NewtonBot / newton_bot_search.py
Kolyadual's picture
Upload folder using huggingface_hub
9250542 verified
#!/usr/bin/env python3
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from ddgs import DDGS
import requests
from bs4 import BeautifulSoup
import time
import re
from fastlid import fastlid
def detect_language(text):
try:
lang, score = fastlid(text)
# Если язык не в списке нужных — возвращаем "en"
allowed_langs = {"ru", "en", "de", "fr", "es"}
if lang in allowed_langs:
return lang
else:
return "en"
except:
return "ru" # по умолчанию — английский
# --- Загрузка модели ---
print(" Newton Bot загружается...")
MODEL_PATH = "./model"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto"
)
print("✅ Готов к работе с интернетом и языками!\n")
def clean_text(html):
if not html:
return ""
soup = BeautifulSoup(html, "html.parser")
for script in soup(["script", "style", "nav", "footer", "header"]):
script.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return re.sub(r'\s+', ' ', text)[:2000]
def web_search(query, max_results=2):
print(f"🔍 Поиск: {query}")
try:
with DDGS() as ddgs:
results = list(ddgs.text(query, max_results=max_results))
if not results:
return ""
context = ""
for i, res in enumerate(results):
print(f" 💻 {i+1}. {res['href'][:60]}")
try:
resp = requests.get(res["href"], timeout=5, headers={
"User-Agent": "Mozilla/5.0 (compatible; NewtonBot/1.0)"
})
text = clean_text(resp.text)
if text:
context += f"[{res['title']}]: {text[:1000]}\n\n"
except Exception as e:
continue
time.sleep(1)
return context[:2500].strip()
except Exception as e:
print(f"⚠️ Ошибка поиска: {e}")
return ""
def generate_response(prompt, user_lang="en"):
system_prompts = {
"ru": "Ты — Newton Bot. Отвечай на том же языке, что и пользователь. Будь кратким и точным.",
"en": "You are Newton Bot. Respond in the same language as the user. Be concise and accurate.",
"de": "Du bist Newton Bot. Antworte in der gleichen Sprache wie der Benutzer. Sei präzise und hilfreich.",
"fr": "Vous êtes Newton Bot. Répondez dans la même langue que l'utilisateur. Soyez clair et concis.",
"es": "Eres Newton Bot. Responde en el mismo idioma que el usuario. Sé claro y preciso."
}
system_msg = system_prompts.get(user_lang, system_prompts["en"])
messages = [
{"role": "system", "content": system_msg},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=300,
do_sample=True,
temperature=0.8,
top_p=0.95,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
return response.strip()
# --- Основной цикл ---
print("Newton Bot: отвечает на языке запроса, ищет в интернете!\n")
while True:
user_input = input("Вы: ").strip()
if user_input.lower() in ["выход", "exit", "quit"]:
print("👋 Пока!")
break
lang = detect_language(user_input)
print(f" Язык: {lang.upper()}")
web_context = web_search(user_input, max_results=2) if len(user_input) > 5 else ""
if web_context:
full_prompt = f"На основе информации:\n{web_context}\n\nОтветь на вопрос: {user_input}"
else:
full_prompt = user_input
reply = generate_response(full_prompt, user_lang=lang)
print(f" Newton Bot ({lang}): {reply}\n")