| |
| import torch |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| from ddgs import DDGS |
| import requests |
| from bs4 import BeautifulSoup |
| import time |
| import re |
| from fastlid import fastlid |
|
|
| def detect_language(text): |
| try: |
| lang, score = fastlid(text) |
| |
| allowed_langs = {"ru", "en", "de", "fr", "es"} |
| if lang in allowed_langs: |
| return lang |
| else: |
| return "en" |
| except: |
| return "ru" |
|
|
| |
| print(" Newton Bot загружается...") |
| MODEL_PATH = "./model" |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_PATH, |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
| device_map="auto" |
| ) |
| print("✅ Готов к работе с интернетом и языками!\n") |
|
|
| def clean_text(html): |
| if not html: |
| return "" |
| soup = BeautifulSoup(html, "html.parser") |
| for script in soup(["script", "style", "nav", "footer", "header"]): |
| script.decompose() |
| text = soup.get_text() |
| lines = (line.strip() for line in text.splitlines()) |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
| text = ' '.join(chunk for chunk in chunks if chunk) |
| return re.sub(r'\s+', ' ', text)[:2000] |
|
|
| def web_search(query, max_results=2): |
| print(f"🔍 Поиск: {query}") |
| try: |
| with DDGS() as ddgs: |
| results = list(ddgs.text(query, max_results=max_results)) |
| if not results: |
| return "" |
|
|
| context = "" |
| for i, res in enumerate(results): |
| print(f" 💻 {i+1}. {res['href'][:60]}") |
| try: |
| resp = requests.get(res["href"], timeout=5, headers={ |
| "User-Agent": "Mozilla/5.0 (compatible; NewtonBot/1.0)" |
| }) |
| text = clean_text(resp.text) |
| if text: |
| context += f"[{res['title']}]: {text[:1000]}\n\n" |
| except Exception as e: |
| continue |
| time.sleep(1) |
| return context[:2500].strip() |
| except Exception as e: |
| print(f"⚠️ Ошибка поиска: {e}") |
| return "" |
|
|
| def generate_response(prompt, user_lang="en"): |
| system_prompts = { |
| "ru": "Ты — Newton Bot. Отвечай на том же языке, что и пользователь. Будь кратким и точным.", |
| "en": "You are Newton Bot. Respond in the same language as the user. Be concise and accurate.", |
| "de": "Du bist Newton Bot. Antworte in der gleichen Sprache wie der Benutzer. Sei präzise und hilfreich.", |
| "fr": "Vous êtes Newton Bot. Répondez dans la même langue que l'utilisateur. Soyez clair et concis.", |
| "es": "Eres Newton Bot. Responde en el mismo idioma que el usuario. Sé claro y preciso." |
| } |
| system_msg = system_prompts.get(user_lang, system_prompts["en"]) |
|
|
| messages = [ |
| {"role": "system", "content": system_msg}, |
| {"role": "user", "content": prompt} |
| ] |
| text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| inputs = tokenizer(text, return_tensors="pt").to(model.device) |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=300, |
| do_sample=True, |
| temperature=0.8, |
| top_p=0.95, |
| pad_token_id=tokenizer.eos_token_id |
| ) |
| response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True) |
| return response.strip() |
|
|
| |
| print("Newton Bot: отвечает на языке запроса, ищет в интернете!\n") |
|
|
| while True: |
| user_input = input("Вы: ").strip() |
| if user_input.lower() in ["выход", "exit", "quit"]: |
| print("👋 Пока!") |
| break |
|
|
| lang = detect_language(user_input) |
| print(f" Язык: {lang.upper()}") |
|
|
| web_context = web_search(user_input, max_results=2) if len(user_input) > 5 else "" |
|
|
| if web_context: |
| full_prompt = f"На основе информации:\n{web_context}\n\nОтветь на вопрос: {user_input}" |
| else: |
| full_prompt = user_input |
|
|
| reply = generate_response(full_prompt, user_lang=lang) |
| print(f" Newton Bot ({lang}): {reply}\n") |
|
|