NewtonBot / newton_bot_search.py

Upload folder using huggingface_hub

9250542 verified 5 months ago

4.5 kB

	#!/usr/bin/env python3
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from ddgs import DDGS
	import requests
	from bs4 import BeautifulSoup
	import time
	import re
	from fastlid import fastlid

	def detect_language(text):
	try:
	lang, score = fastlid(text)
	# Если язык не в списке нужных — возвращаем "en"
	allowed_langs = {"ru", "en", "de", "fr", "es"}
	if lang in allowed_langs:
	return lang
	else:
	return "en"
	except:
	return "ru" # по умолчанию — английский

	# --- Загрузка модели ---
	print(" Newton Bot загружается...")
	MODEL_PATH = "./model"
	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_PATH,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	device_map="auto"
	)
	print("✅ Готов к работе с интернетом и языками!\n")

	def clean_text(html):
	if not html:
	return ""
	soup = BeautifulSoup(html, "html.parser")
	for script in soup(["script", "style", "nav", "footer", "header"]):
	script.decompose()
	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = ' '.join(chunk for chunk in chunks if chunk)
	return re.sub(r'\s+', ' ', text)[:2000]

	def web_search(query, max_results=2):
	print(f"🔍 Поиск: {query}")
	try:
	with DDGS() as ddgs:
	results = list(ddgs.text(query, max_results=max_results))
	if not results:
	return ""

	context = ""
	for i, res in enumerate(results):
	print(f" 💻 {i+1}. {res['href'][:60]}")
	try:
	resp = requests.get(res["href"], timeout=5, headers={
	"User-Agent": "Mozilla/5.0 (compatible; NewtonBot/1.0)"
	})
	text = clean_text(resp.text)
	if text:
	context += f"[{res['title']}]: {text[:1000]}\n\n"
	except Exception as e:
	continue
	time.sleep(1)
	return context[:2500].strip()
	except Exception as e:
	print(f"⚠️ Ошибка поиска: {e}")
	return ""

	def generate_response(prompt, user_lang="en"):
	system_prompts = {
	"ru": "Ты — Newton Bot. Отвечай на том же языке, что и пользователь. Будь кратким и точным.",
	"en": "You are Newton Bot. Respond in the same language as the user. Be concise and accurate.",
	"de": "Du bist Newton Bot. Antworte in der gleichen Sprache wie der Benutzer. Sei präzise und hilfreich.",
	"fr": "Vous êtes Newton Bot. Répondez dans la même langue que l'utilisateur. Soyez clair et concis.",
	"es": "Eres Newton Bot. Responde en el mismo idioma que el usuario. Sé claro y preciso."
	}
	system_msg = system_prompts.get(user_lang, system_prompts["en"])

	messages = [
	{"role": "system", "content": system_msg},
	{"role": "user", "content": prompt}
	]
	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tokenizer(text, return_tensors="pt").to(model.device)
	outputs = model.generate(
	**inputs,
	max_new_tokens=300,
	do_sample=True,
	temperature=0.8,
	top_p=0.95,
	pad_token_id=tokenizer.eos_token_id
	)
	response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
	return response.strip()

	# --- Основной цикл ---
	print("Newton Bot: отвечает на языке запроса, ищет в интернете!\n")

	while True:
	user_input = input("Вы: ").strip()
	if user_input.lower() in ["выход", "exit", "quit"]:
	print("👋 Пока!")
	break

	lang = detect_language(user_input)
	print(f" Язык: {lang.upper()}")

	web_context = web_search(user_input, max_results=2) if len(user_input) > 5 else ""

	if web_context:
	full_prompt = f"На основе информации:\n{web_context}\n\nОтветь на вопрос: {user_input}"
	else:
	full_prompt = user_input

	reply = generate_response(full_prompt, user_lang=lang)
	print(f" Newton Bot ({lang}): {reply}\n")