Spaces:

randusertry
/

book_reader_app

Running

App Files Files Community

book_reader_app / llm_service.py

randusertry

Upload 11 files

3e6b783 verified 10 days ago

raw

history blame contribute delete

5.73 kB

	"""LLM service: translation via HuggingFace (Llama / EuroLLM / Aya) and grammar analysis via Stanza API."""

	import requests
	from base import SUPPORTED_LANGUAGES
	import settings

	LLAMA_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
	EUROLLM_MODEL = "utter-project/EuroLLM-22B-Instruct-2512"
	AYA_MODEL = "CohereLabs/aya-expanse-32b"
	HF_ROUTER_URL = "https://router.huggingface.co/v1/chat/completions"

	# Languages where EuroLLM is preferred (EU official small languages)
	EUROLLM_CODES = {
	lang['code'] for lang in SUPPORTED_LANGUAGES
	if lang.get('small') and not lang.get('regional')
	}
	# Regional / minority languages -> Aya
	AYA_CODES = {
	lang['code'] for lang in SUPPORTED_LANGUAGES
	if lang.get('regional')
	}


	def _get_lang_info(code: str) -> dict \| None:
	for lang in SUPPORTED_LANGUAGES:
	if lang['code'] == code:
	return lang
	return None


	def _pick_model(source_lang_code: str) -> str:
	"""Route to the best model based on source language."""
	if source_lang_code in AYA_CODES:
	return AYA_MODEL
	if source_lang_code in EUROLLM_CODES:
	return EUROLLM_MODEL
	return LLAMA_MODEL


	def _call_hf_chat(model: str, messages: list[dict], max_tokens: int = 1500) -> str:
	"""Call HuggingFace Router chat completions API."""
	headers = {
	"Authorization": f"Bearer {settings.HF_TOKEN}",
	"Content-Type": "application/json",
	}
	payload = {
	"model": model,
	"messages": messages,
	"max_tokens": max_tokens,
	}
	resp = requests.post(HF_ROUTER_URL, json=payload, headers=headers, timeout=120)
	resp.raise_for_status()
	data = resp.json()
	return data["choices"][0]["message"]["content"]


	def translate_pages(text: str, source_lang_code: str) -> str:
	"""Translate text from source language to English."""
	lang_info = _get_lang_info(source_lang_code)
	lang_name = lang_info['name'] if lang_info else source_lang_code
	model = _pick_model(source_lang_code)
	messages = [
	{"role": "system", "content": "You are a professional translator. Translate the following text to English accurately, preserving formatting and paragraph breaks. Output ONLY the translation."},
	{"role": "user", "content": f"Translate from {lang_name} to English:\n\n{text}"},
	]
	return _call_hf_chat(model, messages)


	def explain_selection(text: str, source_lang_code: str) -> str:
	"""Explain a highlighted text selection with grammatical and syntactical notes in English."""
	lang_info = _get_lang_info(source_lang_code)
	lang_name = lang_info['name'] if lang_info else source_lang_code
	model = _pick_model(source_lang_code)
	messages = [
	{"role": "system", "content": (
	f"You are a {lang_name} language expert. The user highlights a passage in {lang_name}. "
	"Provide:\n1. An English translation of the passage.\n"
	"2. Grammatical notes (parts of speech, cases, tenses, moods).\n"
	"3. Syntactical analysis (sentence structure, clauses, word order).\n"
	"4. Any idiomatic or cultural notes.\n"
	"Be concise but thorough."
	)},
	{"role": "user", "content": text},
	]
	return _call_hf_chat(model, messages)


	def word_by_word_analysis(text: str, source_lang_code: str) -> dict:
	"""Perform word-by-word morpho-syntactic analysis using Stanza API, plus full sentence translation."""
	lang_info = _get_lang_info(source_lang_code)
	lang_code = lang_info['code'] if lang_info else "en"
	api_url = f"https://randusertry-stanzalazymodels.hf.space/{lang_code}/analyze"

	# Translate the full selected text first
	sentence_translation = ""
	try:
	sentence_translation = translate_pages(text, source_lang_code)
	except Exception:
	sentence_translation = "[Translation failed]"

	try:
	resp = requests.post(api_url, json={"text": text}, timeout=60)
	if resp.status_code != 200:
	return {"translation": sentence_translation, "words": [{"error": f"Stanza API returned {resp.status_code}"}]}

	data = resp.json()
	word_objects = []

	# Flexible response handling (flat list vs nested sentences)
	if isinstance(data, list):
	if len(data) > 0 and isinstance(data[0], dict) and ('text' in data[0] or 'lemma' in data[0]):
	words_to_process = data
	else:
	words_to_process = []
	for sent in data:
	if isinstance(sent, list):
	words_to_process.extend(sent)
	elif isinstance(sent, dict) and "words" in sent:
	words_to_process.extend(sent["words"])
	else:
	words_to_process = []
	for sent in data.get("sentences", []):
	words_to_process.extend(sent if isinstance(sent, list) else sent.get("words", []))

	for word in words_to_process:
	if not isinstance(word, dict):
	continue
	upos = word.get("pos") or word.get("upos", "")
	morph = word.get("morph") or word.get("feats", "")
	if upos == "PUNCT":
	continue
	word_objects.append({
	"form": word.get("text"),
	"grammar_comments": f"{upos} {morph}".strip(),
	"lemma": word.get("lemma"),
	})

	return {"translation": sentence_translation, "words": word_objects}

	except Exception as e:
	return {"translation": sentence_translation, "words": [{"error": str(e)}]}