Spaces:
Running
Running
| """LLM service: translation via HuggingFace (Llama / EuroLLM / Aya) and grammar analysis via Stanza API.""" | |
| import requests | |
| from base import SUPPORTED_LANGUAGES | |
| import settings | |
| LLAMA_MODEL = "meta-llama/Llama-3.1-8B-Instruct" | |
| EUROLLM_MODEL = "utter-project/EuroLLM-22B-Instruct-2512" | |
| AYA_MODEL = "CohereLabs/aya-expanse-32b" | |
| HF_ROUTER_URL = "https://router.huggingface.co/v1/chat/completions" | |
| # Languages where EuroLLM is preferred (EU official small languages) | |
| EUROLLM_CODES = { | |
| lang['code'] for lang in SUPPORTED_LANGUAGES | |
| if lang.get('small') and not lang.get('regional') | |
| } | |
| # Regional / minority languages -> Aya | |
| AYA_CODES = { | |
| lang['code'] for lang in SUPPORTED_LANGUAGES | |
| if lang.get('regional') | |
| } | |
| def _get_lang_info(code: str) -> dict | None: | |
| for lang in SUPPORTED_LANGUAGES: | |
| if lang['code'] == code: | |
| return lang | |
| return None | |
| def _pick_model(source_lang_code: str) -> str: | |
| """Route to the best model based on source language.""" | |
| if source_lang_code in AYA_CODES: | |
| return AYA_MODEL | |
| if source_lang_code in EUROLLM_CODES: | |
| return EUROLLM_MODEL | |
| return LLAMA_MODEL | |
| def _call_hf_chat(model: str, messages: list[dict], max_tokens: int = 1500) -> str: | |
| """Call HuggingFace Router chat completions API.""" | |
| headers = { | |
| "Authorization": f"Bearer {settings.HF_TOKEN}", | |
| "Content-Type": "application/json", | |
| } | |
| payload = { | |
| "model": model, | |
| "messages": messages, | |
| "max_tokens": max_tokens, | |
| } | |
| resp = requests.post(HF_ROUTER_URL, json=payload, headers=headers, timeout=120) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| return data["choices"][0]["message"]["content"] | |
| def translate_pages(text: str, source_lang_code: str) -> str: | |
| """Translate text from source language to English.""" | |
| lang_info = _get_lang_info(source_lang_code) | |
| lang_name = lang_info['name'] if lang_info else source_lang_code | |
| model = _pick_model(source_lang_code) | |
| messages = [ | |
| {"role": "system", "content": "You are a professional translator. Translate the following text to English accurately, preserving formatting and paragraph breaks. Output ONLY the translation."}, | |
| {"role": "user", "content": f"Translate from {lang_name} to English:\n\n{text}"}, | |
| ] | |
| return _call_hf_chat(model, messages) | |
| def explain_selection(text: str, source_lang_code: str) -> str: | |
| """Explain a highlighted text selection with grammatical and syntactical notes in English.""" | |
| lang_info = _get_lang_info(source_lang_code) | |
| lang_name = lang_info['name'] if lang_info else source_lang_code | |
| model = _pick_model(source_lang_code) | |
| messages = [ | |
| {"role": "system", "content": ( | |
| f"You are a {lang_name} language expert. The user highlights a passage in {lang_name}. " | |
| "Provide:\n1. An English translation of the passage.\n" | |
| "2. Grammatical notes (parts of speech, cases, tenses, moods).\n" | |
| "3. Syntactical analysis (sentence structure, clauses, word order).\n" | |
| "4. Any idiomatic or cultural notes.\n" | |
| "Be concise but thorough." | |
| )}, | |
| {"role": "user", "content": text}, | |
| ] | |
| return _call_hf_chat(model, messages) | |
| def word_by_word_analysis(text: str, source_lang_code: str) -> dict: | |
| """Perform word-by-word morpho-syntactic analysis using Stanza API, plus full sentence translation.""" | |
| lang_info = _get_lang_info(source_lang_code) | |
| lang_code = lang_info['code'] if lang_info else "en" | |
| api_url = f"https://randusertry-stanzalazymodels.hf.space/{lang_code}/analyze" | |
| # Translate the full selected text first | |
| sentence_translation = "" | |
| try: | |
| sentence_translation = translate_pages(text, source_lang_code) | |
| except Exception: | |
| sentence_translation = "[Translation failed]" | |
| try: | |
| resp = requests.post(api_url, json={"text": text}, timeout=60) | |
| if resp.status_code != 200: | |
| return {"translation": sentence_translation, "words": [{"error": f"Stanza API returned {resp.status_code}"}]} | |
| data = resp.json() | |
| word_objects = [] | |
| # Flexible response handling (flat list vs nested sentences) | |
| if isinstance(data, list): | |
| if len(data) > 0 and isinstance(data[0], dict) and ('text' in data[0] or 'lemma' in data[0]): | |
| words_to_process = data | |
| else: | |
| words_to_process = [] | |
| for sent in data: | |
| if isinstance(sent, list): | |
| words_to_process.extend(sent) | |
| elif isinstance(sent, dict) and "words" in sent: | |
| words_to_process.extend(sent["words"]) | |
| else: | |
| words_to_process = [] | |
| for sent in data.get("sentences", []): | |
| words_to_process.extend(sent if isinstance(sent, list) else sent.get("words", [])) | |
| for word in words_to_process: | |
| if not isinstance(word, dict): | |
| continue | |
| upos = word.get("pos") or word.get("upos", "") | |
| morph = word.get("morph") or word.get("feats", "") | |
| if upos == "PUNCT": | |
| continue | |
| word_objects.append({ | |
| "form": word.get("text"), | |
| "grammar_comments": f"{upos} {morph}".strip(), | |
| "lemma": word.get("lemma"), | |
| }) | |
| return {"translation": sentence_translation, "words": word_objects} | |
| except Exception as e: | |
| return {"translation": sentence_translation, "words": [{"error": str(e)}]} | |