| |
|
|
| |
| |
| |
| import os |
|
|
| print("--- 1. Installing All Libraries ---") |
| print("✅ Libraries installed.") |
|
|
| print("\n--- 2. Cloning IndicLID Repository ---") |
| |
| print("✅ Repository cloned.") |
|
|
| |
|
|
| print("\n--- 3. Downloading and Unzipping IndicLID Models ---") |
| print("✅ Download commands executed. Unzipping now...") |
| print("✅ Unzip commands executed.") |
|
|
| print("\n🎉🎉🎉 SETUP COMPLETE. You can now proceed to Step 2. 🎉🎉🎉") |
|
|
|
|
| import shutil |
| import os |
|
|
| |
| source = "/usr/local/lib/python3.12/dist-packages/transformers" |
|
|
| |
| destination = "/content/IndicLID/Inference/ai4bharat/" |
|
|
| |
| os.makedirs(destination, exist_ok=True) |
|
|
| |
| moved_path = shutil.move(source, destination) |
|
|
| print(f"Folder moved to: {moved_path}") |
|
|
|
|
|
|
| |
| |
| |
| import os |
| import sys |
| import torch |
| print("--- Applying your original add_safe_globals fix... ---") |
|
|
| if "/content/IndicLID/Inference" not in sys.path: |
| sys.path.append("/content/IndicLID/Inference") |
|
|
| from transformers.models.bert.modeling_bert import ( |
| BertModel, BertPreTrainedModel, BertForSequenceClassification, |
| BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention, |
| BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput |
| ) |
| from transformers.models.bert.configuration_bert import BertConfig |
| import torch.nn as nn |
| from torch.nn.modules.sparse import Embedding |
| from torch.nn.modules.container import ModuleList |
| from torch.nn.modules.linear import Linear |
| from torch.nn.modules.normalization import LayerNorm |
| from torch.nn.modules.dropout import Dropout |
|
|
| torch.serialization.add_safe_globals([ |
| BertModel, BertPreTrainedModel, BertForSequenceClassification, |
| BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention, |
| BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig, |
| Embedding, ModuleList, Linear, LayerNorm, Dropout, |
| ]) |
| print("✅ Comprehensive safe globals added successfully.") |
|
|
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
| from IndicTransToolkit.processor import IndicProcessor |
| from ai4bharat.IndicLID import IndicLID |
|
|
| print("--- Loading all models into memory... ---") |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print(f"Using device: {device}") |
|
|
| lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6) |
| print("✅ IndicLID model loaded successfully.") |
|
|
| MODEL_ID = "ai4bharat/indictrans2-indic-en-1B" |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
| model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device) |
| ip = IndicProcessor(inference=True) |
| print("✅ IndicTrans2 1B model loaded.") |
|
|
| print("🎉 ALL MODELS ARE LOADED. Proceed to direct batch prediction tests.") |
|
|
|
|
| import sys |
| print(sys.path) |
|
|
| pip show transformers |
|
|
|
|
|
|
| |
| |
| |
|
|
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| import torch |
|
|
| print("--- Loading RomanSetu model compatible with transformers 4.40.2... ---") |
|
|
| |
| model_options = [ |
| "ai4bharat/romansetu-cpt-roman-100m", |
| "ai4bharat/romansetu-cpt-roman-200m" |
| ] |
|
|
| rs_model = None |
| rs_tokenizer = None |
|
|
| for model_id in model_options: |
| try: |
| print(f"Trying model: {model_id}") |
| rs_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) |
| rs_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(device) |
| print(f"✅ {model_id} loaded successfully.") |
| break |
| except Exception as e: |
| print(f"❌ {model_id} failed: {e}") |
| continue |
|
|
| if rs_model is None: |
| print("❌ All RomanSetu models failed. Continuing with transliteration-based approach.") |
|
|
| def translate_with_romansetu(text, max_new_tokens=50): |
| if rs_model is None: |
| |
| from indic_transliteration import sanscript |
| from indic_transliteration.sanscript import transliterate |
| try: |
| |
| native_text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI) |
| pre = ip.preprocess_batch([native_text], src_lang="hin_Deva", tgt_lang="eng_Latn") |
| inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) |
| with torch.no_grad(): |
| out = model.generate(**inputs, num_beams=3, max_length=100) |
| dec = tokenizer.batch_decode(out, skip_special_tokens=True) |
| post = ip.postprocess_batch(dec, lang="hin_Deva") |
| return post[0] |
| except: |
| return text |
|
|
| try: |
| prompt = f"Translate this romanized Indian text to English: {text}" |
| inputs = rs_tokenizer(prompt, return_tensors="pt").to(device) |
|
|
| with torch.no_grad(): |
| outputs = rs_model.generate( |
| inputs.input_ids, |
| max_new_tokens=max_new_tokens, |
| num_beams=2, |
| temperature=0.7, |
| do_sample=True, |
| pad_token_id=rs_tokenizer.eos_token_id |
| ) |
|
|
| full_response = rs_tokenizer.decode(outputs, skip_special_tokens=True) |
| translation = full_response.replace(prompt, "").strip() |
| return translation if translation and len(translation) > 2 else text |
|
|
| except Exception as e: |
| return text |
|
|
| print("✅ RomanSetu/fallback translation function defined.") |
| print("🎉 SETUP COMPLETE with fallback mechanism.") |
|
|
|
|
| |
| |
| |
|
|
| print("--- Installing and loading IndicXlit for better romanized text handling ---") |
|
|
| |
|
|
| from ai4bharat.transliteration import XlitEngine |
| import torch |
|
|
| try: |
| |
| xlit_engines = { |
| "hindi": XlitEngine("hi", beam_width=4, rescore=True), |
| "bengali": XlitEngine("bn", beam_width=4, rescore=True), |
| "tamil": XlitEngine("ta", beam_width=4, rescore=True), |
| "telugu": XlitEngine("te", beam_width=4, rescore=True), |
| "gujarati": XlitEngine("gu", beam_width=4, rescore=True), |
| "kannada": XlitEngine("kn", beam_width=4, rescore=True), |
| "malayalam": XlitEngine("ml", beam_width=4, rescore=True), |
| "punjabi": XlitEngine("pa", beam_width=4, rescore=True), |
| "marathi": XlitEngine("mr", beam_width=4, rescore=True), |
| "urdu": XlitEngine("ur", beam_width=4, rescore=True), |
| } |
| print("✅ Multiple IndicXlit engines loaded successfully.") |
|
|
| except Exception as e: |
| print(f"❌ Error loading IndicXlit: {e}") |
| print("💡 Falling back to basic transliteration.") |
| xlit_engines = {} |
|
|
| def enhanced_transliterate_with_xlit(text, target_lang): |
| """ |
| Enhanced transliteration using IndicXlit (based on official API) |
| """ |
| lang_key = target_lang.lower() |
|
|
| if not xlit_engines or lang_key not in xlit_engines: |
| |
| from indic_transliteration import sanscript |
| from indic_transliteration.sanscript import transliterate |
| script_map = { |
| "hindi": sanscript.DEVANAGARI, "bengali": sanscript.BENGALI, |
| "tamil": sanscript.TAMIL, "telugu": sanscript.TELUGU, |
| "kannada": sanscript.KANNADA, "malayalam": sanscript.MALAYALAM, |
| "gujarati": sanscript.GUJARATI, "punjabi": sanscript.GURMUKHI, |
| "marathi": sanscript.DEVANAGARI, "urdu": 'urdu' |
| } |
| return transliterate(text, sanscript.ITRANS, script_map.get(lang_key, sanscript.DEVANAGARI)) |
|
|
| try: |
| |
| engine = xlit_engines[lang_key] |
|
|
| |
| if ' ' in text: |
| result = engine.translit_sentence(text) |
| |
| lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te", |
| "gujarati": "gu", "kannada": "kn", "malayalam": "ml", |
| "punjabi": "pa", "marathi": "mr", "urdu": "ur"} |
| lang_code = lang_codes.get(lang_key, "hi") |
| return result.get(lang_code, text) |
| else: |
| |
| result = engine.translit_word(text, topk=1) |
| lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te", |
| "gujarati": "gu", "kannada": "kn", "malayalam": "ml", |
| "punjabi": "pa", "marathi": "mr", "urdu": "ur"} |
| lang_code = lang_codes.get(lang_key, "hi") |
| return result.get(lang_code, [text])[0] |
|
|
| except Exception as e: |
| print(f"IndicXlit error for '{text}': {e}") |
| |
| return text |
|
|
| print("✅ Enhanced transliteration function defined.") |
| print("🎉 INDICXLIT SETUP COMPLETE.") |
|
|
|
|
| import pandas as pd |
| from indic_transliteration import sanscript |
| from indic_transliteration.sanscript import transliterate |
|
|
| |
| LID_TO_TRANSLATE = { |
| |
| "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
| |
| "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
| |
| "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
| "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
|
|
| |
| "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
| "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
|
|
| |
| "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
| "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
| "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
|
|
| |
| "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, |
| "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, |
|
|
| |
| "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, |
| "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, |
|
|
| |
| "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, |
| "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, |
|
|
| |
| "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, |
| "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, |
|
|
| |
| "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, |
| "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, |
|
|
| |
| "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, |
| "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, |
|
|
| |
| "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, |
| "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, |
|
|
| |
| "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "gom_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| } |
|
|
| def enhanced_transliterate_robust(text, target_script): |
| """ |
| Enhanced transliteration with better romanization handling |
| """ |
| try: |
| |
| cleaned_text = text.lower().strip() |
|
|
| |
| replacements = { |
| 'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph', |
| 'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh', |
| 'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au' |
| } |
|
|
| for old, new in replacements.items(): |
| cleaned_text = cleaned_text.replace(old, new) |
|
|
| |
| result = transliterate(cleaned_text, sanscript.ITRANS, target_script) |
| return result if result else text |
|
|
| except Exception as e: |
| print(f"Transliteration error: {e}") |
| return text |
|
|
| def detect_and_translate_robust(texts, batch_size=64): |
| """ |
| Robust detection and translation with expanded language mapping |
| """ |
| results = [] |
| preds = lid.batch_predict(texts, batch_size) |
|
|
| for item in preds: |
| if isinstance(item, dict): |
| text = item.get("text", "") |
| lang_code = item.get("lang", item.get("pred_lang", "")) |
| score = float(item.get("score", 0.0)) |
| model_name = item.get("model", "") |
| else: |
| text, lang_code, score, model_name = item |
|
|
| is_romanized = lang_code.endswith("_Latn") |
|
|
| if lang_code not in LID_TO_TRANSLATE: |
| translation = f"Language '{lang_code}' not supported for translation" |
| method = "Unsupported" |
| else: |
| try: |
| lang_info = LID_TO_TRANSLATE[lang_code] |
| src_code = lang_info["it_code"] |
|
|
| if is_romanized: |
| |
| native_text = enhanced_transliterate_robust(text, lang_info["script"]) |
| method = f"Enhanced Transliteration + IndicTrans2 (detected as {lang_code})" |
| print(f"Enhanced: '{text}' → '{native_text}' (detected: {lang_code})") |
| else: |
| native_text = text |
| method = f"IndicTrans2 (detected as {lang_code})" |
|
|
| |
| pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn") |
| inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) |
| with torch.no_grad(): |
| out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True) |
| dec = tokenizer.batch_decode(out, skip_special_tokens=True) |
| post = ip.postprocess_batch(dec, lang=src_code) |
| translation = post[0] |
|
|
| except Exception as e: |
| translation = f"Translation error: {str(e)}" |
| method = "Error" |
|
|
| results.append({ |
| "original_text": text, |
| "detected_lang": lang_code, |
| "script_type": "Romanized" if is_romanized else "Native", |
| "confidence": f"{score:.3f}", |
| "translation_method": method, |
| "english_translation": translation |
| }) |
|
|
| return pd.DataFrame(results) |
|
|
| print("✅ Robust translation function with expanded language mapping defined") |
|
|
| |
| sample_texts = [ |
| "यहाँ कितने लोग हैं?", |
| "tum kaha ho", |
| "aaj mausam suhana hai", |
| "aap kaise hain", |
| "আমি ভালো আছি।", |
| "ami bhalo achi", |
| "mera naam rahul hai", |
| "main office jaa raha hun" |
| ] |
|
|
| print(f"🔍 Testing robust approach with expanded language mapping...") |
| df_results = detect_and_translate_robust(sample_texts, batch_size=16) |
| display(df_results) |
|
|
|
|
| |
| |
| |
|
|
| import pandas as pd |
| from indic_transliteration import sanscript |
| from indic_transliteration.sanscript import transliterate |
|
|
| |
| sample_sentences = { |
| "Assamese": ("আপুনি কেনেকৈ আছেন?", "apuni kenekoi asen?"), |
| "Bengali": ("তুমি কেমন আছো?", "tumi kemon acho?"), |
| "Bodo": ("नांगनि फाथै खौ?", "nangni phathai kho?"), |
| "Dogri": ("तुसीं केहे हो?", "tusi kehe ho?"), |
| "Gujarati": ("તમે કેમ છો?", "tame kem cho?"), |
| "Hindi": ("तुम कैसे हो?", "tum kaise ho?"), |
| "Kannada": ("ನೀವು ಹೇಗಿದ್ದೀರಾ?", "neevu hegiddira?"), |
| "Kashmiri": ("तुस की छै?", "tus ki chhai?"), |
| "Konkani": ("तुम कशें आसा?", "tum kashen asa?"), |
| "Maithili": ("अहाँ कथी छी?", "ahaan kathi chhi?"), |
| "Malayalam": ("സുഖമായിരോ?", "sukhamaayiro?"), |
| "Manipuri": ("नमस्कार, नखोंगबा तौ?", "namaskaar, nakhongba tau?"), |
| "Marathi": ("तू कसा आहेस?", "tu kasa ahes?"), |
| "Nepali": ("तिमी कस्तो छौ?", "timi kasto chau?"), |
| "Odia": ("ତୁମେ କେମିତି ଅଛ?", "tume kemiti achha?"), |
| "Punjabi": ("ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?", "tusi kiven ho?"), |
| "Sanskrit": ("भवतः कथम् अस्ति?", "bhavatah katham asti?"), |
| "Santali": ("ᱥᱟᱱᱛᱟᱲᱤ ᱠᱚᱱᱛᱮᱞᱤ ᱟᱹᱲᱤ?", "santalii konteli adii?"), |
| "Sindhi": ("توهان ڪيئن آهيو؟", "tohan kayn aahiyo?"), |
| "Tamil": ("நீங்கள் எப்படி இருக்கிறீர்கள்?", "neenga epdi irukeenga?"), |
| "Telugu": ("మీరు ఎలా ఉన్నారు?", "meeru ela unnaru?"), |
| "Urdu": ("آپ کیسے ہیں؟", "aap kaise hain?") |
| } |
|
|
| |
| LID_TO_TRANSLATE = { |
| |
| "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
| "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, |
|
|
| |
| "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
| "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
| "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
| "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, |
|
|
| |
| "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
| "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
| "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, |
|
|
| |
| "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, |
| "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, |
|
|
| |
| "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, |
| "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, |
|
|
| |
| "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, |
| "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, |
|
|
| |
| "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, |
| "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, |
|
|
| |
| "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, |
| "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, |
|
|
| |
| "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, |
| "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, |
|
|
| |
| "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, |
| "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, |
| } |
|
|
| def enhanced_transliterate_robust(text, target_script): |
| """Enhanced transliteration with better romanization handling""" |
| try: |
| cleaned_text = text.lower().strip() |
| replacements = { |
| 'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph', |
| 'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh', |
| 'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au' |
| } |
| for old, new in replacements.items(): |
| cleaned_text = cleaned_text.replace(old, new) |
| result = transliterate(cleaned_text, sanscript.ITRANS, target_script) |
| return result if result else text |
| except Exception as e: |
| print(f"Transliteration error: {e}") |
| return text |
|
|
| def test_all_22_languages(texts, batch_size=32): |
| """Complete testing function for all 22 languages""" |
| results = [] |
| preds = lid.batch_predict(texts, batch_size) |
|
|
| for item in preds: |
| if isinstance(item, dict): |
| text = item.get("text", "") |
| lang_code = item.get("lang", item.get("pred_lang", "")) |
| score = float(item.get("score", 0.0)) |
| model_name = item.get("model", "") |
| else: |
| text, lang_code, score, model_name = item |
|
|
| is_romanized = lang_code.endswith("_Latn") |
|
|
| if lang_code not in LID_TO_TRANSLATE: |
| translation = f"Language '{lang_code}' not supported" |
| method = "Unsupported" |
| else: |
| try: |
| lang_info = LID_TO_TRANSLATE[lang_code] |
| src_code = lang_info["it_code"] |
|
|
| if is_romanized: |
| native_text = enhanced_transliterate_robust(text, lang_info["script"]) |
| method = f"Transliteration+IndicTrans2 (detected: {lang_code})" |
| print(f"Romanized: '{text}' → '{native_text}'") |
| else: |
| native_text = text |
| method = f"IndicTrans2 (detected: {lang_code})" |
|
|
| |
| pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn") |
| inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) |
| with torch.no_grad(): |
| out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True) |
| dec = tokenizer.batch_decode(out, skip_special_tokens=True) |
| post = ip.postprocess_batch(dec, lang=src_code) |
| translation = post[0] |
|
|
| except Exception as e: |
| translation = f"Translation error: {str(e)}" |
| method = "Error" |
|
|
| results.append({ |
| "language": text[:20] + "..." if len(text) > 20 else text, |
| "original_text": text, |
| "detected_lang": lang_code, |
| "script_type": "Romanized" if is_romanized else "Native", |
| "confidence": f"{score:.3f}", |
| "method": method, |
| "english_translation": translation |
| }) |
|
|
| return pd.DataFrame(results) |
|
|
| |
| print("🔍 Creating test dataset for all 22 official Indian languages...") |
| all_test_texts = [] |
| for lang, (native, roman) in sample_sentences.items(): |
| all_test_texts.append(native) |
| all_test_texts.append(roman) |
|
|
| print(f"📊 Testing {len(all_test_texts)} samples ({len(sample_sentences)} languages × 2 scripts)...") |
|
|
| |
| df_results = test_all_22_languages(all_test_texts, batch_size=32) |
|
|
| |
| print("\n🎯 COMPLETE TEST RESULTS:") |
| display(df_results) |
|
|
| |
| print(f"\n📈 SUMMARY STATISTICS:") |
| print(f"Total samples tested: {len(df_results)}") |
| print(f"Languages detected: {df_results['detected_lang'].nunique()}") |
| print(f"Native script samples: {len(df_results[df_results['script_type'] == 'Native'])}") |
| print(f"Romanized samples: {len(df_results[df_results['script_type'] == 'Romanized'])}") |
| print(f"Successfully translated: {len(df_results[~df_results['english_translation'].str.contains('error|not supported', case=False)])}") |
|
|
|
|
| import pandas as pd |
|
|
| def detailed_translation_summary(df_results): |
| """ |
| Generate comprehensive detailed summary of translation results |
| """ |
| |
| df_results['successful_translation'] = ~df_results['english_translation'].str.contains('error|not supported', case=False, na=False) |
|
|
| print("\n=========== OVERALL SUMMARY ===========") |
| print(f"Total samples tested: {len(df_results)}") |
| print(f"Languages detected: {df_results['detected_lang'].nunique()}") |
| print(f"Native script samples: {df_results[df_results['script_type'] == 'Native'].shape[0]}") |
| print(f"Romanized samples: {df_results[df_results['script_type'] == 'Romanized'].shape}") |
| print(f"Successfully translated: {df_results['successful_translation'].sum()}") |
|
|
| overall_success_rate = (df_results['successful_translation'].sum() / len(df_results) * 100) |
| print(f"Overall success rate: {overall_success_rate:.1f}%") |
|
|
| print("\n=========== DETAILED LANGUAGE BREAKDOWN ===========") |
| |
| lang_summary = df_results.groupby('detected_lang').agg( |
| total_samples=('original_text', 'count'), |
| native_count=('script_type', lambda x: (x == 'Native').sum()), |
| romanized_count=('script_type', lambda x: (x == 'Romanized').sum()), |
| mean_confidence=('confidence', lambda x: pd.to_numeric(x, errors='coerce').mean()), |
| success=('successful_translation', 'sum'), |
| error_count=('successful_translation', lambda x: (~x).sum()) |
| ).reset_index().sort_values('total_samples', ascending=False) |
|
|
| lang_summary['success_rate'] = (lang_summary['success'] / lang_summary['total_samples'] * 100).round(1) |
| print(lang_summary) |
|
|
| print("\n=========== TOP PERFORMING LANGUAGES ===========") |
| top_performers = lang_summary[lang_summary['success_rate'] >= 90].sort_values('success_rate', ascending=False) |
| if len(top_performers) > 0: |
| print(top_performers[['detected_lang', 'total_samples', 'success_rate']]) |
| else: |
| print("No languages with 90%+ success rate") |
|
|
| print("\n=========== CHALLENGING LANGUAGES ===========") |
| challenging = lang_summary[lang_summary['success_rate'] < 50].sort_values('success_rate') |
| if len(challenging) > 0: |
| print(challenging[['detected_lang', 'total_samples', 'success_rate']]) |
| else: |
| print("No languages with <50% success rate") |
|
|
| print("\n=========== ERROR ANALYSIS ===========") |
| error_df = df_results[~df_results['successful_translation']] |
| print(f"Total errors: {len(error_df)}") |
| if len(error_df) > 0: |
| print("\nError samples:") |
| print(error_df[['original_text', 'detected_lang', 'script_type', 'confidence', 'english_translation']]) |
| else: |
| print("No errors found!") |
|
|
| print("\n=========== SUCCESS BREAKDOWN BY SCRIPT ===========") |
| script_summary = df_results.groupby('script_type').agg( |
| total_samples=('original_text', 'count'), |
| successful=('successful_translation', 'sum'), |
| success_rate=('successful_translation', lambda x: x.mean() * 100) |
| ).round(1) |
| print(script_summary) |
|
|
| print("\n=========== DETECTION CONFIDENCE ANALYSIS ===========") |
| confidence_summary = lang_summary[['detected_lang', 'mean_confidence']].sort_values('mean_confidence', ascending=False) |
| print("Top 10 most confident detections:") |
| print(confidence_summary.head(10)) |
|
|
| return lang_summary, script_summary, error_df |
|
|
| |
| print("✅ Detailed summary function defined") |
| print("\n📋 To run on your test results:") |
| print(" lang_summary, script_summary, error_df = detailed_translation_summary(df_results)") |
| print(" display(lang_summary)") |
| print(" display(error_df)") |
|
|
|
|
| lang_summary, script_summary, error_df = detailed_translation_summary(df_results) |
|
|
|
|
| display(lang_summary) |
| display(error_df) |
|
|