| |
|
|
|
|
| import gradio as gr |
| from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer |
|
|
| model = M2M100ForConditionalGeneration.from_pretrained( |
| "facebook/m2m100_1.2B") |
|
|
| tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B") |
|
|
|
|
| this_description = ''' |
| Using facebook/m2m100_1.2B pre-trained model. |
| |
| Chinese(zh) |
| English(en) |
| Hindi(hi) |
| Japanese(ja) |
| Sinhalese(si) |
| Thai(th) |
| Vietnamese(vi) |
| ... |
| |
| ''' |
|
|
| |
| lang_codes = { |
| "Afrikaans": "af", |
| "Amharic": "am", |
| "Arabic": "ar", |
| "Asturian": "ast", |
| "Azerbaijani": "az", |
| "Bashkir": "ba", |
| "Belarusian": "be", |
| "Bulgarian": "bg", |
| "Bengali": "bn", |
| "Breton": "br", |
| "Bosnian": "bs", |
| "Catalan; Valencian": "ca", |
| "Cebuano": "ceb", |
| "Czech": "cs", |
| "Welsh": "cy", |
| "Danish": "da", |
| "German": "de", |
| "Greeek": "el", |
| "English": "en", |
| "Spanish": "es", |
| "Estonian": "et", |
| "Persian": "fa", |
| "Fulah": "ff", |
| "Finnish": "fi", |
| "French": "fr", |
| "Western Frisian": "fy", |
| "Irish": "ga", |
| "Gaelic; Scottish Gaelic": "gd", |
| "Galician": "gl", |
| "Gujarati": "gu", |
| "Hausa": "ha", |
| "Hebrew": "he", |
| "Hindi": "hi", |
| "Croatian": "hr", |
| "Haitian; Haitian Creole": "ht", |
| "Hungarian": "hu", |
| "Armenian": "hy", |
| "Indonesian": "id", |
| "Igbo": "ig", |
| "Iloko": "ilo", |
| "Icelandic": "is", |
| "Italian": "it", |
| "Japanese": "ja", |
| "Javanese": "jv", |
| "Georgian": "ka", |
| "Kazakh": "kk", |
| "Central Khmer": "km", |
| "Kannada": "kn", |
| "Korean": "ko", |
| "Luxembourgish; Letzeburgesch": "lb", |
| "Ganda": "lg", |
| "Lingala": "ln", |
| "Lao": "lo", |
| "Lithuanian": "lt", |
| "Latvian": "lv", |
| "Malagasy": "mg", |
| "Macedonian": "mk", |
| "Malayalam": "ml", |
| "Mongolian": "mn", |
| "Marathi": "mr", |
| "Malay": "ms", |
| "Burmese": "my", |
| "Nepali": "ne", |
| "Dutch; Flemish": "nl", |
| "Norwegian": "no", |
| "Northern Sotho": "ns", |
| "Occitan": "oc", |
| "Oriya": "or", |
| "Panjabi; Punjabi": "pa", |
| "Polish": "pl", |
| "Pushto": "ps", |
| "Portuguese": "pt", |
| "Romanian; Moldavian; Moldovan": "ro", |
| "Russian": "ru", |
| "Sindhi": "sd", |
| "Sinhala; Sinhalese": "si", |
| "Slovak": "sk", |
| "Slovenian": "sl", |
| "Somali": "so", |
| "Albanian": "sq", |
| "Serbian": "sr", |
| "Swati": "ss", |
| "Sundanese": "su", |
| "Swedish": "sv", |
| "Swahili": "sw", |
| "Tamil": "ta", |
| "Thai": "th", |
| "Tagalog": "tl", |
| "Tswana": "tn", |
| "Turkish": "tr", |
| "Ukrainian": "uk", |
| "Urdu": "ur", |
| "Uzbek": "uz", |
| "Vietnamese": "vi", |
| "Wolof": "wo", |
| "Xhosa": "xh", |
| "Yiddish": "yi", |
| "Yoruba": "yo", |
| "Chinese": "zh", |
| "Zulu": "zu" |
| } |
|
|
|
|
| def m2m_translate(Input_Text, from_lang, to_lang): |
| tokenizer.src_lang = lang_codes[from_lang] |
| encoded_from_lang = tokenizer(Input_Text, return_tensors="pt") |
|
|
| generated_tokens = model.generate( |
| **encoded_from_lang, |
| max_new_tokens=200, |
| forced_bos_token_id=tokenizer.get_lang_id(lang_codes[to_lang]) |
| ) |
|
|
| res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) |
|
|
| return res[0] |
|
|
|
|
| iface = gr.Interface( |
| fn=m2m_translate, |
|
|
| title="M2M100 Text Translation", |
| description=this_description, |
|
|
| inputs=[ |
| gr.Textbox(lines=5, placeholder="Enter text", label="Text input"), |
|
|
| gr.Radio( |
| choices=[ |
| 'Burmese', |
| 'Chinese', |
| 'English', |
| 'Hindi', |
| 'Japanese', |
| 'Sinhala', |
| 'Thai', |
| 'Vietnamese' |
| ], |
| value='Vietnamese', |
| label='From language' |
| ), |
|
|
| gr.Radio( |
| choices=[ |
| 'Burmese', |
| 'Chinese', |
| 'English', |
| 'Hindi', |
| 'Japanese', |
| 'Sinhala', |
| 'Thai', |
| 'Vietnamese' |
| ], |
| value='English', |
| label='To language' |
| ), |
| ], |
| outputs="text") |
|
|
| iface.launch() |
|
|