import gc import gradio as gr import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer current_model_id = None tokenizer = None model = None LANGUAGES = [ "Arabic", "Bengali", "Burmese", "Cantonese", "Chinese", "Czech", "Dutch", "English", "Filipino", "French", "German", "Gujarati", "Hebrew", "Hindi", "Indonesian", "Italian", "Japanese", "Kazakh", "Khmer", "Korean", "Malay", "Marathi", "Mongolian", "Persian", "Polish", "Portuguese", "Russian", "Spanish", "Tamil", "Telugu", "Thai", "Tibetan", "Traditional Chinese", "Turkish", "Ukrainian", "Urdu", "Uyghur", "Vietnamese" ] MODELS = [ "tencent/Hy-MT2-30B-A3B", "tencent/Hy-MT2-7B", "tencent/Hy-MT2-1.8B" ] def load_model(model_id): global current_model_id, tokenizer, model if current_model_id == model_id: return print(f"Switching model from {current_model_id} to {model_id}...") if model is not None: del model del tokenizer gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() print(f"Loading tokenizer for {model_id}...") tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) print(f"Loading model {model_id}...") model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) model.eval() current_model_id = model_id print("Model loaded successfully.") @spaces.GPU def translate(source_text, target_lang, selected_model): global current_model_id, tokenizer, model if not source_text.strip(): return "" try: load_model(selected_model) prompt = f"Translate the following text into {target_lang}. Note that you should only output the translated result without any additional explanation:\n\n{source_text}" messages = [{"role": "user", "content": prompt}] inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device) if "30B" in selected_model: gen_kwargs = { "temperature": 0.7, "top_p": 1.0, "repetition_penalty": 1.0, } else: gen_kwargs = { "temperature": 0.7, "top_p": 0.6, "top_k": 20, "repetition_penalty": 1.05, } with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=4096, **gen_kwargs ) response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True) return response except Exception as e: return f"Error during generation: {str(e)}\n\n(Note: Zero GPU environments may timeout or run out of memory when loading large models dynamically.)" with gr.Blocks(title="Hy-MT2 Translator") as demo: gr.Markdown("# Hy-MT2 Translator") gr.Markdown("https://huggingface.co/collections/tencent/hy-mt2") with gr.Row(): with gr.Column(): source_text = gr.Textbox(label="Source Text", lines=8, placeholder="Enter text to translate...") with gr.Row(): target_lang = gr.Dropdown(choices=LANGUAGES, value="English", label="Target Language") model_selector = gr.Dropdown(choices=MODELS, value="tencent/Hy-MT2-1.8B", label="Model") translate_btn = gr.Button("Translate", variant="primary") with gr.Column(): output_text = gr.Textbox(label="Translated Text", lines=12, interactive=False) translate_btn.click( fn=translate, inputs=[source_text, target_lang, model_selector], outputs=output_text ) if __name__ == "__main__": demo.launch()