import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from spaces import GPU model_id = "sapientinc/HRM-Text-1B" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, dtype=torch.bfloat16, trust_remote_code=True, ).cuda().eval() # M#ark the prompt as a single bidirectional prefix block — see "PrefixLM mask" below. @GPU def generate_response(message, history): # synth,cot composite — reasoning / CoT style (see Disclaimer for other modes) conversation = message # for user_msg, assistant_msg in history: # conversation += f"user: {user_msg}\nassistant: {assistant_msg}\n" # conversation += f"user: {message}\nassistant: " condition = "<|quad_end|><|object_ref_end|>" prompt = f"<|im_start|>{condition}{conversation} " inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # Mark the prompt as a single bidirectional prefix block — see "PrefixLM mask" below. inputs["token_type_ids"] = torch.ones_like(inputs["input_ids"]) with torch.no_grad(): out = model.generate(**inputs, max_new_tokens=1000, do_sample=False) full_output = tokenizer.decode(out[0], skip_special_tokens=False) start = full_output.find('<|object_ref_end|>') + len('<|object_ref_end|>') end = full_output.find('<|box_end|>') if end == -1: # اگر <|box_end|> پیدا نشد result = full_output[start:] else: result = full_output[start:end] return result chatbot = gr.ChatInterface( fn=generate_response, title="Simple ZeroGPU Chatbot", description="A simple chatbot using HRM-Text for test, running on Hugging Face ZeroGPU. use only one question in any round", examples=["Explain why the sky is blue?", "Introduce yourself.", "explain why ship float on water?"] ) if __name__ == "__main__": chatbot.launch()