import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from spaces import GPU


model_id = "sapientinc/HRM-Text-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.bfloat16,
    trust_remote_code=True,
).cuda().eval()


# M#ark the prompt as a single bidirectional prefix block — see "PrefixLM mask" below.


@GPU
def generate_response(message, history):
      

    # synth,cot composite — reasoning / CoT style (see Disclaimer for other modes)
    conversation = message
    # for user_msg, assistant_msg in history:
    #     conversation += f"user: {user_msg}\nassistant: {assistant_msg}\n"
    # conversation += f"user: {message}\nassistant: "
    
    condition = "<|quad_end|><|object_ref_end|>"
    prompt = f"<|im_start|>{condition}{conversation} "
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    # Mark the prompt as a single bidirectional prefix block — see "PrefixLM mask" below.
    inputs["token_type_ids"] = torch.ones_like(inputs["input_ids"])
    
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=1000, do_sample=False)
    full_output = tokenizer.decode(out[0], skip_special_tokens=False)
    start = full_output.find('<|object_ref_end|>') + len('<|object_ref_end|>')
    end = full_output.find('<|box_end|>')
    if end == -1:  # اگر <|box_end|> پیدا نشد
        result = full_output[start:]
    else:
        result = full_output[start:end]
    return result
chatbot = gr.ChatInterface(
    fn=generate_response,
    title="Simple ZeroGPU Chatbot",
    description="A simple chatbot using HRM-Text for test, running on Hugging Face ZeroGPU. use only one question in any round",
    examples=["Explain why the sky is blue?", "Introduce yourself.", "explain why ship float on water?"]
)

if __name__ == "__main__":
    chatbot.launch()