import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# The path to your specific model
model_id = "HedronCreeper/gemma-2b-security-bot"

# 1. Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# 2. Load Model
# We use device_map="auto" to let the system handle memory allocation
# and torch_dtype=torch.float16 to try and keep it somewhat lean
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

def chat_func(message):
    # Prepare the prompt format we used in training
    prompt = f"<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>model\n"
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=True,
            temperature=0.7,
            top_k=50
        )
    
    # Decode and clean up output
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the model's response part
    if "model" in decoded:
        response = decoded.split("model")[-1].strip()
    else:
        response = decoded
        
    return response

# 3. Simple Interface
demo = gr.Interface(
    fn=chat_func,
    inputs=gr.Textbox(label="Message the Security Bot"),
    outputs=gr.Textbox(label="Response"),
    title="Gemma Security Bot (Raw Test)"
)

if __name__ == "__main__":
    demo.launch()