import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM # The path to your specific model model_id = "HedronCreeper/gemma-2b-security-bot" # 1. Load Tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id) # 2. Load Model # We use device_map="auto" to let the system handle memory allocation # and torch_dtype=torch.float16 to try and keep it somewhat lean model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.float16, low_cpu_mem_usage=True ) def chat_func(message): # Prepare the prompt format we used in training prompt = f"user\n{message}\nmodel\n" # Tokenize input inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # Generate response with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_k=50 ) # Decode and clean up output decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only the model's response part if "model" in decoded: response = decoded.split("model")[-1].strip() else: response = decoded return response # 3. Simple Interface demo = gr.Interface( fn=chat_func, inputs=gr.Textbox(label="Message the Security Bot"), outputs=gr.Textbox(label="Response"), title="Gemma Security Bot (Raw Test)" ) if __name__ == "__main__": demo.launch()