import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import torch from threading import Thread # Load model and tokenizer properly for streaming model_id = "HuggingFaceTB/SmolLM2-135M-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id) def generate_response(message, history): # Strict system prompt to keep it grounded system_prompt = "You are a helpful, very brief assistant. Do not imagine stories or contexts. Answer only what is asked." # Build chat format messages = [{"role": "system", "content": system_prompt}] for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) # Convert to model's specific format input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer([input_text], return_tensors="pt") # Set up the streamer streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Run generation in a separate thread generation_kwargs = dict( inputs, streamer=streamer, max_new_tokens=150, # Keep responses short to prevent yapping temperature=0.3, # Low temp = more "sane" repetition_penalty=1.2, do_sample=True ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Yield the text as it comes in partial_text = "" for new_text in streamer: partial_text += new_text yield partial_text demo = gr.ChatInterface( fn=generate_response, title="Actually Fast AI", description="SmolLM2 135M with Streaming. No more imaginary stories!" ) if __name__ == "__main__": demo.launch()