Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| import torch | |
| from threading import Thread | |
| # Load model and tokenizer properly for streaming | |
| model_id = "HuggingFaceTB/SmolLM2-135M-Instruct" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id) | |
| def generate_response(message, history): | |
| # Strict system prompt to keep it grounded | |
| system_prompt = "You are a helpful, very brief assistant. Do not imagine stories or contexts. Answer only what is asked." | |
| # Build chat format | |
| messages = [{"role": "system", "content": system_prompt}] | |
| for user_msg, assistant_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| messages.append({"role": "assistant", "content": assistant_msg}) | |
| messages.append({"role": "user", "content": message}) | |
| # Convert to model's specific format | |
| input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = tokenizer([input_text], return_tensors="pt") | |
| # Set up the streamer | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| # Run generation in a separate thread | |
| generation_kwargs = dict( | |
| inputs, | |
| streamer=streamer, | |
| max_new_tokens=150, # Keep responses short to prevent yapping | |
| temperature=0.3, # Low temp = more "sane" | |
| repetition_penalty=1.2, | |
| do_sample=True | |
| ) | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| # Yield the text as it comes in | |
| partial_text = "" | |
| for new_text in streamer: | |
| partial_text += new_text | |
| yield partial_text | |
| demo = gr.ChatInterface( | |
| fn=generate_response, | |
| title="Actually Fast AI", | |
| description="SmolLM2 135M with Streaming. No more imaginary stories!" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |