import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import torch from threading import Thread # Model ID for DeepSeek-R1-Distill-Qwen-1.5B model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # Load tokenizer and model # Using bfloat16 to save 50% RAM and avoid crashes tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", low_cpu_mem_usage=True ) def generate_response(message, history): # System prompt to keep the model focused system_prompt = "You are DeepSeek-R1, a helpful assistant. Use the tags to show your reasoning." # Build conversation with history messages = [{"role": "system", "content": system_prompt}] for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) # Prepare the input input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer([input_text], return_tensors="pt").to(model.device) # Setup streaming streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( inputs, streamer=streamer, max_new_tokens=512, temperature=0.6, repetition_penalty=1.1, do_sample=True ) # Run in a thread so the UI doesn't freeze thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() partial_text = "" for new_text in streamer: partial_text += new_text yield partial_text # Create the Interface (No 'theme' argument to avoid Gradio 6 errors) demo = gr.ChatInterface( fn=generate_response, title="DeepSeek-R1 (1.5B) - Smart Slow AI", description="Streaming enabled. Watch it think!" ) if __name__ == "__main__": demo.launch()