File size: 2,106 Bytes
3912358
 
 
 
 
 
 
 
 
9fbbe42
3912358
 
 
9fbbe42
3912358
 
 
 
 
9fbbe42
 
3912358
 
 
 
 
 
 
 
9fbbe42
3912358
 
 
 
 
 
 
 
 
 
9fbbe42
 
 
3912358
 
 
 
 
 
 
 
 
 
 
9fbbe42
3912358
 
9fbbe42
 
3912358
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread

# Model ID for DeepSeek-R1-Distill-Qwen-1.5B
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Load tokenizer and model
# Using bfloat16 to save 50% RAM and avoid crashes
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="auto",
    low_cpu_mem_usage=True
)

def generate_response(message, history):
    # System prompt to keep the model focused
    system_prompt = "You are DeepSeek-R1, a helpful assistant. Use the <think> tags to show your reasoning."
    
    # Build conversation with history
    messages = [{"role": "system", "content": system_prompt}]
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    
    # Prepare the input
    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
    
    # Setup streaming
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=512,
        temperature=0.6,
        repetition_penalty=1.1,
        do_sample=True
    )
    
    # Run in a thread so the UI doesn't freeze
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        yield partial_text

# Create the Interface (No 'theme' argument to avoid Gradio 6 errors)
demo = gr.ChatInterface(
    fn=generate_response, 
    title="DeepSeek-R1 (1.5B) - Smart Slow AI",
    description="Streaming enabled. Watch it think!"
)

if __name__ == "__main__":
    demo.launch()