Eeppa commited on
Commit
9fbbe42
·
verified ·
1 Parent(s): 1a31472

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -16
app.py CHANGED
@@ -7,18 +7,18 @@ from threading import Thread
7
  model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
8
 
9
  # Load tokenizer and model
10
- # Using low_cpu_mem_usage to stay within Space limits
11
  tokenizer = AutoTokenizer.from_pretrained(model_id)
12
  model = AutoModelForCausalLM.from_pretrained(
13
  model_id,
14
- torch_dtype="auto",
15
  device_map="auto",
16
  low_cpu_mem_usage=True
17
  )
18
 
19
  def generate_response(message, history):
20
- # DeepSeek works best with a clear instruction
21
- system_prompt = "You are DeepSeek-R1, a helpful assistant. Keep your answers direct and avoid hallucinating outside contexts."
22
 
23
  # Build conversation with history
24
  messages = [{"role": "system", "content": system_prompt}]
@@ -27,7 +27,7 @@ def generate_response(message, history):
27
  messages.append({"role": "assistant", "content": assistant_msg})
28
  messages.append({"role": "user", "content": message})
29
 
30
- # Prepare the input using the official template
31
  input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
32
  inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
33
 
@@ -38,8 +38,9 @@ def generate_response(message, history):
38
  inputs,
39
  streamer=streamer,
40
  max_new_tokens=512,
41
- temperature=0.6, # DeepSeek R1 recommends 0.5-0.7
42
- repetition_penalty=1.1 # Prevents the "yapping" loop
 
43
  )
44
 
45
  # Run in a thread so the UI doesn't freeze
@@ -49,19 +50,13 @@ def generate_response(message, history):
49
  partial_text = ""
50
  for new_text in streamer:
51
  partial_text += new_text
52
- # DeepSeek-R1 often uses <think> tags. We'll leave them in so you see the 'reasoning'.
53
  yield partial_text
54
 
55
- # Create the Interface
56
  demo = gr.ChatInterface(
57
  fn=generate_response,
58
- title="DeepSeek-R1 (1.5B) - The Smarter Slow AI",
59
- description="Now streaming! Watch it 'think' before it speaks. Still slow, but way smarter."
60
- )
61
-
62
- if __name__ == "__main__":
63
- # We apply the theme here instead, or just leave it default
64
- demo.launch()
65
  )
66
 
67
  if __name__ == "__main__":
 
7
  model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
8
 
9
  # Load tokenizer and model
10
+ # Using bfloat16 to save 50% RAM and avoid crashes
11
  tokenizer = AutoTokenizer.from_pretrained(model_id)
12
  model = AutoModelForCausalLM.from_pretrained(
13
  model_id,
14
+ torch_dtype=torch.bfloat16,
15
  device_map="auto",
16
  low_cpu_mem_usage=True
17
  )
18
 
19
  def generate_response(message, history):
20
+ # System prompt to keep the model focused
21
+ system_prompt = "You are DeepSeek-R1, a helpful assistant. Use the <think> tags to show your reasoning."
22
 
23
  # Build conversation with history
24
  messages = [{"role": "system", "content": system_prompt}]
 
27
  messages.append({"role": "assistant", "content": assistant_msg})
28
  messages.append({"role": "user", "content": message})
29
 
30
+ # Prepare the input
31
  input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
32
  inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
33
 
 
38
  inputs,
39
  streamer=streamer,
40
  max_new_tokens=512,
41
+ temperature=0.6,
42
+ repetition_penalty=1.1,
43
+ do_sample=True
44
  )
45
 
46
  # Run in a thread so the UI doesn't freeze
 
50
  partial_text = ""
51
  for new_text in streamer:
52
  partial_text += new_text
 
53
  yield partial_text
54
 
55
+ # Create the Interface (No 'theme' argument to avoid Gradio 6 errors)
56
  demo = gr.ChatInterface(
57
  fn=generate_response,
58
+ title="DeepSeek-R1 (1.5B) - Smart Slow AI",
59
+ description="Streaming enabled. Watch it think!"
 
 
 
 
 
60
  )
61
 
62
  if __name__ == "__main__":