Spaces:

Eeppa
/

VerySlowAISmartedition

Running

App Files Files Community

Eeppa commited on 11 days ago

Commit

9fbbe42

verified ·

1 Parent(s): 1a31472

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -16

app.py CHANGED Viewed

@@ -7,18 +7,18 @@ from threading import Thread
 model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # Load tokenizer and model
-# Using low_cpu_mem_usage to stay within Space limits
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    torch_dtype="auto",
     device_map="auto",
     low_cpu_mem_usage=True
 )
 def generate_response(message, history):
-    # DeepSeek works best with a clear instruction
-    system_prompt = "You are DeepSeek-R1, a helpful assistant. Keep your answers direct and avoid hallucinating outside contexts."
     # Build conversation with history
     messages = [{"role": "system", "content": system_prompt}]
@@ -27,7 +27,7 @@ def generate_response(message, history):
         messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
-    # Prepare the input using the official template
     input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
@@ -38,8 +38,9 @@ def generate_response(message, history):
         inputs,
         streamer=streamer,
         max_new_tokens=512,
-        temperature=0.6,      # DeepSeek R1 recommends 0.5-0.7
-        repetition_penalty=1.1 # Prevents the "yapping" loop
     )
     # Run in a thread so the UI doesn't freeze
@@ -49,19 +50,13 @@ def generate_response(message, history):
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
-        # DeepSeek-R1 often uses <think> tags. We'll leave them in so you see the 'reasoning'.
         yield partial_text
-# Create the Interface
 demo = gr.ChatInterface(
     fn=generate_response,
-    title="DeepSeek-R1 (1.5B) - The Smarter Slow AI",
-    description="Now streaming! Watch it 'think' before it speaks. Still slow, but way smarter."
-)
-if __name__ == "__main__":
-    # We apply the theme here instead, or just leave it default
-    demo.launch()
 )
 if __name__ == "__main__":

 model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 # Load tokenizer and model
+# Using bfloat16 to save 50% RAM and avoid crashes
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype=torch.bfloat16,
     device_map="auto",
     low_cpu_mem_usage=True
 )
 def generate_response(message, history):
+    # System prompt to keep the model focused
+    system_prompt = "You are DeepSeek-R1, a helpful assistant. Use the <think> tags to show your reasoning."
     # Build conversation with history
     messages = [{"role": "system", "content": system_prompt}]
         messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
+    # Prepare the input
     input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
         inputs,
         streamer=streamer,
         max_new_tokens=512,
+        temperature=0.6,
+        repetition_penalty=1.1,
+        do_sample=True
     )
     # Run in a thread so the UI doesn't freeze
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
         yield partial_text
+# Create the Interface (No 'theme' argument to avoid Gradio 6 errors)
 demo = gr.ChatInterface(
     fn=generate_response,
+    title="DeepSeek-R1 (1.5B) - Smart Slow AI",
+    description="Streaming enabled. Watch it think!"
 )
 if __name__ == "__main__":