wangsheng commited on
Commit
a151dea
·
verified ·
1 Parent(s): 058be44

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +423 -0
app.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## 2. Hugging Face Gradio Demo (app.py)
3
+
4
+ ```python
5
+ # app.py
6
+ import gradio as gr
7
+ import torch
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM
9
+ import os
10
+ from pathlib import Path
11
+ import json
12
+ import time
13
+
14
+ # Configuration
15
+ MODEL_NAME = "deepseek-ai/DeepSeek-V4-Pro"
16
+ MODEL_CACHE_DIR = "./model_cache"
17
+ MAX_CONTEXT_LENGTH = 1000000 # 1M tokens
18
+ DEFAULT_MAX_TOKENS = 2048
19
+ DEFAULT_TEMPERATURE = 0.7
20
+
21
+ # Model and tokenizer will be loaded lazily
22
+ model = None
23
+ tokenizer = None
24
+
25
+ def load_model():
26
+ """Load model and tokenizer (lazy loading)"""
27
+ global model, tokenizer
28
+
29
+ if model is None:
30
+ print("Loading tokenizer...")
31
+ tokenizer = AutoTokenizer.from_pretrained(
32
+ MODEL_NAME,
33
+ cache_dir=MODEL_CACHE_DIR,
34
+ trust_remote_code=True
35
+ )
36
+
37
+ print("Loading model... This may take several minutes...")
38
+ model = AutoModelForCausalLM.from_pretrained(
39
+ MODEL_NAME,
40
+ cache_dir=MODEL_CACHE_DIR,
41
+ device_map="auto",
42
+ torch_dtype=torch.bfloat16,
43
+ trust_remote_code=True,
44
+ low_cpu_mem_usage=True
45
+ )
46
+ print("Model loaded successfully!")
47
+
48
+ return model, tokenizer
49
+
50
+ def generate_response(
51
+ message,
52
+ history,
53
+ thinking_mode="Think High",
54
+ max_tokens=DEFAULT_MAX_TOKENS,
55
+ temperature=DEFAULT_TEMPERATURE,
56
+ top_p=1.0,
57
+ top_k=50,
58
+ system_prompt=""
59
+ ):
60
+ """Generate response from the model"""
61
+
62
+ # Load model if not loaded
63
+ model, tokenizer = load_model()
64
+
65
+ # Build conversation history
66
+ messages = []
67
+
68
+ # Add system prompt if provided
69
+ if system_prompt:
70
+ messages.append({"role": "system", "content": system_prompt})
71
+
72
+ # Add chat history
73
+ for h in history:
74
+ messages.append({"role": "user", "content": h[0]})
75
+ if h[1]:
76
+ messages.append({"role": "assistant", "content": h[1]})
77
+
78
+ # Add current message
79
+ messages.append({"role": "user", "content": message})
80
+
81
+ # Map thinking mode to model format
82
+ thinking_mode_map = {
83
+ "Non-think": "non_thinking",
84
+ "Think High": "thinking",
85
+ "Think Max": "thinking_max"
86
+ }
87
+
88
+ try:
89
+ # Try to use the custom encoding if available
90
+ try:
91
+ from encoding_dsv4 import encode_messages
92
+ prompt = encode_messages(
93
+ messages,
94
+ thinking_mode=thinking_mode_map[thinking_mode]
95
+ )
96
+ except ImportError:
97
+ # Fallback: simple concatenation
98
+ prompt = ""
99
+ for msg in messages:
100
+ if msg["role"] == "system":
101
+ prompt += f"System: {msg['content']}\n\n"
102
+ elif msg["role"] == "user":
103
+ prompt += f"User: {msg['content']}\n\n"
104
+ elif msg["role"] == "assistant":
105
+ prompt += f"Assistant: {msg['content']}\n\n"
106
+ prompt += "Assistant: "
107
+
108
+ # Tokenize input
109
+ inputs = tokenizer(prompt, return_tensors="pt")
110
+
111
+ # Move to appropriate device
112
+ if torch.cuda.is_available():
113
+ inputs = {k: v.cuda() for k, v in inputs.items()}
114
+
115
+ # Check context length
116
+ input_length = inputs['input_ids'].shape[1]
117
+ if input_length > MAX_CONTEXT_LENGTH:
118
+ raise gr.Error(f"Input too long: {input_length} tokens. Maximum: {MAX_CONTEXT_LENGTH}")
119
+
120
+ # Generate with streaming
121
+ start_time = time.time()
122
+
123
+ generation_config = {
124
+ "max_new_tokens": max_tokens,
125
+ "temperature": temperature,
126
+ "top_p": top_p,
127
+ "top_k": top_k,
128
+ "do_sample": True if temperature > 0 else False,
129
+ "pad_token_id": tokenizer.pad_token_id,
130
+ "eos_token_id": tokenizer.eos_token_id,
131
+ }
132
+
133
+ # For Think Max mode, adjust parameters
134
+ if thinking_mode == "Think Max":
135
+ generation_config["max_new_tokens"] = min(max_tokens * 2, 32768)
136
+
137
+ # Generate response
138
+ outputs = model.generate(**inputs, **generation_config)
139
+
140
+ # Decode response
141
+ full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
142
+ response = full_output[len(prompt):]
143
+
144
+ end_time = time.time()
145
+ generation_time = end_time - start_time
146
+
147
+ # Add generation info
148
+ response += f"\n\n---\n⚡ Generated in {generation_time:.2f}s | 📊 {len(outputs[0]) - input_length} tokens | 🌡️ Temperature: {temperature}"
149
+
150
+ return response
151
+
152
+ except Exception as e:
153
+ raise gr.Error(f"Generation failed: {str(e)}")
154
+
155
+ def clear_chat():
156
+ """Clear chat history"""
157
+ return None, None
158
+
159
+ # Create the Gradio interface
160
+ with gr.Blocks(
161
+ title="DeepSeek-V4 Demo",
162
+ theme=gr.themes.Soft(),
163
+ css="""
164
+ .deepseek-header {
165
+ text-align: center;
166
+ margin-bottom: 20px;
167
+ }
168
+ .deepseek-header h1 {
169
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
170
+ -webkit-background-clip: text;
171
+ -webkit-text-fill-color: transparent;
172
+ font-size: 2.5em;
173
+ }
174
+ .model-info {
175
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
176
+ color: white;
177
+ padding: 20px;
178
+ border-radius: 10px;
179
+ margin-bottom: 20px;
180
+ }
181
+ .benchmark-grid {
182
+ display: grid;
183
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
184
+ gap: 10px;
185
+ margin: 10px 0;
186
+ }
187
+ .benchmark-item {
188
+ background: rgba(255,255,255,0.1);
189
+ padding: 10px;
190
+ border-radius: 5px;
191
+ text-align: center;
192
+ }
193
+ """
194
+ ) as demo:
195
+ gr.HTML("""
196
+ <div class="deepseek-header">
197
+ <h1>🚀 DeepSeek-V4</h1>
198
+ <p>Towards Highly Efficient Million-Token Context Intelligence</p>
199
+ </div>
200
+ """)
201
+
202
+ with gr.Row():
203
+ with gr.Column(scale=1):
204
+ # Model info panel
205
+ gr.HTML("""
206
+ <div class="model-info">
207
+ <h3>📊 Model Specifications</h3>
208
+ <div class="benchmark-grid">
209
+ <div class="benchmark-item">
210
+ <b>1.6T</b><br>Total Parameters
211
+ </div>
212
+ <div class="benchmark-item">
213
+ <b>49B</b><br>Activated Parameters
214
+ </div>
215
+ <div class="benchmark-item">
216
+ <b>1M</b><br>Context Length
217
+ </div>
218
+ <div class="benchmark-item">
219
+ <b>32T+</b><br>Training Tokens
220
+ </div>
221
+ </div>
222
+
223
+ <h3>🎯 Key Benchmarks</h3>
224
+ <div class="benchmark-grid">
225
+ <div class="benchmark-item">
226
+ <b>93.5</b><br>LiveCodeBench
227
+ </div>
228
+ <div class="benchmark-item">
229
+ <b>3206</b><br>Codeforces Rating
230
+ </div>
231
+ <div class="benchmark-item">
232
+ <b>87.5</b><br>MMLU-Pro
233
+ </div>
234
+ <div class="benchmark-item">
235
+ <b>80.6%</b><br>SWE Verified
236
+ </div>
237
+ </div>
238
+
239
+ <h3>💡 Innovation Highlights</h3>
240
+ <ul>
241
+ <li>Hybrid Attention (CSA + HCA)</li>
242
+ <li>Manifold-Constrained Hyper-Connections</li>
243
+ <li>Muon Optimizer</li>
244
+ <li>Two-stage Post-training</li>
245
+ <li>FP4 + FP8 Mixed Precision</li>
246
+ </ul>
247
+ </div>
248
+ """)
249
+
250
+ # Configuration panel
251
+ with gr.Group():
252
+ gr.Markdown("### ⚙️ Configuration")
253
+
254
+ thinking_mode = gr.Radio(
255
+ choices=["Non-think", "Think High", "Think Max"],
256
+ value="Think High",
257
+ label="Reasoning Mode",
258
+ info="Non-think: Fast responses | Think High: Careful analysis | Think Max: Maximum reasoning"
259
+ )
260
+
261
+ system_prompt = gr.Textbox(
262
+ label="System Prompt",
263
+ placeholder="Enter system instructions...",
264
+ lines=3,
265
+ value="You are DeepSeek-V4, an advanced AI assistant with strong reasoning capabilities. Provide accurate and helpful responses."
266
+ )
267
+
268
+ with gr.Accordion("Advanced Parameters", open=False):
269
+ max_tokens = gr.Slider(
270
+ minimum=64,
271
+ maximum=32768,
272
+ value=2048,
273
+ step=64,
274
+ label="Max Tokens",
275
+ info="Maximum number of tokens to generate"
276
+ )
277
+
278
+ temperature = gr.Slider(
279
+ minimum=0.0,
280
+ maximum=2.0,
281
+ value=0.7,
282
+ step=0.1,
283
+ label="Temperature",
284
+ info="Higher values = more creative, lower = more focused"
285
+ )
286
+
287
+ top_p = gr.Slider(
288
+ minimum=0.0,
289
+ maximum=1.0,
290
+ value=1.0,
291
+ step=0.05,
292
+ label="Top P"
293
+ )
294
+
295
+ top_k = gr.Slider(
296
+ minimum=1,
297
+ maximum=100,
298
+ value=50,
299
+ step=1,
300
+ label="Top K"
301
+ )
302
+
303
+ # Quick examples
304
+ gr.Markdown("### 💬 Example Prompts")
305
+ examples = gr.Examples(
306
+ examples=[
307
+ ["Explain quantum entanglement like I'm 5 years old"],
308
+ ["Write a Python function to find prime numbers using the Sieve of Eratosthenes"],
309
+ ["What are the key differences between DeepSeek-V4 and previous versions?"],
310
+ ["Solve this math problem: Find the derivative of f(x) = x³sin(x)"],
311
+ ["Design a REST API for a todo application"],
312
+ ],
313
+ inputs=[message] if 'message' in locals() else None,
314
+ )
315
+
316
+ with gr.Column(scale=2):
317
+ # Chat interface
318
+ chatbot = gr.Chatbot(
319
+ label="Chat with DeepSeek-V4",
320
+ height=600,
321
+ show_copy_button=True,
322
+ avatar_images=(
323
+ "https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm.svg",
324
+ "https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm.svg"
325
+ )
326
+ )
327
+
328
+ with gr.Row():
329
+ message = gr.Textbox(
330
+ label="Your Message",
331
+ placeholder="Type your message here... (Shift+Enter for new line, Enter to send)",
332
+ lines=3,
333
+ scale=9
334
+ )
335
+ send_btn = gr.Button("Send", variant="primary", scale=1)
336
+
337
+ with gr.Row():
338
+ clear_btn = gr.Button("Clear Chat", size="sm")
339
+ stop_btn = gr.Button("Stop Generation", size="sm", variant="stop")
340
+
341
+ # Status indicator
342
+ status = gr.Textbox(
343
+ label="Status",
344
+ value="Ready to chat! Select your configuration and start a conversation.",
345
+ interactive=False
346
+ )
347
+
348
+ # Event handlers
349
+ def respond(message, history, thinking_mode, system_prompt, max_tokens, temperature, top_p, top_k):
350
+ """Main response handler"""
351
+ if not message.strip():
352
+ return "", history, "Please enter a message."
353
+
354
+ history = history or []
355
+ history.append([message, None])
356
+
357
+ yield "", history, "Generating..."
358
+
359
+ try:
360
+ response = generate_response(
361
+ message,
362
+ history[:-1],
363
+ thinking_mode,
364
+ max_tokens,
365
+ temperature,
366
+ top_p,
367
+ top_k,
368
+ system_prompt
369
+ )
370
+
371
+ history[-1][1] = response
372
+ yield "", history, "Ready"
373
+
374
+ except Exception as e:
375
+ history[-1][1] = f"Error: {str(e)}"
376
+ yield "", history, f"Error: {str(e)}"
377
+
378
+ # Wire up events
379
+ submit_event = message.submit(
380
+ respond,
381
+ inputs=[message, chatbot, thinking_mode, system_prompt, max_tokens, temperature, top_p, top_k],
382
+ outputs=[message, chatbot, status]
383
+ )
384
+
385
+ send_btn.click(
386
+ respond,
387
+ inputs=[message, chatbot, thinking_mode, system_prompt, max_tokens, temperature, top_p, top_k],
388
+ outputs=[message, chatbot, status]
389
+ )
390
+
391
+ clear_btn.click(
392
+ lambda: ([], "Chat cleared. Ready for new conversation."),
393
+ outputs=[chatbot, status]
394
+ )
395
+
396
+ # Stop generation
397
+ stop_btn.click(
398
+ lambda: "Generation stopped by user.",
399
+ outputs=[status]
400
+ )
401
+
402
+ # Footer
403
+ gr.HTML("""
404
+ <div style="text-align: center; margin-top: 20px; padding: 20px; color: #666;">
405
+ <p>
406
+ <a href="https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro" target="_blank">📦 Model Card</a> |
407
+ <a href="https://github.com/deepseek-ai/DeepSeek-V4" target="_blank">📖 Documentation</a> |
408
+ <a href="https://deepseek.ai" target="_blank">🌐 Homepage</a>
409
+ </p>
410
+ <p>⚠️ This is a preview version. Results may vary. For production use, please deploy with proper infrastructure.</p>
411
+ <p>License: MIT | DeepSeek-AI © 2026</p>
412
+ </div>
413
+ """)
414
+
415
+ if __name__ == "__main__":
416
+ # Launch the demo
417
+ demo.queue(max_size=20).launch(
418
+ server_name="0.0.0.0",
419
+ server_port=7860,
420
+ share=False, # Set to True for temporary public link
421
+ debug=False,
422
+ show_error=True
423
+ )