Spaces:

wangsheng
/

DeepSeekV4Chat

Running

App Files Files Community

wangsheng commited on 13 days ago

Commit

2867a45

verified ·

1 Parent(s): 67589cb

Update app.py

Browse files

Files changed (1) hide show

app.py +658 -359

app.py CHANGED Viewed

@@ -1,423 +1,722 @@
-## 2. Hugging Face Gradio Demo (app.py)
-```python
 # app.py
 import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
-from pathlib import Path
-import json
 import time
-# Configuration
-MODEL_NAME = "deepseek-ai/DeepSeek-V4-Pro"
-MODEL_CACHE_DIR = "./model_cache"
-MAX_CONTEXT_LENGTH = 1000000  # 1M tokens
-DEFAULT_MAX_TOKENS = 2048
-DEFAULT_TEMPERATURE = 0.7
-# Model and tokenizer will be loaded lazily
-model = None
-tokenizer = None
-def load_model():
-    """Load model and tokenizer (lazy loading)"""
-    global model, tokenizer
-    if model is None:
-        print("Loading tokenizer...")
-        tokenizer = AutoTokenizer.from_pretrained(
-            MODEL_NAME,
-            cache_dir=MODEL_CACHE_DIR,
-            trust_remote_code=True
-        )
-        print("Loading model... This may take several minutes...")
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_NAME,
-            cache_dir=MODEL_CACHE_DIR,
-            device_map="auto",
-            torch_dtype=torch.bfloat16,
-            trust_remote_code=True,
-            low_cpu_mem_usage=True
         )
-        print("Model loaded successfully!")
-    return model, tokenizer
 def generate_response(
-    message,
-    history,
-    thinking_mode="Think High",
-    max_tokens=DEFAULT_MAX_TOKENS,
-    temperature=DEFAULT_TEMPERATURE,
-    top_p=1.0,
-    top_k=50,
-    system_prompt=""
-):
-    """Generate response from the model"""
-    # Load model if not loaded
-    model, tokenizer = load_model()
-    # Build conversation history
-    messages = []
-    # Add system prompt if provided
-    if system_prompt:
-        messages.append({"role": "system", "content": system_prompt})
-    # Add chat history
-    for h in history:
-        messages.append({"role": "user", "content": h[0]})
-        if h[1]:
-            messages.append({"role": "assistant", "content": h[1]})
     # Add current message
     messages.append({"role": "user", "content": message})
-    # Map thinking mode to model format
-    thinking_mode_map = {
-        "Non-think": "non_thinking",
-        "Think High": "thinking",
-        "Think Max": "thinking_max"
-    }
     try:
-        # Try to use the custom encoding if available
-        try:
-            from encoding_dsv4 import encode_messages
-            prompt = encode_messages(
-                messages,
-                thinking_mode=thinking_mode_map[thinking_mode]
-            )
-        except ImportError:
-            # Fallback: simple concatenation
-            prompt = ""
-            for msg in messages:
-                if msg["role"] == "system":
-                    prompt += f"System: {msg['content']}\n\n"
-                elif msg["role"] == "user":
-                    prompt += f"User: {msg['content']}\n\n"
-                elif msg["role"] == "assistant":
-                    prompt += f"Assistant: {msg['content']}\n\n"
-            prompt += "Assistant: "
-        # Tokenize input
-        inputs = tokenizer(prompt, return_tensors="pt")
-        # Move to appropriate device
-        if torch.cuda.is_available():
-            inputs = {k: v.cuda() for k, v in inputs.items()}
-        # Check context length
-        input_length = inputs['input_ids'].shape[1]
-        if input_length > MAX_CONTEXT_LENGTH:
-            raise gr.Error(f"Input too long: {input_length} tokens. Maximum: {MAX_CONTEXT_LENGTH}")
-        # Generate with streaming
-        start_time = time.time()
-        generation_config = {
-            "max_new_tokens": max_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "do_sample": True if temperature > 0 else False,
-            "pad_token_id": tokenizer.pad_token_id,
-            "eos_token_id": tokenizer.eos_token_id,
-        }
-        # For Think Max mode, adjust parameters
-        if thinking_mode == "Think Max":
-            generation_config["max_new_tokens"] = min(max_tokens * 2, 32768)
-        # Generate response
-        outputs = model.generate(**inputs, **generation_config)
-        # Decode response
-        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        response = full_output[len(prompt):]
         end_time = time.time()
-        generation_time = end_time - start_time
-        # Add generation info
-        response += f"\n\n---\n⚡ Generated in {generation_time:.2f}s | 📊 {len(outputs[0]) - input_length} tokens | 🌡️ Temperature: {temperature}"
-        return response
     except Exception as e:
-        raise gr.Error(f"Generation failed: {str(e)}")
-def clear_chat():
-    """Clear chat history"""
-    return None, None
-# Create the Gradio interface
-with gr.Blocks(
-    title="DeepSeek-V4 Demo",
-    theme=gr.themes.Soft(),
-    css="""
-        .deepseek-header {
-            text-align: center;
-            margin-bottom: 20px;
-        }
-        .deepseek-header h1 {
-            background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
-            -webkit-background-clip: text;
-            -webkit-text-fill-color: transparent;
-            font-size: 2.5em;
-        }
-        .model-info {
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            color: white;
-            padding: 20px;
-            border-radius: 10px;
-            margin-bottom: 20px;
-        }
-        .benchmark-grid {
-            display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
-            gap: 10px;
-            margin: 10px 0;
-        }
-        .benchmark-item {
-            background: rgba(255,255,255,0.1);
-            padding: 10px;
-            border-radius: 5px;
-            text-align: center;
-        }
     """
-) as demo:
-    gr.HTML("""
-        <div class="deepseek-header">
-            <h1>🚀 DeepSeek-V4</h1>
-            <p>Towards Highly Efficient Million-Token Context Intelligence</p>
-        </div>
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            # Model info panel
-            gr.HTML("""
-                <div class="model-info">
-                    <h3>📊 Model Specifications</h3>
-                    <div class="benchmark-grid">
-                        <div class="benchmark-item">
-                            <b>1.6T</b><br>Total Parameters
-                        </div>
-                        <div class="benchmark-item">
-                            <b>49B</b><br>Activated Parameters
-                        </div>
-                        <div class="benchmark-item">
-                            <b>1M</b><br>Context Length
                         </div>
-                        <div class="benchmark-item">
-                            <b>32T+</b><br>Training Tokens
                         </div>
                     </div>
-                    <h3>🎯 Key Benchmarks</h3>
-                    <div class="benchmark-grid">
-                        <div class="benchmark-item">
-                            <b>93.5</b><br>LiveCodeBench
-                        </div>
-                        <div class="benchmark-item">
-                            <b>3206</b><br>Codeforces Rating
-                        </div>
-                        <div class="benchmark-item">
-                            <b>87.5</b><br>MMLU-Pro
-                        </div>
-                        <div class="benchmark-item">
-                            <b>80.6%</b><br>SWE Verified
-                        </div>
-                    </div>
-                    <h3>💡 Innovation Highlights</h3>
-                    <ul>
-                        <li>Hybrid Attention (CSA + HCA)</li>
-                        <li>Manifold-Constrained Hyper-Connections</li>
-                        <li>Muon Optimizer</li>
-                        <li>Two-stage Post-training</li>
-                        <li>FP4 + FP8 Mixed Precision</li>
-                    </ul>
-                </div>
-            """)
-            # Configuration panel
-            with gr.Group():
-                gr.Markdown("### ⚙️ Configuration")
-                thinking_mode = gr.Radio(
-                    choices=["Non-think", "Think High", "Think Max"],
-                    value="Think High",
-                    label="Reasoning Mode",
-                    info="Non-think: Fast responses | Think High: Careful analysis | Think Max: Maximum reasoning"
                 )
-                system_prompt = gr.Textbox(
-                    label="System Prompt",
-                    placeholder="Enter system instructions...",
-                    lines=3,
-                    value="You are DeepSeek-V4, an advanced AI assistant with strong reasoning capabilities. Provide accurate and helpful responses."
                 )
-                with gr.Accordion("Advanced Parameters", open=False):
-                    max_tokens = gr.Slider(
-                        minimum=64,
-                        maximum=32768,
-                        value=2048,
-                        step=64,
-                        label="Max Tokens",
-                        info="Maximum number of tokens to generate"
-                    )
-                    temperature = gr.Slider(
-                        minimum=0.0,
-                        maximum=2.0,
-                        value=0.7,
-                        step=0.1,
-                        label="Temperature",
-                        info="Higher values = more creative, lower = more focused"
                     )
-                    top_p = gr.Slider(
-                        minimum=0.0,
-                        maximum=1.0,
-                        value=1.0,
-                        step=0.05,
-                        label="Top P"
                     )
-                    top_k = gr.Slider(
-                        minimum=1,
-                        maximum=100,
-                        value=50,
-                        step=1,
-                        label="Top K"
                     )
-            # Quick examples
-            gr.Markdown("### 💬 Example Prompts")
-            examples = gr.Examples(
-                examples=[
-                    ["Explain quantum entanglement like I'm 5 years old"],
-                    ["Write a Python function to find prime numbers using the Sieve of Eratosthenes"],
-                    ["What are the key differences between DeepSeek-V4 and previous versions?"],
-                    ["Solve this math problem: Find the derivative of f(x) = x³sin(x)"],
-                    ["Design a REST API for a todo application"],
-                ],
-                inputs=[message] if 'message' in locals() else None,
-            )
-        with gr.Column(scale=2):
-            # Chat interface
-            chatbot = gr.Chatbot(
-                label="Chat with DeepSeek-V4",
-                height=600,
-                show_copy_button=True,
-                avatar_images=(
-                    "https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm.svg",
-                    "https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm.svg"
                 )
-            )
-            with gr.Row():
-                message = gr.Textbox(
-                    label="Your Message",
-                    placeholder="Type your message here... (Shift+Enter for new line, Enter to send)",
-                    lines=3,
-                    scale=9
                 )
-                send_btn = gr.Button("Send", variant="primary", scale=1)
-            with gr.Row():
-                clear_btn = gr.Button("Clear Chat", size="sm")
-                stop_btn = gr.Button("Stop Generation", size="sm", variant="stop")
-            # Status indicator
-            status = gr.Textbox(
-                label="Status",
-                value="Ready to chat! Select your configuration and start a conversation.",
-                interactive=False
-            )
-    # Event handlers
-    def respond(message, history, thinking_mode, system_prompt, max_tokens, temperature, top_p, top_k):
-        """Main response handler"""
-        if not message.strip():
-            return "", history, "Please enter a message."
-        history = history or []
-        history.append([message, None])
-        yield "", history, "Generating..."
-        try:
-            response = generate_response(
-                message,
-                history[:-1],
-                thinking_mode,
-                max_tokens,
-                temperature,
-                top_p,
-                top_k,
-                system_prompt
             )
-            history[-1][1] = response
-            yield "", history, "Ready"
-        except Exception as e:
-            history[-1][1] = f"Error: {str(e)}"
-            yield "", history, f"Error: {str(e)}"
-    # Wire up events
-    submit_event = message.submit(
-        respond,
-        inputs=[message, chatbot, thinking_mode, system_prompt, max_tokens, temperature, top_p, top_k],
-        outputs=[message, chatbot, status]
-    )
-    send_btn.click(
-        respond,
-        inputs=[message, chatbot, thinking_mode, system_prompt, max_tokens, temperature, top_p, top_k],
-        outputs=[message, chatbot, status]
-    )
-    clear_btn.click(
-        lambda: ([], "Chat cleared. Ready for new conversation."),
-        outputs=[chatbot, status]
-    )
-    # Stop generation
-    stop_btn.click(
-        lambda: "Generation stopped by user.",
-        outputs=[status]
-    )
-    # Footer
-    gr.HTML("""
-        <div style="text-align: center; margin-top: 20px; padding: 20px; color: #666;">
-            <p>
-                <a href="https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro" target="_blank">📦 Model Card</a> |
-                <a href="https://github.com/deepseek-ai/DeepSeek-V4" target="_blank">📖 Documentation</a> |
-                <a href="https://deepseek.ai" target="_blank">🌐 Homepage</a>
-            </p>
-            <p>⚠️ This is a preview version. Results may vary. For production use, please deploy with proper infrastructure.</p>
-            <p>License: MIT | DeepSeek-AI © 2026</p>
-        </div>
-    """)
 if __name__ == "__main__":
-    # Launch the demo
-    demo.queue(max_size=20).launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=False,  # Set to True for temporary public link
         debug=False,
-        show_error=True
     )

 # app.py
 import gradio as gr
+from openai import OpenAI
 import os
 import time
+from typing import List, Tuple, Optional
+# ==================== Configuration ====================
+DEFAULT_SYSTEM_PROMPT = "You are DeepSeek-V4, an advanced AI assistant with strong reasoning capabilities. Provide accurate, helpful, and well-reasoned responses."
+# Reasoning effort mapping
+REASONING_EFFORT_MAP = {
+    "Non-think": "minimal",
+    "Think High": "high",
+    "Think Max": "maximum"
+}
+# Thinking type mapping
+THINKING_TYPE_MAP = {
+    "Non-think": "disabled",
+    "Think High": "enabled",
+    "Think Max": "enabled"
+}
+# ==================== API Client Setup ====================
+def get_client():
+    """Initialize DeepSeek API client"""
+    api_key = os.environ.get('DEEPSEEK_API_KEY')
+    if not api_key:
+        raise ValueError(
+            "⚠️ DEEPSEEK_API_KEY not found!\n\n"
+            "Please set your API key:\n"
+            "1. Get your key from: https://platform.deepseek.com/api_keys\n"
+            "2. Set environment variable:\n"
+            "   export DEEPSEEK_API_KEY='your-api-key-here'\n"
+            "   or create a .env file with: DEEPSEEK_API_KEY=your-api-key-here"
         )
+    return OpenAI(
+        api_key=api_key,
+        base_url="https://api.deepseek.com"
+    )
+# ==================== Response Generation ====================
 def generate_response(
+    message: str,
+    history: List[Tuple[str, str]],
+    thinking_mode: str = "Think High",
+    max_tokens: int = 4096,
+    temperature: float = 0.7,
+    top_p: float = 1.0,
+    system_prompt: str = DEFAULT_SYSTEM_PROMPT,
+    show_thinking: bool = True
+) -> Tuple[str, List[Tuple[str, str]], str, str]:
+    """
+    Generate response using DeepSeek API
+    Returns:
+        Tuple of (empty_message, updated_history, response_text, thinking_text, status)
+    """
+    if not message.strip():
+        return "", history, "", "", "Please enter a message."
+    client = get_client()
+    # Build messages array
+    messages = [{"role": "system", "content": system_prompt}]
+    # Add conversation history
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            messages.append({"role": "assistant", "content": assistant_msg})
     # Add current message
     messages.append({"role": "user", "content": message})
+    # Prepare API parameters
+    reasoning_effort = REASONING_EFFORT_MAP.get(thinking_mode, "high")
+    thinking_type = THINKING_TYPE_MAP.get(thinking_mode, "enabled")
     try:
+        start_time = time.time()
+        # Call DeepSeek API
+        response = client.chat.completions.create(
+            model="deepseek-v4-pro",
+            messages=messages,
+            stream=False,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            reasoning_effort=reasoning_effort,
+            extra_body={
+                "thinking": {"type": thinking_type}
+            }
+        )
+        end_time = time.time()
+        generation_time = end_time - start_time
+        # Extract response content
+        choice = response.choices[0]
+        message_obj = choice.message
+        # Get main content
+        content = message_obj.content or ""
+        # Get reasoning/thinking content if available
+        thinking_content = ""
+        if hasattr(message_obj, 'reasoning_content') and message_obj.reasoning_content:
+            thinking_content = message_obj.reasoning_content
+        # Update history
+        full_response = content
+        if show_thinking and thinking_content:
+            full_response = f"{thinking_content}\n\n---\n\n{content}"
+        # Add usage info if available
+        if hasattr(response, 'usage') and response.usage:
+            usage = response.usage
+            tokens_info = f"📊 Input: {usage.prompt_tokens} tokens | Output: {usage.completion_tokens} tokens | Total: {usage.total_tokens} tokens"
+        else:
+            tokens_info = ""
+        status = f"✅ Generated in {generation_time:.2f}s | 🎯 Mode: {thinking_mode} | {tokens_info}"
+        return "", history + [(message, full_response)], content, thinking_content, status
+    except Exception as e:
+        error_msg = f"❌ Error: {str(e)}"
+        return "", history + [(message, error_msg)], "", "", error_msg
+def generate_response_stream(
+    message: str,
+    history: List[Tuple[str, str]],
+    thinking_mode: str = "Think High",
+    max_tokens: int = 4096,
+    temperature: float = 0.7,
+    top_p: float = 1.0,
+    system_prompt: str = DEFAULT_SYSTEM_PROMPT,
+    show_thinking: bool = True
+):
+    """
+    Stream response from DeepSeek API
+    Yields:
+        Tuple of (empty_message, updated_history, content_so_far, thinking_so_far, status)
+    """
+    if not message.strip():
+        yield "", history, "", "", "Please enter a message."
+        return
+    client = get_client()
+    # Build messages array
+    messages = [{"role": "system", "content": system_prompt}]
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            messages.append({"role": "assistant", "content": assistant_msg})
+    messages.append({"role": "user", "content": message})
+    reasoning_effort = REASONING_EFFORT_MAP.get(thinking_mode, "high")
+    thinking_type = THINKING_TYPE_MAP.get(thinking_mode, "enabled")
+    try:
+        start_time = time.time()
+        # Stream response
+        stream = client.chat.completions.create(
+            model="deepseek-v4-pro",
+            messages=messages,
+            stream=True,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            reasoning_effort=reasoning_effort,
+            extra_body={
+                "thinking": {"type": thinking_type}
+            }
+        )
+        content_chunks = []
+        thinking_chunks = []
+        for chunk in stream:
+            if chunk.choices[0].delta.content:
+                content_chunks.append(chunk.choices[0].delta.content)
+            # Check for reasoning content in stream
+            if hasattr(chunk.choices[0].delta, 'reasoning_content'):
+                if chunk.choices[0].delta.reasoning_content:
+                    thinking_chunks.append(chunk.choices[0].delta.reasoning_content)
+            current_content = ''.join(content_chunks)
+            current_thinking = ''.join(thinking_chunks)
+            full_response = current_content
+            if show_thinking and current_thinking:
+                full_response = f"🧠 Thinking:\n{current_thinking}\n\n💬 Response:\n{current_content}"
+            elapsed = time.time() - start_time
+            status = f"🔄 Streaming... ({elapsed:.1f}s) | Mode: {thinking_mode}"
+            yield "", history + [(message, full_response)], current_content, current_thinking, status
+        # Final yield with complete response
         end_time = time.time()
+        final_content = ''.join(content_chunks)
+        final_thinking = ''.join(thinking_chunks)
+        full_response = final_content
+        if show_thinking and final_thinking:
+            full_response = f"🧠 Thinking:\n{final_thinking}\n\n💬 Response:\n{final_content}"
+        status = f"✅ Done in {end_time - start_time:.2f}s | Mode: {thinking_mode}"
+        yield "", history + [(message, full_response)], final_content, final_thinking, status
     except Exception as e:
+        error_msg = f"❌ Error: {str(e)}"
+        yield "", history + [(message, error_msg)], "", "", error_msg
+# ==================== Gradio Interface ====================
+def create_demo():
+    """Create the Gradio interface"""
+    # Custom CSS
+    custom_css = """
+    :root {
+        --primary: #667eea;
+        --secondary: #764ba2;
+    }
+    .deepseek-header {
+        text-align: center;
+        margin-bottom: 20px;
+        padding: 30px;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 16px;
+        color: white;
+    }
+    .deepseek-header h1 {
+        font-size: 2.8em;
+        font-weight: 800;
+        margin: 0;
+        text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
+    }
+    .deepseek-header p {
+        font-size: 1.2em;
+        opacity: 0.95;
+        margin: 10px 0 0 0;
+    }
+    .model-info {
+        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
+        padding: 20px;
+        border-radius: 12px;
+        margin-bottom: 20px;
+        border: 1px solid #e0e0e0;
+    }
+    .benchmark-grid {
+        display: grid;
+        grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
+        gap: 12px;
+        margin: 15px 0;
+    }
+    .benchmark-item {
+        background: white;
+        padding: 12px;
+        border-radius: 8px;
+        text-align: center;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        transition: transform 0.2s;
+    }
+    .benchmark-item:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 4px 8px rgba(0,0,0,0.15);
+    }
+    .benchmark-item .value {
+        font-size: 1.5em;
+        font-weight: 700;
+        color: #667eea;
+    }
+    .benchmark-item .label {
+        font-size: 0.85em;
+        color: #666;
+        margin-top: 4px;
+    }
+    .chat-container {
+        border: 1px solid #e0e0e0;
+        border-radius: 12px;
+        overflow: hidden;
+    }
+    .thinking-box {
+        background: #f8f9fa;
+        border-left: 4px solid #667eea;
+        padding: 15px;
+        margin: 10px 0;
+        border-radius: 8px;
+        font-style: italic;
+        color: #555;
+    }
+    .thinking-box::before {
+        content: "🧠 Thinking Process";
+        display: block;
+        font-weight: 600;
+        color: #667eea;
+        margin-bottom: 8px;
+    }
+    .response-box {
+        background: white;
+        padding: 15px;
+        border-radius: 8px;
+        line-height: 1.6;
+    }
+    .status-bar {
+        padding: 10px;
+        background: #f5f5f5;
+        border-radius: 8px;
+        font-family: monospace;
+        font-size: 0.9em;
+    }
+    .mode-indicator {
+        display: inline-block;
+        padding: 4px 12px;
+        border-radius: 20px;
+        font-size: 0.85em;
+        font-weight: 600;
+        margin-right: 8px;
+    }
+    .mode-non-think {
+        background: #e3f2fd;
+        color: #1976d2;
+    }
+    .mode-think-high {
+        background: #f3e5f5;
+        color: #7b1fa2;
+    }
+    .mode-think-max {
+        background: #fce4ec;
+        color: #c62828;
+    }
+    .api-key-warning {
+        background: #fff3cd;
+        border: 1px solid #ffc107;
+        color: #856404;
+        padding: 15px;
+        border-radius: 8px;
+        margin: 10px 0;
+    }
     """
+    with gr.Blocks(
+        title="DeepSeek-V4 Pro - API Demo",
+        theme=gr.themes.Soft(),
+        css=custom_css,
+        analytics_enabled=False
+    ) as demo:
+        # Header
+        gr.HTML("""
+            <div class="deepseek-header">
+                <h1>🚀 DeepSeek-V4 Pro</h1>
+                <p>Towards Highly Efficient Million-Token Context Intelligence</p>
+                <p style="font-size: 0.9em; opacity: 0.8;">Powered by DeepSeek API • 1.6T Parameters • 49B Activated</p>
+            </div>
+        """)
+        # Main layout
+        with gr.Row():
+            # Left sidebar - Configuration
+            with gr.Column(scale=1, min_width=350):
+                # Model Info Card
+                gr.HTML("""
+                    <div class="model-info">
+                        <h3 style="margin-top: 0;">📊 Model Specifications</h3>
+                        <div class="benchmark-grid">
+                            <div class="benchmark-item">
+                                <div class="value">1.6T</div>
+                                <div class="label">Total Parameters</div>
+                            </div>
+                            <div class="benchmark-item">
+                                <div class="value">49B</div>
+                                <div class="label">Activated Parameters</div>
+                            </div>
+                            <div class="benchmark-item">
+                                <div class="value">1M</div>
+                                <div class="label">Context Length</div>
+                            </div>
+                            <div class="benchmark-item">
+                                <div class="value">32T+</div>
+                                <div class="label">Training Tokens</div>
+                            </div>
                         </div>
+                        <h3>🎯 Key Benchmarks</h3>
+                        <div class="benchmark-grid">
+                            <div class="benchmark-item">
+                                <div class="value">93.5</div>
+                                <div class="label">LiveCodeBench</div>
+                            </div>
+                            <div class="benchmark-item">
+                                <div class="value">3206</div>
+                                <div class="label">Codeforces Rating</div>
+                            </div>
+                            <div class="benchmark-item">
+                                <div class="value">87.5</div>
+                                <div class="label">MMLU-Pro</div>
+                            </div>
+                            <div class="benchmark-item">
+                                <div class="value">80.6%</div>
+                                <div class="label">SWE Verified</div>
+                            </div>
                         </div>
+                        <h3>💡 Key Innovations</h3>
+                        <ul style="padding-left: 20px;">
+                            <li>Hybrid Attention (CSA + HCA)</li>
+                            <li>Manifold-Constrained Hyper-Connections</li>
+                            <li>Muon Optimizer</li>
+                            <li>Two-stage Post-training</li>
+                            <li>FP4 + FP8 Mixed Precision</li>
+                        </ul>
                     </div>
+                """)
+                # Configuration Panel
+                with gr.Group():
+                    gr.Markdown("### ⚙️ Generation Settings")
+                    thinking_mode = gr.Radio(
+                        choices=["Non-think", "Think High", "Think Max"],
+                        value="Think High",
+                        label="🧠 Reasoning Mode",
+                        info="""
+                        • Non-think: Fast, intuitive responses for daily tasks
+                        • Think High: Deliberate reasoning for complex problems
+                        • Think Max: Maximum effort for hardest challenges
+                        """
+                    )
+                    show_thinking = gr.Checkbox(
+                        value=True,
+                        label="📝 Show Thinking Process",
+                        info="Display the model's reasoning steps"
+                    )
+                    system_prompt = gr.Textbox(
+                        label="📋 System Prompt",
+                        value=DEFAULT_SYSTEM_PROMPT,
+                        lines=3,
+                        max_lines=5
+                    )
+                    with gr.Accordion("🔧 Advanced Parameters", open=False):
+                        max_tokens = gr.Slider(
+                            minimum=64,
+                            maximum=32768,
+                            value=4096,
+                            step=64,
+                            label="Max Tokens"
+                        )
+                        temperature = gr.Slider(
+                            minimum=0.0,
+                            maximum=2.0,
+                            value=0.7,
+                            step=0.05,
+                            label="Temperature",
+                            info="0 = deterministic, 1+ = creative"
+                        )
+                        top_p = gr.Slider(
+                            minimum=0.0,
+                            maximum=1.0,
+                            value=1.0,
+                            step=0.05,
+                            label="Top P"
+                        )
+                        stream_output = gr.Checkbox(
+                            value=True,
+                            label="📡 Stream Output",
+                            info="Show response as it's generated"
+                        )
+                # Quick examples
+                gr.Markdown("### 💡 Quick Examples")
+                examples = [
+                    "Explain quantum computing to a 10-year-old",
+                    "Write a Python function for Fibonacci with memoization",
+                    "What are the key features of DeepSeek-V4?",
+                    "Solve: If x² + y² = 25 and x + y = 7, find x and y",
+                    "Design a REST API for a social media platform",
+                ]
+                gr.Examples(
+                    examples=examples,
+                    inputs=gr.Textbox(label="Click to try", visible=False),
                 )
+            # Right - Chat Interface
+            with gr.Column(scale=2):
+                # Chatbot
+                chatbot = gr.Chatbot(
+                    label="💬 Chat with DeepSeek-V4 Pro",
+                    height=550,
+                    show_copy_button=True,
+                    bubble_full_width=False,
+                    avatar_images=(
+                        "https://api.dicebear.com/7.x/bottts/svg?seed=user&backgroundColor=667eea",
+                        "https://api.dicebear.com/7.x/bottts/svg?seed=assistant&backgroundColor=764ba2"
+                    ),
+                    layout="panel"
                 )
+                # Thinking process display
+                with gr.Accordion("🧠 Thinking Process", open=True, visible=True):
+                    thinking_display = gr.Markdown(
+                        value="*The model's reasoning will appear here...*",
+                        elem_classes="thinking-box"
                     )
+                # Input area
+                with gr.Row():
+                    message_input = gr.Textbox(
+                        label="Your Message",
+                        placeholder="Type your message here... (Shift+Enter for new line)",
+                        lines=2,
+                        max_lines=5,
+                        scale=9,
+                        autofocus=True
                     )
+                    send_btn = gr.Button(
+                        "🚀 Send",
+                        variant="primary",
+                        scale=1,
+                        size="lg"
                     )
+                # Control buttons
+                with gr.Row():
+                    clear_btn = gr.Button("🗑️ Clear Chat", size="sm")
+                    stop_btn = gr.Button("⏹️ Stop", size="sm", variant="stop", visible=False)
+                    retry_btn = gr.Button("🔄 Retry", size="sm", variant="secondary")
+                # Status bar
+                status_display = gr.Textbox(
+                    label="Status",
+                    value="✅ Ready | Using DeepSeek API (deepseek-v4-pro)",
+                    interactive=False,
+                    elem_classes="status-bar"
                 )
+        # Footer
+        gr.HTML("""
+            <div style="text-align: center; margin-top: 30px; padding: 20px; color: #666; border-top: 1px solid #e0e0e0;">
+                <p style="margin: 5px 0;">
+                    <a href="https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro" target="_blank">📦 Model Card</a> |
+                    <a href="https://platform.deepseek.com/api_keys" target="_blank">🔑 Get API Key</a> |
+                    <a href="https://platform.deepseek.com/docs" target="_blank">📚 API Docs</a> |
+                    <a href="https://deepseek.ai" target="_blank">🌐 Homepage</a>
+                </p>
+                <p style="margin: 5px 0; font-size: 0.9em;">
+                    ⚡ Powered by DeepSeek API • Streaming Available • MIT License
+                </p>
+                <p style="margin: 5px 0; font-size: 0.8em; opacity: 0.7;">
+                    DeepSeek-AI © 2026 • All benchmarks are for reference only
+                </p>
+            </div>
+        """)
+        # ==================== Event Handlers ====================
+        def process_message(
+            message: str,
+            history: List[Tuple[str, str]],
+            thinking_mode: str,
+            show_thinking: bool,
+            system_prompt: str,
+            max_tokens: int,
+            temperature: float,
+            top_p: float,
+            stream_output: bool
+        ):
+            """Process message with streaming or non-streaming mode"""
+            if not message.strip():
+                return message, history, "", "Please enter a message."
+            # Check API key
+            if not os.environ.get('DEEPSEEK_API_KEY'):
+                return (
+                    message,
+                    history + [(message, "⚠️ **API Key Missing**\n\nPlease set your `DEEPSEEK_API_KEY` environment variable.\nGet one at: https://platform.deepseek.com/api_keys")],
+                    "",
+                    "❌ API Key not configured"
                 )
+            if stream_output:
+                # Use streaming
+                for msg, hist, content, thinking, status in generate_response_stream(
+                    message, history, thinking_mode, max_tokens,
+                    temperature, top_p, system_prompt, show_thinking
+                ):
+                    yield msg, hist, thinking, status
+            else:
+                # Use non-streaming
+                result = generate_response(
+                    message, history, thinking_mode, max_tokens,
+                    temperature, top_p, system_prompt, show_thinking
+                )
+                msg, hist, content, thinking, status = result
+                yield msg, hist, thinking, status
+        # Wire up send button
+        send_event = send_btn.click(
+            fn=process_message,
+            inputs=[
+                message_input, chatbot, thinking_mode, show_thinking,
+                system_prompt, max_tokens, temperature, top_p, stream_output
+            ],
+            outputs=[message_input, chatbot, thinking_display, status_display],
+            show_progress="hidden"
+        )
+        # Wire up Enter key
+        enter_event = message_input.submit(
+            fn=process_message,
+            inputs=[
+                message_input, chatbot, thinking_mode, show_thinking,
+                system_prompt, max_tokens, temperature, top_p, stream_output
+            ],
+            outputs=[message_input, chatbot, thinking_display, status_display],
+            show_progress="hidden"
+        )
+        # Clear chat
+        def clear_chat():
+            return (
+                [],
+                "*The model's reasoning will appear here...*",
+                "✅ Chat cleared. Ready for new conversation."
             )
+        clear_btn.click(
+            fn=clear_chat,
+            outputs=[chatbot, thinking_display, status_display]
+        )
+        # Retry last message
+        def retry_last(history):
+            if not history:
+                return history, "No message to retry."
+            last_message = history[-1][0]
+            history = history[:-1]
+            return history, last_message
+        retry_btn.click(
+            fn=retry_last,
+            inputs=[chatbot],
+            outputs=[chatbot, message_input]
+        )
+        # Mode change indicator
+        def update_mode_indicator(mode):
+            mode_classes = {
+                "Non-think": "mode-non-think",
+                "Think High": "mode-think-high",
+                "Think Max": "mode-think-max"
+            }
+            class_name = mode_classes.get(mode, "")
+            return f'<span class="mode-indicator {class_name}">{mode}</span>'
+    return demo
+# ==================== Main ====================
 if __name__ == "__main__":
+    # Check environment
+    api_key = os.environ.get('DEEPSEEK_API_KEY')
+    if not api_key:
+        print("\n" + "=" * 60)
+        print("⚠️  DEEPSEEK_API_KEY not found!")
+        print("=" * 60)
+        print("\nTo get started:")
+        print("1. Get your API key: https://platform.deepseek.com/api_keys")
+        print("2. Set the environment variable:")
+        print("   export DEEPSEEK_API_KEY='your-key-here'")
+        print("\nOr create a .env file:")
+        print('   echo DEEPSEEK_API_KEY=your-key-here > .env')
+        print("\n" + "=" * 60 + "\n")
+    # Create and launch demo
+    demo = create_demo()
+    demo.queue(
+        max_size=50,
+        concurrency_count=10,
+        default_concurrency_limit=10
+    ).launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False,
         debug=False,
+        show_error=True,
+        favicon_path=None
     )