Spaces:

akhaliq
/

MiniCPM-V-4.6

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 9 days ago

Commit

2866f2d

1 Parent(s): ee98620

feat: implement streaming inference with history support and update UI theme

Browse files

Files changed (3) hide show

app.py +103 -109
index.html +387 -146
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -2,27 +2,33 @@ import os
 import torch
 import re
 import av
 from PIL import Image
-from transformers import AutoModelForImageTextToText, AutoProcessor
 from gradio import Server
 from gradio.data_classes import FileData
 from fastapi.responses import HTMLResponse
 import spaces
-# Load model and processor
-model_id = "openbmb/MiniCPM-V-4.6"
-print(f"Loading model: {model_id}...")
-processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
-model = AutoModelForImageTextToText.from_pretrained(
-    model_id,
-    torch_dtype=torch.bfloat16,
     trust_remote_code=True,
-    device_map="cuda"
-)
 def load_video(video_path, max_frames=64):
-    """Utility to load video frames using PyAV."""
     try:
         container = av.open(video_path)
         frames = []
@@ -30,11 +36,9 @@ def load_video(video_path, max_frames=64):
         total_frames = stream.frames
         if total_frames <= 0:
-            print("Frame count unknown, decoding all and sampling...")
             temp_frames = []
             for frame in container.decode(video=0):
                 temp_frames.append(frame.to_image())
             if len(temp_frames) > max_frames:
                 indices = [int(i * len(temp_frames) / max_frames) for i in range(max_frames)]
                 frames = [temp_frames[i] for i in indices]
@@ -55,116 +59,106 @@ def load_video(video_path, max_frames=64):
         print(f"Error loading video: {e}")
         return None
-# Utility for response normalization
-_PATTERN = re.compile(
-    r'(```[\s\S]*?```|`[^`]+`|\$\$[\s\S]*?\$\$|\$[^$]+\$|\\\([\s\S]*?\\\)|\\\[[\s\S]*?\\\])'
-    r'|(?<!\\)(?:\\r\\n|\\[nr])'
-)
 def normalize_response_text(text: str) -> str:
-    if not isinstance(text, str) or "\\" not in text:
-        return text
-    return _PATTERN.sub(lambda m: m.group(1) or '\n', text)
 app = Server()
 @app.api()
 @spaces.GPU(duration=120)
-def predict(message: str, file: FileData = None, downsample_mode: str = "16x") -> str:
     """
-    General inference endpoint for both image and video.
     """
-    if file is None:
-        # Text-only inference
-        messages = [{"role": "user", "content": [{"type": "text", "text": message}]}]
-        inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt"
-        ).to(model.device)
-    else:
-        file_path = file["path"]
-        # Robust detection: Try opening with AV first to see if it's a video
-        is_video = False
-        try:
-            container = av.open(file_path)
-            if len(container.streams.video) > 0:
-                is_video = True
-            container.close()
-        except:
             is_video = False
-        if is_video:
-            print(f"Processing as video: {file_path}")
-            frames = load_video(file_path, max_frames=64)
-            if frames is None or len(frames) == 0:
-                return "Error: Could not decode video file."
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "video": frames},
-                        {"type": "text", "text": message},
-                    ],
-                }
-            ]
-            inputs = processor.apply_chat_template(
-                messages, tokenize=True, add_generation_prompt=True,
-                return_dict=True, return_tensors="pt",
-                processor_kwargs={
-                    "downsample_mode": downsample_mode,
-                    "max_num_frames": 64,
-                    "stack_frames": 1,
-                    "max_slice_nums": 1,
-                    "use_image_id": False,
-                    "do_sample_frames": False, # Fix: Avoid requiring metadata since we already sampled
-                }
-            ).to(model.device)
-        else:
-            print(f"Processing as image: {file_path}")
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image", "url": file_path},
-                        {"type": "text", "text": message},
-                    ],
-                }
-            ]
-            inputs = processor.apply_chat_template(
-                messages, tokenize=True, add_generation_prompt=True,
-                return_dict=True, return_tensors="pt",
-                processor_kwargs={
-                    "downsample_mode": downsample_mode,
-                    "max_slice_nums": 9,
-                }
-            ).to(model.device)
-    with torch.no_grad():
-        generate_kwargs = {
-            **inputs,
-            "max_new_tokens": 1024,
-            "do_sample": True,
-            "temperature": 0.7
         }
-        if file is not None:
-            generate_kwargs["downsample_mode"] = downsample_mode
-        generated_ids = model.generate(**generate_kwargs)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )
-    return normalize_response_text(output_text[0])
 @app.get("/", response_class=HTMLResponse)
 async def homepage():

 import torch
 import re
 import av
+import uuid
+import copy
+import threading
+import time
 from PIL import Image
+from transformers import AutoProcessor, MiniCPMV4_6ForConditionalGeneration, TextIteratorStreamer
 from gradio import Server
 from gradio.data_classes import FileData
 from fastapi.responses import HTMLResponse
 import spaces
+# ---------- Globals & Model Loading ----------
+MODEL_ID = "openbmb/MiniCPM-V-4.6"
+print(f"Loading processor: {MODEL_ID}")
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+print(f"Loading model: {MODEL_ID}")
+model = MiniCPMV4_6ForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="sdpa",
     trust_remote_code=True,
+    device_map="cuda"
+).eval()
+# ---------- Helper Functions ----------
 def load_video(video_path, max_frames=64):
     try:
         container = av.open(video_path)
         frames = []
         total_frames = stream.frames
         if total_frames <= 0:
             temp_frames = []
             for frame in container.decode(video=0):
                 temp_frames.append(frame.to_image())
             if len(temp_frames) > max_frames:
                 indices = [int(i * len(temp_frames) / max_frames) for i in range(max_frames)]
                 frames = [temp_frames[i] for i in indices]
         print(f"Error loading video: {e}")
         return None
 def normalize_response_text(text: str) -> str:
+    # Basic normalization, could be expanded
+    return text.replace("\\n", "\n")
+# ---------- Inference Endpoint ----------
 app = Server()
 @app.api()
 @spaces.GPU(duration=120)
+def predict(
+    message: str,
+    history: list[list[str, str]] = None,
+    files: list[FileData] = None,
+    thinking_mode: bool = True,
+    max_new_tokens: int = 1024,
+    temperature: float = 0.7,
+    top_p: float = 0.8,
+    top_k: int = 100,
+    max_frames: int = 64
+):
     """
+    Streaming inference endpoint with history support.
     """
+    messages = []
+    # Process history
+    if history:
+        for turn in history:
+            if turn[0]:
+                messages.append({"role": "user", "content": [{"type": "text", "text": turn[0]}]})
+            if turn[1]:
+                messages.append({"role": "assistant", "content": [{"type": "text", "text": turn[1]}]})
+    content = []
+    if files:
+        for f in files:
+            file_path = f["path"]
+            # Detect if video or image
             is_video = False
+            try:
+                container = av.open(file_path)
+                if len(container.streams.video) > 0:
+                    is_video = True
+                container.close()
+            except:
+                is_video = False
+            if is_video:
+                frames = load_video(file_path, max_frames=max_frames)
+                if frames:
+                    content.append({"type": "video", "video": frames})
+            else:
+                content.append({"type": "image", "image": Image.open(file_path).convert("RGB")})
+    content.append({"type": "text", "text": message})
+    messages = [{"role": "user", "content": content}]
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+        enable_thinking=thinking_mode,
+        processor_kwargs={
+            "max_num_frames": max_frames,
         }
+    ).to(model.device)
+    # Cast float tensors to bfloat16
+    for k, v in inputs.items():
+        if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
+            inputs[k] = v.to(dtype=torch.bfloat16)
+    streamer = TextIteratorStreamer(
+        processor.tokenizer,
+        skip_prompt=True,
+        skip_special_tokens=True,
     )
+    generate_kwargs = {
+        **inputs,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": temperature > 0,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "streamer": streamer,
+    }
+    # Start generation in a separate thread
+    thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
+    full_text = ""
+    for new_text in streamer:
+        full_text += new_text
+        yield normalize_response_text(full_text)
 @app.get("/", response_class=HTMLResponse)
 async def homepage():

index.html CHANGED Viewed

@@ -3,19 +3,20 @@
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
-    <title>MiniCPM-V | OpenBMB</title>
     <script src="https://cdn.tailwindcss.com"></script>
     <link href="https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700&family=Inter:wght@300;400;500;600&display=swap" rel="stylesheet">
     <script src="https://unpkg.com/lucide@latest"></script>
     <style>
         :root {
-            --bg: #0A0C10;
             --blue: #3B5BFF;
             --cyan: #27D4EA;
             --text: #FFFFFF;
-            --text-muted: #6E7681;
             --glass: rgba(255, 255, 255, 0.03);
-            --glass-border: rgba(255, 255, 255, 0.1);
         }
         body {
@@ -26,7 +27,7 @@
             margin: 0;
             display: flex;
             flex-direction: column;
-            overflow: hidden; /* Prevent body scroll */
         }
         h1, h2, h3 { font-family: 'Outfit', sans-serif; }
@@ -34,46 +35,53 @@
         .chat-scroll-area {
             flex: 1;
             overflow-y: auto;
-            padding-bottom: 120px; /* Space for floating input */
             -webkit-overflow-scrolling: touch;
         }
-        /* Modern Scrollbar */
-        .chat-scroll-area::-webkit-scrollbar {
-            width: 5px;
-        }
-        .chat-scroll-area::-webkit-scrollbar-track {
-            background: transparent;
-        }
-        .chat-scroll-area::-webkit-scrollbar-thumb {
-            background: rgba(255, 255, 255, 0.1);
-            border-radius: 10px;
-        }
         .message-bubble {
             max-width: 85%;
             animation: fadeIn 0.4s cubic-bezier(0.16, 1, 0.3, 1) forwards;
         }
         @keyframes fadeIn {
-            from { opacity: 0; transform: translateY(10px); }
             to { opacity: 1; transform: translateY(0); }
         }
         .user-message {
             background: linear-gradient(135deg, var(--blue), var(--cyan));
             color: #FFFFFF;
-            box-shadow: 0 10px 30px rgba(59, 91, 255, 0.2);
         }
         .bot-message {
-            background: rgba(255, 255, 255, 0.04);
             border: 1px solid var(--glass-border);
         }
         .typing-dot {
-            width: 4px;
-            height: 4px;
             background: var(--cyan);
             border-radius: 50%;
             animation: bounce 1.4s infinite ease-in-out;
@@ -87,94 +95,205 @@
         }
         .input-pill {
-            background: rgba(255, 255, 255, 0.05);
-            backdrop-filter: blur(20px);
-            -webkit-backdrop-filter: blur(20px);
             border: 1px solid var(--glass-border);
-            transition: all 0.3s ease;
         }
         .input-pill:focus-within {
-            border-color: var(--blue);
-            box-shadow: 0 0 30px rgba(59, 91, 255, 0.1);
         }
         .logo-glow {
-            filter: drop-shadow(0 0 10px rgba(39, 212, 234, 0.3));
         }
         .send-btn {
             background: linear-gradient(135deg, var(--blue), var(--cyan));
-            transition: transform 0.2s ease, opacity 0.2s ease;
         }
-        .send-btn:active { transform: scale(0.95); }
-        #user-input::placeholder { color: #555; }
     </style>
 </head>
 <body>
-    <!-- Minimalist Header -->
-    <header class="h-20 flex items-center justify-between px-6 md:px-12 shrink-0 z-50">
         <div class="flex items-center gap-4">
-            <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/1670387859384-633fe7784b362488336bbfad.png"
-                 alt="OpenBMB" class="w-10 h-10 logo-glow">
             <div>
-                <h1 class="text-xl font-bold tracking-tight">MiniCPM-V</h1>
-                <p class="text-[10px] text-muted uppercase tracking-[0.2em] font-medium">By OpenBMB</p>
             </div>
         </div>
-        <div class="hidden md:flex items-center gap-2 text-[10px] font-bold text-muted uppercase tracking-widest">
-            <span class="w-1.5 h-1.5 rounded-full bg-[#27D4EA] animate-pulse"></span>
-            Vision System Online
         </div>
     </header>
-    <!-- Chat Messages Scroll Area -->
-    <main id="chat-messages" class="chat-scroll-area px-4 md:px-0">
-        <div class="max-w-3xl mx-auto space-y-8 pt-4">
-            <!-- Bot Greeting -->
             <div class="flex gap-4 items-start">
-                <div class="bot-message p-6 rounded-3xl rounded-tl-none message-bubble shadow-2xl">
                     <p class="text-white/90 leading-relaxed text-[15px]">
-                        Welcome to <span class="font-bold text-[#27D4EA]">MiniCPM-V 4.6</span>.
-                        I can analyze images and videos with high efficiency.
                         <br><br>
-                        Drop a file below to begin.
                     </p>
                 </div>
             </div>
         </div>
     </main>
-    <!-- Floating Input Bar -->
     <div class="fixed bottom-0 left-0 right-0 p-6 md:p-10 pointer-events-none">
         <div class="max-w-3xl mx-auto pointer-events-auto">
-            <!-- Media Preview -->
-            <div id="preview-container" class="hidden mb-6 animate-in">
-                <div class="relative inline-block group">
-                    <img id="image-preview" src="" class="h-36 w-auto rounded-3xl border border-white/20 shadow-2xl hidden object-cover" />
-                    <video id="video-preview" class="h-36 w-auto rounded-3xl border border-white/20 shadow-2xl hidden object-cover" muted loop></video>
-                    <button id="cancel-file" class="absolute -top-3 -right-3 bg-white text-black rounded-full p-2 shadow-xl hover:bg-neutral-200 transition-all">
-                        <i data-lucide="x" class="w-4 h-4"></i>
-                    </button>
-                </div>
             </div>
-            <!-- Pill Input -->
-            <div class="input-pill rounded-[2.5rem] p-2 flex items-end gap-2 pr-3 shadow-2xl">
                 <div class="flex items-center">
-                    <input type="file" id="file-input" class="hidden" accept="image/*,video/*">
-                    <button id="upload-trigger" class="p-4 text-white/30 hover:text-[#27D4EA] transition-colors">
-                        <i data-lucide="paperclip" class="w-6 h-6"></i>
                     </button>
                 </div>
-                <textarea id="user-input" rows="1" placeholder="Type your message..."
-                          class="flex-1 bg-transparent border-none focus:ring-0 text-white py-4 px-1 resize-none max-h-40 scrollbar-none text-[16px] leading-relaxed"></textarea>
-                <button id="send-btn" class="send-btn w-12 h-12 text-white rounded-full flex items-center justify-center disabled:opacity-20 disabled:grayscale group shrink-0 mb-1">
-                    <i data-lucide="arrow-up" class="w-5 h-5 group-hover:scale-110 transition-transform" id="send-icon"></i>
                     <i data-lucide="loader-2" class="w-5 h-5 animate-spin hidden" id="loading-icon"></i>
                 </button>
             </div>
@@ -186,20 +305,33 @@
         lucide.createIcons();
-        const chatMessages = document.getElementById('chat-messages');
         const userInput = document.getElementById('user-input');
         const sendBtn = document.getElementById('send-btn');
         const fileInput = document.getElementById('file-input');
         const uploadTrigger = document.getElementById('upload-trigger');
         const previewContainer = document.getElementById('preview-container');
-        const imagePreview = document.getElementById('image-preview');
-        const videoPreview = document.getElementById('video-preview');
-        const cancelFile = document.getElementById('cancel-file');
         const sendIcon = document.getElementById('send-icon');
         const loadingIcon = document.getElementById('loading-icon');
-        let selectedFile = null;
         let client = null;
         async function init() {
             try {
@@ -208,119 +340,222 @@
         }
         init();
-        // UI Interactions
         uploadTrigger.onclick = () => fileInput.click();
         fileInput.onchange = (e) => {
-            const file = e.target.files[0];
-            if (file) {
-                selectedFile = file;
-                previewContainer.classList.remove('hidden');
-                const url = URL.createObjectURL(file);
-                if (file.type.startsWith('image/')) {
-                    imagePreview.src = url;
-                    imagePreview.classList.remove('hidden');
-                    videoPreview.classList.add('hidden');
-                } else {
-                    videoPreview.src = url;
-                    videoPreview.classList.remove('hidden');
-                    imagePreview.classList.add('hidden');
-                    videoPreview.play();
-                }
-            }
         };
-        cancelFile.onclick = () => {
-            selectedFile = null;
-            fileInput.value = '';
-            previewContainer.classList.add('hidden');
-            imagePreview.src = '';
-            videoPreview.src = '';
-        };
-        function appendMessage(role, text, mediaUrl = null, mediaType = null) {
             const div = document.createElement('div');
             div.className = `flex gap-4 items-start ${role === 'user' ? 'flex-row-reverse' : ''}`;
             let mediaHtml = '';
-            if (mediaUrl) {
-                if (mediaType.startsWith('image')) {
-                    mediaHtml = `<img src="${mediaUrl}" class="max-w-xs md:max-w-md rounded-3xl mb-4 border border-white/10" />`;
-                } else {
-                    mediaHtml = `<video src="${mediaUrl}" controls class="max-w-xs md:max-w-md rounded-3xl mb-4 border border-white/10"></video>`;
-                }
             }
             const bubbleClass = role === 'user' ? 'user-message' : 'bot-message';
             div.innerHTML = `
-                <div class="${bubbleClass} p-6 rounded-[2rem] ${role === 'user' ? 'rounded-tr-none' : 'rounded-tl-none'} message-bubble shadow-xl">
                     ${mediaHtml}
-                    <p class="leading-relaxed text-[15px] whitespace-pre-wrap font-medium">${text}</p>
                 </div>
             `;
-            // Get the inner container
-            const container = chatMessages.querySelector('.max-w-3xl');
-            container.appendChild(div);
-            // Smooth scroll to bottom
-            chatMessages.scrollTo({ top: chatMessages.scrollHeight, behavior: 'smooth' });
         }
         async function sendMessage() {
             const text = userInput.value.trim();
-            if (!text && !selectedFile) return;
             const content = text;
-            const file = selectedFile;
             userInput.value = '';
             userInput.style.height = 'auto';
-            const fileUrl = file ? URL.createObjectURL(file) : null;
-            const fileType = file ? file.type : null;
-            appendMessage('user', content, fileUrl, fileType);
-            cancelFile.click();
             sendIcon.classList.add('hidden');
             loadingIcon.classList.remove('hidden');
-            sendBtn.disabled = true;
-            const thinkingId = 'think-' + Date.now();
-            const thinkingDiv = document.createElement('div');
-            thinkingDiv.id = thinkingId;
-            thinkingDiv.className = 'flex gap-4 items-start';
-            thinkingDiv.innerHTML = `
-                <div class="bot-message p-6 rounded-[2rem] rounded-tl-none message-bubble flex items-center gap-4">
-                    <div class="flex gap-1.5">
-                        <div class="typing-dot"></div><div class="typing-dot"></div><div class="typing-dot"></div>
-                    </div>
-                </div>
-            `;
-            const container = chatMessages.querySelector('.max-w-3xl');
-            container.appendChild(thinkingDiv);
-            chatMessages.scrollTo({ top: chatMessages.scrollHeight, behavior: 'smooth' });
             try {
-                let fileData = file ? handle_file(file) : null;
-                const result = await client.predict("/predict", {
                     message: content,
-                    file: fileData,
-                    downsample_mode: "16x"
                 });
-                document.getElementById(thinkingId).remove();
-                appendMessage('bot', result.data);
             } catch (err) {
-                document.getElementById(thinkingId).remove();
-                appendMessage('bot', "The system encountered an error. Please check your file format and try again.");
             } finally {
-                sendIcon.classList.remove('hidden');
-                loadingIcon.classList.add('hidden');
-                sendBtn.disabled = false;
             }
         }
         sendBtn.onclick = sendMessage;
         userInput.onkeydown = (e) => {
             if (e.key === 'Enter' && !e.shiftKey) {
@@ -328,6 +563,12 @@
                 sendMessage();
             }
         };
     </script>
 </body>
 </html>

 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
+    <title>MiniCPM-V | OpenBMB Premium</title>
     <script src="https://cdn.tailwindcss.com"></script>
     <link href="https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700&family=Inter:wght@300;400;500;600&display=swap" rel="stylesheet">
     <script src="https://unpkg.com/lucide@latest"></script>
     <style>
         :root {
+            --bg: #05070A;
             --blue: #3B5BFF;
             --cyan: #27D4EA;
             --text: #FFFFFF;
+            --text-muted: #8B949E;
             --glass: rgba(255, 255, 255, 0.03);
+            --glass-border: rgba(255, 255, 255, 0.08);
+            --accent: #3B5BFF;
         }
         body {
             margin: 0;
             display: flex;
             flex-direction: column;
+            overflow: hidden;
         }
         h1, h2, h3 { font-family: 'Outfit', sans-serif; }
         .chat-scroll-area {
             flex: 1;
             overflow-y: auto;
+            padding-bottom: 140px;
             -webkit-overflow-scrolling: touch;
+            scroll-behavior: smooth;
         }
+        .chat-scroll-area::-webkit-scrollbar { width: 4px; }
+        .chat-scroll-area::-webkit-scrollbar-track { background: transparent; }
+        .chat-scroll-area::-webkit-scrollbar-thumb { background: rgba(255, 255, 255, 0.1); border-radius: 10px; }
         .message-bubble {
             max-width: 85%;
             animation: fadeIn 0.4s cubic-bezier(0.16, 1, 0.3, 1) forwards;
+            position: relative;
         }
         @keyframes fadeIn {
+            from { opacity: 0; transform: translateY(15px); }
             to { opacity: 1; transform: translateY(0); }
         }
         .user-message {
             background: linear-gradient(135deg, var(--blue), var(--cyan));
             color: #FFFFFF;
+            box-shadow: 0 8px 25px rgba(59, 91, 255, 0.15);
+            border-radius: 24px 24px 4px 24px;
         }
         .bot-message {
+            background: rgba(255, 255, 255, 0.03);
             border: 1px solid var(--glass-border);
+            border-radius: 24px 24px 24px 4px;
+            backdrop-filter: blur(10px);
+        }
+        .thinking-block {
+            background: rgba(59, 91, 255, 0.05);
+            border-left: 3px solid var(--blue);
+            padding: 12px 16px;
+            margin-bottom: 12px;
+            border-radius: 4px 12px 12px 4px;
+            font-size: 14px;
+            color: var(--text-muted);
+            font-style: italic;
         }
         .typing-dot {
+            width: 4px; height: 4px;
             background: var(--cyan);
             border-radius: 50%;
             animation: bounce 1.4s infinite ease-in-out;
         }
         .input-pill {
+            background: rgba(255, 255, 255, 0.04);
+            backdrop-filter: blur(25px);
+            -webkit-backdrop-filter: blur(25px);
             border: 1px solid var(--glass-border);
+            transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
         }
         .input-pill:focus-within {
+            border-color: rgba(59, 91, 255, 0.4);
+            background: rgba(255, 255, 255, 0.06);
+            box-shadow: 0 0 40px rgba(59, 91, 255, 0.08);
         }
         .logo-glow {
+            filter: drop-shadow(0 0 15px rgba(39, 212, 234, 0.4));
         }
         .send-btn {
             background: linear-gradient(135deg, var(--blue), var(--cyan));
+            transition: all 0.3s ease;
         }
+        .send-btn:hover:not(:disabled) { transform: scale(1.05); filter: brightness(1.1); }
+        .send-btn:active:not(:disabled) { transform: scale(0.95); }
+        .settings-panel {
+            background: rgba(10, 12, 16, 0.95);
+            backdrop-filter: blur(30px);
+            border-left: 1px solid var(--glass-border);
+            transition: transform 0.4s cubic-bezier(0.16, 1, 0.3, 1);
+        }
+        .control-slider {
+            -webkit-appearance: none;
+            width: 100%;
+            height: 4px;
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 2px;
+            outline: none;
+        }
+        .control-slider::-webkit-slider-thumb {
+            -webkit-appearance: none;
+            width: 12px; height: 12px;
+            background: var(--blue);
+            border-radius: 50%;
+            cursor: pointer;
+            transition: scale 0.2s;
+        }
+        .control-slider::-webkit-slider-thumb:hover { scale: 1.2; }
+        .toggle-switch {
+            width: 36px; height: 20px;
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 10px;
+            position: relative;
+            cursor: pointer;
+            transition: background 0.3s;
+        }
+        .toggle-switch.active { background: var(--blue); }
+        .toggle-switch::after {
+            content: '';
+            position: absolute;
+            top: 2px; left: 2px;
+            width: 16px; height: 16px;
+            background: white;
+            border-radius: 50%;
+            transition: transform 0.3s;
+        }
+        .toggle-switch.active::after { transform: translateX(16px); }
+        .media-preview-item {
+            position: relative;
+            animation: scaleIn 0.3s ease-out;
+        }
+        @keyframes scaleIn { from { scale: 0.8; opacity: 0; } to { scale: 1; opacity: 1; } }
+        .shimmer {
+            background: linear-gradient(90deg, transparent, rgba(255,255,255,0.05), transparent);
+            background-size: 200% 100%;
+            animation: shimmer 2s infinite;
+        }
+        @keyframes shimmer { 0% { background-position: -200% 0; } 100% { background-position: 200% 0; } }
     </style>
 </head>
 <body>
+    <!-- Header -->
+    <header class="h-20 flex items-center justify-between px-6 md:px-12 shrink-0 z-50 border-b border-white/5">
         <div class="flex items-center gap-4">
+            <div class="relative">
+                <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/1670387859384-633fe7784b362488336bbfad.png"
+                     alt="OpenBMB" class="w-10 h-10 logo-glow">
+                <div class="absolute -bottom-1 -right-1 w-3 h-3 bg-green-500 rounded-full border-2 border-[var(--bg)]"></div>
+            </div>
             <div>
+                <h1 class="text-xl font-bold tracking-tight bg-clip-text text-transparent bg-gradient-to-r from-white to-white/60">MiniCPM-V</h1>
+                <p class="text-[10px] text-muted uppercase tracking-[0.2em] font-bold opacity-50">By OpenBMB</p>
             </div>
         </div>
+        <div class="flex items-center gap-6">
+            <div class="hidden md:flex items-center gap-2 text-[10px] font-bold text-muted uppercase tracking-widest bg-white/5 px-3 py-1.5 rounded-full border border-white/5">
+                <span class="w-1.5 h-1.5 rounded-full bg-[#27D4EA] animate-pulse"></span>
+                v4.6 Intelligence Engine
+            </div>
+            <button id="toggle-settings" class="p-2 text-white/40 hover:text-white transition-colors relative">
+                <i data-lucide="sliders-horizontal" class="w-5 h-5"></i>
+            </button>
         </div>
     </header>
+    <!-- Settings Panel (Side) -->
+    <div id="settings-panel" class="fixed top-0 right-0 h-full w-80 z-[100] translate-x-full settings-panel p-8 flex flex-col gap-8 shadow-[-20px_0_50px_rgba(0,0,0,0.5)]">
+        <div class="flex items-center justify-between">
+            <h2 class="text-lg font-bold">Engine Settings</h2>
+            <button id="close-settings" class="text-white/40 hover:text-white"><i data-lucide="x" class="w-5 h-5"></i></button>
+        </div>
+        <div class="space-y-6">
+            <div class="flex items-center justify-between">
+                <span class="text-sm font-medium text-white/70">Thinking Mode</span>
+                <div id="thinking-toggle" class="toggle-switch active"></div>
+            </div>
+            <div class="flex items-center justify-between">
+                <span class="text-sm font-medium text-white/70">Streaming</span>
+                <div id="streaming-toggle" class="toggle-switch active"></div>
+            </div>
+            <div class="space-y-3">
+                <div class="flex justify-between text-xs font-bold text-white/40 uppercase tracking-widest">
+                    <span>Temperature</span>
+                    <span id="temp-val">0.7</span>
+                </div>
+                <input type="range" id="temp-slider" min="0" max="2" step="0.01" value="0.7" class="control-slider">
+            </div>
+            <div class="space-y-3">
+                <div class="flex justify-between text-xs font-bold text-white/40 uppercase tracking-widest">
+                    <span>Max Tokens</span>
+                    <span id="tokens-val">1024</span>
+                </div>
+                <input type="range" id="tokens-slider" min="64" max="4096" step="64" value="1024" class="control-slider">
+            </div>
+            <div class="space-y-3">
+                <div class="flex justify-between text-xs font-bold text-white/40 uppercase tracking-widest">
+                    <span>Top-P</span>
+                    <span id="p-val">0.8</span>
+                </div>
+                <input type="range" id="p-slider" min="0" max="1" step="0.05" value="0.8" class="control-slider">
+            </div>
+            <button onclick="clearHistory()" class="w-full py-4 rounded-2xl bg-red-500/10 border border-red-500/20 text-red-500 text-sm font-bold hover:bg-red-500/20 transition-all flex items-center justify-center gap-2">
+                <i data-lucide="trash-2" class="w-4 h-4"></i>
+                Clear Conversation
+            </button>
+        </div>
+        <div class="mt-auto p-4 bg-white/5 rounded-2xl border border-white/5 text-[10px] text-white/30 leading-relaxed">
+            MiniCPM-V 4.6 is a multimodal large language model with strong OCR and reasoning capabilities.
+        </div>
+    </div>
+    <!-- Chat Area -->
+    <main id="chat-messages" class="chat-scroll-area px-4">
+        <div class="max-w-3xl mx-auto space-y-8 pt-8" id="chat-container">
+            <!-- Greeting -->
             <div class="flex gap-4 items-start">
+                <div class="bot-message p-6 message-bubble shadow-2xl">
                     <p class="text-white/90 leading-relaxed text-[15px]">
+                        Welcome to <span class="font-bold text-[#27D4EA]">MiniCPM-V 4.6 Premium</span>.
+                        I've been upgraded with <span class="text-white font-bold">Thinking Mode</span> and <span class="text-white font-bold">Multimodal Streaming</span>.
                         <br><br>
+                        Upload multiple images or a video to see me in action.
                     </p>
                 </div>
             </div>
         </div>
     </main>
+    <!-- Input Area -->
     <div class="fixed bottom-0 left-0 right-0 p-6 md:p-10 pointer-events-none">
         <div class="max-w-3xl mx-auto pointer-events-auto">
+            <!-- Multi-file Preview -->
+            <div id="preview-container" class="hidden mb-6 flex flex-wrap gap-3 max-h-40 overflow-y-auto p-2 scrollbar-none">
+                <!-- Preview items will be injected here -->
             </div>
+            <!-- Input Bar -->
+            <div class="input-pill rounded-[2rem] p-2 flex items-end gap-2 pr-3 shadow-2xl overflow-hidden">
                 <div class="flex items-center">
+                    <input type="file" id="file-input" class="hidden" accept="image/*,video/*" multiple>
+                    <button id="upload-trigger" class="p-4 text-white/30 hover:text-[#27D4EA] transition-colors relative group">
+                        <i data-lucide="plus" class="w-6 h-6"></i>
+                        <span id="file-count-badge" class="hidden absolute top-3 right-3 w-4 h-4 bg-blue-600 text-[9px] font-bold rounded-full flex items-center justify-center">0</span>
                     </button>
                 </div>
+                <textarea id="user-input" rows="1" placeholder="Describe what you see..."
+                          class="flex-1 bg-transparent border-none focus:ring-0 text-white py-4 px-1 resize-none max-h-48 scrollbar-none text-[16px] leading-relaxed"></textarea>
+                <button id="send-btn" class="send-btn w-12 h-12 text-white rounded-full flex items-center justify-center disabled:opacity-20 disabled:grayscale shrink-0 mb-1">
+                    <i data-lucide="arrow-up" class="w-5 h-5" id="send-icon"></i>
                     <i data-lucide="loader-2" class="w-5 h-5 animate-spin hidden" id="loading-icon"></i>
                 </button>
             </div>
         lucide.createIcons();
+        // DOM Elements
+        const chatContainer = document.getElementById('chat-container');
+        const chatScrollArea = document.getElementById('chat-messages');
         const userInput = document.getElementById('user-input');
         const sendBtn = document.getElementById('send-btn');
         const fileInput = document.getElementById('file-input');
         const uploadTrigger = document.getElementById('upload-trigger');
         const previewContainer = document.getElementById('preview-container');
+        const fileCountBadge = document.getElementById('file-count-badge');
         const sendIcon = document.getElementById('send-icon');
         const loadingIcon = document.getElementById('loading-icon');
+        const settingsPanel = document.getElementById('settings-panel');
+        const toggleSettings = document.getElementById('toggle-settings');
+        const closeSettings = document.getElementById('close-settings');
+        const thinkingToggle = document.getElementById('thinking-toggle');
+        const streamingToggle = document.getElementById('streaming-toggle');
+        const tempSlider = document.getElementById('temp-slider');
+        const tokensSlider = document.getElementById('tokens-slider');
+        const pSlider = document.getElementById('p-slider');
+        let selectedFiles = [];
         let client = null;
+        let isSettingsOpen = false;
+        let chatHistory = [];
+        let currentJob = null;
         async function init() {
             try {
         }
         init();
+        // Settings Logic
+        toggleSettings.onclick = () => {
+            isSettingsOpen = true;
+            settingsPanel.classList.remove('translate-x-full');
+        };
+        closeSettings.onclick = () => {
+            isSettingsOpen = false;
+            settingsPanel.classList.add('translate-x-full');
+        };
+        const setupToggle = (el) => {
+            el.onclick = () => el.classList.toggle('active');
+        };
+        setupToggle(thinkingToggle);
+        setupToggle(streamingToggle);
+        const setupSlider = (slider, valEl) => {
+            slider.oninput = () => valEl.textContent = slider.value;
+        };
+        setupSlider(tempSlider, document.getElementById('temp-val'));
+        setupSlider(tokensSlider, document.getElementById('tokens-val'));
+        setupSlider(pSlider, document.getElementById('p-val'));
+        // File Handling
         uploadTrigger.onclick = () => fileInput.click();
         fileInput.onchange = (e) => {
+            const files = Array.from(e.target.files);
+            selectedFiles = [...selectedFiles, ...files];
+            renderPreviews();
         };
+        function renderPreviews() {
+            previewContainer.innerHTML = '';
+            if (selectedFiles.length > 0) {
+                previewContainer.classList.remove('hidden');
+                fileCountBadge.classList.remove('hidden');
+                fileCountBadge.textContent = selectedFiles.length;
+                selectedFiles.forEach((file, index) => {
+                    const url = URL.createObjectURL(file);
+                    const item = document.createElement('div');
+                    item.className = 'media-preview-item h-24 w-24 rounded-2xl overflow-hidden border border-white/20 shadow-lg';
+                    if (file.type.startsWith('image/')) {
+                        item.innerHTML = `<img src="${url}" class="w-full h-full object-cover">`;
+                    } else {
+                        item.innerHTML = `<video src="${url}" class="w-full h-full object-cover" muted></video><div class="absolute inset-0 flex items-center justify-center bg-black/20"><i data-lucide="play" class="w-6 h-6 text-white"></i></div>`;
+                    }
+                    const removeBtn = document.createElement('button');
+                    removeBtn.className = 'absolute -top-1 -right-1 bg-red-500 text-white rounded-full p-1 shadow-lg scale-75';
+                    removeBtn.innerHTML = '<i data-lucide="x" class="w-3 h-3"></i>';
+                    removeBtn.onclick = (e) => {
+                        e.stopPropagation();
+                        selectedFiles.splice(index, 1);
+                        renderPreviews();
+                    };
+                    item.appendChild(removeBtn);
+                    previewContainer.appendChild(item);
+                });
+                lucide.createIcons();
+            } else {
+                previewContainer.classList.add('hidden');
+                fileCountBadge.classList.add('hidden');
+            }
+        }
+        // Message Handling
+        function appendMessage(role, text = '', files = []) {
             const div = document.createElement('div');
             div.className = `flex gap-4 items-start ${role === 'user' ? 'flex-row-reverse' : ''}`;
             let mediaHtml = '';
+            if (files.length > 0) {
+                mediaHtml = '<div class="flex flex-wrap gap-2 mb-4">';
+                files.forEach(file => {
+                    const url = typeof file === 'string' ? file : URL.createObjectURL(file);
+                    const type = typeof file === 'string' ? (file.match(/\.(mp4|webm|mkv)/i) ? 'video' : 'image') : (file.type.startsWith('video') ? 'video' : 'image');
+                    if (type === 'image') {
+                        mediaHtml += `<img src="${url}" class="h-48 rounded-2xl border border-white/10 shadow-lg object-contain bg-black/20" />`;
+                    } else {
+                        mediaHtml += `<video src="${url}" controls class="h-48 rounded-2xl border border-white/10 shadow-lg" />`;
+                    }
+                });
+                mediaHtml += '</div>';
             }
             const bubbleClass = role === 'user' ? 'user-message' : 'bot-message';
             div.innerHTML = `
+                <div class="${bubbleClass} p-6 message-bubble shadow-xl">
                     ${mediaHtml}
+                    <div class="thinking-container hidden"></div>
+                    <div class="content-container leading-relaxed text-[15px] whitespace-pre-wrap font-medium">${text}</div>
                 </div>
             `;
+            chatContainer.appendChild(div);
+            chatScrollArea.scrollTo({ top: chatScrollArea.scrollHeight, behavior: 'smooth' });
+            return div;
+        }
+        function updateBotMessage(div, fullText) {
+            const thinkingContainer = div.querySelector('.thinking-container');
+            const contentContainer = div.querySelector('.content-container');
+            const thinkMatch = fullText.match(/<think>([\s\S]*?)<\/think>/);
+            const thinkingText = thinkMatch ? thinkMatch[1].trim() : (fullText.includes('<think>') && !fullText.includes('</think>') ? fullText.split('<think>')[1].trim() : '');
+            const actualText = fullText.replace(/<think>[\s\S]*?<\/think>/, '').trim();
+            if (thinkingText) {
+                thinkingContainer.classList.remove('hidden');
+                thinkingContainer.innerHTML = `<div class="thinking-block">${thinkingText}</div>`;
+            } else {
+                thinkingContainer.classList.add('hidden');
+            }
+            contentContainer.textContent = actualText;
+            chatScrollArea.scrollTo({ top: chatScrollArea.scrollHeight, behavior: 'smooth' });
+            return actualText;
         }
         async function sendMessage() {
             const text = userInput.value.trim();
+            if (!text && selectedFiles.length === 0) return;
+            const filesToUpload = [...selectedFiles];
             const content = text;
             userInput.value = '';
             userInput.style.height = 'auto';
+            appendMessage('user', content, filesToUpload);
+            selectedFiles = [];
+            renderPreviews();
             sendIcon.classList.add('hidden');
             loadingIcon.classList.remove('hidden');
+            sendBtn.innerHTML = '<i data-lucide="square" class="w-5 h-5 fill-white"></i>';
+            sendBtn.classList.remove('send-btn');
+            sendBtn.classList.add('bg-red-500/20', 'hover:bg-red-500/40', 'border', 'border-red-500/50');
+            lucide.createIcons();
+            let isStopped = false;
+            sendBtn.onclick = () => {
+                if (currentJob) {
+                    currentJob.cancel();
+                    isStopped = true;
+                    resetSendBtn();
+                }
+            };
+            // Bot response placeholder
+            const botDiv = appendMessage('bot', '');
+            const contentContainer = botDiv.querySelector('.content-container');
+            contentContainer.innerHTML = '<div class="flex gap-1.5 py-2"><div class="typing-dot"></div><div class="typing-dot"></div><div class="typing-dot"></div></div>';
             try {
+                const gradioFiles = filesToUpload.length > 0 ? filesToUpload.map(f => handle_file(f)) : null;
+                currentJob = client.submit("/predict", {
                     message: content,
+                    history: chatHistory,
+                    files: gradioFiles,
+                    thinking_mode: thinkingToggle.classList.contains('active'),
+                    max_new_tokens: parseInt(tokensSlider.value),
+                    temperature: parseFloat(tempSlider.value),
+                    top_p: parseFloat(pSlider.value),
+                    top_k: 100,
+                    max_frames: 64
                 });
+                let finalAnswer = "";
+                for await (const msg of currentJob) {
+                    if (isStopped) break;
+                    if (msg.type === "data" && msg.data) {
+                        const chunk = msg.data[0];
+                        finalAnswer = updateBotMessage(botDiv, chunk);
+                    }
+                }
+                if (!isStopped) {
+                    chatHistory.push([content, finalAnswer]);
+                }
             } catch (err) {
+                console.error(err);
+                if (!isStopped) {
+                    contentContainer.textContent = "I encountered an error while processing your request. Please try again.";
+                }
             } finally {
+                resetSendBtn();
+                currentJob = null;
             }
         }
+        function resetSendBtn() {
+            sendBtn.innerHTML = '<i data-lucide="arrow-up" class="w-5 h-5" id="send-icon"></i>';
+            sendBtn.className = 'send-btn w-12 h-12 text-white rounded-full flex items-center justify-center disabled:opacity-20 disabled:grayscale shrink-0 mb-1';
+            sendBtn.onclick = sendMessage;
+            lucide.createIcons();
+        }
+        window.clearHistory = function() {
+            chatHistory = [];
+            chatContainer.innerHTML = `
+                <div class="flex gap-4 items-start">
+                    <div class="bot-message p-6 message-bubble shadow-2xl">
+                        <p class="text-white/90 leading-relaxed text-[15px]">
+                            History cleared. How can I help you today?
+                        </p>
+                    </div>
+                </div>
+            `;
+            closeSettings.click();
+        }
         sendBtn.onclick = sendMessage;
         userInput.onkeydown = (e) => {
             if (e.key === 'Enter' && !e.shiftKey) {
                 sendMessage();
             }
         };
+        // Auto-resize textarea
+        userInput.oninput = () => {
+            userInput.style.height = 'auto';
+            userInput.style.height = userInput.scrollHeight + 'px';
+        };
     </script>
 </body>
 </html>

requirements.txt CHANGED Viewed

@@ -8,3 +8,4 @@ pillow
 av
 accelerate
 sentencepiece

 av
 accelerate
 sentencepiece
+modelscope-studio