MiniCPM-V-4.6-Demo

Running on Zero

App Files Files Community

userisuser Cursor commited on 6 days ago

Commit

1b5e2bb

1 Parent(s): b74a8c4

Merge PR2 updated Gradio server app

Browse files

Thanks to @akhaliq for the contribution.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (4) hide show

README.md +1 -2
app.py +381 -32
index.html +940 -0
requirements.txt +9 -12

README.md CHANGED Viewed

@@ -4,12 +4,11 @@ emoji: 🪐
 colorFrom: indigo
 colorTo: pink
 sdk: gradio
-sdk_version: 5.50.0
 python_version: "3.12"
 app_file: app.py
 models:
 - openbmb/MiniCPM-V-4.6
-- openbmb/MiniCPM-V-4.6-Thinking
 pinned: true
 short_description: MiniCPM-V 4.6 Ultra-Efficient Multimodal AI
 ---

 colorFrom: indigo
 colorTo: pink
 sdk: gradio
+sdk_version: 6.14.0
 python_version: "3.12"
 app_file: app.py
 models:
 - openbmb/MiniCPM-V-4.6
 pinned: true
 short_description: MiniCPM-V 4.6 Ultra-Efficient Multimodal AI
 ---

app.py CHANGED Viewed

@@ -1,35 +1,384 @@
 import os
 import spaces
-from v46 import app as v46_app
-INSTRUCT_MODEL_ID = os.environ.get("V46_INSTRUCT_MODEL_ID", "openbmb/MiniCPM-V-4.6")
-THINKING_MODEL_ID = os.environ.get("V46_THINKING_MODEL_ID", "openbmb/MiniCPM-V-4.6-Thinking")
-DEVICE = os.environ.get("V46_DEVICE", "cuda")
-DEFAULT_THINKING = os.environ.get("V46_DEFAULT_THINKING", "0") == "1"
-GPU_DURATION = int(os.environ.get("V46_GPU_DURATION", "300"))
-print(
-    f"[official-space] loading models at module startup: "
-    f"instruct={INSTRUCT_MODEL_ID}, thinking={THINKING_MODEL_ID}, device={DEVICE}",
-    flush=True,
-)
-v46_app.load_models(
-    instruct_path=INSTRUCT_MODEL_ID,
-    thinking_path=THINKING_MODEL_ID,
-    device=DEVICE,
-)
-# ZeroGPU docs recommend placing models on cuda at module level and
-# decorating GPU-dependent callbacks.
-v46_app.native_chat_respond = spaces.GPU(duration=GPU_DURATION)(v46_app.native_chat_respond)
-v46_app.native_fewshot_respond = spaces.GPU(duration=GPU_DURATION)(v46_app.native_fewshot_respond)
-demo = v46_app.build_ui(v46_app.DEFAULT_MODEL_NAME, default_thinking=DEFAULT_THINKING)
-demo.queue(api_open=False).launch(
-    share=False,
-    show_api=False,
-    server_name="0.0.0.0",
-    allowed_paths=[v46_app.UPLOAD_LOG_DIR],
-    app_kwargs=v46_app.http_request_logging_app_kwargs(),
-)

 import os
+import torch
+import re
+import av
+import uuid
+import copy
+import threading
+import time
+import shutil
+from PIL import Image
+from transformers import AutoProcessor, MiniCPMV4_6ForConditionalGeneration, TextIteratorStreamer
+from gradio import Server
+from gradio.data_classes import FileData
+from fastapi.responses import HTMLResponse
+import logging
+# Silence asyncio noise from ZeroGPU cleanup
+logging.getLogger("asyncio").setLevel(logging.CRITICAL)
+from starlette.middleware import Middleware
+import hashlib
+import base64
+import json
+# ---------- Logging Middleware ----------
+def _headers_from_asgi(raw_headers) -> list[dict]:
+    headers = []
+    for raw_key, raw_value in raw_headers or []:
+        headers.append({
+            "name": raw_key.decode("latin-1", errors="replace"),
+            "value": raw_value.decode("latin-1", errors="replace"),
+        })
+    return headers
+def _header_value(headers: list[dict], name: str) -> str:
+    name = name.lower()
+    for header in headers:
+        if header["name"].lower() == name:
+            return header["value"]
+    return ""
+def _body_text(data: bytes, content_type: str) -> str | None:
+    if not data: return ""
+    lower_type = (content_type or "").lower()
+    if "text/" in lower_type or "json" in lower_type or "x-www-form-urlencoded" in lower_type:
+        return data.decode("utf-8", errors="replace")
+    return None
+def _body_record(data: bytes, content_type: str) -> dict:
+    return {
+        "size": len(data),
+        "sha256": hashlib.sha256(data).hexdigest() if data else "",
+        "base64": base64.b64encode(data).decode("ascii") if data else "",
+        "text": _body_text(data, content_type),
+    }
+def _append_http_log(record: dict) -> None:
+    os.makedirs(os.path.dirname(HTTP_LOG_FILE), exist_ok=True)
+    line = json.dumps(record, ensure_ascii=False, separators=(",", ":"))
+    with HTTP_LOG_LOCK:
+        with open(HTTP_LOG_FILE, "a", encoding="utf-8") as f:
+            f.write(line + "\n")
+class HTTPRequestLogMiddleware:
+    def __init__(self, app):
+        self.app = app
+    async def __call__(self, scope, receive, send):
+        if scope.get("type") != "http":
+            await self.app(scope, receive, send)
+            return
+        started = time.time()
+        request_id = uuid.uuid4().hex[:12]
+        request_body = bytearray()
+        response_headers = []
+        response_body = bytearray()
+        status_code = None
+        async def receive_wrapper():
+            message = await receive()
+            if message.get("type") == "http.request":
+                chunk = message.get("body", b"")
+                if chunk: request_body.extend(chunk)
+            return message
+        async def send_wrapper(message):
+            nonlocal status_code, response_headers
+            if message.get("type") == "http.response.start":
+                status_code = message.get("status")
+                response_headers = _headers_from_asgi(message.get("headers", []))
+            elif message.get("type") == "http.response.body":
+                chunk = message.get("body", b"")
+                if chunk: response_body.extend(chunk)
+            await send(message)
+        try:
+            await self.app(scope, receive_wrapper, send_wrapper)
+        finally:
+            request_headers = _headers_from_asgi(scope.get("headers", []))
+            client = scope.get("client") or (None, None)
+            record = {
+                "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(started)),
+                "request_id": request_id,
+                "client_id": _header_value(request_headers, "x-v46-client-id"),
+                "method": scope.get("method"),
+                "path": scope.get("path"),
+                "status_code": status_code,
+                "duration_ms": round((time.time() - started) * 1000, 2),
+            }
+            try:
+                _append_http_log(record)
+            except Exception as e:
+                print(f"Logging error: {e}")
 import spaces
+from typing import Generator
+# ---------- Globals & Model Loading ----------
+MODEL_ID = "openbmb/MiniCPM-V-4.6"
+print(f"Loading processor: {MODEL_ID}")
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+print(f"Loading model: {MODEL_ID}")
+model = MiniCPMV4_6ForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="sdpa",
+    trust_remote_code=True,
+    device_map="cuda"
+).eval()
+# ---------- Logging & Helper Functions ----------
+PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
+LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
+UPLOAD_LOG_DIR = os.path.join(LOG_DIR, "uploads")
+HTTP_LOG_FILE = os.path.join(LOG_DIR, "http_requests.jsonl")
+RAW_OUTPUT_LOG_FILE = os.path.join(LOG_DIR, "raw_model_outputs.jsonl")
+HTTP_LOG_LOCK = threading.Lock()
+RAW_OUTPUT_LOG_LOCK = threading.Lock()
+def _append_raw_output_log(record: dict) -> None:
+    os.makedirs(os.path.dirname(RAW_OUTPUT_LOG_FILE), exist_ok=True)
+    line = json.dumps(record, ensure_ascii=False, separators=(",", ":"))
+    with RAW_OUTPUT_LOG_LOCK:
+        with open(RAW_OUTPUT_LOG_FILE, "a", encoding="utf-8") as f:
+            f.write(line + "\n")
+def log_raw_model_output(session_id: str, **record) -> None:
+    payload = {
+        "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "session_id": session_id,
+        **record,
+    }
+    try:
+        _append_raw_output_log(payload)
+    except Exception as e:
+        print(f"Logging error: {e}")
+def load_video(video_path, max_frames=64):
+    """Fast video loading using PyAV timestamp seeking."""
+    try:
+        container = av.open(video_path)
+        stream = container.streams.video[0]
+        stream.thread_count = 8
+        duration = stream.duration
+        if duration is None or duration <= 0:
+            frames = [f.to_image() for f in container.decode(video=0)]
+            if len(frames) > max_frames:
+                indices = [int(i * len(frames) / max_frames) for i in range(max_frames)]
+                return [frames[i] for i in indices]
+            return frames
+        indices = [int(i * duration / max_frames) for i in range(max_frames)]
+        frames = []
+        for ts in indices:
+            container.seek(ts, stream=stream)
+            for frame in container.decode(video=0):
+                frames.append(frame.to_image())
+                break
+        container.close()
+        return frames
+    except Exception as e:
+        print(f"Error loading video: {e}")
+        return None
+def persist_uploaded_files(files: list, session_id: str) -> list:
+    """Copy Gradio temp uploads into the project log directory."""
+    if not files: return []
+    dest_dir = os.path.join(UPLOAD_LOG_DIR, session_id or "session")
+    os.makedirs(dest_dir, exist_ok=True)
+    persisted = []
+    for f in files:
+        src = f["path"] if isinstance(f, dict) else f
+        if not os.path.isfile(src):
+            persisted.append(src)
+            continue
+        base = os.path.basename(src)
+        stamp = time.strftime("%Y%m%dT%H%M%SZ", time.gmtime())
+        dest = os.path.join(dest_dir, f"{stamp}-{uuid.uuid4().hex[:8]}-{base}")
+        shutil.copy2(src, dest)
+        persisted.append(dest)
+    return persisted
+def normalize_response_text(text: str) -> str:
+    """Robust conversion of literal \n to newlines while protecting code/LaTeX."""
+    if not isinstance(text, str) or "\\" not in text:
+        return text
+    protected = {}
+    counter = [0]
+    def _convert(v):
+        v = re.sub(r"(?<!\\)(?:\\r\\n|\\n|\\r){2,}", lambda m: "\n" * len(re.findall(r"\\n|\\r", m.group(0))), v)
+        v = re.sub(r"(?<!\\)\\r\\n", "\n", v)
+        v = re.sub(r"(?<!\\)\\n(?![a-zA-Z])", "\n", v)
+        return v
+    def _protect(m):
+        key = f"\x00P{counter[0]}\x00"
+        counter[0] += 1
+        protected[key] = m.group(0)
+        return key
+    res = text
+    res = re.sub(r"```[\s\S]*?```", lambda m: _protect(re.match(r"```[\s\S]*?```", _convert(m.group(0)))), res) # Simplified for parity
+    res = re.sub(r"`[^`]+`", _protect, res)
+    res = _convert(res)
+    for k, v in protected.items(): res = res.replace(k, v)
+    return res
+# ---------- Inference Endpoint ----------
+demo = Server()
+@demo.api()
+@spaces.GPU(duration=120)
+def predict(
+    message: str,
+    history: list[list] = None,
+    files: list[FileData] = None,
+    thinking_mode: bool = True,
+    max_new_tokens: int = 1024,
+    temperature: float = 0.7,
+    top_p: float = 0.8,
+    top_k: int = 100,
+    max_frames: int = 64,
+    generation_mode: str = "Sampling"
+) -> Generator[str, None, None]:
+    """
+    Streaming inference endpoint with history support.
+    """
+    session_id = str(uuid.uuid4())
+    # Persist files in background to avoid blocking user (parity audit)
+    if files:
+        threading.Thread(target=persist_uploaded_files, args=(files, session_id), daemon=True).start()
+    messages = []
+    # Process history
+    if history:
+        for turn in history:
+            # history turn is [user_text, assistant_text, [optional_file_paths]]
+            user_text = turn[0]
+            assistant_text = turn[1]
+            turn_files = turn[2] if len(turn) > 2 else []
+            h_content = []
+            if turn_files:
+                for f_path in turn_files:
+                    # In history, we don't have mime_type, so we check extension
+                    ext = os.path.splitext(f_path)[1].lower()
+                    if ext in {".mp4", ".mkv", ".mov", ".avi", ".webm"}:
+                        v_frames = load_video(f_path, max_frames=max_frames)
+                        if v_frames:
+                            h_content.append({"type": "video", "video": v_frames})
+                        else:
+                            h_content.append({"type": "video", "path": f_path})
+                    else:
+                        try:
+                            img = Image.open(f_path).convert("RGB")
+                            h_content.append({"type": "image", "image": img})
+                        except Exception:
+                            v_frames = load_video(f_path, max_frames=max_frames)
+                            if v_frames:
+                                h_content.append({"type": "video", "video": v_frames})
+                            else:
+                                h_content.append({"type": "video", "path": f_path})
+            if user_text:
+                h_content.append({"type": "text", "text": user_text})
+            if h_content:
+                messages.append({"role": "user", "content": h_content})
+            if assistant_text:
+                messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_text}]})
+    content = []
+    if files:
+        for f in files:
+            file_path = f["path"]
+            try:
+                # Try image first
+                img = Image.open(file_path).convert("RGB")
+                content.append({"type": "image", "image": img})
+            except Exception:
+                # Fallback to manual video frame extraction (bypasses broken torchvision)
+                v_frames = load_video(file_path, max_frames=max_frames)
+                if v_frames:
+                    content.append({"type": "video", "video": v_frames})
+                else:
+                    print(f"Failed to load video: {file_path}")
+    if message:
+        content.append({"type": "text", "text": message})
+    if content:
+        messages.append({"role": "user", "content": content})
+    # Prepare inputs with Advanced Parameters for MiniCPM-V 4.6
+    with torch.no_grad():
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            enable_thinking=thinking_mode,
+            processor_kwargs={
+                "downsample_mode": "16x",
+                "max_slice_nums": 1 if any(it.get("type") == "video" for msg in messages for it in msg["content"]) else 9,
+                "use_image_id": False if any(it.get("type") == "video" for msg in messages for it in msg["content"]) else True,
+                "videos_kwargs": {
+                    "max_num_frames": max_frames,
+                    "do_sample_frames": False, # Frames are already sampled by load_video
+                    "stack_frames": 1,
+                }
+            }
+        ).to(model.device)
+    for k, v in inputs.items():
+        if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
+            inputs[k] = v.to(dtype=torch.bfloat16)
+    streamer = TextIteratorStreamer(
+        processor.tokenizer,
+        skip_prompt=True,
+        skip_special_tokens=True,
+    )
+    sampling = (generation_mode == "Sampling")
+    generate_kwargs = {
+        **inputs,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": sampling,
+        "streamer": streamer,
+        "downsample_mode": "16x"
+    }
+    if sampling:
+        generate_kwargs.update({
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+        })
+    else:
+        generate_kwargs.update({"num_beams": 1})
+    thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
+    full_text = ""
+    for new_text in streamer:
+        full_text += new_text
+        yield normalize_response_text(full_text)
+    log_raw_model_output(session_id, message=message, response=full_text, variant="thinking" if thinking_mode else "instruct")
+@demo.get("/", response_class=HTMLResponse)
+async def homepage():
+    html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
+    with open(html_path, "r", encoding="utf-8") as f:
+        return f.read()
+if __name__ == "__main__":
+    demo.launch(
+        show_error=True,
+        app_kwargs={"middleware": [Middleware(HTTPRequestLogMiddleware)]}
+    )

index.html ADDED Viewed

	@@ -0,0 +1,940 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
+    <title>MiniCPM-V | OpenBMB Premium</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <link href="https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700&family=Inter:wght@300;400;500;600&display=swap" rel="stylesheet">
+    <script src="https://unpkg.com/lucide@latest"></script>
+    <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.0/dist/katex.min.css">
+    <script src="https://cdn.jsdelivr.net/npm/katex@0.16.0/dist/katex.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/katex@0.16.0/dist/contrib/auto-render.min.js"></script>
+    <style>
+        :root {
+            --bg: #05070A;
+            --blue: #3B5BFF;
+            --cyan: #27D4EA;
+            --text: #FFFFFF;
+            --text-muted: #8B949E;
+            --glass: rgba(255, 255, 255, 0.03);
+            --glass-border: rgba(255, 255, 255, 0.08);
+            --accent: #3B5BFF;
+        }
+        body {
+            font-family: 'Inter', sans-serif;
+            background-color: var(--bg);
+            color: var(--text);
+            height: 100vh;
+            margin: 0;
+            display: flex;
+            flex-direction: column;
+            overflow: hidden;
+        }
+        h1, h2, h3 { font-family: 'Outfit', sans-serif; }
+        .chat-scroll-area {
+            flex: 1;
+            overflow-y: auto;
+            padding-bottom: 140px;
+            -webkit-overflow-scrolling: touch;
+            scroll-behavior: smooth;
+        }
+        .chat-scroll-area::-webkit-scrollbar { width: 4px; }
+        .chat-scroll-area::-webkit-scrollbar-track { background: transparent; }
+        .chat-scroll-area::-webkit-scrollbar-thumb { background: rgba(255, 255, 255, 0.1); border-radius: 10px; }
+        .message-bubble {
+            max-width: 85%;
+            animation: fadeIn 0.4s cubic-bezier(0.16, 1, 0.3, 1) forwards;
+            position: relative;
+        }
+        @keyframes fadeIn {
+            from { opacity: 0; transform: translateY(15px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        .user-message {
+            background: linear-gradient(135deg, var(--blue), var(--cyan));
+            color: #FFFFFF;
+            box-shadow: 0 8px 25px rgba(59, 91, 255, 0.15);
+            border-radius: 24px 24px 4px 24px;
+        }
+        .bot-message {
+            background: rgba(255, 255, 255, 0.03);
+            border: 1px solid var(--glass-border);
+            border-radius: 24px 24px 24px 4px;
+            backdrop-filter: blur(10px);
+        }
+        .thinking-block {
+            background: rgba(59, 91, 255, 0.05);
+            border-left: 3px solid var(--blue);
+            padding: 12px 16px;
+            margin-bottom: 12px;
+            border-radius: 4px 12px 12px 4px;
+            font-size: 14px;
+            color: var(--text-muted);
+            font-style: italic;
+        }
+        .typing-dot {
+            width: 4px; height: 4px;
+            background: var(--cyan);
+            border-radius: 50%;
+            animation: bounce 1.4s infinite ease-in-out;
+        }
+        /* Tab Styles */
+        .tab-btn {
+            @apply px-6 py-3 text-sm font-bold text-white/40 border-b-2 border-transparent transition-all;
+        }
+        .tab-btn.active {
+            @apply text-white border-white;
+        }
+        .tab-content {
+            display: none;
+        }
+        .tab-content.active {
+            display: flex;
+        }
+        @keyframes bounce {
+            0%, 80%, 100% { transform: scale(0.3); opacity: 0.4; }
+            40% { transform: scale(1); opacity: 1; }
+        }
+        .input-pill {
+            background: rgba(255, 255, 255, 0.04);
+            backdrop-filter: blur(25px);
+            -webkit-backdrop-filter: blur(25px);
+            border: 1px solid var(--glass-border);
+            transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+        }
+        .input-pill:focus-within {
+            border-color: rgba(59, 91, 255, 0.4);
+            background: rgba(255, 255, 255, 0.06);
+            box-shadow: 0 0 40px rgba(59, 91, 255, 0.08);
+        }
+        .logo-glow {
+            filter: drop-shadow(0 0 15px rgba(39, 212, 234, 0.4));
+        }
+        .send-btn {
+            background: linear-gradient(135deg, var(--blue), var(--cyan));
+            transition: all 0.3s ease;
+        }
+        .send-btn:hover:not(:disabled) { transform: scale(1.05); filter: brightness(1.1); }
+        .send-btn:active:not(:disabled) { transform: scale(0.95); }
+        .settings-panel {
+            background: rgba(10, 12, 16, 0.95);
+            backdrop-filter: blur(30px);
+            border-left: 1px solid var(--glass-border);
+            transition: transform 0.4s cubic-bezier(0.16, 1, 0.3, 1);
+        }
+        .control-slider {
+            -webkit-appearance: none;
+            width: 100%;
+            height: 4px;
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 2px;
+            outline: none;
+        }
+        .control-slider::-webkit-slider-thumb {
+            -webkit-appearance: none;
+            width: 12px; height: 12px;
+            background: var(--blue);
+            border-radius: 50%;
+            cursor: pointer;
+            transition: scale 0.2s;
+        }
+        .control-slider::-webkit-slider-thumb:hover { scale: 1.2; }
+        .toggle-switch {
+            width: 36px; height: 20px;
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 10px;
+            position: relative;
+            cursor: pointer;
+            transition: background 0.3s;
+        }
+        .toggle-switch.active { background: var(--blue); }
+        .toggle-switch::after {
+            content: '';
+            position: absolute;
+            top: 2px; left: 2px;
+            width: 16px; height: 16px;
+            background: white;
+            border-radius: 50%;
+            transition: transform 0.3s;
+        }
+        .toggle-switch.active::after { transform: translateX(16px); }
+        .media-preview-item {
+            position: relative;
+            animation: scaleIn 0.3s ease-out;
+        }
+        @keyframes scaleIn { from { scale: 0.8; opacity: 0; } to { scale: 1; opacity: 1; } }
+        .shimmer {
+            background: linear-gradient(90deg, transparent, rgba(255,255,255,0.05), transparent);
+            background-size: 200% 100%;
+            animation: shimmer 2s infinite;
+        }
+        @keyframes shimmer { 0% { background-position: -200% 0; } 100% { background-position: 200% 0; } }
+    </style>
+</head>
+<body>
+    <!-- Header -->
+    <header class="h-20 flex items-center justify-between px-6 md:px-12 shrink-0 z-50 border-b border-white/5">
+        <div class="flex items-center gap-4">
+            <div class="relative">
+                <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/1670387859384-633fe7784b362488336bbfad.png"
+                     alt="OpenBMB" class="w-10 h-10 logo-glow">
+                <div class="absolute -bottom-1 -right-1 w-3 h-3 bg-green-500 rounded-full border-2 border-[var(--bg)]"></div>
+            </div>
+            <div>
+                <h1 class="text-xl font-bold tracking-tight bg-clip-text text-transparent bg-gradient-to-r from-white to-white/60">MiniCPM-V</h1>
+                <p class="text-[10px] text-muted uppercase tracking-[0.2em] font-bold opacity-50">By OpenBMB</p>
+            </div>
+        </div>
+        <div class="flex items-center gap-6">
+            <div class="hidden md:flex items-center gap-2 text-[10px] font-bold text-muted uppercase tracking-widest bg-white/5 px-3 py-1.5 rounded-full border border-white/5">
+                <span class="w-1.5 h-1.5 rounded-full bg-[#27D4EA] animate-pulse"></span>
+                v4.6 Intelligence Engine
+            </div>
+            <button id="toggle-settings" class="p-2.5 rounded-xl hover:bg-white/5 text-white/40 hover:text-white transition-all relative">
+                <i data-lucide="sliders-horizontal" class="w-5 h-5"></i>
+            </button>
+        </div>
+    </header>
+    <!-- Settings Panel (Side) -->
+    <div id="settings-panel" class="fixed top-0 right-0 h-full w-80 z-[100] translate-x-full settings-panel p-8 flex flex-col gap-8 shadow-[-20px_0_50px_rgba(0,0,0,0.5)]">
+        <div class="flex items-center justify-between">
+            <h2 class="text-lg font-bold">Engine Settings</h2>
+            <button id="close-settings" class="text-white/40 hover:text-white"><i data-lucide="x" class="w-5 h-5"></i></button>
+        </div>
+        <div class="space-y-6">
+            <div class="flex items-center justify-between">
+                <span class="text-sm font-medium text-white/70">Thinking Mode</span>
+                <div id="thinking-toggle" class="toggle-switch active"></div>
+            </div>
+            <div class="flex items-center justify-between">
+                <span class="text-sm font-medium text-white/70">Streaming</span>
+                <div id="streaming-toggle" class="toggle-switch active"></div>
+            </div>
+            <div class="space-y-3">
+                <span class="text-xs font-bold text-white/40 uppercase tracking-widest">Generation Mode</span>
+                <div class="flex gap-2 p-1 bg-white/5 rounded-xl border border-white/10">
+                    <button id="mode-sampling" class="flex-1 py-2 text-xs font-bold rounded-lg bg-white/10 text-white transition-all">Sampling</button>
+                    <button id="mode-beam" class="flex-1 py-2 text-xs font-bold rounded-lg text-white/40 hover:text-white transition-all">Beam Search</button>
+                </div>
+            </div>
+            <div class="space-y-3">
+                <div class="flex justify-between text-xs font-bold text-white/40 uppercase tracking-widest">
+                    <span>Max Tokens</span>
+                    <span id="tokens-val">2048</span>
+                </div>
+                <input type="range" id="tokens-slider" min="64" max="16384" step="64" value="2048" class="control-slider">
+            </div>
+            <div class="space-y-3">
+                <div class="flex justify-between text-xs font-bold text-white/40 uppercase tracking-widest">
+                    <span>Temperature</span>
+                    <span id="temp-val">0.7</span>
+                </div>
+                <input type="range" id="temp-slider" min="0" max="2" step="0.01" value="0.7" class="control-slider">
+            </div>
+            <div class="space-y-3">
+                <div class="flex justify-between text-xs font-bold text-white/40 uppercase tracking-widest">
+                    <span>Top-P</span>
+                    <span id="p-val">0.8</span>
+                </div>
+                <input type="range" id="p-slider" min="0" max="1" step="0.05" value="0.8" class="control-slider">
+            </div>
+            <div class="space-y-3">
+                <div class="flex justify-between text-xs font-bold text-white/40 uppercase tracking-widest">
+                    <span>Top-K</span>
+                    <span id="k-val">100</span>
+                </div>
+                <input type="range" id="k-slider" min="0" max="200" step="1" value="100" class="control-slider">
+            </div>
+            <div class="space-y-3">
+                <div class="flex justify-between text-xs font-bold text-white/40 uppercase tracking-widest">
+                    <span>Max Frames</span>
+                    <span id="frames-val">64</span>
+                </div>
+                <input type="range" id="frames-slider" min="8" max="256" step="8" value="64" class="control-slider">
+            </div>
+            <button id="open-fewshot" class="w-full py-4 rounded-2xl bg-white/5 hover:bg-white/10 border border-white/5 transition-all flex items-center justify-center gap-2 group mb-2">
+                <i data-lucide="sparkles" class="w-4 h-4 text-[#27D4EA] group-hover:scale-110 transition-transform"></i>
+                <span class="text-sm font-bold">Few-Shot Builder</span>
+            </button>
+            <button onclick="clearHistory()" class="w-full py-4 rounded-2xl bg-red-500/10 border border-red-500/20 text-red-500 text-sm font-bold hover:bg-red-500/20 transition-all flex items-center justify-center gap-2">
+                <i data-lucide="trash-2" class="w-4 h-4"></i>
+                Clear Conversation
+            </button>
+        </div>
+    </div>
+    <!-- Help Modal -->
+    <div id="help-modal" class="fixed inset-0 z-[200] flex items-center justify-center bg-black/80 backdrop-blur-sm hidden p-6">
+        <div class="max-w-4xl w-full bg-[#0D1117] border border-white/10 rounded-[32px] overflow-hidden shadow-2xl flex flex-col max-h-[90vh]">
+            <div class="p-8 border-b border-white/10 flex items-center justify-between">
+                <h2 class="text-2xl font-bold">How to use MiniCPM-V 4.6</h2>
+                <button id="close-help" class="text-white/40 hover:text-white"><i data-lucide="x" class="w-6 h-6"></i></button>
+            </div>
+            <div class="p-8 overflow-y-auto space-y-12">
+                <div class="grid grid-cols-1 md:grid-cols-3 gap-8">
+                    <div class="space-y-4">
+                        <div class="aspect-video bg-white/5 rounded-2xl overflow-hidden border border-white/5">
+                            <img src="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/m_bear2.gif" class="w-full h-full object-cover">
+                        </div>
+                        <h3 class="font-bold text-lg">1. Multi-Image Chat</h3>
+                        <p class="text-sm text-white/50 leading-relaxed">Upload multiple images at once. Use the "+" button to select files or drag them into the input area.</p>
+                    </div>
+                    <div class="space-y-4">
+                        <div class="aspect-video bg-white/5 rounded-2xl overflow-hidden border border-white/5">
+                            <img src="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/video2.gif" class="w-full h-full object-cover">
+                        </div>
+                        <h3 class="font-bold text-lg">2. Video Intelligence</h3>
+                        <p class="text-sm text-white/50 leading-relaxed">Upload videos for temporal reasoning. MiniCPM-V will sample frames and describe events over time.</p>
+                    </div>
+                    <div class="space-y-4">
+                        <div class="aspect-video bg-white/5 rounded-2xl overflow-hidden border border-white/5">
+                            <img src="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/fshot.gif" class="w-full h-full object-cover">
+                        </div>
+                        <h3 class="font-bold text-lg">3. Few-Shot Learning</h3>
+                        <p class="text-sm text-white/50 leading-relaxed">Provide examples to guide the model. Add turns with correct answers to teach the model a specific style.</p>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <!-- Chat Area (Tab 1) -->
+    <div id="tab-chat" class="tab-content active flex-col h-full">
+        <main id="chat-messages" class="chat-scroll-area px-4 flex-1">
+            <div class="max-w-3xl mx-auto space-y-8 pt-24 pb-40" id="chat-container">
+            </div>
+        </main>
+        <!-- Input Area -->
+        <div class="fixed bottom-0 left-0 right-0 p-6 md:p-10 pointer-events-none z-50">
+            <div class="max-w-3xl mx-auto pointer-events-auto">
+                <!-- Multi-file Preview -->
+                <div id="preview-container" class="hidden mb-6 flex flex-wrap gap-3 max-h-40 overflow-y-auto p-2 scrollbar-none">
+                    <!-- Preview items will be injected here -->
+                </div>
+                <!-- Input Bar -->
+                <div class="input-pill rounded-[2rem] p-2 flex items-end shadow-2xl overflow-hidden bg-white/5 backdrop-blur-xl border border-white/10">
+                    <input type="file" id="file-input" class="hidden" accept="image/*,video/*" multiple>
+                    <button id="upload-trigger" class="w-12 h-12 flex items-center justify-center text-white/50 hover:text-[#27D4EA] transition-colors relative shrink-0 mb-1">
+                        <i data-lucide="plus" class="w-6 h-6"></i>
+                        <span id="file-count-badge" class="absolute top-2 right-2 w-4 h-4 bg-indigo-500 text-[10px] text-white rounded-full flex items-center justify-center hidden shadow-lg">0</span>
+                    </button>
+                    <textarea id="user-input" placeholder="Ask MiniCPM-V..." class="flex-1 bg-transparent border-none focus:ring-0 text-white placeholder-white/30 py-4 px-2 resize-none max-h-48 leading-relaxed font-medium" rows="1"></textarea>
+                    <div class="flex items-center gap-1 mb-1 pr-2">
+                        <button onclick="regenerate()" class="w-10 h-10 flex items-center justify-center text-white/30 hover:text-white transition-colors shrink-0" title="Regenerate last response">
+                            <i data-lucide="refresh-cw" class="w-4 h-4"></i>
+                        </button>
+                        <button id="send-btn" class="send-btn w-12 h-12 text-white rounded-full flex items-center justify-center disabled:opacity-20 disabled:grayscale shrink-0">
+                            <i data-lucide="arrow-up" class="w-5 h-5" id="send-icon"></i>
+                            <div id="loading-icon" class="hidden"><div class="w-5 h-5 border-2 border-white/30 border-t-white rounded-full animate-spin"></div></div>
+                        </button>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <!-- Few-Shot Area (Tab 2) -->
+    <div id="tab-fewshot" class="tab-content flex-col items-center pt-32 px-4 h-full overflow-y-auto">
+        <div class="max-w-3xl w-full space-y-8 pb-20">
+            <div class="flex items-center justify-between">
+                <div class="space-y-2">
+                    <h2 class="text-2xl font-bold tracking-tight">Few-Shot Builder</h2>
+                    <p class="text-white/40 text-sm">Add custom examples to guide the model's behavior.</p>
+                </div>
+                <button id="return-chat" class="px-6 py-2 rounded-full bg-white/5 hover:bg-white/10 border border-white/10 transition-all flex items-center gap-2 text-xs font-bold uppercase tracking-widest">
+                    <i data-lucide="arrow-left" class="w-4 h-4"></i>
+                    Back to Chat
+                </button>
+            </div>
+            <div class="grid grid-cols-1 md:grid-cols-2 gap-6">
+                <div class="space-y-4">
+                    <div class="aspect-video bg-white/5 rounded-3xl border border-white/10 flex items-center justify-center relative overflow-hidden group cursor-pointer" onclick="document.getElementById('fs-file').click()">
+                        <input type="file" id="fs-file" class="hidden" accept="image/*">
+                        <img id="fs-preview" class="hidden w-full h-full object-contain">
+                        <div id="fs-placeholder" class="flex flex-col items-center gap-2 text-white/20">
+                            <i data-lucide="image" class="w-10 h-10"></i>
+                            <span class="text-[10px] font-bold uppercase tracking-[0.2em]">Upload Example Image</span>
+                        </div>
+                        <div class="absolute inset-0 bg-indigo-500/10 opacity-0 group-hover:opacity-100 transition-opacity flex items-center justify-center font-bold text-[10px] uppercase tracking-widest">Update Image</div>
+                    </div>
+                </div>
+                <div class="space-y-4 flex flex-col">
+                    <textarea id="fs-user" placeholder="User question for this example..." class="flex-1 bg-white/5 border border-white/10 rounded-2xl p-4 text-sm text-white placeholder-white/20 resize-none focus:border-indigo-500/50 focus:ring-0 transition-all"></textarea>
+                    <textarea id="fs-bot" placeholder="Model's expected answer..." class="flex-1 bg-white/5 border border-white/10 rounded-2xl p-4 text-sm text-white placeholder-white/20 resize-none focus:border-indigo-500/50 focus:ring-0 transition-all"></textarea>
+                </div>
+            </div>
+            <div class="flex gap-4">
+                <button id="fs-add" class="flex-1 py-4 rounded-2xl bg-white/10 hover:bg-white/20 border border-white/10 font-bold text-xs uppercase tracking-widest transition-all">Add to Context</button>
+                <button id="fs-gen" class="flex-1 py-4 rounded-2xl bg-indigo-500 hover:bg-indigo-600 font-bold text-xs uppercase tracking-widest transition-all shadow-lg shadow-indigo-500/20">Ask with Context</button>
+            </div>
+            <div class="p-6 rounded-3xl bg-indigo-500/5 border border-indigo-500/10 space-y-4">
+                <div class="flex items-center gap-2 text-indigo-400">
+                    <i data-lucide="info" class="w-4 h-4"></i>
+                    <span class="text-[10px] font-bold uppercase tracking-widest">Few-Shot Pro Tip</span>
+                </div>
+                <p class="text-xs text-white/40 leading-relaxed italic">"Demonstrations help the model learn complex formatting or specific domain knowledge by example. Upload an image, type a question and the ideal answer, then click 'Add to Context' to teach the model before you start chatting."</p>
+            </div>
+        </div>
+    </div>
+    <script type="module">
+        import { Client, handle_file } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
+        lucide.createIcons();
+        // DOM Elements
+        const chatContainer = document.getElementById('chat-container');
+        const chatScrollArea = document.getElementById('chat-messages');
+        const userInput = document.getElementById('user-input');
+        const sendBtn = document.getElementById('send-btn');
+        const fileInput = document.getElementById('file-input');
+        const uploadTrigger = document.getElementById('upload-trigger');
+        const previewContainer = document.getElementById('preview-container');
+        const fileCountBadge = document.getElementById('file-count-badge');
+        const sendIcon = document.getElementById('send-icon');
+        const loadingIcon = document.getElementById('loading-icon');
+        const settingsPanel = document.getElementById('settings-panel');
+        const toggleSettings = document.getElementById('toggle-settings');
+        const closeSettings = document.getElementById('close-settings');
+        const thinkingToggle = document.getElementById('thinking-toggle');
+        const streamingToggle = document.getElementById('streaming-toggle');
+        const tempSlider = document.getElementById('temp-slider');
+        const tokensSlider = document.getElementById('tokens-slider');
+        const tokensVal = document.getElementById('tokens-val');
+        const pSlider = document.getElementById('p-slider');
+        const pVal = document.getElementById('p-val');
+        const kSlider = document.getElementById('k-slider');
+        const kVal = document.getElementById('k-val');
+        const framesSlider = document.getElementById('frames-slider');
+        const framesVal = document.getElementById('frames-val');
+        const modeSampling = document.getElementById('mode-sampling');
+        const modeBeam = document.getElementById('mode-beam');
+        const helpModal = document.getElementById('help-modal');
+        const closeHelp = document.getElementById('close-help');
+        let client = null;
+        let selectedFiles = [];
+        let isSettingsOpen = false;
+        let generationMode = 'Sampling';
+        modeSampling.onclick = () => {
+            generationMode = 'Sampling';
+            modeSampling.classList.add('bg-white/10');
+            modeSampling.classList.remove('text-white/40');
+            modeBeam.classList.remove('bg-white/10');
+            modeBeam.classList.add('text-white/40');
+        };
+        modeBeam.onclick = () => {
+            generationMode = 'Beam Search';
+            modeBeam.classList.add('bg-white/10');
+            modeBeam.classList.remove('text-white/40');
+            modeSampling.classList.remove('bg-white/10');
+            modeSampling.classList.add('text-white/40');
+            // Disable streaming toggle for beam search parity
+            if (streamingToggle.classList.contains('active')) {
+                streamingToggle.click();
+            }
+        };
+        // Help Modal Logic
+        window.openHelp = () => helpModal.classList.remove('hidden');
+        closeHelp.onclick = () => helpModal.classList.add('hidden');
+        tempSlider.oninput = () => tempVal.textContent = tempSlider.value;
+        tokensSlider.oninput = () => tokensVal.textContent = tokensSlider.value;
+        pSlider.oninput = () => pVal.textContent = pSlider.value;
+        kSlider.oninput = () => kVal.textContent = kSlider.value;
+        framesSlider.oninput = () => framesVal.textContent = framesSlider.value;
+        // Tab Switching Logic
+        const tabChat = document.getElementById('tab-chat');
+        const tabFewshot = document.getElementById('tab-fewshot');
+        function switchTab(target) {
+            if (target === 'chat') {
+                tabChat.classList.add('active');
+                tabFewshot.classList.remove('active');
+            } else {
+                tabFewshot.classList.add('active');
+                tabChat.classList.remove('active');
+            }
+        }
+        const openFewShot = document.getElementById('open-fewshot');
+        const returnChat = document.getElementById('return-chat');
+        openFewShot.onclick = () => {
+            toggleSettingsSidebar(false);
+            switchTab('fewshot');
+        };
+        returnChat.onclick = () => switchTab('chat');
+        // Few-Shot Builder
+        const fsFile = document.getElementById('fs-file');
+        const fsPreview = document.getElementById('fs-preview');
+        const fsPlaceholder = document.getElementById('fs-placeholder');
+        const fsUser = document.getElementById('fs-user');
+        const fsBot = document.getElementById('fs-bot');
+        const fsAddBtn = document.getElementById('fs-add');
+        const fsGenBtn = document.getElementById('fs-gen');
+        let fsSelectedFile = null;
+        fsFile.onchange = (e) => {
+            const file = e.target.files[0];
+            if (file) {
+                fsSelectedFile = file;
+                const reader = new FileReader();
+                reader.onload = (re) => {
+                    fsPreview.src = re.target.result;
+                    fsPreview.classList.remove('hidden');
+                    fsPlaceholder.classList.add('hidden');
+                };
+                reader.readAsDataURL(file);
+            }
+        };
+        fsAddBtn.onclick = () => {
+            if (!fsUser.value.trim() && !fsSelectedFile) {
+                alert("Please provide at least a question or an image.");
+                return;
+            }
+            // Add to history as a "completed turn"
+            const userText = fsUser.value.trim();
+            const botText = fsBot.value.trim();
+            // Parity with reference: add to chatHistory and show in UI
+            chatHistory.push([userText || null, botText || null, fsSelectedFile ? [handle_file(fsSelectedFile)] : []]);
+            // Visual feedback
+            appendMessage('user', userText || "(Example Image)", fsSelectedFile ? [fsSelectedFile] : []);
+            if (botText) appendMessage('bot', botText);
+            // Clear inputs
+            fsUser.value = '';
+            fsBot.value = '';
+            fsSelectedFile = null;
+            fsPreview.classList.add('hidden');
+            fsPlaceholder.classList.remove('hidden');
+            switchTab('chat');
+        };
+        fsGenBtn.onclick = async () => {
+            if (!fsUser.value.trim() && !fsSelectedFile) return;
+            // Switch to chat tab and trigger generation with current FS inputs
+            const tempUser = fsUser.value;
+            const tempFile = fsSelectedFile;
+            switchTab('chat');
+            userInput.value = tempUser;
+            if (tempFile) {
+                selectedFiles = [tempFile];
+                renderPreviews();
+            }
+            sendMessage();
+            // Clear FS inputs
+            fsUser.value = '';
+            fsBot.value = '';
+            fsSelectedFile = null;
+            fsPreview.classList.add('hidden');
+            fsPlaceholder.classList.remove('hidden');
+        };
+        let chatHistory = [];
+        let currentJob = null;
+        async function init() {
+            try {
+                client = await Client.connect(window.location.origin);
+            } catch (err) { console.error("Gradio Connection Error", err); }
+        }
+        init();
+        // Settings Logic
+        const toggleSettingsSidebar = (open) => {
+            isSettingsOpen = open;
+            if (open) {
+                settingsPanel.classList.remove('translate-x-full');
+                settingsPanel.classList.add('translate-x-0');
+            } else {
+                settingsPanel.classList.add('translate-x-full');
+                settingsPanel.classList.remove('translate-x-0');
+            }
+        };
+        toggleSettings.onclick = (e) => {
+            e.stopPropagation();
+            toggleSettingsSidebar(true);
+        };
+        closeSettings.onclick = (e) => {
+            e.stopPropagation();
+            toggleSettingsSidebar(false);
+        };
+        // Close sidebar when clicking outside
+        document.addEventListener('click', (e) => {
+            if (isSettingsOpen && !settingsPanel.contains(e.target) && !toggleSettings.contains(e.target)) {
+                toggleSettingsSidebar(false);
+            }
+        });
+        const setupToggle = (el) => {
+            el.onclick = () => el.classList.toggle('active');
+        };
+        setupToggle(thinkingToggle);
+        setupToggle(streamingToggle);
+        const setupSlider = (slider, valEl) => {
+            slider.oninput = () => valEl.textContent = slider.value;
+        };
+        setupSlider(tempSlider, document.getElementById('temp-val'));
+        setupSlider(tokensSlider, document.getElementById('tokens-val'));
+        setupSlider(pSlider, document.getElementById('p-val'));
+        // File Handling
+        uploadTrigger.onclick = () => fileInput.click();
+        fileInput.onchange = (e) => {
+            const files = Array.from(e.target.files);
+            selectedFiles = [...selectedFiles, ...files];
+            renderPreviews();
+        };
+        function renderPreviews() {
+            previewContainer.innerHTML = '';
+            if (selectedFiles.length > 0) {
+                previewContainer.classList.remove('hidden');
+                fileCountBadge.classList.remove('hidden');
+                fileCountBadge.textContent = selectedFiles.length;
+                selectedFiles.forEach((file, index) => {
+                    const url = URL.createObjectURL(file);
+                    const item = document.createElement('div');
+                    item.className = 'media-preview-item h-24 w-24 rounded-2xl overflow-hidden border border-white/20 shadow-lg';
+                    if (file.type.startsWith('image/')) {
+                        item.innerHTML = `<img src="${url}" class="w-full h-full object-cover">`;
+                    } else {
+                        item.innerHTML = `<video src="${url}" class="w-full h-full object-cover" muted></video><div class="absolute inset-0 flex items-center justify-center bg-black/20"><i data-lucide="play" class="w-6 h-6 text-white"></i></div>`;
+                    }
+                    const removeBtn = document.createElement('button');
+                    removeBtn.className = 'absolute -top-1 -right-1 bg-red-500 text-white rounded-full p-1 shadow-lg scale-75';
+                    removeBtn.innerHTML = '<i data-lucide="x" class="w-3 h-3"></i>';
+                    removeBtn.onclick = (e) => {
+                        e.stopPropagation();
+                        selectedFiles.splice(index, 1);
+                        renderPreviews();
+                    };
+                    item.appendChild(removeBtn);
+                    previewContainer.appendChild(item);
+                });
+                lucide.createIcons();
+            } else {
+                previewContainer.classList.add('hidden');
+                fileCountBadge.classList.add('hidden');
+            }
+        }
+        // Message Handling
+        function appendMessage(role, text = '', files = []) {
+            const div = document.createElement('div');
+            div.className = `flex gap-4 items-start ${role === 'user' ? 'flex-row-reverse' : ''}`;
+            let mediaHtml = '';
+            if (files.length > 0) {
+                mediaHtml = '<div class="flex flex-wrap gap-2 mb-4">';
+                files.forEach(file => {
+                    const url = typeof file === 'string' ? file : URL.createObjectURL(file);
+                    const type = typeof file === 'string' ? (file.match(/\.(mp4|webm|mkv)/i) ? 'video' : 'image') : (file.type.startsWith('video') ? 'video' : 'image');
+                    if (type === 'image') {
+                        mediaHtml += `<img src="${url}" class="h-48 rounded-2xl border border-white/10 shadow-lg object-contain bg-black/20" />`;
+                    } else {
+                        mediaHtml += `<video src="${url}" controls class="h-48 rounded-2xl border border-white/10 shadow-lg" />`;
+                    }
+                });
+                mediaHtml += '</div>';
+            }
+            const bubbleClass = role === 'user' ? 'user-message' : 'bot-message';
+            div.innerHTML = `
+                <div class="${bubbleClass} p-6 message-bubble shadow-xl">
+                    ${mediaHtml}
+                    <div class="thinking-container hidden"></div>
+                    <div class="content-container leading-relaxed text-[15px] whitespace-pre-wrap font-medium">${marked.parse(text)}</div>
+                </div>
+            `;
+            chatContainer.appendChild(div);
+            const contentContainer = div.querySelector('.content-container');
+            if (window.renderMathInElement) {
+                renderMathInElement(contentContainer, {
+                    delimiters: [
+                        {left: '$$', right: '$$', display: true},
+                        {left: '$', right: '$', display: false},
+                        {left: '\\(', right: '\\)', display: false},
+                        {left: '\\[', right: '\\]', display: true}
+                    ],
+                    throwOnError: false
+                });
+            }
+            chatScrollArea.scrollTo({ top: chatScrollArea.scrollHeight, behavior: 'smooth' });
+            return div;
+        }
+        function updateBotMessage(div, fullText) {
+            const thinkingContainer = div.querySelector('.thinking-container');
+            const contentContainer = div.querySelector('.content-container');
+            const thinkMatch = fullText.match(/<think>([\s\S]*?)<\/think>/);
+            const thinkingText = thinkMatch ? thinkMatch[1].trim() : (fullText.includes('<think>') && !fullText.includes('</think>') ? fullText.split('<think>')[1].trim() : '');
+            const actualText = fullText.replace(/<think>[\s\S]*?<\/think>/, '').trim();
+            if (thinkingText) {
+                thinkingContainer.classList.remove('hidden');
+                thinkingContainer.innerHTML = `<div class="thinking-block">${marked.parse(thinkingText)}</div>`;
+            } else {
+                thinkingContainer.classList.add('hidden');
+            }
+            contentContainer.innerHTML = marked.parse(actualText);
+            // Render Math
+            [thinkingContainer, contentContainer].forEach(el => {
+                if (window.renderMathInElement) {
+                    renderMathInElement(el, {
+                        delimiters: [
+                            {left: '$$', right: '$$', display: true},
+                            {left: '$', right: '$', display: false},
+                            {left: '\\(', right: '\\)', display: false},
+                            {left: '\\[', right: '\\]', display: true}
+                        ],
+                        throwOnError: false
+                    });
+                }
+            });
+            chatScrollArea.scrollTo({ top: chatScrollArea.scrollHeight, behavior: 'smooth' });
+            return actualText;
+        }
+        async function sendMessage() {
+            const text = userInput.value.trim();
+            if (!text && selectedFiles.length === 0) return;
+            const filesToUpload = [...selectedFiles];
+            const content = text;
+            userInput.value = '';
+            userInput.style.height = 'auto';
+            appendMessage('user', content, filesToUpload);
+            selectedFiles = [];
+            renderPreviews();
+            sendIcon.classList.add('hidden');
+            loadingIcon.classList.remove('hidden');
+            sendBtn.innerHTML = '<i data-lucide="square" class="w-5 h-5 fill-white"></i>';
+            sendBtn.classList.remove('send-btn');
+            sendBtn.classList.add('bg-red-500/20', 'hover:bg-red-500/40', 'border', 'border-red-500/50');
+            lucide.createIcons();
+            let isStopped = false;
+            sendBtn.onclick = () => {
+                if (currentJob) {
+                    currentJob.cancel();
+                    isStopped = true;
+                    resetSendBtn();
+                }
+            };
+            // Bot response placeholder
+            const botDiv = appendMessage('bot', '');
+            const contentContainer = botDiv.querySelector('.content-container');
+            contentContainer.innerHTML = '<div class="flex gap-1.5 py-2"><div class="typing-dot"></div><div class="typing-dot"></div><div class="typing-dot"></div></div>';
+            try {
+                const gradioFiles = filesToUpload.length > 0 ? filesToUpload.map(f => handle_file(f)) : null;
+                currentJob = client.submit("/predict", {
+                    message: content,
+                    history: chatHistory,
+                    files: gradioFiles,
+                    thinking_mode: thinkingToggle.classList.contains('active'),
+                    max_new_tokens: parseInt(tokensSlider.value),
+                    temperature: parseFloat(tempSlider.value),
+                    top_p: parseFloat(pSlider.value),
+                    top_k: parseInt(kSlider.value),
+                    max_frames: parseInt(framesSlider.value),
+                    generation_mode: generationMode
+                });
+                let finalAnswer = "";
+                for await (const msg of currentJob) {
+                    if (isStopped) break;
+                    if (msg.type === "data" && msg.data) {
+                        const chunk = msg.data[0];
+                        finalAnswer = updateBotMessage(botDiv, chunk);
+                    }
+                }
+                if (!isStopped) {
+                    chatHistory.push([content, finalAnswer]);
+                }
+            } catch (err) {
+                console.error(err);
+                if (!isStopped) {
+                    contentContainer.textContent = "I encountered an error while processing your request. Please try again.";
+                }
+            } finally {
+                resetSendBtn();
+                currentJob = null;
+            }
+        }
+        function resetSendBtn() {
+            sendBtn.innerHTML = '<i data-lucide="arrow-up" class="w-5 h-5" id="send-icon"></i>';
+            sendBtn.className = 'send-btn w-12 h-12 text-white rounded-full flex items-center justify-center disabled:opacity-20 disabled:grayscale shrink-0 mb-1';
+            sendBtn.onclick = sendMessage;
+            lucide.createIcons();
+        }
+        window.clearHistory = function() {
+            chatHistory = [];
+            chatContainer.innerHTML = `
+                <div class="flex gap-4 items-start">
+                    <div class="bot-message p-6 message-bubble shadow-2xl">
+                        <p class="text-white/90 leading-relaxed text-[15px]">
+                            History cleared. How can I help you today?
+                        </p>
+                    </div>
+                </div>
+            `;
+            closeSettings.click();
+        }
+        async function regenerate() {
+            if (chatHistory.length === 0) return;
+            const lastTurn = chatHistory.pop();
+            // Remove last assistant message from UI
+            chatContainer.removeChild(chatContainer.lastChild);
+            // Re-send last user message
+            userInput.value = lastTurn[0];
+            // We need to re-handle files if we want perfect parity, but for now we re-send text
+            sendMessage();
+        }
+        thinkingToggle.onclick = () => {
+            thinkingToggle.classList.toggle('active');
+            if (chatHistory.length > 0) {
+                if (confirm("Changing Thinking Mode will clear your current conversation history. Proceed?")) {
+                    window.clearHistory();
+                } else {
+                    thinkingToggle.classList.toggle('active');
+                }
+            }
+        };
+        // Client ID injection (Parity with reference)
+        const header = "x-v46-client-id";
+        const key = "minicpm_v46_demo_client_id";
+        let clientId = localStorage.getItem(key);
+        if (!clientId) {
+            clientId = "local-" + Math.random().toString(36).slice(2) + Date.now().toString(36);
+            localStorage.setItem(key, clientId);
+        }
+        window.__minicpmV46ClientId = clientId;
+        // Patch fetch to include client ID
+        const originalFetch = window.fetch;
+        window.fetch = function(input, init) {
+            const nextInit = init ? Object.assign({}, init) : {};
+            const headers = new Headers(nextInit.headers || (input instanceof Request ? input.headers : undefined));
+            headers.set(header, clientId);
+            nextInit.headers = headers;
+            return originalFetch.call(this, input, nextInit);
+        };
+        // Few-Shot Logic (Enhanced for Parity)
+        window.addFewShot = function() {
+            // In the reference, this is a form. Here we can use a modal or simple prompts.
+            // For true parity, let's add a more structured prompt sequence.
+            const hasImage = confirm("Include an image in this example?");
+            if (hasImage) {
+                // In a real web app we'd open a file picker, but here we can just ask the user to upload it normally
+                // and then "Mark as Few-Shot".
+                // For now, let's keep it simple as a guided prompt.
+                alert("Please upload your image normally, then click 'Add Example' in the settings to mark the current turn as few-shot.");
+            }
+        };
+        // Initial Button Wiring
+        sendBtn.onclick = sendMessage;
+        userInput.onkeydown = (e) => {
+            if (e.key === 'Enter' && !e.shiftKey) {
+                e.preventDefault();
+                sendMessage();
+            }
+        };
+        // Auto-resize textarea
+        userInput.oninput = () => {
+            userInput.style.height = 'auto';
+            userInput.style.height = userInput.scrollHeight + 'px';
+        };
+        // Make regenerate global for the HTML onclick
+        window.regenerate = regenerate;
+    </script>
+</body>
+</html>

requirements.txt CHANGED Viewed

@@ -1,15 +1,12 @@
 torch
 torchvision
-torchcodec
-transformers==5.8.0
-gradio==5.50.0
-modelscope_studio==1.6.1
-Pillow>=10.0
-decord>=0.6.0
-huggingface_hub>=1.0
-tokenizers>=0.22.0,<=0.23.0
-regex>=2025.10.22
-mistral_common>=1.11.0
-accelerate>=1.1.0
-av==17.0.1
 spaces

+transformers>=4.44.0
 torch
 torchvision
+gradio==6.14.0
+fastapi
 spaces
+pillow
+av
+accelerate
+sentencepiece
+uvicorn>=0.14.0
+websockets>=10.4