Spaces:

userisuser
/

MiniCPM-V-4.6-Demo-Gradio-Server

Runtime error

App Files Files Community

userisuser Cursor commited on 9 days ago

Commit

ecb8ee5

1 Parent(s): 5d404cd

Deploy MiniCPM-V 4.6 Gradio Server demo

Browse files

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (5) hide show

README.md +23 -8
app.py +176 -0
index.html +333 -0
requirements.txt +8 -0
style.css +28 -0

README.md CHANGED Viewed

@@ -1,13 +1,28 @@
 ---
-title: MiniCPM V 4.6 Demo Gradio Server
-emoji: 🐢
-colorFrom: purple
-colorTo: green
 sdk: gradio
-sdk_version: 6.14.0
-python_version: '3.13'
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: MiniCPM-V 4.6 Demo
+emoji: 🪐
+colorFrom: indigo
+colorTo: pink
 sdk: gradio
+models:
+- openbmb/MiniCPM-V-4.6
 pinned: false
+short_description: MiniCPM-V 4.6 Ultra-Efficient Multimodal AI
 ---
+# MiniCPM-V 4.6 Demo
+This Space hosts the official **MiniCPM-V 4.6** multimodal demo using the Gradio Server architecture.
+## Features
+- **Ultra-Efficient**: Powered by MiniCPM-V 4.6 for fast image and video understanding.
+- **ZeroGPU Optimization**: Uses dynamic GPU allocation for high performance.
+- **Modern Minimalist UI**: Sleek, mobile-friendly interface designed by OpenBMB.
+## Technical Stack
+- **Backend**: Gradio Server (FastAPI)
+- **Frontend**: Custom HTML/JS/CSS with Lucide icons
+- **Vision Logic**: Manual frame extraction via PyAV for robust video support
+---
+For more information, visit the [MiniCPM-V GitHub Repository](https://github.com/OpenBMB/MiniCPM-V).

app.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import os
+import torch
+import re
+import av
+from PIL import Image
+from transformers import AutoModelForImageTextToText, AutoProcessor
+from gradio import Server
+from gradio.data_classes import FileData
+from fastapi.responses import HTMLResponse
+import spaces
+# Load model and processor
+model_id = "openbmb/MiniCPM-V-4.6"
+print(f"Loading model: {model_id}...")
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+    device_map="cuda"
+)
+def load_video(video_path, max_frames=64):
+    """Utility to load video frames using PyAV."""
+    try:
+        container = av.open(video_path)
+        frames = []
+        stream = container.streams.video[0]
+        total_frames = stream.frames
+        if total_frames <= 0:
+            print("Frame count unknown, decoding all and sampling...")
+            temp_frames = []
+            for frame in container.decode(video=0):
+                temp_frames.append(frame.to_image())
+            if len(temp_frames) > max_frames:
+                indices = [int(i * len(temp_frames) / max_frames) for i in range(max_frames)]
+                frames = [temp_frames[i] for i in indices]
+            else:
+                frames = temp_frames
+        else:
+            indices = [int(i * total_frames / max_frames) for i in range(max_frames)]
+            current_idx = 0
+            for i, frame in enumerate(container.decode(video=0)):
+                if current_idx < len(indices) and i == indices[current_idx]:
+                    frames.append(frame.to_image())
+                    current_idx += 1
+                if current_idx >= len(indices):
+                    break
+        container.close()
+        return frames
+    except Exception as e:
+        print(f"Error loading video: {e}")
+        return None
+# Utility for response normalization
+_PATTERN = re.compile(
+    r'(```[\s\S]*?```|`[^`]+`|\$\$[\s\S]*?\$\$|\$[^$]+\$|\\\([\s\S]*?\\\)|\\\[[\s\S]*?\\\])'
+    r'|(?<!\\)(?:\\r\\n|\\[nr])'
+)
+def normalize_response_text(text: str) -> str:
+    if not isinstance(text, str) or "\\" not in text:
+        return text
+    return _PATTERN.sub(lambda m: m.group(1) or '\n', text)
+app = Server()
+@app.api()
+@spaces.GPU(duration=120)
+def predict(message: str, file: FileData = None, downsample_mode: str = "16x") -> str:
+    """
+    General inference endpoint for both image and video.
+    """
+    if file is None:
+        # Text-only inference
+        messages = [{"role": "user", "content": [{"type": "text", "text": message}]}]
+        inputs = processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt"
+        ).to(model.device)
+    else:
+        file_path = file["path"]
+        # Robust detection: Try opening with AV first to see if it's a video
+        is_video = False
+        try:
+            container = av.open(file_path)
+            if len(container.streams.video) > 0:
+                is_video = True
+            container.close()
+        except:
+            is_video = False
+        if is_video:
+            print(f"Processing as video: {file_path}")
+            frames = load_video(file_path, max_frames=64)
+            if frames is None or len(frames) == 0:
+                return "Error: Could not decode video file."
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video", "video": frames},
+                        {"type": "text", "text": message},
+                    ],
+                }
+            ]
+            inputs = processor.apply_chat_template(
+                messages, tokenize=True, add_generation_prompt=True,
+                return_dict=True, return_tensors="pt",
+                processor_kwargs={
+                    "downsample_mode": downsample_mode,
+                    "max_num_frames": 64,
+                    "stack_frames": 1,
+                    "max_slice_nums": 1,
+                    "use_image_id": False,
+                    "do_sample_frames": False, # Fix: Avoid requiring metadata since we already sampled
+                }
+            ).to(model.device)
+        else:
+            print(f"Processing as image: {file_path}")
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "url": file_path},
+                        {"type": "text", "text": message},
+                    ],
+                }
+            ]
+            inputs = processor.apply_chat_template(
+                messages, tokenize=True, add_generation_prompt=True,
+                return_dict=True, return_tensors="pt",
+                processor_kwargs={
+                    "downsample_mode": downsample_mode,
+                    "max_slice_nums": 9,
+                }
+            ).to(model.device)
+    with torch.no_grad():
+        generate_kwargs = {
+            **inputs,
+            "max_new_tokens": 1024,
+            "do_sample": True,
+            "temperature": 0.7
+        }
+        if file is not None:
+            generate_kwargs["downsample_mode"] = downsample_mode
+        generated_ids = model.generate(**generate_kwargs)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return normalize_response_text(output_text[0])
+@app.get("/", response_class=HTMLResponse)
+async def homepage():
+    html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
+    with open(html_path, "r", encoding="utf-8") as f:
+        return f.read()
+if __name__ == "__main__":
+    app.launch(show_error=True)

index.html ADDED Viewed

	@@ -0,0 +1,333 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
+    <title>MiniCPM-V | OpenBMB</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <link href="https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700&family=Inter:wght@300;400;500;600&display=swap" rel="stylesheet">
+    <script src="https://unpkg.com/lucide@latest"></script>
+    <style>
+        :root {
+            --bg: #0A0C10;
+            --blue: #3B5BFF;
+            --cyan: #27D4EA;
+            --text: #FFFFFF;
+            --text-muted: #6E7681;
+            --glass: rgba(255, 255, 255, 0.03);
+            --glass-border: rgba(255, 255, 255, 0.1);
+        }
+        body {
+            font-family: 'Inter', sans-serif;
+            background-color: var(--bg);
+            color: var(--text);
+            height: 100vh;
+            margin: 0;
+            display: flex;
+            flex-direction: column;
+            overflow: hidden; /* Prevent body scroll */
+        }
+        h1, h2, h3 { font-family: 'Outfit', sans-serif; }
+        .chat-scroll-area {
+            flex: 1;
+            overflow-y: auto;
+            padding-bottom: 120px; /* Space for floating input */
+            -webkit-overflow-scrolling: touch;
+        }
+        /* Modern Scrollbar */
+        .chat-scroll-area::-webkit-scrollbar {
+            width: 5px;
+        }
+        .chat-scroll-area::-webkit-scrollbar-track {
+            background: transparent;
+        }
+        .chat-scroll-area::-webkit-scrollbar-thumb {
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 10px;
+        }
+        .message-bubble {
+            max-width: 85%;
+            animation: fadeIn 0.4s cubic-bezier(0.16, 1, 0.3, 1) forwards;
+        }
+        @keyframes fadeIn {
+            from { opacity: 0; transform: translateY(10px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        .user-message {
+            background: linear-gradient(135deg, var(--blue), var(--cyan));
+            color: #FFFFFF;
+            box-shadow: 0 10px 30px rgba(59, 91, 255, 0.2);
+        }
+        .bot-message {
+            background: rgba(255, 255, 255, 0.04);
+            border: 1px solid var(--glass-border);
+        }
+        .typing-dot {
+            width: 4px;
+            height: 4px;
+            background: var(--cyan);
+            border-radius: 50%;
+            animation: bounce 1.4s infinite ease-in-out;
+        }
+        .typing-dot:nth-child(2) { animation-delay: 0.2s; }
+        .typing-dot:nth-child(3) { animation-delay: 0.4s; }
+        @keyframes bounce {
+            0%, 80%, 100% { transform: scale(0.3); opacity: 0.4; }
+            40% { transform: scale(1); opacity: 1; }
+        }
+        .input-pill {
+            background: rgba(255, 255, 255, 0.05);
+            backdrop-filter: blur(20px);
+            -webkit-backdrop-filter: blur(20px);
+            border: 1px solid var(--glass-border);
+            transition: all 0.3s ease;
+        }
+        .input-pill:focus-within {
+            border-color: var(--blue);
+            box-shadow: 0 0 30px rgba(59, 91, 255, 0.1);
+        }
+        .logo-glow {
+            filter: drop-shadow(0 0 10px rgba(39, 212, 234, 0.3));
+        }
+        .send-btn {
+            background: linear-gradient(135deg, var(--blue), var(--cyan));
+            transition: transform 0.2s ease, opacity 0.2s ease;
+        }
+        .send-btn:active { transform: scale(0.95); }
+        #user-input::placeholder { color: #555; }
+    </style>
+</head>
+<body>
+    <!-- Minimalist Header -->
+    <header class="h-20 flex items-center justify-between px-6 md:px-12 shrink-0 z-50">
+        <div class="flex items-center gap-4">
+            <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/1670387859384-633fe7784b362488336bbfad.png"
+                 alt="OpenBMB" class="w-10 h-10 logo-glow">
+            <div>
+                <h1 class="text-xl font-bold tracking-tight">MiniCPM-V</h1>
+                <p class="text-[10px] text-muted uppercase tracking-[0.2em] font-medium">By OpenBMB</p>
+            </div>
+        </div>
+        <div class="hidden md:flex items-center gap-2 text-[10px] font-bold text-muted uppercase tracking-widest">
+            <span class="w-1.5 h-1.5 rounded-full bg-[#27D4EA] animate-pulse"></span>
+            Vision System Online
+        </div>
+    </header>
+    <!-- Chat Messages Scroll Area -->
+    <main id="chat-messages" class="chat-scroll-area px-4 md:px-0">
+        <div class="max-w-3xl mx-auto space-y-8 pt-4">
+            <!-- Bot Greeting -->
+            <div class="flex gap-4 items-start">
+                <div class="bot-message p-6 rounded-3xl rounded-tl-none message-bubble shadow-2xl">
+                    <p class="text-white/90 leading-relaxed text-[15px]">
+                        Welcome to <span class="font-bold text-[#27D4EA]">MiniCPM-V 4.6</span>.
+                        I can analyze images and videos with high efficiency.
+                        <br><br>
+                        Drop a file below to begin.
+                    </p>
+                </div>
+            </div>
+        </div>
+    </main>
+    <!-- Floating Input Bar -->
+    <div class="fixed bottom-0 left-0 right-0 p-6 md:p-10 pointer-events-none">
+        <div class="max-w-3xl mx-auto pointer-events-auto">
+            <!-- Media Preview -->
+            <div id="preview-container" class="hidden mb-6 animate-in">
+                <div class="relative inline-block group">
+                    <img id="image-preview" src="" class="h-36 w-auto rounded-3xl border border-white/20 shadow-2xl hidden object-cover" />
+                    <video id="video-preview" class="h-36 w-auto rounded-3xl border border-white/20 shadow-2xl hidden object-cover" muted loop></video>
+                    <button id="cancel-file" class="absolute -top-3 -right-3 bg-white text-black rounded-full p-2 shadow-xl hover:bg-neutral-200 transition-all">
+                        <i data-lucide="x" class="w-4 h-4"></i>
+                    </button>
+                </div>
+            </div>
+            <!-- Pill Input -->
+            <div class="input-pill rounded-[2.5rem] p-2 flex items-end gap-2 pr-3 shadow-2xl">
+                <div class="flex items-center">
+                    <input type="file" id="file-input" class="hidden" accept="image/*,video/*">
+                    <button id="upload-trigger" class="p-4 text-white/30 hover:text-[#27D4EA] transition-colors">
+                        <i data-lucide="paperclip" class="w-6 h-6"></i>
+                    </button>
+                </div>
+                <textarea id="user-input" rows="1" placeholder="Type your message..."
+                          class="flex-1 bg-transparent border-none focus:ring-0 text-white py-4 px-1 resize-none max-h-40 scrollbar-none text-[16px] leading-relaxed"></textarea>
+                <button id="send-btn" class="send-btn w-12 h-12 text-white rounded-full flex items-center justify-center disabled:opacity-20 disabled:grayscale group shrink-0 mb-1">
+                    <i data-lucide="arrow-up" class="w-5 h-5 group-hover:scale-110 transition-transform" id="send-icon"></i>
+                    <i data-lucide="loader-2" class="w-5 h-5 animate-spin hidden" id="loading-icon"></i>
+                </button>
+            </div>
+        </div>
+    </div>
+    <script type="module">
+        import { Client, handle_file } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
+        lucide.createIcons();
+        const chatMessages = document.getElementById('chat-messages');
+        const userInput = document.getElementById('user-input');
+        const sendBtn = document.getElementById('send-btn');
+        const fileInput = document.getElementById('file-input');
+        const uploadTrigger = document.getElementById('upload-trigger');
+        const previewContainer = document.getElementById('preview-container');
+        const imagePreview = document.getElementById('image-preview');
+        const videoPreview = document.getElementById('video-preview');
+        const cancelFile = document.getElementById('cancel-file');
+        const sendIcon = document.getElementById('send-icon');
+        const loadingIcon = document.getElementById('loading-icon');
+        let selectedFile = null;
+        let client = null;
+        async function init() {
+            try {
+                client = await Client.connect(window.location.origin);
+            } catch (err) { console.error("Gradio Connection Error", err); }
+        }
+        init();
+        // UI Interactions
+        uploadTrigger.onclick = () => fileInput.click();
+        fileInput.onchange = (e) => {
+            const file = e.target.files[0];
+            if (file) {
+                selectedFile = file;
+                previewContainer.classList.remove('hidden');
+                const url = URL.createObjectURL(file);
+                if (file.type.startsWith('image/')) {
+                    imagePreview.src = url;
+                    imagePreview.classList.remove('hidden');
+                    videoPreview.classList.add('hidden');
+                } else {
+                    videoPreview.src = url;
+                    videoPreview.classList.remove('hidden');
+                    imagePreview.classList.add('hidden');
+                    videoPreview.play();
+                }
+            }
+        };
+        cancelFile.onclick = () => {
+            selectedFile = null;
+            fileInput.value = '';
+            previewContainer.classList.add('hidden');
+            imagePreview.src = '';
+            videoPreview.src = '';
+        };
+        function appendMessage(role, text, mediaUrl = null, mediaType = null) {
+            const div = document.createElement('div');
+            div.className = `flex gap-4 items-start ${role === 'user' ? 'flex-row-reverse' : ''}`;
+            let mediaHtml = '';
+            if (mediaUrl) {
+                if (mediaType.startsWith('image')) {
+                    mediaHtml = `<img src="${mediaUrl}" class="max-w-xs md:max-w-md rounded-3xl mb-4 border border-white/10" />`;
+                } else {
+                    mediaHtml = `<video src="${mediaUrl}" controls class="max-w-xs md:max-w-md rounded-3xl mb-4 border border-white/10"></video>`;
+                }
+            }
+            const bubbleClass = role === 'user' ? 'user-message' : 'bot-message';
+            div.innerHTML = `
+                <div class="${bubbleClass} p-6 rounded-[2rem] ${role === 'user' ? 'rounded-tr-none' : 'rounded-tl-none'} message-bubble shadow-xl">
+                    ${mediaHtml}
+                    <p class="leading-relaxed text-[15px] whitespace-pre-wrap font-medium">${text}</p>
+                </div>
+            `;
+            // Get the inner container
+            const container = chatMessages.querySelector('.max-w-3xl');
+            container.appendChild(div);
+            // Smooth scroll to bottom
+            chatMessages.scrollTo({ top: chatMessages.scrollHeight, behavior: 'smooth' });
+        }
+        async function sendMessage() {
+            const text = userInput.value.trim();
+            if (!text && !selectedFile) return;
+            const content = text;
+            const file = selectedFile;
+            userInput.value = '';
+            userInput.style.height = 'auto';
+            const fileUrl = file ? URL.createObjectURL(file) : null;
+            const fileType = file ? file.type : null;
+            appendMessage('user', content, fileUrl, fileType);
+            cancelFile.click();
+            sendIcon.classList.add('hidden');
+            loadingIcon.classList.remove('hidden');
+            sendBtn.disabled = true;
+            const thinkingId = 'think-' + Date.now();
+            const thinkingDiv = document.createElement('div');
+            thinkingDiv.id = thinkingId;
+            thinkingDiv.className = 'flex gap-4 items-start';
+            thinkingDiv.innerHTML = `
+                <div class="bot-message p-6 rounded-[2rem] rounded-tl-none message-bubble flex items-center gap-4">
+                    <div class="flex gap-1.5">
+                        <div class="typing-dot"></div><div class="typing-dot"></div><div class="typing-dot"></div>
+                    </div>
+                </div>
+            `;
+            const container = chatMessages.querySelector('.max-w-3xl');
+            container.appendChild(thinkingDiv);
+            chatMessages.scrollTo({ top: chatMessages.scrollHeight, behavior: 'smooth' });
+            try {
+                let fileData = file ? handle_file(file) : null;
+                const result = await client.predict("/predict", {
+                    message: content,
+                    file: fileData,
+                    downsample_mode: "16x"
+                });
+                document.getElementById(thinkingId).remove();
+                appendMessage('bot', result.data);
+            } catch (err) {
+                document.getElementById(thinkingId).remove();
+                appendMessage('bot', "The system encountered an error. Please check your file format and try again.");
+            } finally {
+                sendIcon.classList.remove('hidden');
+                loadingIcon.classList.add('hidden');
+                sendBtn.disabled = false;
+            }
+        }
+        sendBtn.onclick = sendMessage;
+        userInput.onkeydown = (e) => {
+            if (e.key === 'Enter' && !e.shiftKey) {
+                e.preventDefault();
+                sendMessage();
+            }
+        };
+    </script>
+</body>
+</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+transformers>=4.44.0
+torch
+torchvision
+gradio
+fastapi
+spaces
+av
+pillow

style.css ADDED Viewed

	@@ -0,0 +1,28 @@

+body {
+	padding: 2rem;
+	font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
+}
+h1 {
+	font-size: 16px;
+	margin-top: 0;
+}
+p {
+	color: rgb(107, 114, 128);
+	font-size: 15px;
+	margin-bottom: 10px;
+	margin-top: 5px;
+}
+.card {
+	max-width: 620px;
+	margin: 0 auto;
+	padding: 16px;
+	border: 1px solid lightgray;
+	border-radius: 16px;
+}
+.card p:last-child {
+	margin-bottom: 0;
+}