Spaces:

akhaliq
/

MiniCPM-V-4.6

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 9 days ago

Commit

f1f0cc8

1 Parent(s): ddaae0c

feat: implement multimodal MiniCPM-V 4.6 inference application with FastAPI and custom web interface

Browse files

Files changed (3) hide show

app.py +112 -0
index.html +336 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+import torch
+import re
+from PIL import Image
+from transformers import AutoModelForImageTextToText, AutoProcessor
+from gradio import Server
+from gradio.data_classes import FileData
+from fastapi.responses import HTMLResponse
+import spaces
+# Load model and processor
+model_id = "openbmb/MiniCPM-V-4.6"
+print(f"Loading model: {model_id}...")
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+    device_map="auto"
+)
+# Utility for response normalization
+_PATTERN = re.compile(
+    r'(```[\s\S]*?```|`[^`]+`|\$\$[\s\S]*?\$\$|\$[^$]+\$|\\\([\s\S]*?\\\)|\\\[[\s\S]*?\\\])'
+    r'|(?<!\\)(?:\\r\\n|\\[nr])'
+)
+def normalize_response_text(text: str) -> str:
+    if not isinstance(text, str) or "\\" not in text:
+        return text
+    return _PATTERN.sub(lambda m: m.group(1) or '\n', text)
+app = Server()
+@spaces.GPU
+@app.api()
+def predict(message: str, file: FileData = None, downsample_mode: str = "16x"):
+    """
+    General inference endpoint for both image and video.
+    """
+    if file is None:
+        # Text-only inference (standard LLM behavior)
+        messages = [{"role": "user", "content": [{"type": "text", "text": message}]}]
+    else:
+        file_path = file["path"]
+        is_video = any(file_path.lower().endswith(ext) for ext in ['.mp4', '.mkv', '.mov', '.avi'])
+        if is_video:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video", "url": file_path},
+                        {"type": "text", "text": message},
+                    ],
+                }
+            ]
+            # Video specific params
+            inputs = processor.apply_chat_template(
+                messages, tokenize=True, add_generation_prompt=True,
+                return_dict=True, return_tensors="pt",
+                downsample_mode=downsample_mode,
+                max_num_frames=64, # Optimized for speed
+                stack_frames=1,
+                max_slice_nums=1,
+                use_image_id=False,
+            ).to(model.device)
+        else:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "url": file_path},
+                        {"type": "text", "text": message},
+                    ],
+                }
+            ]
+            # Image specific params
+            inputs = processor.apply_chat_template(
+                messages, tokenize=True, add_generation_prompt=True,
+                return_dict=True, return_tensors="pt",
+                downsample_mode=downsample_mode,
+                max_slice_nums=9,
+            ).to(model.device)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            downsample_mode=downsample_mode,
+            max_new_tokens=1024,
+            do_sample=True,
+            temperature=0.7
+        )
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return normalize_response_text(output_text[0])
+@app.get("/", response_class=HTMLResponse)
+async def homepage():
+    html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
+    with open(html_path, "r", encoding="utf-8") as f:
+        return f.read()
+if __name__ == "__main__":
+    app.launch(show_error=True)

index.html ADDED Viewed

	@@ -0,0 +1,336 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>MiniCPM-V 4.6 | Next-Gen Multimodal AI</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
+    <script src="https://unpkg.com/lucide@latest"></script>
+    <style>
+        :root {
+            --glass-bg: rgba(17, 24, 39, 0.7);
+            --glass-border: rgba(255, 255, 255, 0.1);
+            --accent: #6366f1;
+            --accent-glow: rgba(99, 102, 241, 0.3);
+        }
+        body {
+            font-family: 'Inter', sans-serif;
+            background-color: #030712;
+            color: #f3f4f6;
+            overflow-x: hidden;
+        }
+        .glass {
+            background: var(--glass-bg);
+            backdrop-filter: blur(12px);
+            border: 1px solid var(--glass-border);
+        }
+        .chat-container {
+            height: calc(100vh - 180px);
+            scrollbar-width: thin;
+            scrollbar-color: var(--glass-border) transparent;
+        }
+        .chat-container::-webkit-scrollbar {
+            width: 6px;
+        }
+        .chat-container::-webkit-scrollbar-thumb {
+            background: var(--glass-border);
+            border-radius: 10px;
+        }
+        .message-anim {
+            animation: slideUp 0.3s ease-out forwards;
+        }
+        @keyframes slideUp {
+            from { opacity: 0; transform: translateY(10px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        .gradient-text {
+            background: linear-gradient(135deg, #818cf8, #c084fc);
+            -webkit-background-clip: text;
+            -webkit-text-fill-color: transparent;
+        }
+        .glow-button {
+            transition: all 0.3s ease;
+        }
+        .glow-button:hover {
+            box-shadow: 0 0 20px var(--accent-glow);
+            transform: translateY(-1px);
+        }
+        .file-preview-container {
+            position: relative;
+            display: inline-block;
+        }
+        .remove-file {
+            position: absolute;
+            top: -8px;
+            right: -8px;
+            background: #ef4444;
+            border-radius: 50%;
+            padding: 2px;
+            cursor: pointer;
+            display: none;
+        }
+        .file-preview-container:hover .remove-file {
+            display: block;
+        }
+        #loading-spinner {
+            display: none;
+        }
+    </style>
+</head>
+<body class="min-h-screen flex flex-col">
+    <!-- Header -->
+    <header class="h-16 glass fixed top-0 w-full z-50 flex items-center justify-between px-6 border-b border-white/5">
+        <div class="flex items-center gap-3">
+            <div class="w-8 h-8 bg-indigo-600 rounded-lg flex items-center justify-center">
+                <i data-lucide="zap" class="w-5 h-5 text-white"></i>
+            </div>
+            <h1 class="text-xl font-bold tracking-tight gradient-text">MiniCPM-V 4.6</h1>
+        </div>
+        <div class="flex items-center gap-6 text-sm font-medium text-gray-400">
+            <a href="#" class="hover:text-white transition-colors">Docs</a>
+            <a href="#" class="hover:text-white transition-colors">GitHub</a>
+            <div class="h-4 w-[1px] bg-white/10"></div>
+            <button class="glass px-4 py-1.5 rounded-full text-xs border border-white/10 hover:bg-white/5 transition-all">
+                v4.6.0-stable
+            </button>
+        </div>
+    </header>
+    <!-- Sidebar -->
+    <aside class="fixed left-0 top-16 w-64 h-full glass border-r border-white/5 p-4 hidden md:block">
+        <div class="mb-8">
+            <h2 class="text-xs font-semibold text-gray-500 uppercase tracking-wider mb-4">Mode Settings</h2>
+            <div class="space-y-4">
+                <div>
+                    <label class="text-xs text-gray-400 mb-2 block">Downsample Mode</label>
+                    <select id="downsample-mode" class="w-full bg-black/40 border border-white/10 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-1 focus:ring-indigo-500">
+                        <option value="16x">16x (Fast)</option>
+                        <option value="4x">4x (Finer Detail)</option>
+                    </select>
+                </div>
+            </div>
+        </div>
+        <div>
+            <h2 class="text-xs font-semibold text-gray-500 uppercase tracking-wider mb-4">Quick Actions</h2>
+            <button class="w-full text-left px-3 py-2 text-sm text-gray-400 hover:text-white hover:bg-white/5 rounded-lg transition-all flex items-center gap-3">
+                <i data-lucide="image" class="w-4 h-4"></i> Image Analysis
+            </button>
+            <button class="w-full text-left px-3 py-2 text-sm text-gray-400 hover:text-white hover:bg-white/5 rounded-lg transition-all flex items-center gap-3">
+                <i data-lucide="video" class="w-4 h-4"></i> Video Understanding
+            </button>
+        </div>
+    </aside>
+    <!-- Main Chat Area -->
+    <main class="flex-1 mt-16 md:ml-64 p-4 md:p-8 flex flex-col">
+        <div id="chat-messages" class="chat-container space-y-6 pb-24 overflow-y-auto">
+            <!-- Welcome Message -->
+            <div class="flex gap-4 max-w-3xl mx-auto items-start message-anim">
+                <div class="w-8 h-8 rounded-full bg-indigo-500/20 flex items-center justify-center shrink-0 border border-indigo-500/30">
+                    <i data-lucide="bot" class="w-4 h-4 text-indigo-400"></i>
+                </div>
+                <div class="glass p-5 rounded-2xl rounded-tl-none border border-white/5">
+                    <p class="text-gray-200 leading-relaxed">
+                        Hello! I am **MiniCPM-V 4.6**, an ultra-efficient multimodal assistant. I can help you understand images and videos with high precision.
+                        <br><br>
+                        Try uploading an image or a video to get started!
+                    </p>
+                </div>
+            </div>
+        </div>
+        <!-- Input Section -->
+        <div class="fixed bottom-0 left-0 md:left-64 right-0 p-4 bg-gradient-to-t from-[#030712] via-[#030712] to-transparent">
+            <div class="max-w-4xl mx-auto glass rounded-2xl p-2 border border-white/10 shadow-2xl">
+                <div id="preview-area" class="px-4 py-2 hidden">
+                    <div class="file-preview-container">
+                        <img id="image-preview" src="" class="h-20 w-auto rounded-lg border border-white/10 hidden" />
+                        <video id="video-preview" class="h-20 w-auto rounded-lg border border-white/10 hidden" muted loop></video>
+                        <div id="remove-file-btn" class="remove-file"><i data-lucide="x" class="w-3 h-3 text-white"></i></div>
+                    </div>
+                </div>
+                <div class="flex items-end gap-2 px-2 pb-1 pt-1">
+                    <button id="upload-btn" class="p-3 text-gray-400 hover:text-white hover:bg-white/5 rounded-xl transition-all">
+                        <i data-lucide="paperclip" class="w-5 h-5"></i>
+                    </button>
+                    <input type="file" id="file-input" class="hidden" accept="image/*,video/*">
+                    <textarea id="user-input" rows="1" placeholder="Ask anything about the media..." class="flex-1 bg-transparent border-none focus:ring-0 text-white placeholder-gray-500 py-3 resize-none max-h-48 scrollbar-none" oninput="this.style.height = ''; this.style.height = this.scrollHeight + 'px'"></textarea>
+                    <button id="send-btn" class="bg-indigo-600 hover:bg-indigo-500 text-white p-3 rounded-xl glow-button flex items-center justify-center disabled:opacity-50 disabled:cursor-not-allowed">
+                        <i data-lucide="arrow-up" class="w-5 h-5" id="send-icon"></i>
+                        <i data-lucide="loader-2" class="w-5 h-5 animate-spin hidden" id="loading-spinner"></i>
+                    </button>
+                </div>
+            </div>
+            <p class="text-[10px] text-center text-gray-600 mt-2">MiniCPM-V 4.6 may produce inaccurate information about people, places, or facts.</p>
+        </div>
+    </main>
+    <script type="module">
+        import { Client, handle_file } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
+        // Initialize Lucide icons
+        lucide.createIcons();
+        const chatMessages = document.getElementById('chat-messages');
+        const userInput = document.getElementById('user-input');
+        const sendBtn = document.getElementById('send-btn');
+        const fileInput = document.getElementById('file-input');
+        const uploadBtn = document.getElementById('upload-btn');
+        const previewArea = document.getElementById('preview-area');
+        const imagePreview = document.getElementById('image-preview');
+        const videoPreview = document.getElementById('video-preview');
+        const removeFileBtn = document.getElementById('remove-file-btn');
+        const downsampleMode = document.getElementById('downsample-mode');
+        const sendIcon = document.getElementById('send-icon');
+        const loadingSpinner = document.getElementById('loading-spinner');
+        let selectedFile = null;
+        let client = null;
+        async function initClient() {
+            try {
+                client = await Client.connect(window.location.origin);
+                console.log("Gradio Client Connected");
+            } catch (error) {
+                console.error("Failed to connect to Gradio backend:", error);
+            }
+        }
+        initClient();
+        uploadBtn.onclick = () => fileInput.click();
+        fileInput.onchange = (e) => {
+            const file = e.target.files[0];
+            if (file) {
+                selectedFile = file;
+                previewArea.classList.remove('hidden');
+                const url = URL.createObjectURL(file);
+                if (file.type.startsWith('image/')) {
+                    imagePreview.src = url;
+                    imagePreview.classList.remove('hidden');
+                    videoPreview.classList.add('hidden');
+                } else {
+                    videoPreview.src = url;
+                    videoPreview.classList.remove('hidden');
+                    imagePreview.classList.add('hidden');
+                    videoPreview.play();
+                }
+            }
+        };
+        removeFileBtn.onclick = () => {
+            selectedFile = null;
+            fileInput.value = '';
+            previewArea.classList.add('hidden');
+            imagePreview.src = '';
+            videoPreview.src = '';
+            videoPreview.pause();
+        };
+        function addMessage(role, content, fileUrl = null, fileType = null) {
+            const div = document.createElement('div');
+            div.className = `flex gap-4 max-w-3xl mx-auto items-start message-anim ${role === 'user' ? 'flex-row-reverse' : ''}`;
+            const icon = role === 'user' ? 'user' : 'bot';
+            const iconColor = role === 'user' ? 'gray' : 'indigo';
+            let mediaHtml = '';
+            if (fileUrl) {
+                if (fileType.startsWith('image')) {
+                    mediaHtml = `<img src="${fileUrl}" class="max-w-xs rounded-lg mb-3 border border-white/10" />`;
+                } else {
+                    mediaHtml = `<video src="${fileUrl}" controls class="max-w-xs rounded-lg mb-3 border border-white/10"></video>`;
+                }
+            }
+            div.innerHTML = `
+                <div class="w-8 h-8 rounded-full bg-${iconColor}-500/20 flex items-center justify-center shrink-0 border border-${iconColor}-500/30">
+                    <i data-lucide="${icon}" class="w-4 h-4 text-${iconColor}-400"></i>
+                </div>
+                <div class="glass p-5 rounded-2xl ${role === 'user' ? 'rounded-tr-none' : 'rounded-tl-none'} border border-white/5">
+                    ${mediaHtml}
+                    <div class="text-gray-200 leading-relaxed whitespace-pre-wrap">${content}</div>
+                </div>
+            `;
+            chatMessages.appendChild(div);
+            lucide.createIcons();
+            chatMessages.scrollTop = chatMessages.scrollHeight;
+        }
+        async function handleSend() {
+            const text = userInput.value.trim();
+            if (!text && !selectedFile) return;
+            const currentFile = selectedFile;
+            const currentText = text;
+            const currentMode = downsampleMode.value;
+            // Clear input
+            userInput.value = '';
+            userInput.style.height = 'auto';
+            const fileUrl = currentFile ? URL.createObjectURL(currentFile) : null;
+            const fileType = currentFile ? currentFile.type : null;
+            addMessage('user', currentText, fileUrl, fileType);
+            // Show loading
+            sendIcon.classList.add('hidden');
+            loadingSpinner.classList.remove('hidden');
+            sendBtn.disabled = true;
+            try {
+                let fileData = null;
+                if (currentFile) {
+                    fileData = handle_file(currentFile);
+                }
+                const result = await client.predict("/predict", {
+                    message: currentText,
+                    file: fileData,
+                    downsample_mode: currentMode
+                });
+                addMessage('assistant', result.data);
+            } catch (error) {
+                console.error("Prediction failed:", error);
+                addMessage('assistant', "Sorry, I encountered an error while processing your request.");
+            } finally {
+                sendIcon.classList.remove('hidden');
+                loadingSpinner.classList.add('hidden');
+                sendBtn.disabled = false;
+                removeFileBtn.onclick(); // Reset preview
+            }
+        }
+        sendBtn.onclick = handleSend;
+        userInput.onkeydown = (e) => {
+            if (e.key === 'Enter' && !e.shiftKey) {
+                e.preventDefault();
+                handleSend();
+            }
+        };
+    </script>
+</body>
+</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+transformers>=4.44.0
+torch
+torchvision
+gradio
+fastapi
+spaces
+pillow
+av
+accelerate
+sentencepiece