akhaliq HF Staff commited on
Commit
f1f0cc8
·
1 Parent(s): ddaae0c

feat: implement multimodal MiniCPM-V 4.6 inference application with FastAPI and custom web interface

Browse files
Files changed (3) hide show
  1. app.py +112 -0
  2. index.html +336 -0
  3. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import re
4
+ from PIL import Image
5
+ from transformers import AutoModelForImageTextToText, AutoProcessor
6
+ from gradio import Server
7
+ from gradio.data_classes import FileData
8
+ from fastapi.responses import HTMLResponse
9
+ import spaces
10
+
11
+ # Load model and processor
12
+ model_id = "openbmb/MiniCPM-V-4.6"
13
+ print(f"Loading model: {model_id}...")
14
+
15
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
16
+ model = AutoModelForImageTextToText.from_pretrained(
17
+ model_id,
18
+ torch_dtype=torch.bfloat16,
19
+ trust_remote_code=True,
20
+ device_map="auto"
21
+ )
22
+
23
+ # Utility for response normalization
24
+ _PATTERN = re.compile(
25
+ r'(```[\s\S]*?```|`[^`]+`|\$\$[\s\S]*?\$\$|\$[^$]+\$|\\\([\s\S]*?\\\)|\\\[[\s\S]*?\\\])'
26
+ r'|(?<!\\)(?:\\r\\n|\\[nr])'
27
+ )
28
+
29
+ def normalize_response_text(text: str) -> str:
30
+ if not isinstance(text, str) or "\\" not in text:
31
+ return text
32
+ return _PATTERN.sub(lambda m: m.group(1) or '\n', text)
33
+
34
+ app = Server()
35
+
36
+ @spaces.GPU
37
+ @app.api()
38
+ def predict(message: str, file: FileData = None, downsample_mode: str = "16x"):
39
+ """
40
+ General inference endpoint for both image and video.
41
+ """
42
+ if file is None:
43
+ # Text-only inference (standard LLM behavior)
44
+ messages = [{"role": "user", "content": [{"type": "text", "text": message}]}]
45
+ else:
46
+ file_path = file["path"]
47
+ is_video = any(file_path.lower().endswith(ext) for ext in ['.mp4', '.mkv', '.mov', '.avi'])
48
+
49
+ if is_video:
50
+ messages = [
51
+ {
52
+ "role": "user",
53
+ "content": [
54
+ {"type": "video", "url": file_path},
55
+ {"type": "text", "text": message},
56
+ ],
57
+ }
58
+ ]
59
+ # Video specific params
60
+ inputs = processor.apply_chat_template(
61
+ messages, tokenize=True, add_generation_prompt=True,
62
+ return_dict=True, return_tensors="pt",
63
+ downsample_mode=downsample_mode,
64
+ max_num_frames=64, # Optimized for speed
65
+ stack_frames=1,
66
+ max_slice_nums=1,
67
+ use_image_id=False,
68
+ ).to(model.device)
69
+ else:
70
+ messages = [
71
+ {
72
+ "role": "user",
73
+ "content": [
74
+ {"type": "image", "url": file_path},
75
+ {"type": "text", "text": message},
76
+ ],
77
+ }
78
+ ]
79
+ # Image specific params
80
+ inputs = processor.apply_chat_template(
81
+ messages, tokenize=True, add_generation_prompt=True,
82
+ return_dict=True, return_tensors="pt",
83
+ downsample_mode=downsample_mode,
84
+ max_slice_nums=9,
85
+ ).to(model.device)
86
+
87
+ with torch.no_grad():
88
+ generated_ids = model.generate(
89
+ **inputs,
90
+ downsample_mode=downsample_mode,
91
+ max_new_tokens=1024,
92
+ do_sample=True,
93
+ temperature=0.7
94
+ )
95
+
96
+ generated_ids_trimmed = [
97
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
98
+ ]
99
+ output_text = processor.batch_decode(
100
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
101
+ )
102
+
103
+ return normalize_response_text(output_text[0])
104
+
105
+ @app.get("/", response_class=HTMLResponse)
106
+ async def homepage():
107
+ html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
108
+ with open(html_path, "r", encoding="utf-8") as f:
109
+ return f.read()
110
+
111
+ if __name__ == "__main__":
112
+ app.launch(show_error=True)
index.html ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>MiniCPM-V 4.6 | Next-Gen Multimodal AI</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
9
+ <script src="https://unpkg.com/lucide@latest"></script>
10
+ <style>
11
+ :root {
12
+ --glass-bg: rgba(17, 24, 39, 0.7);
13
+ --glass-border: rgba(255, 255, 255, 0.1);
14
+ --accent: #6366f1;
15
+ --accent-glow: rgba(99, 102, 241, 0.3);
16
+ }
17
+
18
+ body {
19
+ font-family: 'Inter', sans-serif;
20
+ background-color: #030712;
21
+ color: #f3f4f6;
22
+ overflow-x: hidden;
23
+ }
24
+
25
+ .glass {
26
+ background: var(--glass-bg);
27
+ backdrop-filter: blur(12px);
28
+ border: 1px solid var(--glass-border);
29
+ }
30
+
31
+ .chat-container {
32
+ height: calc(100vh - 180px);
33
+ scrollbar-width: thin;
34
+ scrollbar-color: var(--glass-border) transparent;
35
+ }
36
+
37
+ .chat-container::-webkit-scrollbar {
38
+ width: 6px;
39
+ }
40
+
41
+ .chat-container::-webkit-scrollbar-thumb {
42
+ background: var(--glass-border);
43
+ border-radius: 10px;
44
+ }
45
+
46
+ .message-anim {
47
+ animation: slideUp 0.3s ease-out forwards;
48
+ }
49
+
50
+ @keyframes slideUp {
51
+ from { opacity: 0; transform: translateY(10px); }
52
+ to { opacity: 1; transform: translateY(0); }
53
+ }
54
+
55
+ .gradient-text {
56
+ background: linear-gradient(135deg, #818cf8, #c084fc);
57
+ -webkit-background-clip: text;
58
+ -webkit-text-fill-color: transparent;
59
+ }
60
+
61
+ .glow-button {
62
+ transition: all 0.3s ease;
63
+ }
64
+
65
+ .glow-button:hover {
66
+ box-shadow: 0 0 20px var(--accent-glow);
67
+ transform: translateY(-1px);
68
+ }
69
+
70
+ .file-preview-container {
71
+ position: relative;
72
+ display: inline-block;
73
+ }
74
+
75
+ .remove-file {
76
+ position: absolute;
77
+ top: -8px;
78
+ right: -8px;
79
+ background: #ef4444;
80
+ border-radius: 50%;
81
+ padding: 2px;
82
+ cursor: pointer;
83
+ display: none;
84
+ }
85
+
86
+ .file-preview-container:hover .remove-file {
87
+ display: block;
88
+ }
89
+
90
+ #loading-spinner {
91
+ display: none;
92
+ }
93
+ </style>
94
+ </head>
95
+ <body class="min-h-screen flex flex-col">
96
+ <!-- Header -->
97
+ <header class="h-16 glass fixed top-0 w-full z-50 flex items-center justify-between px-6 border-b border-white/5">
98
+ <div class="flex items-center gap-3">
99
+ <div class="w-8 h-8 bg-indigo-600 rounded-lg flex items-center justify-center">
100
+ <i data-lucide="zap" class="w-5 h-5 text-white"></i>
101
+ </div>
102
+ <h1 class="text-xl font-bold tracking-tight gradient-text">MiniCPM-V 4.6</h1>
103
+ </div>
104
+ <div class="flex items-center gap-6 text-sm font-medium text-gray-400">
105
+ <a href="#" class="hover:text-white transition-colors">Docs</a>
106
+ <a href="#" class="hover:text-white transition-colors">GitHub</a>
107
+ <div class="h-4 w-[1px] bg-white/10"></div>
108
+ <button class="glass px-4 py-1.5 rounded-full text-xs border border-white/10 hover:bg-white/5 transition-all">
109
+ v4.6.0-stable
110
+ </button>
111
+ </div>
112
+ </header>
113
+
114
+ <!-- Sidebar -->
115
+ <aside class="fixed left-0 top-16 w-64 h-full glass border-r border-white/5 p-4 hidden md:block">
116
+ <div class="mb-8">
117
+ <h2 class="text-xs font-semibold text-gray-500 uppercase tracking-wider mb-4">Mode Settings</h2>
118
+ <div class="space-y-4">
119
+ <div>
120
+ <label class="text-xs text-gray-400 mb-2 block">Downsample Mode</label>
121
+ <select id="downsample-mode" class="w-full bg-black/40 border border-white/10 rounded-lg px-3 py-2 text-sm focus:outline-none focus:ring-1 focus:ring-indigo-500">
122
+ <option value="16x">16x (Fast)</option>
123
+ <option value="4x">4x (Finer Detail)</option>
124
+ </select>
125
+ </div>
126
+ </div>
127
+ </div>
128
+
129
+ <div>
130
+ <h2 class="text-xs font-semibold text-gray-500 uppercase tracking-wider mb-4">Quick Actions</h2>
131
+ <button class="w-full text-left px-3 py-2 text-sm text-gray-400 hover:text-white hover:bg-white/5 rounded-lg transition-all flex items-center gap-3">
132
+ <i data-lucide="image" class="w-4 h-4"></i> Image Analysis
133
+ </button>
134
+ <button class="w-full text-left px-3 py-2 text-sm text-gray-400 hover:text-white hover:bg-white/5 rounded-lg transition-all flex items-center gap-3">
135
+ <i data-lucide="video" class="w-4 h-4"></i> Video Understanding
136
+ </button>
137
+ </div>
138
+ </aside>
139
+
140
+ <!-- Main Chat Area -->
141
+ <main class="flex-1 mt-16 md:ml-64 p-4 md:p-8 flex flex-col">
142
+ <div id="chat-messages" class="chat-container space-y-6 pb-24 overflow-y-auto">
143
+ <!-- Welcome Message -->
144
+ <div class="flex gap-4 max-w-3xl mx-auto items-start message-anim">
145
+ <div class="w-8 h-8 rounded-full bg-indigo-500/20 flex items-center justify-center shrink-0 border border-indigo-500/30">
146
+ <i data-lucide="bot" class="w-4 h-4 text-indigo-400"></i>
147
+ </div>
148
+ <div class="glass p-5 rounded-2xl rounded-tl-none border border-white/5">
149
+ <p class="text-gray-200 leading-relaxed">
150
+ Hello! I am **MiniCPM-V 4.6**, an ultra-efficient multimodal assistant. I can help you understand images and videos with high precision.
151
+ <br><br>
152
+ Try uploading an image or a video to get started!
153
+ </p>
154
+ </div>
155
+ </div>
156
+ </div>
157
+
158
+ <!-- Input Section -->
159
+ <div class="fixed bottom-0 left-0 md:left-64 right-0 p-4 bg-gradient-to-t from-[#030712] via-[#030712] to-transparent">
160
+ <div class="max-w-4xl mx-auto glass rounded-2xl p-2 border border-white/10 shadow-2xl">
161
+ <div id="preview-area" class="px-4 py-2 hidden">
162
+ <div class="file-preview-container">
163
+ <img id="image-preview" src="" class="h-20 w-auto rounded-lg border border-white/10 hidden" />
164
+ <video id="video-preview" class="h-20 w-auto rounded-lg border border-white/10 hidden" muted loop></video>
165
+ <div id="remove-file-btn" class="remove-file"><i data-lucide="x" class="w-3 h-3 text-white"></i></div>
166
+ </div>
167
+ </div>
168
+
169
+ <div class="flex items-end gap-2 px-2 pb-1 pt-1">
170
+ <button id="upload-btn" class="p-3 text-gray-400 hover:text-white hover:bg-white/5 rounded-xl transition-all">
171
+ <i data-lucide="paperclip" class="w-5 h-5"></i>
172
+ </button>
173
+ <input type="file" id="file-input" class="hidden" accept="image/*,video/*">
174
+
175
+ <textarea id="user-input" rows="1" placeholder="Ask anything about the media..." class="flex-1 bg-transparent border-none focus:ring-0 text-white placeholder-gray-500 py-3 resize-none max-h-48 scrollbar-none" oninput="this.style.height = ''; this.style.height = this.scrollHeight + 'px'"></textarea>
176
+
177
+ <button id="send-btn" class="bg-indigo-600 hover:bg-indigo-500 text-white p-3 rounded-xl glow-button flex items-center justify-center disabled:opacity-50 disabled:cursor-not-allowed">
178
+ <i data-lucide="arrow-up" class="w-5 h-5" id="send-icon"></i>
179
+ <i data-lucide="loader-2" class="w-5 h-5 animate-spin hidden" id="loading-spinner"></i>
180
+ </button>
181
+ </div>
182
+ </div>
183
+ <p class="text-[10px] text-center text-gray-600 mt-2">MiniCPM-V 4.6 may produce inaccurate information about people, places, or facts.</p>
184
+ </div>
185
+ </main>
186
+
187
+ <script type="module">
188
+ import { Client, handle_file } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
189
+
190
+ // Initialize Lucide icons
191
+ lucide.createIcons();
192
+
193
+ const chatMessages = document.getElementById('chat-messages');
194
+ const userInput = document.getElementById('user-input');
195
+ const sendBtn = document.getElementById('send-btn');
196
+ const fileInput = document.getElementById('file-input');
197
+ const uploadBtn = document.getElementById('upload-btn');
198
+ const previewArea = document.getElementById('preview-area');
199
+ const imagePreview = document.getElementById('image-preview');
200
+ const videoPreview = document.getElementById('video-preview');
201
+ const removeFileBtn = document.getElementById('remove-file-btn');
202
+ const downsampleMode = document.getElementById('downsample-mode');
203
+ const sendIcon = document.getElementById('send-icon');
204
+ const loadingSpinner = document.getElementById('loading-spinner');
205
+
206
+ let selectedFile = null;
207
+ let client = null;
208
+
209
+ async function initClient() {
210
+ try {
211
+ client = await Client.connect(window.location.origin);
212
+ console.log("Gradio Client Connected");
213
+ } catch (error) {
214
+ console.error("Failed to connect to Gradio backend:", error);
215
+ }
216
+ }
217
+
218
+ initClient();
219
+
220
+ uploadBtn.onclick = () => fileInput.click();
221
+
222
+ fileInput.onchange = (e) => {
223
+ const file = e.target.files[0];
224
+ if (file) {
225
+ selectedFile = file;
226
+ previewArea.classList.remove('hidden');
227
+
228
+ const url = URL.createObjectURL(file);
229
+ if (file.type.startsWith('image/')) {
230
+ imagePreview.src = url;
231
+ imagePreview.classList.remove('hidden');
232
+ videoPreview.classList.add('hidden');
233
+ } else {
234
+ videoPreview.src = url;
235
+ videoPreview.classList.remove('hidden');
236
+ imagePreview.classList.add('hidden');
237
+ videoPreview.play();
238
+ }
239
+ }
240
+ };
241
+
242
+ removeFileBtn.onclick = () => {
243
+ selectedFile = null;
244
+ fileInput.value = '';
245
+ previewArea.classList.add('hidden');
246
+ imagePreview.src = '';
247
+ videoPreview.src = '';
248
+ videoPreview.pause();
249
+ };
250
+
251
+ function addMessage(role, content, fileUrl = null, fileType = null) {
252
+ const div = document.createElement('div');
253
+ div.className = `flex gap-4 max-w-3xl mx-auto items-start message-anim ${role === 'user' ? 'flex-row-reverse' : ''}`;
254
+
255
+ const icon = role === 'user' ? 'user' : 'bot';
256
+ const iconColor = role === 'user' ? 'gray' : 'indigo';
257
+
258
+ let mediaHtml = '';
259
+ if (fileUrl) {
260
+ if (fileType.startsWith('image')) {
261
+ mediaHtml = `<img src="${fileUrl}" class="max-w-xs rounded-lg mb-3 border border-white/10" />`;
262
+ } else {
263
+ mediaHtml = `<video src="${fileUrl}" controls class="max-w-xs rounded-lg mb-3 border border-white/10"></video>`;
264
+ }
265
+ }
266
+
267
+ div.innerHTML = `
268
+ <div class="w-8 h-8 rounded-full bg-${iconColor}-500/20 flex items-center justify-center shrink-0 border border-${iconColor}-500/30">
269
+ <i data-lucide="${icon}" class="w-4 h-4 text-${iconColor}-400"></i>
270
+ </div>
271
+ <div class="glass p-5 rounded-2xl ${role === 'user' ? 'rounded-tr-none' : 'rounded-tl-none'} border border-white/5">
272
+ ${mediaHtml}
273
+ <div class="text-gray-200 leading-relaxed whitespace-pre-wrap">${content}</div>
274
+ </div>
275
+ `;
276
+ chatMessages.appendChild(div);
277
+ lucide.createIcons();
278
+ chatMessages.scrollTop = chatMessages.scrollHeight;
279
+ }
280
+
281
+ async function handleSend() {
282
+ const text = userInput.value.trim();
283
+ if (!text && !selectedFile) return;
284
+
285
+ const currentFile = selectedFile;
286
+ const currentText = text;
287
+ const currentMode = downsampleMode.value;
288
+
289
+ // Clear input
290
+ userInput.value = '';
291
+ userInput.style.height = 'auto';
292
+ const fileUrl = currentFile ? URL.createObjectURL(currentFile) : null;
293
+ const fileType = currentFile ? currentFile.type : null;
294
+
295
+ addMessage('user', currentText, fileUrl, fileType);
296
+
297
+ // Show loading
298
+ sendIcon.classList.add('hidden');
299
+ loadingSpinner.classList.remove('hidden');
300
+ sendBtn.disabled = true;
301
+
302
+ try {
303
+ let fileData = null;
304
+ if (currentFile) {
305
+ fileData = handle_file(currentFile);
306
+ }
307
+
308
+ const result = await client.predict("/predict", {
309
+ message: currentText,
310
+ file: fileData,
311
+ downsample_mode: currentMode
312
+ });
313
+
314
+ addMessage('assistant', result.data);
315
+ } catch (error) {
316
+ console.error("Prediction failed:", error);
317
+ addMessage('assistant', "Sorry, I encountered an error while processing your request.");
318
+ } finally {
319
+ sendIcon.classList.remove('hidden');
320
+ loadingSpinner.classList.add('hidden');
321
+ sendBtn.disabled = false;
322
+ removeFileBtn.onclick(); // Reset preview
323
+ }
324
+ }
325
+
326
+ sendBtn.onclick = handleSend;
327
+
328
+ userInput.onkeydown = (e) => {
329
+ if (e.key === 'Enter' && !e.shiftKey) {
330
+ e.preventDefault();
331
+ handleSend();
332
+ }
333
+ };
334
+ </script>
335
+ </body>
336
+ </html>
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers>=4.44.0
2
+ torch
3
+ torchvision
4
+ gradio
5
+ fastapi
6
+ spaces
7
+ pillow
8
+ av
9
+ accelerate
10
+ sentencepiece