userisuser Cursor commited on
Commit
ecb8ee5
Β·
1 Parent(s): 5d404cd

Deploy MiniCPM-V 4.6 Gradio Server demo

Browse files

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (5) hide show
  1. README.md +23 -8
  2. app.py +176 -0
  3. index.html +333 -0
  4. requirements.txt +8 -0
  5. style.css +28 -0
README.md CHANGED
@@ -1,13 +1,28 @@
1
  ---
2
- title: MiniCPM V 4.6 Demo Gradio Server
3
- emoji: 🐒
4
- colorFrom: purple
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 6.14.0
8
- python_version: '3.13'
9
- app_file: app.py
10
  pinned: false
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: MiniCPM-V 4.6 Demo
3
+ emoji: πŸͺ
4
+ colorFrom: indigo
5
+ colorTo: pink
6
  sdk: gradio
7
+ models:
8
+ - openbmb/MiniCPM-V-4.6
 
9
  pinned: false
10
+ short_description: MiniCPM-V 4.6 Ultra-Efficient Multimodal AI
11
  ---
12
 
13
+ # MiniCPM-V 4.6 Demo
14
+
15
+ This Space hosts the official **MiniCPM-V 4.6** multimodal demo using the Gradio Server architecture.
16
+
17
+ ## Features
18
+ - **Ultra-Efficient**: Powered by MiniCPM-V 4.6 for fast image and video understanding.
19
+ - **ZeroGPU Optimization**: Uses dynamic GPU allocation for high performance.
20
+ - **Modern Minimalist UI**: Sleek, mobile-friendly interface designed by OpenBMB.
21
+
22
+ ## Technical Stack
23
+ - **Backend**: Gradio Server (FastAPI)
24
+ - **Frontend**: Custom HTML/JS/CSS with Lucide icons
25
+ - **Vision Logic**: Manual frame extraction via PyAV for robust video support
26
+
27
+ ---
28
+ For more information, visit the [MiniCPM-V GitHub Repository](https://github.com/OpenBMB/MiniCPM-V).
app.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import re
4
+ import av
5
+ from PIL import Image
6
+ from transformers import AutoModelForImageTextToText, AutoProcessor
7
+ from gradio import Server
8
+ from gradio.data_classes import FileData
9
+ from fastapi.responses import HTMLResponse
10
+ import spaces
11
+
12
+ # Load model and processor
13
+ model_id = "openbmb/MiniCPM-V-4.6"
14
+ print(f"Loading model: {model_id}...")
15
+
16
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
17
+ model = AutoModelForImageTextToText.from_pretrained(
18
+ model_id,
19
+ torch_dtype=torch.bfloat16,
20
+ trust_remote_code=True,
21
+ device_map="cuda"
22
+ )
23
+
24
+ def load_video(video_path, max_frames=64):
25
+ """Utility to load video frames using PyAV."""
26
+ try:
27
+ container = av.open(video_path)
28
+ frames = []
29
+ stream = container.streams.video[0]
30
+ total_frames = stream.frames
31
+
32
+ if total_frames <= 0:
33
+ print("Frame count unknown, decoding all and sampling...")
34
+ temp_frames = []
35
+ for frame in container.decode(video=0):
36
+ temp_frames.append(frame.to_image())
37
+
38
+ if len(temp_frames) > max_frames:
39
+ indices = [int(i * len(temp_frames) / max_frames) for i in range(max_frames)]
40
+ frames = [temp_frames[i] for i in indices]
41
+ else:
42
+ frames = temp_frames
43
+ else:
44
+ indices = [int(i * total_frames / max_frames) for i in range(max_frames)]
45
+ current_idx = 0
46
+ for i, frame in enumerate(container.decode(video=0)):
47
+ if current_idx < len(indices) and i == indices[current_idx]:
48
+ frames.append(frame.to_image())
49
+ current_idx += 1
50
+ if current_idx >= len(indices):
51
+ break
52
+ container.close()
53
+ return frames
54
+ except Exception as e:
55
+ print(f"Error loading video: {e}")
56
+ return None
57
+
58
+ # Utility for response normalization
59
+ _PATTERN = re.compile(
60
+ r'(```[\s\S]*?```|`[^`]+`|\$\$[\s\S]*?\$\$|\$[^$]+\$|\\\([\s\S]*?\\\)|\\\[[\s\S]*?\\\])'
61
+ r'|(?<!\\)(?:\\r\\n|\\[nr])'
62
+ )
63
+
64
+ def normalize_response_text(text: str) -> str:
65
+ if not isinstance(text, str) or "\\" not in text:
66
+ return text
67
+ return _PATTERN.sub(lambda m: m.group(1) or '\n', text)
68
+
69
+ app = Server()
70
+
71
+ @app.api()
72
+ @spaces.GPU(duration=120)
73
+ def predict(message: str, file: FileData = None, downsample_mode: str = "16x") -> str:
74
+ """
75
+ General inference endpoint for both image and video.
76
+ """
77
+ if file is None:
78
+ # Text-only inference
79
+ messages = [{"role": "user", "content": [{"type": "text", "text": message}]}]
80
+ inputs = processor.apply_chat_template(
81
+ messages,
82
+ tokenize=True,
83
+ add_generation_prompt=True,
84
+ return_dict=True,
85
+ return_tensors="pt"
86
+ ).to(model.device)
87
+ else:
88
+ file_path = file["path"]
89
+
90
+ # Robust detection: Try opening with AV first to see if it's a video
91
+ is_video = False
92
+ try:
93
+ container = av.open(file_path)
94
+ if len(container.streams.video) > 0:
95
+ is_video = True
96
+ container.close()
97
+ except:
98
+ is_video = False
99
+
100
+ if is_video:
101
+ print(f"Processing as video: {file_path}")
102
+ frames = load_video(file_path, max_frames=64)
103
+ if frames is None or len(frames) == 0:
104
+ return "Error: Could not decode video file."
105
+
106
+ messages = [
107
+ {
108
+ "role": "user",
109
+ "content": [
110
+ {"type": "video", "video": frames},
111
+ {"type": "text", "text": message},
112
+ ],
113
+ }
114
+ ]
115
+ inputs = processor.apply_chat_template(
116
+ messages, tokenize=True, add_generation_prompt=True,
117
+ return_dict=True, return_tensors="pt",
118
+ processor_kwargs={
119
+ "downsample_mode": downsample_mode,
120
+ "max_num_frames": 64,
121
+ "stack_frames": 1,
122
+ "max_slice_nums": 1,
123
+ "use_image_id": False,
124
+ "do_sample_frames": False, # Fix: Avoid requiring metadata since we already sampled
125
+ }
126
+ ).to(model.device)
127
+ else:
128
+ print(f"Processing as image: {file_path}")
129
+ messages = [
130
+ {
131
+ "role": "user",
132
+ "content": [
133
+ {"type": "image", "url": file_path},
134
+ {"type": "text", "text": message},
135
+ ],
136
+ }
137
+ ]
138
+ inputs = processor.apply_chat_template(
139
+ messages, tokenize=True, add_generation_prompt=True,
140
+ return_dict=True, return_tensors="pt",
141
+ processor_kwargs={
142
+ "downsample_mode": downsample_mode,
143
+ "max_slice_nums": 9,
144
+ }
145
+ ).to(model.device)
146
+
147
+ with torch.no_grad():
148
+ generate_kwargs = {
149
+ **inputs,
150
+ "max_new_tokens": 1024,
151
+ "do_sample": True,
152
+ "temperature": 0.7
153
+ }
154
+
155
+ if file is not None:
156
+ generate_kwargs["downsample_mode"] = downsample_mode
157
+
158
+ generated_ids = model.generate(**generate_kwargs)
159
+
160
+ generated_ids_trimmed = [
161
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
162
+ ]
163
+ output_text = processor.batch_decode(
164
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
165
+ )
166
+
167
+ return normalize_response_text(output_text[0])
168
+
169
+ @app.get("/", response_class=HTMLResponse)
170
+ async def homepage():
171
+ html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
172
+ with open(html_path, "r", encoding="utf-8") as f:
173
+ return f.read()
174
+
175
+ if __name__ == "__main__":
176
+ app.launch(show_error=True)
index.html ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
6
+ <title>MiniCPM-V | OpenBMB</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ <link href="https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700&family=Inter:wght@300;400;500;600&display=swap" rel="stylesheet">
9
+ <script src="https://unpkg.com/lucide@latest"></script>
10
+ <style>
11
+ :root {
12
+ --bg: #0A0C10;
13
+ --blue: #3B5BFF;
14
+ --cyan: #27D4EA;
15
+ --text: #FFFFFF;
16
+ --text-muted: #6E7681;
17
+ --glass: rgba(255, 255, 255, 0.03);
18
+ --glass-border: rgba(255, 255, 255, 0.1);
19
+ }
20
+
21
+ body {
22
+ font-family: 'Inter', sans-serif;
23
+ background-color: var(--bg);
24
+ color: var(--text);
25
+ height: 100vh;
26
+ margin: 0;
27
+ display: flex;
28
+ flex-direction: column;
29
+ overflow: hidden; /* Prevent body scroll */
30
+ }
31
+
32
+ h1, h2, h3 { font-family: 'Outfit', sans-serif; }
33
+
34
+ .chat-scroll-area {
35
+ flex: 1;
36
+ overflow-y: auto;
37
+ padding-bottom: 120px; /* Space for floating input */
38
+ -webkit-overflow-scrolling: touch;
39
+ }
40
+
41
+ /* Modern Scrollbar */
42
+ .chat-scroll-area::-webkit-scrollbar {
43
+ width: 5px;
44
+ }
45
+ .chat-scroll-area::-webkit-scrollbar-track {
46
+ background: transparent;
47
+ }
48
+ .chat-scroll-area::-webkit-scrollbar-thumb {
49
+ background: rgba(255, 255, 255, 0.1);
50
+ border-radius: 10px;
51
+ }
52
+
53
+ .message-bubble {
54
+ max-width: 85%;
55
+ animation: fadeIn 0.4s cubic-bezier(0.16, 1, 0.3, 1) forwards;
56
+ }
57
+
58
+ @keyframes fadeIn {
59
+ from { opacity: 0; transform: translateY(10px); }
60
+ to { opacity: 1; transform: translateY(0); }
61
+ }
62
+
63
+ .user-message {
64
+ background: linear-gradient(135deg, var(--blue), var(--cyan));
65
+ color: #FFFFFF;
66
+ box-shadow: 0 10px 30px rgba(59, 91, 255, 0.2);
67
+ }
68
+
69
+ .bot-message {
70
+ background: rgba(255, 255, 255, 0.04);
71
+ border: 1px solid var(--glass-border);
72
+ }
73
+
74
+ .typing-dot {
75
+ width: 4px;
76
+ height: 4px;
77
+ background: var(--cyan);
78
+ border-radius: 50%;
79
+ animation: bounce 1.4s infinite ease-in-out;
80
+ }
81
+ .typing-dot:nth-child(2) { animation-delay: 0.2s; }
82
+ .typing-dot:nth-child(3) { animation-delay: 0.4s; }
83
+
84
+ @keyframes bounce {
85
+ 0%, 80%, 100% { transform: scale(0.3); opacity: 0.4; }
86
+ 40% { transform: scale(1); opacity: 1; }
87
+ }
88
+
89
+ .input-pill {
90
+ background: rgba(255, 255, 255, 0.05);
91
+ backdrop-filter: blur(20px);
92
+ -webkit-backdrop-filter: blur(20px);
93
+ border: 1px solid var(--glass-border);
94
+ transition: all 0.3s ease;
95
+ }
96
+
97
+ .input-pill:focus-within {
98
+ border-color: var(--blue);
99
+ box-shadow: 0 0 30px rgba(59, 91, 255, 0.1);
100
+ }
101
+
102
+ .logo-glow {
103
+ filter: drop-shadow(0 0 10px rgba(39, 212, 234, 0.3));
104
+ }
105
+
106
+ .send-btn {
107
+ background: linear-gradient(135deg, var(--blue), var(--cyan));
108
+ transition: transform 0.2s ease, opacity 0.2s ease;
109
+ }
110
+ .send-btn:active { transform: scale(0.95); }
111
+
112
+ #user-input::placeholder { color: #555; }
113
+ </style>
114
+ </head>
115
+ <body>
116
+
117
+ <!-- Minimalist Header -->
118
+ <header class="h-20 flex items-center justify-between px-6 md:px-12 shrink-0 z-50">
119
+ <div class="flex items-center gap-4">
120
+ <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/1670387859384-633fe7784b362488336bbfad.png"
121
+ alt="OpenBMB" class="w-10 h-10 logo-glow">
122
+ <div>
123
+ <h1 class="text-xl font-bold tracking-tight">MiniCPM-V</h1>
124
+ <p class="text-[10px] text-muted uppercase tracking-[0.2em] font-medium">By OpenBMB</p>
125
+ </div>
126
+ </div>
127
+ <div class="hidden md:flex items-center gap-2 text-[10px] font-bold text-muted uppercase tracking-widest">
128
+ <span class="w-1.5 h-1.5 rounded-full bg-[#27D4EA] animate-pulse"></span>
129
+ Vision System Online
130
+ </div>
131
+ </header>
132
+
133
+ <!-- Chat Messages Scroll Area -->
134
+ <main id="chat-messages" class="chat-scroll-area px-4 md:px-0">
135
+ <div class="max-w-3xl mx-auto space-y-8 pt-4">
136
+ <!-- Bot Greeting -->
137
+ <div class="flex gap-4 items-start">
138
+ <div class="bot-message p-6 rounded-3xl rounded-tl-none message-bubble shadow-2xl">
139
+ <p class="text-white/90 leading-relaxed text-[15px]">
140
+ Welcome to <span class="font-bold text-[#27D4EA]">MiniCPM-V 4.6</span>.
141
+ I can analyze images and videos with high efficiency.
142
+ <br><br>
143
+ Drop a file below to begin.
144
+ </p>
145
+ </div>
146
+ </div>
147
+ </div>
148
+ </main>
149
+
150
+ <!-- Floating Input Bar -->
151
+ <div class="fixed bottom-0 left-0 right-0 p-6 md:p-10 pointer-events-none">
152
+ <div class="max-w-3xl mx-auto pointer-events-auto">
153
+ <!-- Media Preview -->
154
+ <div id="preview-container" class="hidden mb-6 animate-in">
155
+ <div class="relative inline-block group">
156
+ <img id="image-preview" src="" class="h-36 w-auto rounded-3xl border border-white/20 shadow-2xl hidden object-cover" />
157
+ <video id="video-preview" class="h-36 w-auto rounded-3xl border border-white/20 shadow-2xl hidden object-cover" muted loop></video>
158
+ <button id="cancel-file" class="absolute -top-3 -right-3 bg-white text-black rounded-full p-2 shadow-xl hover:bg-neutral-200 transition-all">
159
+ <i data-lucide="x" class="w-4 h-4"></i>
160
+ </button>
161
+ </div>
162
+ </div>
163
+
164
+ <!-- Pill Input -->
165
+ <div class="input-pill rounded-[2.5rem] p-2 flex items-end gap-2 pr-3 shadow-2xl">
166
+ <div class="flex items-center">
167
+ <input type="file" id="file-input" class="hidden" accept="image/*,video/*">
168
+ <button id="upload-trigger" class="p-4 text-white/30 hover:text-[#27D4EA] transition-colors">
169
+ <i data-lucide="paperclip" class="w-6 h-6"></i>
170
+ </button>
171
+ </div>
172
+
173
+ <textarea id="user-input" rows="1" placeholder="Type your message..."
174
+ class="flex-1 bg-transparent border-none focus:ring-0 text-white py-4 px-1 resize-none max-h-40 scrollbar-none text-[16px] leading-relaxed"></textarea>
175
+
176
+ <button id="send-btn" class="send-btn w-12 h-12 text-white rounded-full flex items-center justify-center disabled:opacity-20 disabled:grayscale group shrink-0 mb-1">
177
+ <i data-lucide="arrow-up" class="w-5 h-5 group-hover:scale-110 transition-transform" id="send-icon"></i>
178
+ <i data-lucide="loader-2" class="w-5 h-5 animate-spin hidden" id="loading-icon"></i>
179
+ </button>
180
+ </div>
181
+ </div>
182
+ </div>
183
+
184
+ <script type="module">
185
+ import { Client, handle_file } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
186
+
187
+ lucide.createIcons();
188
+
189
+ const chatMessages = document.getElementById('chat-messages');
190
+ const userInput = document.getElementById('user-input');
191
+ const sendBtn = document.getElementById('send-btn');
192
+ const fileInput = document.getElementById('file-input');
193
+ const uploadTrigger = document.getElementById('upload-trigger');
194
+ const previewContainer = document.getElementById('preview-container');
195
+ const imagePreview = document.getElementById('image-preview');
196
+ const videoPreview = document.getElementById('video-preview');
197
+ const cancelFile = document.getElementById('cancel-file');
198
+ const sendIcon = document.getElementById('send-icon');
199
+ const loadingIcon = document.getElementById('loading-icon');
200
+
201
+ let selectedFile = null;
202
+ let client = null;
203
+
204
+ async function init() {
205
+ try {
206
+ client = await Client.connect(window.location.origin);
207
+ } catch (err) { console.error("Gradio Connection Error", err); }
208
+ }
209
+ init();
210
+
211
+ // UI Interactions
212
+ uploadTrigger.onclick = () => fileInput.click();
213
+ fileInput.onchange = (e) => {
214
+ const file = e.target.files[0];
215
+ if (file) {
216
+ selectedFile = file;
217
+ previewContainer.classList.remove('hidden');
218
+ const url = URL.createObjectURL(file);
219
+ if (file.type.startsWith('image/')) {
220
+ imagePreview.src = url;
221
+ imagePreview.classList.remove('hidden');
222
+ videoPreview.classList.add('hidden');
223
+ } else {
224
+ videoPreview.src = url;
225
+ videoPreview.classList.remove('hidden');
226
+ imagePreview.classList.add('hidden');
227
+ videoPreview.play();
228
+ }
229
+ }
230
+ };
231
+
232
+ cancelFile.onclick = () => {
233
+ selectedFile = null;
234
+ fileInput.value = '';
235
+ previewContainer.classList.add('hidden');
236
+ imagePreview.src = '';
237
+ videoPreview.src = '';
238
+ };
239
+
240
+ function appendMessage(role, text, mediaUrl = null, mediaType = null) {
241
+ const div = document.createElement('div');
242
+ div.className = `flex gap-4 items-start ${role === 'user' ? 'flex-row-reverse' : ''}`;
243
+
244
+ let mediaHtml = '';
245
+ if (mediaUrl) {
246
+ if (mediaType.startsWith('image')) {
247
+ mediaHtml = `<img src="${mediaUrl}" class="max-w-xs md:max-w-md rounded-3xl mb-4 border border-white/10" />`;
248
+ } else {
249
+ mediaHtml = `<video src="${mediaUrl}" controls class="max-w-xs md:max-w-md rounded-3xl mb-4 border border-white/10"></video>`;
250
+ }
251
+ }
252
+
253
+ const bubbleClass = role === 'user' ? 'user-message' : 'bot-message';
254
+
255
+ div.innerHTML = `
256
+ <div class="${bubbleClass} p-6 rounded-[2rem] ${role === 'user' ? 'rounded-tr-none' : 'rounded-tl-none'} message-bubble shadow-xl">
257
+ ${mediaHtml}
258
+ <p class="leading-relaxed text-[15px] whitespace-pre-wrap font-medium">${text}</p>
259
+ </div>
260
+ `;
261
+
262
+ // Get the inner container
263
+ const container = chatMessages.querySelector('.max-w-3xl');
264
+ container.appendChild(div);
265
+
266
+ // Smooth scroll to bottom
267
+ chatMessages.scrollTo({ top: chatMessages.scrollHeight, behavior: 'smooth' });
268
+ }
269
+
270
+ async function sendMessage() {
271
+ const text = userInput.value.trim();
272
+ if (!text && !selectedFile) return;
273
+
274
+ const content = text;
275
+ const file = selectedFile;
276
+
277
+ userInput.value = '';
278
+ userInput.style.height = 'auto';
279
+ const fileUrl = file ? URL.createObjectURL(file) : null;
280
+ const fileType = file ? file.type : null;
281
+
282
+ appendMessage('user', content, fileUrl, fileType);
283
+ cancelFile.click();
284
+
285
+ sendIcon.classList.add('hidden');
286
+ loadingIcon.classList.remove('hidden');
287
+ sendBtn.disabled = true;
288
+
289
+ const thinkingId = 'think-' + Date.now();
290
+ const thinkingDiv = document.createElement('div');
291
+ thinkingDiv.id = thinkingId;
292
+ thinkingDiv.className = 'flex gap-4 items-start';
293
+ thinkingDiv.innerHTML = `
294
+ <div class="bot-message p-6 rounded-[2rem] rounded-tl-none message-bubble flex items-center gap-4">
295
+ <div class="flex gap-1.5">
296
+ <div class="typing-dot"></div><div class="typing-dot"></div><div class="typing-dot"></div>
297
+ </div>
298
+ </div>
299
+ `;
300
+ const container = chatMessages.querySelector('.max-w-3xl');
301
+ container.appendChild(thinkingDiv);
302
+ chatMessages.scrollTo({ top: chatMessages.scrollHeight, behavior: 'smooth' });
303
+
304
+ try {
305
+ let fileData = file ? handle_file(file) : null;
306
+ const result = await client.predict("/predict", {
307
+ message: content,
308
+ file: fileData,
309
+ downsample_mode: "16x"
310
+ });
311
+
312
+ document.getElementById(thinkingId).remove();
313
+ appendMessage('bot', result.data);
314
+ } catch (err) {
315
+ document.getElementById(thinkingId).remove();
316
+ appendMessage('bot', "The system encountered an error. Please check your file format and try again.");
317
+ } finally {
318
+ sendIcon.classList.remove('hidden');
319
+ loadingIcon.classList.add('hidden');
320
+ sendBtn.disabled = false;
321
+ }
322
+ }
323
+
324
+ sendBtn.onclick = sendMessage;
325
+ userInput.onkeydown = (e) => {
326
+ if (e.key === 'Enter' && !e.shiftKey) {
327
+ e.preventDefault();
328
+ sendMessage();
329
+ }
330
+ };
331
+ </script>
332
+ </body>
333
+ </html>
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers>=4.44.0
2
+ torch
3
+ torchvision
4
+ gradio
5
+ fastapi
6
+ spaces
7
+ av
8
+ pillow
style.css ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ padding: 2rem;
3
+ font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
+ }
5
+
6
+ h1 {
7
+ font-size: 16px;
8
+ margin-top: 0;
9
+ }
10
+
11
+ p {
12
+ color: rgb(107, 114, 128);
13
+ font-size: 15px;
14
+ margin-bottom: 10px;
15
+ margin-top: 5px;
16
+ }
17
+
18
+ .card {
19
+ max-width: 620px;
20
+ margin: 0 auto;
21
+ padding: 16px;
22
+ border: 1px solid lightgray;
23
+ border-radius: 16px;
24
+ }
25
+
26
+ .card p:last-child {
27
+ margin-bottom: 0;
28
+ }