akhaliq HF Staff commited on
Commit
0062fa7
·
1 Parent(s): 10871c8

feat: implement PyAV-based video frame extraction and update model processing parameters for MiniCPM-V 4.6

Browse files
Files changed (1) hide show
  1. app.py +51 -7
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import torch
3
  import re
 
4
  import uuid
5
  import copy
6
  import threading
@@ -157,6 +158,33 @@ def log_raw_model_output(session_id: str, **record) -> None:
157
  print(f"Logging error: {e}")
158
 
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  def persist_uploaded_files(files: list, session_id: str) -> list:
161
  """Copy Gradio temp uploads into the project log directory."""
162
  if not files: return []
@@ -239,14 +267,21 @@ def predict(
239
  # In history, we don't have mime_type, so we check extension
240
  ext = os.path.splitext(f_path)[1].lower()
241
  if ext in {".mp4", ".mkv", ".mov", ".avi", ".webm"}:
242
- h_content.append({"type": "video", "path": f_path})
 
 
 
 
243
  else:
244
  try:
245
  img = Image.open(f_path).convert("RGB")
246
  h_content.append({"type": "image", "image": img})
247
  except Exception:
248
- # Fallback to video path if image fails
249
- h_content.append({"type": "video", "path": f_path})
 
 
 
250
 
251
  if user_text:
252
  h_content.append({"type": "text", "text": user_text})
@@ -265,8 +300,12 @@ def predict(
265
  img = Image.open(file_path).convert("RGB")
266
  content.append({"type": "image", "image": img})
267
  except Exception:
268
- # Fallback to video
269
- content.append({"type": "video", "path": file_path})
 
 
 
 
270
 
271
  if message:
272
  content.append({"type": "text", "text": message})
@@ -274,7 +313,7 @@ def predict(
274
  if content:
275
  messages.append({"role": "user", "content": content})
276
 
277
- # Prepare inputs using native processor template
278
  with torch.no_grad():
279
  inputs = processor.apply_chat_template(
280
  messages,
@@ -283,7 +322,11 @@ def predict(
283
  return_dict=True,
284
  return_tensors="pt",
285
  enable_thinking=thinking_mode,
286
- processor_kwargs={"videos_kwargs": {"max_num_frames": max_frames}}
 
 
 
 
287
  ).to(model.device)
288
 
289
  for k, v in inputs.items():
@@ -302,6 +345,7 @@ def predict(
302
  "max_new_tokens": max_new_tokens,
303
  "do_sample": sampling,
304
  "streamer": streamer,
 
305
  }
306
  if sampling:
307
  generate_kwargs.update({
 
1
  import os
2
  import torch
3
  import re
4
+ import av
5
  import uuid
6
  import copy
7
  import threading
 
158
  print(f"Logging error: {e}")
159
 
160
 
161
+ def load_video(video_path, max_frames=64):
162
+ """Fast video loading using PyAV timestamp seeking."""
163
+ try:
164
+ container = av.open(video_path)
165
+ stream = container.streams.video[0]
166
+ stream.thread_count = 8
167
+ duration = stream.duration
168
+ if duration is None or duration <= 0:
169
+ frames = [f.to_image() for f in container.decode(video=0)]
170
+ if len(frames) > max_frames:
171
+ indices = [int(i * len(frames) / max_frames) for i in range(max_frames)]
172
+ return [frames[i] for i in indices]
173
+ return frames
174
+
175
+ indices = [int(i * duration / max_frames) for i in range(max_frames)]
176
+ frames = []
177
+ for ts in indices:
178
+ container.seek(ts, stream=stream)
179
+ for frame in container.decode(video=0):
180
+ frames.append(frame.to_image())
181
+ break
182
+ container.close()
183
+ return frames
184
+ except Exception as e:
185
+ print(f"Error loading video: {e}")
186
+ return None
187
+
188
  def persist_uploaded_files(files: list, session_id: str) -> list:
189
  """Copy Gradio temp uploads into the project log directory."""
190
  if not files: return []
 
267
  # In history, we don't have mime_type, so we check extension
268
  ext = os.path.splitext(f_path)[1].lower()
269
  if ext in {".mp4", ".mkv", ".mov", ".avi", ".webm"}:
270
+ v_frames = load_video(f_path, max_frames=max_frames)
271
+ if v_frames:
272
+ h_content.append({"type": "video", "video": v_frames})
273
+ else:
274
+ h_content.append({"type": "video", "path": f_path})
275
  else:
276
  try:
277
  img = Image.open(f_path).convert("RGB")
278
  h_content.append({"type": "image", "image": img})
279
  except Exception:
280
+ v_frames = load_video(f_path, max_frames=max_frames)
281
+ if v_frames:
282
+ h_content.append({"type": "video", "video": v_frames})
283
+ else:
284
+ h_content.append({"type": "video", "path": f_path})
285
 
286
  if user_text:
287
  h_content.append({"type": "text", "text": user_text})
 
300
  img = Image.open(file_path).convert("RGB")
301
  content.append({"type": "image", "image": img})
302
  except Exception:
303
+ # Fallback to manual video frame extraction (bypasses broken torchvision)
304
+ v_frames = load_video(file_path, max_frames=max_frames)
305
+ if v_frames:
306
+ content.append({"type": "video", "video": v_frames})
307
+ else:
308
+ print(f"Failed to load video: {file_path}")
309
 
310
  if message:
311
  content.append({"type": "text", "text": message})
 
313
  if content:
314
  messages.append({"role": "user", "content": content})
315
 
316
+ # Prepare inputs with Advanced Parameters for MiniCPM-V 4.6
317
  with torch.no_grad():
318
  inputs = processor.apply_chat_template(
319
  messages,
 
322
  return_dict=True,
323
  return_tensors="pt",
324
  enable_thinking=thinking_mode,
325
+ downsample_mode="16x",
326
+ max_num_frames=max_frames,
327
+ stack_frames=1,
328
+ max_slice_nums=1 if any(it.get("type") == "video" for msg in messages for it in msg["content"]) else 9,
329
+ use_image_id=False if any(it.get("type") == "video" for msg in messages for it in msg["content"]) else True
330
  ).to(model.device)
331
 
332
  for k, v in inputs.items():
 
345
  "max_new_tokens": max_new_tokens,
346
  "do_sample": sampling,
347
  "streamer": streamer,
348
+ "downsample_mode": "16x"
349
  }
350
  if sampling:
351
  generate_kwargs.update({