Spaces:

akhaliq
/

MiniCPM-V-4.6

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 8 days ago

Commit

0062fa7

1 Parent(s): 10871c8

feat: implement PyAV-based video frame extraction and update model processing parameters for MiniCPM-V 4.6

Browse files

Files changed (1) hide show

app.py +51 -7

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import torch
 import re
 import uuid
 import copy
 import threading
@@ -157,6 +158,33 @@ def log_raw_model_output(session_id: str, **record) -> None:
         print(f"Logging error: {e}")
 def persist_uploaded_files(files: list, session_id: str) -> list:
     """Copy Gradio temp uploads into the project log directory."""
     if not files: return []
@@ -239,14 +267,21 @@ def predict(
                     # In history, we don't have mime_type, so we check extension
                     ext = os.path.splitext(f_path)[1].lower()
                     if ext in {".mp4", ".mkv", ".mov", ".avi", ".webm"}:
-                        h_content.append({"type": "video", "path": f_path})
                     else:
                         try:
                             img = Image.open(f_path).convert("RGB")
                             h_content.append({"type": "image", "image": img})
                         except Exception:
-                            # Fallback to video path if image fails
-                            h_content.append({"type": "video", "path": f_path})
             if user_text:
                 h_content.append({"type": "text", "text": user_text})
@@ -265,8 +300,12 @@ def predict(
                 img = Image.open(file_path).convert("RGB")
                 content.append({"type": "image", "image": img})
             except Exception:
-                # Fallback to video
-                content.append({"type": "video", "path": file_path})
     if message:
         content.append({"type": "text", "text": message})
@@ -274,7 +313,7 @@ def predict(
     if content:
         messages.append({"role": "user", "content": content})
-    # Prepare inputs using native processor template
     with torch.no_grad():
         inputs = processor.apply_chat_template(
             messages,
@@ -283,7 +322,11 @@ def predict(
             return_dict=True,
             return_tensors="pt",
             enable_thinking=thinking_mode,
-            processor_kwargs={"videos_kwargs": {"max_num_frames": max_frames}}
         ).to(model.device)
     for k, v in inputs.items():
@@ -302,6 +345,7 @@ def predict(
         "max_new_tokens": max_new_tokens,
         "do_sample": sampling,
         "streamer": streamer,
     }
     if sampling:
         generate_kwargs.update({

 import os
 import torch
 import re
+import av
 import uuid
 import copy
 import threading
         print(f"Logging error: {e}")
+def load_video(video_path, max_frames=64):
+    """Fast video loading using PyAV timestamp seeking."""
+    try:
+        container = av.open(video_path)
+        stream = container.streams.video[0]
+        stream.thread_count = 8
+        duration = stream.duration
+        if duration is None or duration <= 0:
+            frames = [f.to_image() for f in container.decode(video=0)]
+            if len(frames) > max_frames:
+                indices = [int(i * len(frames) / max_frames) for i in range(max_frames)]
+                return [frames[i] for i in indices]
+            return frames
+        indices = [int(i * duration / max_frames) for i in range(max_frames)]
+        frames = []
+        for ts in indices:
+            container.seek(ts, stream=stream)
+            for frame in container.decode(video=0):
+                frames.append(frame.to_image())
+                break
+        container.close()
+        return frames
+    except Exception as e:
+        print(f"Error loading video: {e}")
+        return None
 def persist_uploaded_files(files: list, session_id: str) -> list:
     """Copy Gradio temp uploads into the project log directory."""
     if not files: return []
                     # In history, we don't have mime_type, so we check extension
                     ext = os.path.splitext(f_path)[1].lower()
                     if ext in {".mp4", ".mkv", ".mov", ".avi", ".webm"}:
+                        v_frames = load_video(f_path, max_frames=max_frames)
+                        if v_frames:
+                            h_content.append({"type": "video", "video": v_frames})
+                        else:
+                            h_content.append({"type": "video", "path": f_path})
                     else:
                         try:
                             img = Image.open(f_path).convert("RGB")
                             h_content.append({"type": "image", "image": img})
                         except Exception:
+                            v_frames = load_video(f_path, max_frames=max_frames)
+                            if v_frames:
+                                h_content.append({"type": "video", "video": v_frames})
+                            else:
+                                h_content.append({"type": "video", "path": f_path})
             if user_text:
                 h_content.append({"type": "text", "text": user_text})
                 img = Image.open(file_path).convert("RGB")
                 content.append({"type": "image", "image": img})
             except Exception:
+                # Fallback to manual video frame extraction (bypasses broken torchvision)
+                v_frames = load_video(file_path, max_frames=max_frames)
+                if v_frames:
+                    content.append({"type": "video", "video": v_frames})
+                else:
+                    print(f"Failed to load video: {file_path}")
     if message:
         content.append({"type": "text", "text": message})
     if content:
         messages.append({"role": "user", "content": content})
+    # Prepare inputs with Advanced Parameters for MiniCPM-V 4.6
     with torch.no_grad():
         inputs = processor.apply_chat_template(
             messages,
             return_dict=True,
             return_tensors="pt",
             enable_thinking=thinking_mode,
+            downsample_mode="16x",
+            max_num_frames=max_frames,
+            stack_frames=1,
+            max_slice_nums=1 if any(it.get("type") == "video" for msg in messages for it in msg["content"]) else 9,
+            use_image_id=False if any(it.get("type") == "video" for msg in messages for it in msg["content"]) else True
         ).to(model.device)
     for k, v in inputs.items():
         "max_new_tokens": max_new_tokens,
         "do_sample": sampling,
         "streamer": streamer,
+        "downsample_mode": "16x"
     }
     if sampling:
         generate_kwargs.update({