Spaces:

akhaliq
/

MiniCPM-V-4.6

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 9 days ago

Commit

32967e1

1 Parent(s): a11cb66

feat: implement manual video frame loading using PyAV to support direct frame passing for video processing

Browse files

Files changed (1) hide show

app.py +40 -5

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import torch
 import re
 from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor
 from gradio import Server
@@ -17,9 +18,41 @@ model = AutoModelForImageTextToText.from_pretrained(
     model_id,
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,
-    device_map="cuda"
 )
 # Utility for response normalization
 _PATTERN = re.compile(
     r'(```[\s\S]*?```|`[^`]+`|\$\$[\s\S]*?\$\$|\$[^$]+\$|\\\([\s\S]*?\\\)|\\\[[\s\S]*?\\\])'
@@ -54,26 +87,29 @@ def predict(message: str, file: FileData = None, downsample_mode: str = "16x") -
         is_video = any(file_path.lower().endswith(ext) for ext in ['.mp4', '.mkv', '.mov', '.avi'])
         if is_video:
             messages = [
                 {
                     "role": "user",
                     "content": [
-                        {"type": "video", "url": file_path},
                         {"type": "text", "text": message},
                     ],
                 }
             ]
-            # Video specific params
             inputs = processor.apply_chat_template(
                 messages, tokenize=True, add_generation_prompt=True,
                 return_dict=True, return_tensors="pt",
                 downsample_mode=downsample_mode,
-                max_num_frames=64, # Optimized for speed
                 stack_frames=1,
                 max_slice_nums=1,
                 use_image_id=False,
             ).to(model.device)
         else:
             messages = [
                 {
                     "role": "user",
@@ -83,7 +119,6 @@ def predict(message: str, file: FileData = None, downsample_mode: str = "16x") -
                     ],
                 }
             ]
-            # Image specific params
             inputs = processor.apply_chat_template(
                 messages, tokenize=True, add_generation_prompt=True,
                 return_dict=True, return_tensors="pt",

 import os
 import torch
 import re
+import av
 from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor
 from gradio import Server
     model_id,
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,
+    device_map="cuda"
 )
+def load_video(video_path, max_frames=64):
+    """Utility to load video frames using PyAV."""
+    container = av.open(video_path)
+    frames = []
+    # Get total frames to sample uniformly
+    stream = container.streams.video[0]
+    total_frames = stream.frames
+    if total_frames <= 0: # Some containers don't report frame count
+        print("Frame count unknown, decoding all and sampling...")
+        temp_frames = []
+        for frame in container.decode(video=0):
+            temp_frames.append(frame.to_image())
+        if len(temp_frames) > max_frames:
+            indices = [int(i * len(temp_frames) / max_frames) for i in range(max_frames)]
+            frames = [temp_frames[i] for i in indices]
+        else:
+            frames = temp_frames
+    else:
+        # Sample max_frames uniformly
+        indices = [int(i * total_frames / max_frames) for i in range(max_frames)]
+        current_idx = 0
+        for i, frame in enumerate(container.decode(video=0)):
+            if current_idx < len(indices) and i == indices[current_idx]:
+                frames.append(frame.to_image())
+                current_idx += 1
+            if current_idx >= len(indices):
+                break
+    container.close()
+    return frames
 # Utility for response normalization
 _PATTERN = re.compile(
     r'(```[\s\S]*?```|`[^`]+`|\$\$[\s\S]*?\$\$|\$[^$]+\$|\\\([\s\S]*?\\\)|\\\[[\s\S]*?\\\])'
         is_video = any(file_path.lower().endswith(ext) for ext in ['.mp4', '.mkv', '.mov', '.avi'])
         if is_video:
+            print(f"Processing video: {file_path}")
+            # Load video frames manually to avoid torchvision decode error
+            frames = load_video(file_path, max_frames=64)
             messages = [
                 {
                     "role": "user",
                     "content": [
+                        {"type": "video", "video": frames}, # Pass frames directly
                         {"type": "text", "text": message},
                     ],
                 }
             ]
             inputs = processor.apply_chat_template(
                 messages, tokenize=True, add_generation_prompt=True,
                 return_dict=True, return_tensors="pt",
                 downsample_mode=downsample_mode,
+                max_num_frames=64,
                 stack_frames=1,
                 max_slice_nums=1,
                 use_image_id=False,
             ).to(model.device)
         else:
+            print(f"Processing image: {file_path}")
             messages = [
                 {
                     "role": "user",
                     ],
                 }
             ]
             inputs = processor.apply_chat_template(
                 messages, tokenize=True, add_generation_prompt=True,
                 return_dict=True, return_tensors="pt",