Spaces:

allenai
/

MolmoPoint-8B-Demo

Running on Zero

App Files Files Community

jamepark3922 commited on Mar 17

Commit

d0f93f4

0 Parent(s):

Initial Molmo-Point HF Spaces app

Browse files

Files changed (14) hide show

.gitattributes +4 -0
README.md +12 -0
app.py +578 -0
example-images/boat1.jpeg +3 -0
example-images/boat2.jpeg +3 -0
example-images/messy1.jpg +3 -0
example-images/messy2.jpg +3 -0
example-images/messy3.jpg +3 -0
example-images/messy4.jpg +3 -0
example-videos/arena_basketball.mp4 +3 -0
example-videos/backflip.mp4 +3 -0
example-videos/penguins.mp4 +3 -0
pre-requirements.txt +1 -0
requirements.txt +14 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,4 @@

+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Molmo-Point Demo
+emoji: 👆
+colorFrom: indigo
+colorTo: gray
+sdk: gradio
+sdk_version: 6.3.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: Molmo-Point - Image & Video Pointing & Tracking
+---

app.py ADDED Viewed

	@@ -0,0 +1,578 @@

+import functools
+import math
+import os
+import tempfile
+from collections import defaultdict
+import cv2
+import numpy as np
+import PIL
+import torch
+from PIL import Image, ImageDraw, ImageFile
+from transformers import AutoModelForImageTextToText, AutoProcessor
+import gradio as gr
+import spaces
+from molmo_utils import process_vision_info
+from typing import Iterable
+from gradio.themes import Soft
+from gradio.themes.utils import colors, fonts, sizes
+Image.MAX_IMAGE_PIXELS = None
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+# ── Constants ──────────────────────────────────────────────────────────────────
+MODEL_ID = "allenai/MolmoPoint-8B"
+MAX_IMAGE_SIZE = 512
+MAX_VIDEO_HEIGHT = 512
+POINT_SIZE = 0.01
+KEYFRAME_HOLD_FRAMES = 3
+SHOW_TRAILS = True
+MAX_NEW_TOKENS = 2048
+MAX_FPS = 10
+COLORS = [
+    "rgb(255, 100, 180)",
+    "rgb(100, 180, 255)",
+    "rgb(180, 255, 100)",
+    "rgb(255, 180, 100)",
+    "rgb(100, 255, 180)",
+    "rgb(180, 100, 255)",
+    "rgb(255, 255, 100)",
+    "rgb(100, 255, 255)",
+    "rgb(255, 120, 120)",
+    "rgb(120, 255, 255)",
+    "rgb(255, 255, 120)",
+    "rgb(255, 120, 255)",
+]
+# ── Model loading ──────────────────────────────────────────────────────────────
+print(f"Loading {MODEL_ID}...")
+processor = AutoProcessor.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    padding_side="left",
+)
+model = AutoModelForImageTextToText.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    dtype="bfloat16",
+    device_map="auto",
+)
+print("Model loaded successfully.")
+# ── Helper functions ───────────────────────────────────────────────────────────
+def _parse_rgb(color_str):
+    """Parse 'rgb(r, g, b)' to (r, g, b) tuple."""
+    nums = color_str.replace("rgb(", "").replace(")", "").split(",")
+    return tuple(int(n.strip()) for n in nums)
+COLORS_BGR = [(_parse_rgb(c)[2], _parse_rgb(c)[1], _parse_rgb(c)[0]) for c in COLORS]
+def is_tracking_output(generated_text: str) -> bool:
+    """Detect tracking from model output by checking for <tracks tag."""
+    return generated_text.strip().startswith("<tracks")
+def cast_float_bf16(t: torch.Tensor):
+    if torch.is_floating_point(t):
+        t = t.to(torch.bfloat16)
+    return t
+def draw_points(image, points):
+    if isinstance(image, np.ndarray):
+        annotation = PIL.Image.fromarray(image)
+    else:
+        annotation = image.copy()
+    draw = ImageDraw.Draw(annotation)
+    w, h = annotation.size
+    size = max(5, int(max(w, h) * POINT_SIZE))
+    for i, (x, y) in enumerate(points):
+        color = COLORS[0]
+        draw.ellipse((x - size, y - size, x + size, y + size), fill=color, outline=None)
+    return annotation
+def draw_points_colored(image, points_with_ids):
+    """Draw points with per-instance-ID colors for tracking visualization."""
+    if isinstance(image, np.ndarray):
+        annotation = PIL.Image.fromarray(image)
+    else:
+        annotation = image.copy()
+    draw = ImageDraw.Draw(annotation)
+    w, h = annotation.size
+    size = max(5, int(max(w, h) * POINT_SIZE))
+    for object_id, x, y in points_with_ids:
+        color = COLORS[(object_id - 1) % len(COLORS)]
+        draw.ellipse((x - size, y - size, x + size, y + size), fill=color, outline=None)
+    return annotation
+def format_points_list(points, is_video=False):
+    """Format extracted points as a flat Python list string."""
+    if not points:
+        return "[]"
+    rows = []
+    if is_video:
+        for object_id, ts, x, y in points:
+            rows.append(f"[{int(object_id)}, {float(ts):.2f}, {float(x):.1f}, {float(y):.1f}]")
+    else:
+        for object_id, ix, x, y in points:
+            rows.append(f"[{int(object_id)}, {int(ix)}, {float(x):.1f}, {float(y):.1f}]")
+    return "[" + ", ".join(rows) + "]"
+def _interpolate_keyframes(keyframes, total_frames):
+    """Linearly interpolate positions between keyframes.
+    keyframes: sorted list of (frame_idx, x, y)
+    Returns dict {frame_idx: (x, y)} for every frame from first to last keyframe.
+    """
+    if not keyframes:
+        return {}
+    positions = {}
+    for i in range(len(keyframes)):
+        f_idx, x, y = keyframes[i]
+        positions[f_idx] = (x, y)
+        if i + 1 < len(keyframes):
+            nf, nx, ny = keyframes[i + 1]
+            span = nf - f_idx
+            if span > 1:
+                for t in range(1, span):
+                    alpha = t / span
+                    positions[f_idx + t] = (x + alpha * (nx - x), y + alpha * (ny - y))
+    return positions
+def create_annotated_video(video_path, points, metadata, tracking):
+    """Draw points on the original video with interpolation and fading trails.
+    Points format: [(object_id, timestamp, x, y), ...]
+    Coordinates are in the processed frame space (metadata["video_size"]).
+    """
+    cap = cv2.VideoCapture(video_path)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    vid_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    vid_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    proc_w, proc_h = metadata["video_size"]
+    scale_x = vid_w / proc_w
+    scale_y = vid_h / proc_h
+    # Build per-object keyframes: {obj_id: [(frame_idx, x, y), ...]}
+    obj_keyframes = defaultdict(list)
+    for object_id, ts, x, y in points:
+        f_idx = int(round(float(ts) * fps))
+        sx, sy = float(x) * scale_x, float(y) * scale_y
+        obj_keyframes[int(object_id)].append((f_idx, sx, sy))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    obj_positions = {}
+    obj_keyframe_set = {}
+    for obj_id, kfs in obj_keyframes.items():
+        kfs.sort(key=lambda k: k[0])
+        obj_positions[obj_id] = _interpolate_keyframes(kfs, total_frames)
+        raw_kf = set(f_idx for f_idx, _, _ in kfs)
+        obj_keyframe_set[obj_id] = set(
+            f for kf in raw_kf for f in range(kf - KEYFRAME_HOLD_FRAMES, kf + KEYFRAME_HOLD_FRAMES + 1)
+        )
+    out_path = tempfile.mktemp(suffix=".mp4")
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    out = cv2.VideoWriter(out_path, fourcc, fps, (vid_w, vid_h))
+    radius = max(5, int(max(vid_w, vid_h) * POINT_SIZE))
+    trail_length = int(fps * 2)
+    obj_history = defaultdict(list)
+    current_frame = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        for obj_id, positions in obj_positions.items():
+            if current_frame in positions:
+                px, py = positions[current_frame]
+                obj_history[obj_id].append((px, py))
+                if len(obj_history[obj_id]) > trail_length:
+                    obj_history[obj_id] = obj_history[obj_id][-trail_length:]
+                if tracking:
+                    color = COLORS_BGR[(obj_id - 1) % len(COLORS_BGR)]
+                else:
+                    color = COLORS_BGR[0]
+                # Draw fading trail
+                trail = obj_history[obj_id]
+                n_trail = len(trail)
+                if SHOW_TRAILS and n_trail >= 2:
+                    for i in range(n_trail - 1):
+                        alpha = (i + 1) / n_trail
+                        trail_color = tuple(int(c * alpha) for c in color)
+                        thickness = max(1, int(radius * 0.6 * alpha))
+                        pt1 = (int(trail[i][0]), int(trail[i][1]))
+                        pt2 = (int(trail[i + 1][0]), int(trail[i + 1][1]))
+                        cv2.line(frame, pt1, pt2, trail_color, thickness)
+                # Solid on keyframes, outline-only on interpolated frames
+                if current_frame in obj_keyframe_set[obj_id]:
+                    cv2.circle(frame, (int(px), int(py)), radius, color, -1)
+                    cv2.circle(frame, (int(px), int(py)), radius + 2, (255, 255, 255), 2)
+                else:
+                    cv2.circle(frame, (int(px), int(py)), radius, color, 2)
+        out.write(frame)
+        current_frame += 1
+    cap.release()
+    out.release()
+    return out_path
+# ── Inference functions ────────────────────────────────────────────────────────
+@spaces.GPU
+def process_images(user_text, input_images, max_tokens):
+    if not input_images:
+        return "Please upload at least one image.", [], "[]"
+    pil_images = []
+    for img_path in input_images:
+        if isinstance(img_path, tuple):
+            img_path = img_path[0]
+        pil_images.append(Image.open(img_path).convert("RGB"))
+    # Build messages
+    content = [dict(type="text", text=user_text)]
+    for img in pil_images:
+        content.append(dict(type="image", image=img))
+    messages = [{"role": "user", "content": content}]
+    # Process inputs
+    images, _, _ = process_vision_info(messages)
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    print(f"Prompt: {text}")
+    inputs = processor(
+        images=images,
+        text=text,
+        padding=True,
+        return_tensors="pt",
+        return_pointing_metadata=True,
+    )
+    metadata = inputs.pop("metadata")
+    inputs = {k: cast_float_bf16(v.to(model.device)) for k, v in inputs.items()}
+    # Generate
+    with torch.inference_mode():
+        with torch.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+            output = model.generate(
+                **inputs,
+                logits_processor=model.build_logit_processor_from_inputs(inputs),
+                max_new_tokens=int(max_tokens),
+                temperature=0
+            )
+    generated_tokens = output[0, inputs["input_ids"].size(1):]
+    generated_text = processor.decode(generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    # Extract points
+    points = model.extract_image_points(
+        generated_text,
+        metadata["token_pooling"],
+        metadata["subpatch_mapping"],
+        metadata["image_sizes"],
+    )
+    points_table = format_points_list(points, is_video=False)
+    print(f"Output text: {generated_text}")
+    print("Extracted points:", points_table)
+    if points:
+        group_by_index = defaultdict(list)
+        for object_id, ix, x, y in points:
+            group_by_index[ix].append((x, y))
+        annotated = []
+        for ix, pts in group_by_index.items():
+            annotated.append(draw_points(images[ix], pts))
+        return generated_text, annotated, points_table
+    return generated_text, pil_images, points_table
+@spaces.GPU
+def process_video(user_text, video_path, frame_sample_mode, max_frames, max_fps, max_tokens):
+    if not video_path:
+        return "Please upload a video.", None, [], "[]"
+    # Build messages
+    video_kwargs_msg = {
+        "num_frames": int(max_frames),
+        "frame_sample_mode": frame_sample_mode,
+    }
+    if max_fps is not None and max_fps > 0:
+        video_kwargs_msg["max_fps"] = int(max_fps)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                dict(type="text", text=user_text),
+                dict(type="video", video=video_path, **video_kwargs_msg),
+            ],
+        }
+    ]
+    # Process vision info
+    _, videos, video_kwargs = process_vision_info(messages)
+    videos, video_metadatas = zip(*videos)
+    videos, video_metadatas = list(videos), list(video_metadatas)
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    print(f"Prompt: {text}")
+    inputs = processor(
+        videos=videos,
+        video_metadata=video_metadatas,
+        text=text,
+        padding=True,
+        return_tensors="pt",
+        return_pointing_metadata=True,
+        **video_kwargs,
+    )
+    metadata = inputs.pop("metadata")
+    inputs = {k: cast_float_bf16(v.to(model.device)) for k, v in inputs.items()}
+    # Generate
+    with torch.inference_mode():
+        with torch.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+            output = model.generate(
+                **inputs,
+                logits_processor=model.build_logit_processor_from_inputs(inputs),
+                max_new_tokens=int(max_tokens),
+            )
+    generated_tokens = output[0, inputs["input_ids"].size(1):]
+    generated_text = processor.decode(generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    # Extract points
+    points = model.extract_video_points(
+        generated_text,
+        metadata["token_pooling"],
+        metadata["subpatch_mapping"],
+        metadata["timestamps"],
+        metadata["video_size"],
+    )
+    tracking = is_tracking_output(generated_text)
+    annotated_video = None
+    annotated_frames = []
+    points_table = format_points_list(points, is_video=True)
+    print(f"Output text: {generated_text}")
+    print("Extracted points:", points_table)
+    if points:
+        print(f"Extracted {len(points)} points. Tracking={tracking}")
+        # Build annotated frames on sampled video frames
+        if tracking:
+            group_by_time = defaultdict(list)
+            for object_id, ts, x, y in points:
+                group_by_time[ts].append((object_id, x, y))
+            group_by_frame = defaultdict(list)
+            for ts, pts_with_ids in group_by_time.items():
+                ix = int(np.argmin(np.abs(metadata["timestamps"] - ts)))
+                group_by_frame[ix] += pts_with_ids
+            for ix, pts_with_ids in sorted(group_by_frame.items()):
+                frame_img = draw_points_colored(videos[0][ix], pts_with_ids)
+                ts = metadata["timestamps"][ix]
+                annotated_frames.append((frame_img, f"t={ts:.2f}s"))
+        else:
+            group_by_time = defaultdict(list)
+            for object_id, ts, x, y in points:
+                group_by_time[ts].append((x, y))
+            group_by_frame = defaultdict(list)
+            for ts, pts in group_by_time.items():
+                ix = int(np.argmin(np.abs(metadata["timestamps"] - ts)))
+                group_by_frame[ix] += pts
+            for ix, pts in sorted(group_by_frame.items()):
+                frame_img = draw_points(videos[0][ix], pts)
+                ts = metadata["timestamps"][ix]
+                annotated_frames.append((frame_img, f"t={ts:.2f}s"))
+        # Annotated video with interpolation + trails
+        annotated_video = create_annotated_video(video_path, points, metadata, tracking)
+    return generated_text, annotated_video, annotated_frames, points_table
+# ── Gradio UI ────────────────────────────────────────────────────────────────���─
+# Read processor defaults for video settings
+_default_frame_sample_mode = processor.video_processor.frame_sample_mode
+_default_max_frames = processor.video_processor.num_frames
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 960px;
+}
+#main-title h1 {font-size: 2.3em !important;}
+#input_image image {
+    object-fit: contain !important;
+}
+#input_video video {
+    object-fit: contain !important;
+}
+.gallery-item img {
+    border: none !important;
+    outline: none !important;
+}
+"""
+with gr.Blocks() as demo:
+    gr.Markdown("# **Molmo-Point Demo**", elem_id="main-title")
+    gr.Markdown(
+        "Image & video pointing and tracking using the "
+        "[MolmoPoint-8B](https://huggingface.co/allenai/MolmoPoint-8B) pointing model."
+    )
+    with gr.Row():
+        # ── LEFT COLUMN: Inputs ──
+        with gr.Column():
+            with gr.Tabs() as input_tabs:
+                with gr.TabItem("Video Pointing & Tracking", id="video_tab") as video_tab:
+                    video = gr.Video(label="Input Video", elem_id="input_video", height=MAX_VIDEO_HEIGHT)
+                with gr.TabItem("Image(s) Pointing", id="image_tab") as image_tab:
+                    images_input = gr.Gallery(
+                        label="Input Images", elem_id="input_image", type="filepath", height=MAX_IMAGE_SIZE,
+                    )
+            input_text = gr.Textbox(placeholder="Enter the prompt", label="Input text")
+            with gr.Row(visible=True) as video_params_row:
+                frame_sample_mode = gr.Dropdown(choices=[_default_frame_sample_mode, "fps"], value=_default_frame_sample_mode, label="frame_sample_mode")
+                max_frames = gr.Number(value=_default_max_frames, label="max_frames")
+                max_fps = gr.Number(value=MAX_FPS, label="max_fps")
+            max_tok_slider = gr.Slider(label="max_tokens", minimum=1, maximum=4096, step=1, value=MAX_NEW_TOKENS)
+            with gr.Row():
+                submit_button = gr.Button("Submit", variant="primary", scale=3)
+                clear_all_button = gr.ClearButton(
+                    components=[video, images_input, input_text], value="Clear All", scale=1,
+                )
+        # ── RIGHT COLUMN: Outputs ──
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("Output Text"):
+                    output_text = gr.Textbox(placeholder="Output text", label="Output text", lines=10)
+                with gr.TabItem("Extracted Points"):
+                    output_points = gr.Textbox(
+                        label="Extracted Points ([[id, time/index, x, y]])", lines=15,
+                    )
+            with gr.Tabs(visible=True) as video_output_tabs:
+                with gr.TabItem("Annotated Video"):
+                    output_video = gr.Video(label="Annotated Video", height=MAX_VIDEO_HEIGHT)
+                with gr.TabItem("Annotated Frames"):
+                    gr.Markdown("*Click a frame to zoom in. Press Esc to go back.*")
+                    output_annotations = gr.Gallery(label="Annotated Frames (Video)", height=MAX_IMAGE_SIZE)
+            with gr.Group(visible=False) as image_output_group:
+                gr.Markdown("*Click a frame to zoom in. Press Esc to go back.*")
+                output_annotations_img = gr.Gallery(label="Annotated Images", height=MAX_IMAGE_SIZE)
+    # ── Examples ──
+    with gr.Group(visible=True) as video_examples_group:
+        gr.Markdown("### Video Examples")
+        gr.Examples(
+            examples=[
+                ["example-videos/penguins.mp4", "Track all the penguins."],
+                ["example-videos/arena_basketball.mp4", "Track the players in yellow uniform in 1 fps."],
+            ],
+            inputs=[video, input_text],
+            label="Video Pointing & Tracking Examples",
+        )
+    with gr.Group(visible=False) as image_examples_group:
+        gr.Markdown("### Image Examples")
+        gr.Examples(
+            examples=[
+                [["example-images/boat1.jpeg", "example-images/boat2.jpeg"], "Point to the boats."],
+                [["example-images/messy1.jpg", "example-images/messy2.jpg", "example-images/messy3.jpg", "example-images/messy4.jpg"], "Point to the scissors."],
+            ],
+            inputs=[images_input, input_text],
+            label="Image Pointing Examples",
+        )
+    # ── Tab switching: toggle visibility + track active tab ──
+    active_tab = gr.State("video")
+    def _select_video_tab():
+        return (
+            "video",
+            gr.update(visible=True),   # video_examples_group
+            gr.update(visible=False),  # image_examples_group
+            gr.update(visible=True),   # video_params_row
+            gr.update(visible=True),   # video_output_tabs
+            gr.update(visible=False),  # image_output_group
+        )
+    def _select_image_tab():
+        return (
+            "image",
+            gr.update(visible=False),  # video_examples_group
+            gr.update(visible=True),   # image_examples_group
+            gr.update(visible=False),  # video_params_row
+            gr.update(visible=False),  # video_output_tabs
+            gr.update(visible=True),   # image_output_group
+        )
+    tab_outputs = [active_tab, video_examples_group, image_examples_group, video_params_row, video_output_tabs, image_output_group]
+    video_tab.select(fn=_select_video_tab, outputs=tab_outputs)
+    image_tab.select(fn=_select_image_tab, outputs=tab_outputs)
+    def _show_fps_tip(generated_text, current_max_fps):
+        """Show a toast notification if max_fps doesn't match the detected task type."""
+        tracking = "<tracks" in generated_text
+        pointing = "<point" in generated_text
+        if pointing and int(current_max_fps) != 2:
+            gr.Info("Tip: For best video pointing results, set max_fps=2.")
+        elif tracking and int(current_max_fps) != 10:
+            gr.Info("Tip: For best tracking results, set max_fps=10.")
+    def dispatch_submit(tab, user_text, video_path, input_images,
+                        fsm, mf, mfps, max_tok):
+        if tab == "image":
+            text_out, img_gallery, pts = process_images(user_text, input_images, max_tok)
+            return text_out, pts, None, [], img_gallery
+        else:
+            text_out, ann_video, ann_frames, pts = process_video(
+                user_text, video_path, fsm, mf, mfps, max_tok,
+            )
+            _show_fps_tip(text_out, mfps)
+            return text_out, pts, ann_video, ann_frames, []
+    submit_button.click(
+        fn=dispatch_submit,
+        inputs=[active_tab, input_text, video, images_input,
+                frame_sample_mode, max_frames, max_fps, max_tok_slider],
+        outputs=[output_text, output_points, output_video, output_annotations, output_annotations_img],
+    )
+if __name__ == "__main__":
+    demo.launch(css=css, ssr_mode=False, show_error=True, share=True)

example-images/boat1.jpeg ADDED Viewed

Git LFS Details

SHA256: 1652fe0880f870989ac390c704ece359acef36ecde5b423fcc32f9181e7c374f
Pointer size: 132 Bytes
Size of remote file: 2.95 MB

example-images/boat2.jpeg ADDED Viewed

Git LFS Details

SHA256: 2a974235aed23bef201d15f32de63c25098c020d61e3625663fb515af8acbe3c
Pointer size: 132 Bytes
Size of remote file: 3.12 MB

example-images/messy1.jpg ADDED Viewed

Git LFS Details

SHA256: 0810f7ed899a9a90e49923241cf577f3712eac9e8e5d52360d54a9a0d11b079b
Pointer size: 132 Bytes
Size of remote file: 1.1 MB

example-images/messy2.jpg ADDED Viewed

Git LFS Details

SHA256: 12b2e240b935d23644e311b7249410afb8025a842d38d06ebee014195d6da6a9
Pointer size: 131 Bytes
Size of remote file: 255 kB

example-images/messy3.jpg ADDED Viewed

Git LFS Details

SHA256: 7ef063cf698a947efd4224f4d95b36e23c7605a27b6a21bbd0d496cbd95afbfc
Pointer size: 131 Bytes
Size of remote file: 234 kB

example-images/messy4.jpg ADDED Viewed

Git LFS Details

SHA256: b3776aadcc39769a233bf185797b4984aa0fffdcef087fec32e4e1fb75712dec
Pointer size: 131 Bytes
Size of remote file: 164 kB

example-videos/arena_basketball.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a965ceced2053d1e456b2ce4e4a3fc87a64e4520af7743e91885a2ae11dc237
+size 12297652

example-videos/backflip.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10ac0f73fc374bd6ebb63f3d8d145bb11ef1c713b71f433e31c98b1b0f536018
+size 11171759

example-videos/penguins.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:856bfacc3de618a5154fc6dd0240ad375a8f76faa486070a996007f17f9d3624
+size 1689459

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip>=23.0.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+git+https://github.com/huggingface/transformers.git@v4.57.1
+git+https://github.com/huggingface/accelerate.git
+torch==2.8.0
+torchvision
+pillow
+einops
+decord2
+molmo_utils
+opencv-python
+numpy
+gradio
+spaces
+kernels
+hf_xet