File size: 1,629 Bytes
b2c2640
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python3
"""
Sanity check: load MiniCPM-o 4.5 and run a single sample through it.

Picks one video from sync eval set, passes video + audio + prompt, prints
the model's response.
"""

from __future__ import annotations

import os
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent))
from minicpmo_inference import load_model, run_inference


def main():
    # Pick the first original video in the sync eval set
    original_root = Path("/opt/dlami/nvme/video_source/original/uag_oops")
    audio_root = Path("/opt/dlami/nvme/video_source/extracted_audio/original/uag_oops")

    videos = sorted(original_root.glob("*.mp4"))
    if not videos:
        print(f"ERROR: no videos found at {original_root}")
        sys.exit(1)

    video_path = videos[0]
    audio_path = audio_root / f"{video_path.stem}.wav"
    if not audio_path.exists():
        print(f"ERROR: audio not found for {video_path.name}")
        sys.exit(1)

    print(f"Video: {video_path}")
    print(f"Audio: {audio_path}")
    print()

    model, tokenizer = load_model()

    prompt = (
        "Watch this video and listen to its audio carefully. "
        "Determine whether the audio and video tracks are synchronized. "
        "Explain your reasoning."
    )

    print("=== Running inference ===")
    response = run_inference(
        model, tokenizer,
        video_path=str(video_path),
        audio_path=str(audio_path),
        prompt=prompt,
        max_new_tokens=128,
        temperature=0.0,
    )
    print()
    print("=== Response ===")
    print(response)


if __name__ == "__main__":
    main()