anonymouscla
/

phyjudge-9B

+"""Run inference with the anonymous judge LoRA adapter.
+The script can either load files from a local directory or pull them
+directly from the Hugging Face Hub. By default it points at the
+companion repository ``anonymouscla/physground-judger9B``:
+  # From the Hub (no clone needed):
+  python infer.py --video demo.mp4 --caption "A ball rolls down a ramp." --metric SA
+  python infer.py --video demo.mp4 --caption "A ball rolls down a ramp." --law gravity
+  # From a local clone of the model repo:
+  python infer.py --adapter-dir /path/to/local/clone --video demo.mp4 \
+      --caption "A ball rolls down a ramp." --law gravity
+It loads:
+  - adapter_config.json to find the base model
+  - adapter_model.safetensors through PEFT
+  - subq+human.yaml to render the scoring prompt
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Any
+import torch
+import yaml
+from peft import PeftModel
+from transformers import AutoProcessor
+GENERAL_SUB_QUESTIONS: dict[str, list[str]] = {
+    "SA": [
+        "Are the main objects in the caption present in the video?",
+        "Are the key actions or interactions from the caption visible?",
+        "Are important scene attributes and relationships preserved?",
+        "Does the video avoid major contradictions to the caption?",
+    ],
+    "PTV": [
+        "Do causes appear before their effects?",
+        "Do physical events unfold in a plausible temporal order?",
+        "Are motion transitions continuous rather than abrupt jumps or loops?",
+        "Does the sequence avoid impossible reversals or repeated resets?",
+    ],
+    "persistence": [
+        "Do objects maintain consistent existence throughout the video?",
+        "Do objects keep a stable shape, size, color, and texture?",
+        "Do objects avoid disappearing, appearing, or transforming unexpectedly?",
+        "Do objects preserve identity through motion and brief occlusion?",
+    ],
+}
+PHYSICAL_CRITERIA: dict[str, str] = {
+    "gravity": "Do unsupported objects fall downward? Do thrown objects follow a curved trajectory? Does poured liquid fall with gravity?",
+    "inertia": "Do stationary objects remain still unless acted upon? Do moving objects maintain their motion unless stopped by friction, collision, or an obstacle?",
+    "momentum": "After collision, push, or pull, is the direction of motion reasonable? Ignore speed magnitude.",
+    "impenetrability": "Do objects maintain impenetrability -- no passing through each other?",
+    "collision": "After impact, is there reasonable bounce/shatter/deformation? Does response match impact force?",
+    "material": "Does each material respond according to its properties? (glass shatters, rubber bounces, metal is rigid, cloth deforms softly, etc.)",
+    "buoyancy": "Do dense objects sink? Do wood/plastic float?",
+    "displacement": "When you add more liquid or put an object into it, does the liquid level rise in a realistic way? Does it overflow when full?",
+    "flow_dynamics": "Does the liquid's overall motion behave realistically over time -- flowing along surfaces, spreading, draining naturally?",
+    "boundary_interaction": "When the liquid hits a boundary such as a rock face, container wall, or floor, does it respond realistically? Do local splash, rebound, or split patterns on impact look physically plausible?",
+    "fluid_continuity": "Does the liquid avoid disappearing or appearing out of nowhere? Small splashes that briefly break apart are okay.",
+    "reflection": "Does the reflection roughly match objects and colors in the scene, and avoid completely unrelated content?",
+    "shadow": "Are shadow directions consistent with light source? Do shadows move with objects?",
+}
+PHYSICAL_SUB_QUESTIONS: dict[str, list[str]] = {
+    "gravity": [
+        "Do unsupported objects or liquids move downward over time?",
+        "Do thrown or falling objects follow a plausible gravity-driven path?",
+        "Does the video avoid objects floating or rising without support?",
+    ],
+    "inertia": [
+        "Do stationary objects remain still unless a visible force acts on them?",
+        "Do moving objects continue plausibly until friction, collision, or an obstacle changes their motion?",
+        "Does the video avoid unexplained starts, stops, or direction changes?",
+    ],
+    "momentum": [
+        "After contact, push, pull, or collision, are motion directions plausible?",
+        "Does the reacting object move in a direction consistent with the interaction?",
+        "Does the video avoid impossible reversals or unrelated motion changes?",
+    ],
+    "impenetrability": [
+        "Do solid objects avoid passing through one another?",
+        "Do contacts and overlaps remain physically plausible?",
+        "Does the video avoid obvious clipping or penetration artifacts?",
+    ],
+    "collision": [
+        "Does impact cause a plausible bounce, break, deformation, or transfer of motion?",
+        "Is the response direction consistent with the collision?",
+        "Does the response avoid being much too weak, too strong, or unrelated to the impact?",
+    ],
+    "material": [
+        "Do objects respond consistently with their apparent material?",
+        "Are rigid, soft, brittle, elastic, or fluid-like objects animated appropriately?",
+        "Does the video avoid material behavior that contradicts the scene?",
+    ],
+    "buoyancy": [
+        "Do objects sink or float in a way consistent with apparent density?",
+        "Does the floating or sinking behavior stay stable over time?",
+        "Does the video avoid unsupported hovering or impossible underwater motion?",
+    ],
+    "displacement": [
+        "Does liquid level rise when volume is added or an object enters it?",
+        "Does overflow happen only when the container is plausibly full?",
+        "Does the liquid volume remain visually plausible?",
+    ],
+    "flow_dynamics": [
+        "Does liquid flow along surfaces, spread, or drain naturally?",
+        "Does the flow direction follow gravity and boundaries?",
+        "Does the video avoid abrupt stops, reversals, or unsupported uphill flow?",
+    ],
+    "boundary_interaction": [
+        "Does liquid react plausibly when hitting a wall, floor, container, or obstacle?",
+        "Are splash, rebound, or split patterns locally plausible?",
+        "Does the liquid remain consistent after interacting with boundaries?",
+    ],
+    "fluid_continuity": [
+        "Does liquid avoid disappearing or appearing without cause?",
+        "Does the amount of liquid remain broadly consistent?",
+        "Are splashes and separations temporary and physically plausible?",
+    ],
+    "reflection": [
+        "Does the reflection match nearby objects, colors, and motion?",
+        "Does the reflected content stay spatially consistent with the scene?",
+        "Does the video avoid unrelated or impossible reflection content?",
+    ],
+    "shadow": [
+        "Are shadows consistent with the apparent light source direction?",
+        "Do shadows move with the objects that cast them?",
+        "Does the video avoid missing, detached, or contradictory shadows?",
+    ],
+}
+def load_json(path: Path) -> dict[str, Any]:
+    with path.open() as f:
+        return json.load(f)
+def load_yaml(path: Path) -> dict[str, Any]:
+    with path.open() as f:
+        return yaml.safe_load(f)
+def questions_block(questions: list[str]) -> str:
+    return "\n".join(f"{idx}. {question}" for idx, question in enumerate(questions, 1))
+def build_prompt(
+    cfg: dict[str, Any],
+    caption: str,
+    *,
+    metric: str | None = None,
+    law: str | None = None,
+    criteria: str | None = None,
+) -> tuple[str, str, str]:
+    if metric:
+        if metric not in GENERAL_SUB_QUESTIONS:
+            raise ValueError(f"unknown metric: {metric}")
+        prompt = cfg["eval_prompts"][metric].format(
+            prompt=caption,
+            questions_block=questions_block(GENERAL_SUB_QUESTIONS[metric]),
+        )
+        return cfg["system_prompt"], prompt, metric
+    if not law:
+        raise ValueError("either --metric or --law is required")
+    if law not in PHYSICAL_CRITERIA:
+        raise ValueError(f"unknown law: {law}")
+    prompt = cfg["physical_template"].format(
+        prompt=caption,
+        law=law,
+        criteria=criteria or PHYSICAL_CRITERIA[law],
+        questions_block=questions_block(PHYSICAL_SUB_QUESTIONS[law]),
+    )
+    return cfg["system_prompt"], prompt, law
+def load_base_model(base_id: str, dtype: torch.dtype, device_map: str):
+    errors: list[str] = []
+    for class_name in (
+        "AutoModelForImageTextToText",
+        "AutoModelForVision2Seq",
+        "AutoModelForCausalLM",
+    ):
+        try:
+            module = __import__("transformers", fromlist=[class_name])
+            model_cls = getattr(module, class_name)
+            return model_cls.from_pretrained(
+                base_id,
+                torch_dtype=dtype,
+                device_map=device_map,
+                trust_remote_code=True,
+            )
+        except Exception as exc:  # pragma: no cover - depends on local transformers version
+            errors.append(f"{class_name}: {exc}")
+    raise RuntimeError("failed to load base model:\n" + "\n".join(errors))
+def resolve_adapter_dir(source: str) -> Path:
+    """Return a local directory holding the adapter files.
+    If ``source`` is a directory containing ``adapter_config.json`` it is used
+    as-is. Otherwise ``source`` is interpreted as a HF Hub repo id and the
+    snapshot is downloaded into the local cache.
+    """
+    candidate = Path(source)
+    if candidate.is_dir() and (candidate / "adapter_config.json").exists():
+        return candidate
+    try:
+        from huggingface_hub import snapshot_download
+    except ImportError as exc:
+        raise ImportError(
+            "huggingface_hub is required to fetch the adapter from the Hub. "
+            "Install it with: pip install huggingface_hub"
+        ) from exc
+    return Path(snapshot_download(repo_id=source))
+def load_model(adapter_source: str, dtype: torch.dtype, device_map: str) -> tuple[Any, Any, Path]:
+    adapter_dir = resolve_adapter_dir(adapter_source)
+    adapter_cfg = load_json(adapter_dir / "adapter_config.json")
+    base_id = adapter_cfg["base_model_name_or_path"]
+    processor = AutoProcessor.from_pretrained(base_id, trust_remote_code=True)
+    base = load_base_model(base_id, dtype=dtype, device_map=device_map)
+    model = PeftModel.from_pretrained(base, adapter_dir)
+    model.eval()
+    return processor, model, adapter_dir
+def build_messages(system_prompt: str, user_prompt: str, video_path: Path) -> list[dict[str, Any]]:
+    return [
+        {"role": "system", "content": system_prompt},
+        {
+            "role": "user",
+            "content": [
+                {"type": "video", "video": str(video_path)},
+                {"type": "text", "text": user_prompt},
+            ],
+        },
+    ]
+def prepare_inputs(
+    processor: Any,
+    messages: list[dict[str, Any]],
+    device: torch.device,
+    *,
+    fps: float,
+    max_pixels: int,
+) -> dict[str, Any]:
+    text = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    try:
+        from qwen_vl_utils import process_vision_info
+    except ImportError as exc:
+        raise ImportError(
+            "qwen-vl-utils is required for local video inference. "
+            "Install it with: pip install qwen-vl-utils[decord]"
+        ) from exc
+    for msg in messages:
+        content = msg.get("content")
+        if isinstance(content, list):
+            for item in content:
+                if item.get("type") == "video":
+                    item.setdefault("fps", fps)
+                    item.setdefault("max_pixels", max_pixels)
+    try:
+        image_inputs, video_inputs, video_kwargs = process_vision_info(
+            messages,
+            return_video_kwargs=True,
+        )
+    except TypeError:
+        image_inputs, video_inputs = process_vision_info(messages)
+        video_kwargs = {}
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+        **video_kwargs,
+    )
+    return inputs.to(device)
+def decode_generated(processor: Any, inputs: dict[str, Any], generated_ids: torch.Tensor) -> str:
+    input_len = inputs["input_ids"].shape[1]
+    generated_ids = generated_ids[:, input_len:]
+    return processor.batch_decode(
+        generated_ids,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )[0].strip()
+def parse_score(text: str, key: str) -> int | None:
+    match = re.search(r"\{.*?\}", text, flags=re.S)
+    if match:
+        try:
+            obj = json.loads(match.group(0))
+            value = obj.get(key)
+            if isinstance(value, int) and 1 <= value <= 5:
+                return value
+        except json.JSONDecodeError:
+            pass
+    match = re.search(rf'"?{re.escape(key)}"?\s*:\s*([1-5])', text)
+    if match:
+        return int(match.group(1))
+    return None
+def dtype_from_name(name: str) -> torch.dtype:
+    if name == "bfloat16":
+        return torch.bfloat16
+    if name == "float16":
+        return torch.float16
+    if name == "float32":
+        return torch.float32
+    raise ValueError(f"unsupported dtype: {name}")
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Infer with the anonymous judge adapter.")
+    parser.add_argument(
+        "--adapter-dir",
+        default="anonymouscla/physground-judger9B",
+        help=(
+            "Local directory with adapter_config.json + adapter_model.safetensors "
+            "+ subq+human.yaml, or a HF Hub repo id "
+            "(default: anonymouscla/physground-judger9B)."
+        ),
+    )
+    parser.add_argument("--video", required=True, type=Path)
+    parser.add_argument("--caption", required=True)
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--metric", choices=["SA", "PTV", "persistence"])
+    group.add_argument("--law", choices=sorted(PHYSICAL_CRITERIA))
+    parser.add_argument("--criteria", help="Override physical-law criterion text.")
+    parser.add_argument("--max-new-tokens", type=int, default=64)
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--fps", type=float, default=2.0)
+    parser.add_argument("--max-pixels", type=int, default=360 * 640)
+    parser.add_argument("--dtype", choices=["bfloat16", "float16", "float32"], default="bfloat16")
+    parser.add_argument("--device-map", default="auto")
+    parser.add_argument("--print-prompt", action="store_true")
+    args = parser.parse_args()
+    if not args.video.is_file():
+        raise FileNotFoundError(args.video)
+    dtype = dtype_from_name(args.dtype)
+    processor, model, adapter_dir = load_model(
+        args.adapter_dir, dtype=dtype, device_map=args.device_map
+    )
+    prompt_cfg = load_yaml(adapter_dir / "subq+human.yaml")
+    system_prompt, user_prompt, score_key = build_prompt(
+        prompt_cfg,
+        args.caption,
+        metric=args.metric,
+        law=args.law,
+        criteria=args.criteria,
+    )
+    if args.print_prompt:
+        print("SYSTEM:")
+        print(system_prompt)
+        print("\nUSER:")
+        print(user_prompt)
+        print()
+    device = next(model.parameters()).device
+    messages = build_messages(system_prompt, user_prompt, args.video)
+    inputs = prepare_inputs(
+        processor,
+        messages,
+        device,
+        fps=args.fps,
+        max_pixels=args.max_pixels,
+    )
+    generation_kwargs: dict[str, Any] = {
+        "max_new_tokens": args.max_new_tokens,
+        "do_sample": args.temperature > 0,
+        "temperature": args.temperature if args.temperature > 0 else None,
+    }
+    generation_kwargs = {k: v for k, v in generation_kwargs.items() if v is not None}
+    with torch.inference_mode():
+        generated_ids = model.generate(**inputs, **generation_kwargs)
+    raw = decode_generated(processor, inputs, generated_ids)
+    score = parse_score(raw, score_key)
+    print(json.dumps({"key": score_key, "score": score, "raw": raw}, ensure_ascii=False, indent=2))
+if __name__ == "__main__":
+    main()