| """Run inference with the anonymous judge LoRA adapter. |
| |
| The script can either load files from a local directory or pull them |
| directly from the Hugging Face Hub. By default it points at the |
| companion repository ``anonymouscla/physground-judger9B``: |
| |
| # From the Hub (no clone needed): |
| python infer.py --video demo.mp4 --caption "A ball rolls down a ramp." --metric SA |
| python infer.py --video demo.mp4 --caption "A ball rolls down a ramp." --law gravity |
| |
| # From a local clone of the model repo: |
| python infer.py --adapter-dir /path/to/local/clone --video demo.mp4 \ |
| --caption "A ball rolls down a ramp." --law gravity |
| |
| It loads: |
| - adapter_config.json to find the base model |
| - adapter_model.safetensors through PEFT |
| - subq+human.yaml to render the scoring prompt |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import re |
| from pathlib import Path |
| from typing import Any |
|
|
| import torch |
| import yaml |
| from peft import PeftModel |
| from transformers import AutoProcessor |
|
|
|
|
| GENERAL_SUB_QUESTIONS: dict[str, list[str]] = { |
| "SA": [ |
| "Are the main objects in the caption present in the video?", |
| "Are the key actions or interactions from the caption visible?", |
| "Are important scene attributes and relationships preserved?", |
| "Does the video avoid major contradictions to the caption?", |
| ], |
| "PTV": [ |
| "Do causes appear before their effects?", |
| "Do physical events unfold in a plausible temporal order?", |
| "Are motion transitions continuous rather than abrupt jumps or loops?", |
| "Does the sequence avoid impossible reversals or repeated resets?", |
| ], |
| "persistence": [ |
| "Do objects maintain consistent existence throughout the video?", |
| "Do objects keep a stable shape, size, color, and texture?", |
| "Do objects avoid disappearing, appearing, or transforming unexpectedly?", |
| "Do objects preserve identity through motion and brief occlusion?", |
| ], |
| } |
|
|
|
|
| PHYSICAL_CRITERIA: dict[str, str] = { |
| "gravity": "Do unsupported objects fall downward? Do thrown objects follow a curved trajectory? Does poured liquid fall with gravity?", |
| "inertia": "Do stationary objects remain still unless acted upon? Do moving objects maintain their motion unless stopped by friction, collision, or an obstacle?", |
| "momentum": "After collision, push, or pull, is the direction of motion reasonable? Ignore speed magnitude.", |
| "impenetrability": "Do objects maintain impenetrability -- no passing through each other?", |
| "collision": "After impact, is there reasonable bounce/shatter/deformation? Does response match impact force?", |
| "material": "Does each material respond according to its properties? (glass shatters, rubber bounces, metal is rigid, cloth deforms softly, etc.)", |
| "buoyancy": "Do dense objects sink? Do wood/plastic float?", |
| "displacement": "When you add more liquid or put an object into it, does the liquid level rise in a realistic way? Does it overflow when full?", |
| "flow_dynamics": "Does the liquid's overall motion behave realistically over time -- flowing along surfaces, spreading, draining naturally?", |
| "boundary_interaction": "When the liquid hits a boundary such as a rock face, container wall, or floor, does it respond realistically? Do local splash, rebound, or split patterns on impact look physically plausible?", |
| "fluid_continuity": "Does the liquid avoid disappearing or appearing out of nowhere? Small splashes that briefly break apart are okay.", |
| "reflection": "Does the reflection roughly match objects and colors in the scene, and avoid completely unrelated content?", |
| "shadow": "Are shadow directions consistent with light source? Do shadows move with objects?", |
| } |
|
|
|
|
| PHYSICAL_SUB_QUESTIONS: dict[str, list[str]] = { |
| "gravity": [ |
| "Do unsupported objects or liquids move downward over time?", |
| "Do thrown or falling objects follow a plausible gravity-driven path?", |
| "Does the video avoid objects floating or rising without support?", |
| ], |
| "inertia": [ |
| "Do stationary objects remain still unless a visible force acts on them?", |
| "Do moving objects continue plausibly until friction, collision, or an obstacle changes their motion?", |
| "Does the video avoid unexplained starts, stops, or direction changes?", |
| ], |
| "momentum": [ |
| "After contact, push, pull, or collision, are motion directions plausible?", |
| "Does the reacting object move in a direction consistent with the interaction?", |
| "Does the video avoid impossible reversals or unrelated motion changes?", |
| ], |
| "impenetrability": [ |
| "Do solid objects avoid passing through one another?", |
| "Do contacts and overlaps remain physically plausible?", |
| "Does the video avoid obvious clipping or penetration artifacts?", |
| ], |
| "collision": [ |
| "Does impact cause a plausible bounce, break, deformation, or transfer of motion?", |
| "Is the response direction consistent with the collision?", |
| "Does the response avoid being much too weak, too strong, or unrelated to the impact?", |
| ], |
| "material": [ |
| "Do objects respond consistently with their apparent material?", |
| "Are rigid, soft, brittle, elastic, or fluid-like objects animated appropriately?", |
| "Does the video avoid material behavior that contradicts the scene?", |
| ], |
| "buoyancy": [ |
| "Do objects sink or float in a way consistent with apparent density?", |
| "Does the floating or sinking behavior stay stable over time?", |
| "Does the video avoid unsupported hovering or impossible underwater motion?", |
| ], |
| "displacement": [ |
| "Does liquid level rise when volume is added or an object enters it?", |
| "Does overflow happen only when the container is plausibly full?", |
| "Does the liquid volume remain visually plausible?", |
| ], |
| "flow_dynamics": [ |
| "Does liquid flow along surfaces, spread, or drain naturally?", |
| "Does the flow direction follow gravity and boundaries?", |
| "Does the video avoid abrupt stops, reversals, or unsupported uphill flow?", |
| ], |
| "boundary_interaction": [ |
| "Does liquid react plausibly when hitting a wall, floor, container, or obstacle?", |
| "Are splash, rebound, or split patterns locally plausible?", |
| "Does the liquid remain consistent after interacting with boundaries?", |
| ], |
| "fluid_continuity": [ |
| "Does liquid avoid disappearing or appearing without cause?", |
| "Does the amount of liquid remain broadly consistent?", |
| "Are splashes and separations temporary and physically plausible?", |
| ], |
| "reflection": [ |
| "Does the reflection match nearby objects, colors, and motion?", |
| "Does the reflected content stay spatially consistent with the scene?", |
| "Does the video avoid unrelated or impossible reflection content?", |
| ], |
| "shadow": [ |
| "Are shadows consistent with the apparent light source direction?", |
| "Do shadows move with the objects that cast them?", |
| "Does the video avoid missing, detached, or contradictory shadows?", |
| ], |
| } |
|
|
|
|
| def load_json(path: Path) -> dict[str, Any]: |
| with path.open() as f: |
| return json.load(f) |
|
|
|
|
| def load_yaml(path: Path) -> dict[str, Any]: |
| with path.open() as f: |
| return yaml.safe_load(f) |
|
|
|
|
| def questions_block(questions: list[str]) -> str: |
| return "\n".join(f"{idx}. {question}" for idx, question in enumerate(questions, 1)) |
|
|
|
|
| def build_prompt( |
| cfg: dict[str, Any], |
| caption: str, |
| *, |
| metric: str | None = None, |
| law: str | None = None, |
| criteria: str | None = None, |
| ) -> tuple[str, str, str]: |
| if metric: |
| if metric not in GENERAL_SUB_QUESTIONS: |
| raise ValueError(f"unknown metric: {metric}") |
| prompt = cfg["eval_prompts"][metric].format( |
| prompt=caption, |
| questions_block=questions_block(GENERAL_SUB_QUESTIONS[metric]), |
| ) |
| return cfg["system_prompt"], prompt, metric |
|
|
| if not law: |
| raise ValueError("either --metric or --law is required") |
| if law not in PHYSICAL_CRITERIA: |
| raise ValueError(f"unknown law: {law}") |
| prompt = cfg["physical_template"].format( |
| prompt=caption, |
| law=law, |
| criteria=criteria or PHYSICAL_CRITERIA[law], |
| questions_block=questions_block(PHYSICAL_SUB_QUESTIONS[law]), |
| ) |
| return cfg["system_prompt"], prompt, law |
|
|
|
|
| def load_base_model(base_id: str, dtype: torch.dtype, device_map: str): |
| errors: list[str] = [] |
| for class_name in ( |
| "AutoModelForImageTextToText", |
| "AutoModelForVision2Seq", |
| "AutoModelForCausalLM", |
| ): |
| try: |
| module = __import__("transformers", fromlist=[class_name]) |
| model_cls = getattr(module, class_name) |
| return model_cls.from_pretrained( |
| base_id, |
| torch_dtype=dtype, |
| device_map=device_map, |
| trust_remote_code=True, |
| ) |
| except Exception as exc: |
| errors.append(f"{class_name}: {exc}") |
| raise RuntimeError("failed to load base model:\n" + "\n".join(errors)) |
|
|
|
|
| def resolve_adapter_dir(source: str) -> Path: |
| """Return a local directory holding the adapter files. |
| |
| If ``source`` is a directory containing ``adapter_config.json`` it is used |
| as-is. Otherwise ``source`` is interpreted as a HF Hub repo id and the |
| snapshot is downloaded into the local cache. |
| """ |
| candidate = Path(source) |
| if candidate.is_dir() and (candidate / "adapter_config.json").exists(): |
| return candidate |
| try: |
| from huggingface_hub import snapshot_download |
| except ImportError as exc: |
| raise ImportError( |
| "huggingface_hub is required to fetch the adapter from the Hub. " |
| "Install it with: pip install huggingface_hub" |
| ) from exc |
| return Path(snapshot_download(repo_id=source)) |
|
|
|
|
| def load_model(adapter_source: str, dtype: torch.dtype, device_map: str) -> tuple[Any, Any, Path]: |
| adapter_dir = resolve_adapter_dir(adapter_source) |
| adapter_cfg = load_json(adapter_dir / "adapter_config.json") |
| base_id = adapter_cfg["base_model_name_or_path"] |
| processor = AutoProcessor.from_pretrained(base_id, trust_remote_code=True) |
| base = load_base_model(base_id, dtype=dtype, device_map=device_map) |
| model = PeftModel.from_pretrained(base, adapter_dir) |
| model.eval() |
| return processor, model, adapter_dir |
|
|
|
|
| def build_messages(system_prompt: str, user_prompt: str, video_path: Path) -> list[dict[str, Any]]: |
| return [ |
| {"role": "system", "content": system_prompt}, |
| { |
| "role": "user", |
| "content": [ |
| {"type": "video", "video": str(video_path)}, |
| {"type": "text", "text": user_prompt}, |
| ], |
| }, |
| ] |
|
|
|
|
| def prepare_inputs( |
| processor: Any, |
| messages: list[dict[str, Any]], |
| device: torch.device, |
| *, |
| fps: float, |
| max_pixels: int, |
| ) -> dict[str, Any]: |
| text = processor.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True, |
| ) |
|
|
| try: |
| from qwen_vl_utils import process_vision_info |
| except ImportError as exc: |
| raise ImportError( |
| "qwen-vl-utils is required for local video inference. " |
| "Install it with: pip install qwen-vl-utils[decord]" |
| ) from exc |
|
|
| for msg in messages: |
| content = msg.get("content") |
| if isinstance(content, list): |
| for item in content: |
| if item.get("type") == "video": |
| item.setdefault("fps", fps) |
| item.setdefault("max_pixels", max_pixels) |
|
|
| try: |
| image_inputs, video_inputs, video_kwargs = process_vision_info( |
| messages, |
| return_video_kwargs=True, |
| ) |
| except TypeError: |
| image_inputs, video_inputs = process_vision_info(messages) |
| video_kwargs = {} |
|
|
| inputs = processor( |
| text=[text], |
| images=image_inputs, |
| videos=video_inputs, |
| padding=True, |
| return_tensors="pt", |
| **video_kwargs, |
| ) |
| return inputs.to(device) |
|
|
|
|
| def decode_generated(processor: Any, inputs: dict[str, Any], generated_ids: torch.Tensor) -> str: |
| input_len = inputs["input_ids"].shape[1] |
| generated_ids = generated_ids[:, input_len:] |
| return processor.batch_decode( |
| generated_ids, |
| skip_special_tokens=True, |
| clean_up_tokenization_spaces=False, |
| )[0].strip() |
|
|
|
|
| def parse_score(text: str, key: str) -> int | None: |
| match = re.search(r"\{.*?\}", text, flags=re.S) |
| if match: |
| try: |
| obj = json.loads(match.group(0)) |
| value = obj.get(key) |
| if isinstance(value, int) and 1 <= value <= 5: |
| return value |
| except json.JSONDecodeError: |
| pass |
| match = re.search(rf'"?{re.escape(key)}"?\s*:\s*([1-5])', text) |
| if match: |
| return int(match.group(1)) |
| return None |
|
|
|
|
| def dtype_from_name(name: str) -> torch.dtype: |
| if name == "bfloat16": |
| return torch.bfloat16 |
| if name == "float16": |
| return torch.float16 |
| if name == "float32": |
| return torch.float32 |
| raise ValueError(f"unsupported dtype: {name}") |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description="Infer with the anonymous judge adapter.") |
| parser.add_argument( |
| "--adapter-dir", |
| default="anonymouscla/physground-judger9B", |
| help=( |
| "Local directory with adapter_config.json + adapter_model.safetensors " |
| "+ subq+human.yaml, or a HF Hub repo id " |
| "(default: anonymouscla/physground-judger9B)." |
| ), |
| ) |
| parser.add_argument("--video", required=True, type=Path) |
| parser.add_argument("--caption", required=True) |
| group = parser.add_mutually_exclusive_group(required=True) |
| group.add_argument("--metric", choices=["SA", "PTV", "persistence"]) |
| group.add_argument("--law", choices=sorted(PHYSICAL_CRITERIA)) |
| parser.add_argument("--criteria", help="Override physical-law criterion text.") |
| parser.add_argument("--max-new-tokens", type=int, default=64) |
| parser.add_argument("--temperature", type=float, default=0.0) |
| parser.add_argument("--fps", type=float, default=2.0) |
| parser.add_argument("--max-pixels", type=int, default=360 * 640) |
| parser.add_argument("--dtype", choices=["bfloat16", "float16", "float32"], default="bfloat16") |
| parser.add_argument("--device-map", default="auto") |
| parser.add_argument("--print-prompt", action="store_true") |
| args = parser.parse_args() |
|
|
| if not args.video.is_file(): |
| raise FileNotFoundError(args.video) |
|
|
| dtype = dtype_from_name(args.dtype) |
| processor, model, adapter_dir = load_model( |
| args.adapter_dir, dtype=dtype, device_map=args.device_map |
| ) |
|
|
| prompt_cfg = load_yaml(adapter_dir / "subq+human.yaml") |
| system_prompt, user_prompt, score_key = build_prompt( |
| prompt_cfg, |
| args.caption, |
| metric=args.metric, |
| law=args.law, |
| criteria=args.criteria, |
| ) |
|
|
| if args.print_prompt: |
| print("SYSTEM:") |
| print(system_prompt) |
| print("\nUSER:") |
| print(user_prompt) |
| print() |
| device = next(model.parameters()).device |
| messages = build_messages(system_prompt, user_prompt, args.video) |
| inputs = prepare_inputs( |
| processor, |
| messages, |
| device, |
| fps=args.fps, |
| max_pixels=args.max_pixels, |
| ) |
|
|
| generation_kwargs: dict[str, Any] = { |
| "max_new_tokens": args.max_new_tokens, |
| "do_sample": args.temperature > 0, |
| "temperature": args.temperature if args.temperature > 0 else None, |
| } |
| generation_kwargs = {k: v for k, v in generation_kwargs.items() if v is not None} |
|
|
| with torch.inference_mode(): |
| generated_ids = model.generate(**inputs, **generation_kwargs) |
|
|
| raw = decode_generated(processor, inputs, generated_ids) |
| score = parse_score(raw, score_key) |
| print(json.dumps({"key": score_key, "score": score, "raw": raw}, ensure_ascii=False, indent=2)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|