Restore all project files from original repo

Files changed (8) hide show

build_index.py +316 -0
compute_stats.py +98 -0
eval_kitchen.py +263 -0
eval_sim.py +171 -0
filtered_index.json +0 -0
infer_so101.py +223 -0
norm_stats.json +38 -0
so100_dataset.py +325 -0

build_index.py ADDED Viewed

	@@ -0,0 +1,316 @@

+#!/usr/bin/env python3
+"""
+Build a filtered training index from community_dataset_v3 on disk.
+Applies:
+  - Robot type filter (so100/so101 variants only)
+  - Schema filter (2 cameras, 6-DOF, 30fps)
+  - Episode length filter (5s-60s)
+  - Per-task cap (default 200)
+  - Per-contributor cap (default 200)
+  - Excludes datasets with file count mismatches
+Outputs filtered_index.json with all info needed to train.
+"""
+import argparse
+import glob
+import json
+import random
+from collections import defaultdict
+from pathlib import Path
+import av
+import pandas as pd
+def get_video_duration(video_path: Path) -> float:
+    """Get video duration in seconds by reading container metadata (fast, no decoding)."""
+    try:
+        container = av.open(str(video_path))
+        stream = container.streams.video[0]
+        duration = float(stream.duration * stream.time_base)
+        container.close()
+        return duration
+    except Exception:
+        return 0.0
+def load_dataset_meta(dataset_root: Path) -> dict | None:
+    """Load and validate a single dataset's metadata."""
+    info_path = dataset_root / "meta" / "info.json"
+    if not info_path.exists():
+        return None
+    info = json.load(open(info_path))
+    # Robot type filter
+    robot = info.get("robot_type", "")
+    if robot not in ("so100", "so101", "so100_follower", "so101_follower"):
+        return None
+    # Schema filter: exactly the 2-camera, 6-DOF schema
+    features = info.get("features", {})
+    expected_keys = {
+        "action", "episode_index", "frame_index", "index",
+        "observation.images.image", "observation.images.image2",
+        "observation.state", "task_index", "timestamp",
+    }
+    if set(features.keys()) != expected_keys:
+        return None
+    # Dimension check
+    if features.get("action", {}).get("shape") != [6]:
+        return None
+    if features.get("observation.state", {}).get("shape") != [6]:
+        return None
+    # FPS check
+    if info.get("fps") != 30:
+        return None
+    # Resolution check
+    for cam_key in ("observation.images.image", "observation.images.image2"):
+        shape = features.get(cam_key, {}).get("shape", [])
+        if len(shape) < 2 or shape[0] != 480 or shape[1] != 640:
+            return None
+    # Load tasks
+    tasks_path = dataset_root / "meta" / "tasks.jsonl"
+    tasks = {}
+    if tasks_path.exists():
+        for line in open(tasks_path):
+            line = line.strip()
+            if line:
+                t = json.loads(line)
+                tasks[t["task_index"]] = t["task"]
+    # Integrity check: video and parquet file counts
+    total_eps = info.get("total_episodes", 0)
+    vids = glob.glob(str(dataset_root / "videos" / "**" / "*.mp4"), recursive=True)
+    parquets = glob.glob(str(dataset_root / "data" / "**" / "*.parquet"), recursive=True)
+    expected_vids = total_eps * 2  # 2 cameras
+    if len(vids) != expected_vids or len(parquets) != total_eps:
+        return None
+    # Load episode metadata if available
+    episodes = []
+    ep_jsonl = dataset_root / "meta" / "episodes.jsonl"
+    if ep_jsonl.exists():
+        for line in open(ep_jsonl):
+            line = line.strip()
+            if line:
+                episodes.append(json.loads(line))
+    return {
+        "robot_type": robot,
+        "total_episodes": total_eps,
+        "total_frames": info.get("total_frames", 0),
+        "fps": info["fps"],
+        "tasks": tasks,
+        "episodes": episodes,
+        "features": {k: v.get("shape") for k, v in features.items()},
+    }
+def build_index(
+    data_root: Path,
+    max_per_task: int = 200,
+    max_per_contributor: int = 200,
+    min_episode_frames: int = 150,
+    max_episode_frames: int = 1800,
+    seed: int = 42,
+) -> dict:
+    """Build filtered training index."""
+    rng = random.Random(seed)
+    # Discover all contributor/dataset pairs
+    contributors = sorted([
+        d for d in data_root.iterdir()
+        if d.is_dir() and not d.name.startswith(".")
+    ])
+    # Phase 1: Load all valid datasets
+    all_episodes = []  # (contributor, dataset_name, episode_idx, task, num_frames)
+    datasets_passed = 0
+    datasets_rejected = 0
+    skipped_missing = 0
+    skipped_video_mismatch = 0
+    for contrib_dir in contributors:
+        if not contrib_dir.is_dir():
+            continue
+        contributor = contrib_dir.name
+        for ds_dir in sorted(contrib_dir.iterdir()):
+            if not ds_dir.is_dir():
+                continue
+            meta = load_dataset_meta(ds_dir)
+            if meta is None:
+                datasets_rejected += 1
+                continue
+            datasets_passed += 1
+            dataset_name = f"{contributor}/{ds_dir.name}"
+            # Default task if none specified
+            if not meta["tasks"]:
+                meta["tasks"] = {0: "(no task)"}
+            # Build episode list by reading actual parquet files
+            # Trust the parquet row count, not metadata
+            for ep_idx in range(meta["total_episodes"]):
+                parquet_path = ds_dir / f"data/chunk-000/episode_{ep_idx:06d}.parquet"
+                if not parquet_path.exists():
+                    skipped_missing += 1
+                    continue
+                # Read actual row count and timestamps from parquet
+                pf_full = pd.read_parquet(parquet_path, columns=["frame_index", "timestamp"])
+                actual_length = len(pf_full)
+                if actual_length < min_episode_frames or actual_length > max_episode_frames:
+                    continue
+                # Also verify both video files exist
+                vid1 = ds_dir / f"videos/chunk-000/observation.images.image/episode_{ep_idx:06d}.mp4"
+                vid2 = ds_dir / f"videos/chunk-000/observation.images.image2/episode_{ep_idx:06d}.mp4"
+                if not vid1.exists() or not vid2.exists():
+                    skipped_missing += 1
+                    continue
+                # Verify video duration covers all parquet timestamps
+                # The last frame's timestamp must be within the video duration
+                last_timestamp = float(pf_full["timestamp"].iloc[-1])
+                vid1_duration = get_video_duration(vid1)
+                vid2_duration = get_video_duration(vid2)
+                min_vid_duration = min(vid1_duration, vid2_duration)
+                if min_vid_duration > 0 and last_timestamp > min_vid_duration:
+                    # Video is shorter than parquet claims — truncate to what the video covers
+                    # Find the last frame index where timestamp <= video duration
+                    valid_mask = pf_full["timestamp"] <= min_vid_duration
+                    actual_length = int(valid_mask.sum())
+                    if actual_length < min_episode_frames:
+                        skipped_video_mismatch += 1
+                        continue
+                # Get task from episodes.jsonl if available, else default
+                task_idx = 0
+                if meta["episodes"]:
+                    for ep_meta in meta["episodes"]:
+                        if ep_meta.get("episode_index") == ep_idx:
+                            task_idx = ep_meta.get("task_index", 0)
+                            break
+                task = meta["tasks"].get(task_idx, "(no task)")
+                all_episodes.append((contributor, dataset_name, ep_idx, task, actual_length))
+    print(f"Datasets: {datasets_passed} passed, {datasets_rejected} rejected")
+    print(f"Episodes verified: {len(all_episodes)}, skipped missing: {skipped_missing}, skipped video mismatch: {skipped_video_mismatch}")
+    print(f"Episodes before caps: {len(all_episodes)}")
+    # Phase 2: Apply per-task cap
+    task_buckets = defaultdict(list)
+    for ep in all_episodes:
+        task_buckets[ep[3]].append(ep)
+    after_task_cap = []
+    tasks_capped = 0
+    for task, eps in task_buckets.items():
+        rng.shuffle(eps)
+        if len(eps) > max_per_task:
+            tasks_capped += 1
+        after_task_cap.extend(eps[:max_per_task])
+    print(f"Episodes after per-task cap ({max_per_task}): {len(after_task_cap)} ({tasks_capped} tasks capped)")
+    # Phase 3: Apply per-contributor cap
+    contrib_buckets = defaultdict(list)
+    for ep in after_task_cap:
+        contrib_buckets[ep[0]].append(ep)
+    final_episodes = []
+    contribs_capped = 0
+    for contributor, eps in contrib_buckets.items():
+        rng.shuffle(eps)
+        if len(eps) > max_per_contributor:
+            contribs_capped += 1
+        final_episodes.extend(eps[:max_per_contributor])
+    print(f"Episodes after per-contributor cap ({max_per_contributor}): {len(final_episodes)} ({contribs_capped} contributors capped)")
+    # Phase 4: Build the index
+    # Sort for determinism
+    final_episodes.sort(key=lambda x: (x[1], x[2]))
+    # Collect unique tasks
+    unique_tasks = sorted(set(ep[3] for ep in final_episodes))
+    task_to_idx = {t: i for i, t in enumerate(unique_tasks)}
+    # Collect unique datasets used
+    datasets_used = sorted(set(ep[1] for ep in final_episodes))
+    # Build episode entries
+    entries = []
+    total_frames = 0
+    for contributor, dataset_name, ep_idx, task, num_frames in final_episodes:
+        entries.append({
+            "dataset": dataset_name,
+            "episode_index": ep_idx,
+            "task": task,
+            "task_index": task_to_idx[task],
+            "num_frames": num_frames,
+        })
+        total_frames += num_frames
+    index = {
+        "source_repo": "HuggingFaceVLA/community_dataset_v3",
+        "filters": {
+            "max_per_task": max_per_task,
+            "max_per_contributor": max_per_contributor,
+            "min_episode_frames": min_episode_frames,
+            "max_episode_frames": max_episode_frames,
+            "seed": seed,
+        },
+        "summary": {
+            "datasets": len(datasets_used),
+            "episodes": len(entries),
+            "unique_tasks": len(unique_tasks),
+            "total_frames": total_frames,
+            "est_hours": total_frames / 30 / 3600,
+        },
+        "tasks": unique_tasks,
+        "datasets_used": datasets_used,
+        "episodes": entries,
+    }
+    return index
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-root", type=Path, default=Path.home() / "lap" / "community_dataset_v3")
+    parser.add_argument("--output", type=Path, default=Path(__file__).parent / "filtered_index.json")
+    parser.add_argument("--max-per-task", type=int, default=200)
+    parser.add_argument("--max-per-contributor", type=int, default=200)
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    index = build_index(
+        args.data_root,
+        max_per_task=args.max_per_task,
+        max_per_contributor=args.max_per_contributor,
+        seed=args.seed,
+    )
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump(index, f, indent=2)
+    print(f"\nSaved to {args.output}")
+    print(f"  Datasets: {index['summary']['datasets']}")
+    print(f"  Episodes: {index['summary']['episodes']}")
+    print(f"  Tasks: {index['summary']['unique_tasks']}")
+    print(f"  Frames: {index['summary']['total_frames']:,}")
+    print(f"  Est. hours: {index['summary']['est_hours']:.1f}")

compute_stats.py ADDED Viewed

	@@ -0,0 +1,98 @@

+#!/usr/bin/env python3
+"""
+Compute normalization statistics (mean/std) for state and action across the filtered dataset.
+Only reads parquet files — no video decoding, so it's fast.
+"""
+import argparse
+import json
+import time
+from pathlib import Path
+import numpy as np
+import pandas as pd
+def compute_stats(data_root: Path, index_path: Path) -> dict:
+    with open(index_path) as f:
+        index = json.load(f)
+    # Collect all unique (dataset, episode) pairs
+    episode_set = set()
+    for ep in index["episodes"]:
+        episode_set.add((ep["dataset"], ep["episode_index"]))
+    print(f"Computing stats from {len(episode_set)} episodes...")
+    # Online mean/variance computation (Welford's algorithm)
+    state_sum = np.zeros(6, dtype=np.float64)
+    state_sq_sum = np.zeros(6, dtype=np.float64)
+    action_sum = np.zeros(6, dtype=np.float64)
+    action_sq_sum = np.zeros(6, dtype=np.float64)
+    n_state = 0
+    n_action = 0
+    start = time.time()
+    for i, (dataset, ep_idx) in enumerate(sorted(episode_set)):
+        parquet_path = data_root / dataset / f"data/chunk-000/episode_{ep_idx:06d}.parquet"
+        if not parquet_path.exists():
+            continue
+        df = pd.read_parquet(parquet_path)
+        states = np.stack(df["observation.state"].values).astype(np.float64)
+        actions = np.stack(df["action"].values).astype(np.float64)
+        state_sum += states.sum(axis=0)
+        state_sq_sum += (states ** 2).sum(axis=0)
+        n_state += len(states)
+        action_sum += actions.sum(axis=0)
+        action_sq_sum += (actions ** 2).sum(axis=0)
+        n_action += len(actions)
+        if (i + 1) % 1000 == 0:
+            elapsed = time.time() - start
+            rate = (i + 1) / elapsed
+            eta = (len(episode_set) - i - 1) / rate
+            print(f"  [{i+1}/{len(episode_set)}] {rate:.0f} eps/s, ETA: {eta:.0f}s")
+    state_mean = state_sum / n_state
+    state_std = np.sqrt(state_sq_sum / n_state - state_mean ** 2)
+    action_mean = action_sum / n_action
+    action_std = np.sqrt(action_sq_sum / n_action - action_mean ** 2)
+    elapsed = time.time() - start
+    print(f"Done in {elapsed:.1f}s ({n_state:,} state frames, {n_action:,} action frames)")
+    print(f"\nState mean: {state_mean}")
+    print(f"State std:  {state_std}")
+    print(f"Action mean: {action_mean}")
+    print(f"Action std:  {action_std}")
+    stats = {
+        "observation.state": {
+            "mean": state_mean.tolist(),
+            "std": state_std.tolist(),
+        },
+        "action": {
+            "mean": action_mean.tolist(),
+            "std": action_std.tolist(),
+        },
+    }
+    return stats
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-root", type=Path, default=Path.home() / "lap" / "community_dataset_v3")
+    parser.add_argument("--index", type=Path, default=Path(__file__).parent / "filtered_index.json")
+    parser.add_argument("--output", type=Path, default=Path(__file__).parent / "norm_stats.json")
+    args = parser.parse_args()
+    stats = compute_stats(args.data_root, args.index)
+    with open(args.output, "w") as f:
+        json.dump(stats, f, indent=2)
+    print(f"\nSaved to {args.output}")

eval_kitchen.py ADDED Viewed

	@@ -0,0 +1,263 @@

+#!/usr/bin/env python3
+"""
+Evaluate Pi0.5 checkpoints in the RoboCasa kitchen sim.
+Compares base model vs finetuned model side by side.
+Runs on CPU only (GPU is used by training).
+Usage:
+  python eval_kitchen.py --checkpoint /mnt/hdd/pi05-training/full_run/checkpoints/004000/pretrained_model
+  python eval_kitchen.py --checkpoint lerobot/pi05_base  # base model comparison
+  python eval_kitchen.py --compare  # run both and save side-by-side
+"""
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+# EGL rendering for headless MuJoCo
+os.environ["MUJOCO_GL"] = "egl"
+import imageio
+import numpy as np
+import torch
+sys.path.insert(0, str(Path(__file__).parent))
+sys.path.insert(0, str(Path.home() / "lerobot" / "src"))
+sys.path.insert(0, "/mnt/hdd/pi05-training/robocasa_test")
+from so100_kitchen_env import SO100KitchenEnv
+def load_policy(checkpoint_path, device="cuda"):
+    """Load Pi0.5 policy."""
+    from lerobot.policies.pi05.modeling_pi05 import PI05Policy
+    print(f"Loading policy from {checkpoint_path} ({device})...")
+    policy = PI05Policy.from_pretrained(str(checkpoint_path))
+    policy = policy.to(device)
+    policy.eval()
+    return policy
+def build_batch(env_obs, camera_image, task, stats, device="cuda"):
+    """Convert kitchen env observation to Pi0.5 batch format."""
+    import torchvision.transforms.functional as TF
+    # Image: (H, W, 3) uint8 -> (1, 3, 224, 224) float32
+    image = torch.from_numpy(camera_image).permute(2, 0, 1).float() / 255.0
+    image = image.unsqueeze(0)
+    image_224 = TF.resize(image, [224, 224], antialias=True)
+    # ImageNet normalization
+    mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+    std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+    image_224 = (image_224 - mean) / std
+    # State: joint positions in radians -> degrees (LeRobot scale), then normalize
+    joint_pos = env_obs["joint_pos"]
+    state_degrees = np.degrees(joint_pos)
+    state = torch.tensor(state_degrees, dtype=torch.float32).unsqueeze(0)
+    state_mean = torch.tensor(stats["observation.state"]["mean"], dtype=torch.float32)
+    state_std = torch.tensor(stats["observation.state"]["std"], dtype=torch.float32)
+    state = (state - state_mean) / (state_std + 1e-8)
+    # Pad to 32 dims
+    state_padded = torch.zeros(1, 32)
+    state_padded[:, :6] = state
+    # Tokenize
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("google/paligemma-3b-pt-224")
+    state_discrete = ((state[0].clamp(-1, 1) + 1) / 2 * 255).int()
+    state_str = " ".join(str(v.item()) for v in state_discrete)
+    prompt = f"Task: {task}, State: {state_str};\nAction: "
+    tokens = tokenizer(
+        prompt, padding="max_length", max_length=200,
+        truncation=True, return_tensors="pt",
+    )
+    return {
+        "observation.images.base_0_rgb": image_224.to(device),
+        "observation.images.left_wrist_0_rgb": image_224.to(device),
+        "observation.state": state_padded.to(device),
+        "observation.language.tokens": tokens["input_ids"].to(device),
+        "observation.language.attention_mask": tokens["attention_mask"].bool().to(device),
+    }
+def decode_actions(raw_actions, stats):
+    """Convert model output to joint angle radians."""
+    actions = raw_actions[0, :, :6].cpu().numpy()
+    action_mean = np.array(stats["action"]["mean"])
+    action_std = np.array(stats["action"]["std"])
+    actions = actions * action_std + action_mean
+    return np.radians(actions)
+def run_episode(policy, env, task, stats, num_steps=200, camera="robot_workspace", show_live=True):
+    """Run one episode, return frames and joint trajectories."""
+    obs = env.reset()
+    frames = []
+    joint_history = []
+    chunk_actions = None
+    chunk_idx = 0
+    for step in range(num_steps):
+        if chunk_actions is None or chunk_idx >= len(chunk_actions):
+            camera_image = env.render(camera)
+            with torch.no_grad():
+                batch = build_batch(obs, camera_image, task, stats, device=next(policy.parameters()).device)
+                action = policy.select_action(batch)
+                chunk_actions = decode_actions(action.unsqueeze(0), stats)
+                chunk_idx = 0
+        action = chunk_actions[chunk_idx]
+        chunk_idx += 1
+        obs, reward, done, info = env.step(action)
+        frame = env.render(camera)
+        frames.append(frame)
+        joint_history.append(obs["joint_pos"].copy())
+        # Live display via cv2 (static camera)
+        if show_live:
+            try:
+                import cv2
+                cv2.imshow("SO-100 Kitchen Sim", cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+                if cv2.waitKey(1) & 0xFF == ord('q'):
+                    print("Quit by user")
+                    break
+            except Exception:
+                pass
+        if step % 25 == 0:
+            pos = obs["joint_pos"]
+            print(f"  step {step:>3}: joints=[{pos[0]:.2f} {pos[1]:.2f} {pos[2]:.2f} {pos[3]:.2f} {pos[4]:.2f} {pos[5]:.3f}]")
+    return frames, np.array(joint_history)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint", type=str, default=None)
+    parser.add_argument("--task", type=str, default="pick up the mug and place it on the plate")
+    parser.add_argument("--steps", type=int, default=200)
+    parser.add_argument("--output-dir", type=str, default="/mnt/hdd/pi05-training/eval_kitchen")
+    parser.add_argument("--compare", action="store_true", help="Run base vs finetuned comparison")
+    parser.add_argument("--viewer", action="store_true", help="Use MuJoCo interactive viewer (mouse orbit/pan/zoom)")
+    parser.add_argument("--finetuned-checkpoint", type=str,
+                        default="/mnt/hdd/pi05-training/full_run/checkpoints/004000/pretrained_model")
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    with open(Path(__file__).parent / "norm_stats.json") as f:
+        stats = json.load(f)
+    env = SO100KitchenEnv()
+    if args.viewer:
+        # Interactive MuJoCo viewer with mouse controls
+        import mujoco.viewer
+        import time as _time
+        policy = load_policy(args.checkpoint or "lerobot/pi05_base")
+        obs = env.reset()
+        chunk_actions = None
+        chunk_idx = 0
+        device = next(policy.parameters()).device
+        print(f"Launching interactive viewer. Task: '{args.task}'")
+        print("Mouse: Left=rotate, Right=pan, Scroll=zoom")
+        print("Close window to exit.")
+        viewer = mujoco.viewer.launch_passive(env.model, env.data)
+        step = 0
+        while viewer.is_running():
+            # Get action from policy
+            if chunk_actions is None or chunk_idx >= len(chunk_actions):
+                camera_image = env.render("overview")
+                with torch.no_grad():
+                    batch = build_batch(obs, camera_image, args.task, stats, device=device)
+                    action = policy.select_action(batch)
+                    chunk_actions = decode_actions(action.unsqueeze(0), stats)
+                    chunk_idx = 0
+            act = chunk_actions[chunk_idx]
+            chunk_idx += 1
+            # Apply action to actuators
+            from so100_kitchen_env import JOINT_NAMES
+            for i, name in enumerate(JOINT_NAMES):
+                aid = env.actuator_ids.get(name)
+                if aid is not None:
+                    env.data.ctrl[aid] = act[i]
+            # Step physics
+            mujoco.mj_step(env.model, env.data)
+            viewer.sync()
+            # Update obs
+            joint_pos = np.array([env.data.qpos[env.model.jnt_qposadr[env.joint_ids[n]]] for n in JOINT_NAMES])
+            obs = {"joint_pos": joint_pos}
+            step += 1
+            if step % 50 == 0:
+                print(f"  step {step}: joints=[{' '.join(f'{j:.2f}' for j in joint_pos)}]")
+            _time.sleep(0.02)  # ~50Hz
+        viewer.close()
+    elif args.compare:
+        # Run both base and finetuned
+        print("=== BASE MODEL ===")
+        base_policy = load_policy("lerobot/pi05_base")
+        base_frames, base_joints = run_episode(base_policy, env, args.task, stats, args.steps)
+        del base_policy
+        print("\n=== FINETUNED MODEL ===")
+        ft_policy = load_policy(args.finetuned_checkpoint)
+        ft_frames, ft_joints = run_episode(ft_policy, env, args.task, stats, args.steps)
+        del ft_policy
+        # Save videos
+        imageio.mimsave(f"{args.output_dir}/base_model.mp4", base_frames, fps=25)
+        imageio.mimsave(f"{args.output_dir}/finetuned_model.mp4", ft_frames, fps=25)
+        # Save side-by-side frames at key timesteps
+        for t in [0, 50, 100, 150, 199]:
+            if t < len(base_frames) and t < len(ft_frames):
+                combined = np.concatenate([base_frames[t], ft_frames[t]], axis=1)
+                imageio.imwrite(f"{args.output_dir}/compare_step_{t:03d}.png", combined)
+        # Print joint trajectory summary
+        print("\n=== COMPARISON ===")
+        print(f"Base model - joint range: {base_joints.min(axis=0)} to {base_joints.max(axis=0)}")
+        print(f"Finetuned  - joint range: {ft_joints.min(axis=0)} to {ft_joints.max(axis=0)}")
+        print(f"Base model - total motion: {np.abs(np.diff(base_joints, axis=0)).sum():.2f} rad")
+        print(f"Finetuned  - total motion: {np.abs(np.diff(ft_joints, axis=0)).sum():.2f} rad")
+        print(f"\nSaved to {args.output_dir}/")
+    elif args.checkpoint:
+        policy = load_policy(args.checkpoint)
+        frames, joints = run_episode(policy, env, args.task, stats, args.steps)
+        name = Path(args.checkpoint).parent.name if "checkpoint" in args.checkpoint else "model"
+        imageio.mimsave(f"{args.output_dir}/{name}.mp4", frames, fps=25)
+        for t in [0, len(frames)//2, len(frames)-1]:
+            imageio.imwrite(f"{args.output_dir}/{name}_step_{t:03d}.png", frames[t])
+        print(f"Saved {len(frames)} frames to {args.output_dir}/")
+    else:
+        print("Specify --checkpoint or --compare")
+if __name__ == "__main__":
+    main()

eval_sim.py ADDED Viewed

	@@ -0,0 +1,171 @@

+#!/usr/bin/env python3
+"""
+Evaluate a Pi0.5 checkpoint in the SO-100 MuJoCo sim.
+Renders a video of the model controlling the arm.
+Usage:
+  python eval_sim.py --checkpoint outputs/scale_up_1k/checkpoints/000500/pretrained_model
+  python eval_sim.py --checkpoint lerobot/pi05_base  # test base model
+"""
+import argparse
+import sys
+from pathlib import Path
+import imageio
+import numpy as np
+import torch
+sys.path.insert(0, str(Path(__file__).parent))
+sys.path.insert(0, str(Path.home() / "lerobot" / "src"))
+from gym_so100.env import SO100Env
+from gym_so100.constants import normalize_lerobot_to_gym_so100
+def load_policy(checkpoint_path, device="cuda"):
+    """Load Pi0.5 policy from checkpoint."""
+    from lerobot.policies.pi05.modeling_pi05 import PI05Policy
+    print(f"Loading policy from {checkpoint_path}...")
+    policy = PI05Policy.from_pretrained(str(checkpoint_path))
+    policy = policy.to(device)
+    policy.eval()
+    return policy
+def build_batch(obs, task, stats, device="cuda"):
+    """Convert sim observation to Pi0.5 batch format."""
+    # Image: sim gives (H, W, 3) uint8 -> (1, 3, H, W) float32 [0,1]
+    image = torch.from_numpy(obs["pixels"]).permute(2, 0, 1).float() / 255.0
+    image = image.unsqueeze(0)  # add batch dim
+    # Resize to 224x224
+    import torchvision.transforms.functional as TF
+    image_224 = TF.resize(image, [224, 224], antialias=True)
+    # ImageNet normalization
+    mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+    std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+    image_224 = (image_224 - mean) / std
+    # State: sim gives radians, convert to degrees (LeRobot scale)
+    agent_pos = obs["agent_pos"].copy()
+    agent_pos_degrees = np.degrees(agent_pos)
+    state = torch.tensor(agent_pos_degrees, dtype=torch.float32).unsqueeze(0)
+    # Normalize state with our stats
+    state_mean = torch.tensor(stats["observation.state"]["mean"], dtype=torch.float32)
+    state_std = torch.tensor(stats["observation.state"]["std"], dtype=torch.float32)
+    state = (state - state_mean) / (state_std + 1e-8)
+    # Pad state to 32 dims
+    state_padded = torch.zeros(1, 32)
+    state_padded[:, :6] = state
+    # Tokenize task
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("google/paligemma-3b-pt-224")
+    # Discretize state for prompt (Pi0.5 format)
+    state_discrete = ((state[0].clamp(-1, 1) + 1) / 2 * 255).int()
+    state_str = " ".join(str(v.item()) for v in state_discrete)
+    prompt = f"Task: {task}, State: {state_str};\nAction: "
+    tokens = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=200,
+        truncation=True,
+        return_tensors="pt",
+    )
+    batch = {
+        "observation.images.base_0_rgb": image_224.to(device),
+        "observation.images.left_wrist_0_rgb": image_224.to(device),
+        "observation.state": state_padded.to(device),
+        "observation.language.tokens": tokens["input_ids"].to(device),
+        "observation.language.attention_mask": tokens["attention_mask"].bool().to(device),
+    }
+    return batch
+def decode_actions(raw_actions, stats):
+    """Convert model output actions back to LeRobot scale, then to sim radians."""
+    actions = raw_actions[0, :, :6].cpu().numpy()  # (chunk_size, 6)
+    # Unnormalize from MEAN_STD
+    action_mean = np.array(stats["action"]["mean"])
+    action_std = np.array(stats["action"]["std"])
+    actions = actions * action_std + action_mean
+    # Now in LeRobot degree-scale. Convert to radians for sim.
+    actions_rad = np.radians(actions)
+    return actions_rad
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint", type=str, required=True)
+    parser.add_argument("--task", type=str, default="pick up the cube and place it in the bin")
+    parser.add_argument("--steps", type=int, default=200)
+    parser.add_argument("--output", type=str, default="sim_eval.mp4")
+    parser.add_argument("--device", type=str, default="cuda")
+    args = parser.parse_args()
+    import json
+    with open(Path(__file__).parent / "norm_stats.json") as f:
+        stats = json.load(f)
+    # Load policy
+    policy = load_policy(args.checkpoint, args.device)
+    # Create sim
+    env = SO100Env(task="so100_cube_to_bin", obs_type="so100_pixels_agent_pos")
+    obs, info = env.reset()
+    frames = []
+    print(f"Running {args.steps} sim steps with task: '{args.task}'")
+    chunk_actions = None
+    chunk_idx = 0
+    for step in range(args.steps):
+        # Get new action chunk from policy every N steps
+        if chunk_actions is None or chunk_idx >= len(chunk_actions):
+            with torch.no_grad():
+                batch = build_batch(obs, args.task, stats, args.device)
+                action = policy.select_action(batch)
+                chunk_actions = decode_actions(action.unsqueeze(0), stats)
+                chunk_idx = 0
+        # Apply one action from the chunk
+        action = chunk_actions[chunk_idx]
+        chunk_idx += 1
+        # Normalize radians to sim's [-1, 1] action space
+        joint_mins = np.array([-1.92, -3.32, -0.174, -1.66, -2.79, -0.174])
+        joint_maxs = np.array([1.92, 0.174, 3.14, 1.66, 2.79, 1.75])
+        sim_action = 2.0 * (action - joint_mins) / (joint_maxs - joint_mins) - 1.0
+        sim_action = np.clip(sim_action, -1.0, 1.0)
+        obs, reward, terminated, truncated, info = env.step(sim_action.astype(np.float32))
+        frame = env.render()
+        frames.append(frame)
+        if step % 20 == 0:
+            pos = obs["agent_pos"]
+            print(f"  step {step:>3}: pos=[{pos[0]:.2f} {pos[1]:.2f} {pos[2]:.2f} {pos[3]:.2f} {pos[4]:.2f} {pos[5]:.3f}] reward={reward:.3f}")
+        if terminated or truncated:
+            print(f"Episode ended at step {step}")
+            break
+    # Save video
+    imageio.mimsave(args.output, frames, fps=25)
+    print(f"Saved {len(frames)} frames to {args.output}")
+if __name__ == "__main__":
+    main()

filtered_index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

infer_so101.py ADDED Viewed

	@@ -0,0 +1,223 @@

+#!/usr/bin/env python3
+"""
+Run Pi0.5 inference on SO-101.
+Uses LeRobot's FeetechMotorsBus with calibration for correct normalization,
+but bypasses lerobot_record's problematic control loop.
+Usage:
+  python infer_so101.py --task "pick up the blue football"
+"""
+import argparse
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+import cv2
+import numpy as np
+import scservo_sdk as scs
+import torch
+sys.path.insert(0, str(Path(__file__).parent))
+sys.path.insert(0, str(Path.home() / "lerobot" / "src"))
+logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(message)s', datefmt='%H:%M:%S')
+log = logging.getLogger()
+MOTOR_NAMES = ["shoulder_pan", "shoulder_lift", "elbow_flex", "wrist_flex", "wrist_roll", "gripper"]
+MOTOR_IDS = [1, 2, 3, 4, 5, 6]
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", type=str, required=True)
+    parser.add_argument("--checkpoint", type=str,
+                        default="/mnt/hdd/pi05-training/full_run/checkpoints/015000/pretrained_model")
+    parser.add_argument("--port", type=str, default="/dev/ttyACM0")
+    parser.add_argument("--cam-front", type=int, default=2)
+    parser.add_argument("--cam-wrist", type=int, default=0)
+    parser.add_argument("--max-steps", type=int, default=0, help="0 = run until Ctrl+C")
+    args = parser.parse_args()
+    # --- Connect motors using LeRobot's bus (for calibration/normalization) ---
+    from lerobot.motors.feetech.feetech import FeetechMotorsBus
+    from lerobot.motors import Motor, MotorNormMode, MotorCalibration
+    bus = FeetechMotorsBus(
+        port=args.port,
+        motors={
+            'shoulder_pan': Motor(1, 'sts3215', MotorNormMode.RANGE_M100_100),
+            'shoulder_lift': Motor(2, 'sts3215', MotorNormMode.RANGE_M100_100),
+            'elbow_flex': Motor(3, 'sts3215', MotorNormMode.RANGE_M100_100),
+            'wrist_flex': Motor(4, 'sts3215', MotorNormMode.RANGE_M100_100),
+            'wrist_roll': Motor(5, 'sts3215', MotorNormMode.RANGE_M100_100),
+            'gripper': Motor(6, 'sts3215', MotorNormMode.RANGE_0_100),
+        },
+    )
+    bus.connect()
+    # Load calibration
+    cal_path = Path.home() / ".cache/huggingface/lerobot/calibration/robots/so_follower/my_so101.json"
+    cal = json.load(open(cal_path))
+    cal_dict = {name: MotorCalibration(**vals) for name, vals in cal.items()}
+    bus.write_calibration(cal_dict)
+    log.warning("Bus connected with calibration")
+    # Configure motors the same way LeRobot does in so_follower.configure()
+    # This uses torque_disabled() context which disables torque, configures, re-enables
+    with bus.torque_disabled():
+        bus.configure_motors()
+        for motor in bus.motors:
+            bus.write("Operating_Mode", motor, 0)  # Position mode
+            bus.write("P_Coefficient", motor, 16)
+            bus.write("I_Coefficient", motor, 0)
+            bus.write("D_Coefficient", motor, 32)
+            bus.write("Goal_Velocity", motor, 600)  # Slow velocity limit
+            bus.write("Acceleration", motor, 50)     # Gentle acceleration
+            if motor == "gripper":
+                bus.write("Max_Torque_Limit", motor, 500)
+                bus.write("Protection_Current", motor, 250)
+                bus.write("Overload_Torque", motor, 25)
+    # torque_disabled() re-enables torque on exit
+    # Velocity and acceleration limits prevent snapping
+    log.warning("Motors configured and torque enabled (velocity/accel limited)")
+    # --- Open cameras ---
+    cap_front = cv2.VideoCapture(args.cam_front)
+    cap_wrist = cv2.VideoCapture(args.cam_wrist)
+    for cap in [cap_front, cap_wrist]:
+        cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
+        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
+    log.warning("Cameras open")
+    # --- Load policy + preprocessor + postprocessor ---
+    from lerobot.policies.factory import make_pre_post_processors
+    from lerobot.policies.utils import prepare_observation_for_inference, make_robot_action
+    from lerobot.configs.policies import PreTrainedConfig
+    from lerobot.processor.rename_processor import rename_stats
+    from lerobot.policies.pi05.modeling_pi05 import PI05Policy
+    log.warning("Loading Pi0.5...")
+    policy_cfg = PreTrainedConfig.from_pretrained(args.checkpoint)
+    policy_cfg.pretrained_path = Path(args.checkpoint)
+    policy = PI05Policy.from_pretrained(args.checkpoint)
+    policy = policy.to("cuda")
+    policy.eval()
+    policy.reset()
+    # Build stats from checkpoint's saved preprocessor
+    rename_map = {
+        "observation.images.front": "observation.images.base_0_rgb",
+        "observation.images.wrist": "observation.images.left_wrist_0_rgb",
+    }
+    preprocessor, postprocessor = make_pre_post_processors(
+        policy_cfg=policy_cfg,
+        pretrained_path=policy_cfg.pretrained_path,
+        preprocessor_overrides={
+            "device_processor": {"device": "cuda"},
+            "rename_observations_processor": {"rename_map": rename_map},
+        },
+    )
+    action_names = [f"{name}.pos" for name in MOTOR_NAMES]
+    ds_features = {"action": {"names": action_names}}
+    # --- Set up live camera display ---
+    try:
+        import rerun as rr
+        rr.init("so101_inference", spawn=True)
+        use_rerun = True
+        log.warning("Rerun viewer launched — live camera feed")
+    except ImportError:
+        use_rerun = False
+        log.warning("Rerun not available, no live view")
+    log.warning(f"Running: '{args.task}' — Ctrl+C to stop")
+    step = 0
+    try:
+        while args.max_steps == 0 or step < args.max_steps:
+            t0 = time.perf_counter()
+            # 1. Read motor positions (calibrated/normalized by bus)
+            try:
+                pos_dict = bus.sync_read("Present_Position", num_retry=5)
+            except ConnectionError:
+                bus.port_handler.is_using = False
+                bus.port_handler.ser.reset_input_buffer()
+                continue
+            # Build observation dict
+            state_array = np.array([pos_dict[name] for name in MOTOR_NAMES], dtype=np.float32)
+            # 2. Capture camera images
+            ret_f, frame_front = cap_front.read()
+            ret_w, frame_wrist = cap_wrist.read()
+            if not ret_f or not ret_w:
+                continue
+            # Live display
+            if use_rerun:
+                rr.set_time_sequence("step", step)
+                rr.log("camera/front", rr.Image(frame_front))
+                rr.log("camera/wrist", rr.Image(frame_wrist))
+                rr.log("state", rr.BarChart([pos_dict[n] for n in MOTOR_NAMES]))
+            observation = {
+                "observation.images.front": frame_front,
+                "observation.images.wrist": frame_wrist,
+                "observation.state": state_array,
+            }
+            # 3. Inference
+            with torch.inference_mode():
+                obs = prepare_observation_for_inference(
+                    observation, torch.device("cuda"), args.task, "so101_follower"
+                )
+                obs = preprocessor(obs)
+                action = policy.select_action(obs)
+                action = postprocessor(action)
+            # 4. Convert to motor commands
+            robot_action = make_robot_action(action, ds_features)
+            # 5. Send to motors (calibrated/normalized by bus)
+            goal_pos = {name: robot_action[f"{name}.pos"] for name in MOTOR_NAMES}
+            try:
+                bus.sync_write("Goal_Position", goal_pos)
+            except ConnectionError:
+                bus.port_handler.is_using = False
+                bus.port_handler.ser.reset_input_buffer()
+            dt = time.perf_counter() - t0
+            step += 1
+            if step % 10 == 0:
+                pos_str = " ".join(f"{pos_dict[n]:>7.1f}" for n in MOTOR_NAMES)
+                act_str = " ".join(f"{robot_action[f'{n}.pos']:>7.1f}" for n in MOTOR_NAMES)
+                log.warning(f"step {step:>4} | state=[{pos_str}] | action=[{act_str}] | {dt*1000:.0f}ms")
+    except KeyboardInterrupt:
+        log.warning("Stopped by user")
+    finally:
+        log.warning("Disabling torque...")
+        try:
+            bus.disable_torque()
+        except Exception:
+            for mid in MOTOR_IDS:
+                try:
+                    bus.packet_handler.write1ByteTxRx(bus.port_handler, mid, 40, 0)
+                except Exception:
+                    pass
+        bus.disconnect()
+        cap_front.release()
+        cap_wrist.release()
+        log.warning("Done")
+if __name__ == "__main__":
+    main()

norm_stats.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "observation.state": {
+    "mean": [
+      3.2129562341482223,
+      81.25934383631572,
+      97.87567545165706,
+      58.2558965428857,
+      -3.869688922486154,
+      13.552276313577162
+    ],
+    "std": [
+      26.932913188864053,
+      85.10186432539234,
+      60.096302230313775,
+      32.18041942119004,
+      64.69174273514702,
+      17.38995233769721
+    ]
+  },
+  "action": {
+    "mean": [
+      3.2667901525244267,
+      82.01517467950833,
+      96.44080348317482,
+      58.19181662702153,
+      -3.898391972920288,
+      11.117041393936647
+    ],
+    "std": [
+      27.026112586762707,
+      85.80857081004108,
+      60.86058528648729,
+      32.566689386004555,
+      64.99547212544971,
+      17.279498490768535
+    ]
+  }
+}

so100_dataset.py ADDED Viewed

	@@ -0,0 +1,325 @@

+#!/usr/bin/env python3
+"""
+Custom PyTorch Dataset that reads directly from community_dataset_v3 v2.1 files on disk.
+No merging, no conversion, no copying. Just reads parquets + decodes video frames.
+Returns raw (unnormalized) data in the format LeRobotDataset returns — the existing
+Pi0.5 preprocessor handles normalization, padding, tokenization, and device placement.
+Provides a .meta adapter so lerobot_train.py can use it as a drop-in replacement.
+"""
+import json
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import Dataset
+class _DatasetMeta:
+    """
+    Lightweight adapter that provides the .meta interface lerobot_train.py expects.
+    Wraps our filtered index + precomputed stats.
+    """
+    def __init__(self, index: dict, stats: dict, data_root: Path):
+        self.repo_id = "SO100Dataset/local"
+        self.root = data_root
+        # Stats: training script expects dict[str, dict[str, torch.Tensor]]
+        self.stats = {}
+        for key, s in stats.items():
+            self.stats[key] = {
+                "mean": torch.tensor(s["mean"], dtype=torch.float32),
+                "std": torch.tensor(s["std"], dtype=torch.float32),
+                # Preprocessor may also look for min/max/quantiles.
+                # Approximate them from mean/std for MEAN_STD normalization.
+                "min": torch.tensor(s["mean"], dtype=torch.float32) - 3 * torch.tensor(s["std"], dtype=torch.float32),
+                "max": torch.tensor(s["mean"], dtype=torch.float32) + 3 * torch.tensor(s["std"], dtype=torch.float32),
+            }
+        # Tasks
+        self.tasks = pd.DataFrame(
+            {"task_index": range(len(index["tasks"]))},
+            index=index["tasks"],
+        )
+        # Features
+        self._features = {
+            "observation.images.image": {
+                "dtype": "video",
+                "shape": [3, 480, 640],
+                "names": ["channels", "height", "width"],
+            },
+            "observation.images.image2": {
+                "dtype": "video",
+                "shape": [3, 480, 640],
+                "names": ["channels", "height", "width"],
+            },
+            "observation.state": {
+                "dtype": "float32",
+                "shape": [6],
+            },
+            "action": {
+                "dtype": "float32",
+                "shape": [6],
+            },
+            "timestamp": {"dtype": "float32", "shape": []},
+            "frame_index": {"dtype": "int64", "shape": []},
+            "episode_index": {"dtype": "int64", "shape": []},
+            "index": {"dtype": "int64", "shape": []},
+            "task_index": {"dtype": "int64", "shape": []},
+        }
+        self.info = {
+            "fps": 30,
+            "robot_type": "so100",
+            "total_episodes": index["summary"]["episodes"],
+            "total_frames": index["summary"]["total_frames"],
+        }
+    @property
+    def fps(self):
+        return 30
+    @property
+    def features(self):
+        return self._features
+    @property
+    def camera_keys(self):
+        return ["observation.images.image", "observation.images.image2"]
+    @property
+    def video_keys(self):
+        return ["observation.images.image", "observation.images.image2"]
+    @property
+    def image_keys(self):
+        return []
+    @property
+    def total_episodes(self):
+        return self.info["total_episodes"]
+    @property
+    def total_frames(self):
+        return self.info["total_frames"]
+    @property
+    def robot_type(self):
+        return "so100"
+class SO100Dataset(Dataset):
+    """
+    Loads filtered SO-100/101 episodes from community_dataset_v3 on disk.
+    Each sample is one frame with an action chunk of the next `chunk_size` steps.
+    Returns raw unnormalized data — the Pi0.5 preprocessor handles normalization.
+    Provides .meta property compatible with lerobot_train.py.
+    """
+    def __init__(
+        self,
+        data_root: str | Path,
+        index_path: str | Path,
+        stats_path: str | Path | None = None,
+        video_backend: str = "pyav",
+        chunk_size: int = 50,
+        image_transforms=None,
+    ):
+        self.data_root = Path(data_root)
+        self.video_backend = video_backend
+        self.chunk_size = chunk_size
+        self.image_transforms = image_transforms
+        self.fps = 30
+        # Load index
+        with open(index_path) as f:
+            self._index = json.load(f)
+        self.tasks = self._index["tasks"]
+        # Load stats
+        raw_stats = {}
+        if stats_path and Path(stats_path).exists():
+            with open(stats_path) as f:
+                raw_stats = json.load(f)
+        # Create meta adapter
+        self.meta = _DatasetMeta(self._index, raw_stats, self.data_root)
+        # Build flat frame-level index
+        self._frame_index = []
+        self._episode_offsets = []
+        for ep in self._index["episodes"]:
+            dataset_path = self.data_root / ep["dataset"]
+            ep_idx = ep["episode_index"]
+            task = ep["task"]
+            task_idx = ep["task_index"]
+            num_frames = ep["num_frames"]
+            # Only include frames where a full action chunk fits
+            valid_frames = max(0, num_frames - self.chunk_size)
+            if valid_frames == 0:
+                continue
+            start = len(self._frame_index)
+            self._episode_offsets.append(start)
+            for frame_idx in range(valid_frames):
+                self._frame_index.append((
+                    dataset_path, ep_idx, frame_idx,
+                    num_frames, task, task_idx,
+                ))
+        # Parquet cache
+        self._parquet_cache = {}
+        self._cache_max = 200
+    def __len__(self):
+        return len(self._frame_index)
+    @property
+    def num_episodes(self):
+        return len(self._episode_offsets)
+    @property
+    def num_frames(self):
+        return len(self._frame_index)
+    @property
+    def episodes(self):
+        return None  # Use all episodes (no further filtering)
+    @property
+    def features(self):
+        return self.meta.features
+    @property
+    def video(self):
+        return True
+    @property
+    def camera_keys(self):
+        return self.meta.camera_keys
+    @property
+    def video_frame_keys(self):
+        return self.meta.camera_keys
+    def _load_parquet(self, dataset_path: Path, episode_index: int) -> pd.DataFrame:
+        """Load and cache a parquet file."""
+        key = (str(dataset_path), episode_index)
+        if key in self._parquet_cache:
+            return self._parquet_cache[key]
+        parquet_path = dataset_path / f"data/chunk-000/episode_{episode_index:06d}.parquet"
+        df = pd.read_parquet(parquet_path)
+        if len(self._parquet_cache) >= self._cache_max:
+            oldest_key = next(iter(self._parquet_cache))
+            del self._parquet_cache[oldest_key]
+        self._parquet_cache[key] = df
+        return df
+    def _decode_video_frame(self, video_path: Path, timestamp: float) -> torch.Tensor:
+        """Decode a single frame from an MP4 at the given timestamp. Returns (C, H, W) float32 [0,1]."""
+        if self.video_backend == "torchcodec":
+            from torchcodec.decoders import VideoDecoder
+            decoder = VideoDecoder(str(video_path))
+            frame = decoder.get_frame_played_at(timestamp)
+            return frame.data.float() / 255.0
+        else:
+            import av
+            container = av.open(str(video_path))
+            stream = container.streams.video[0]
+            target_pts = int(timestamp / float(stream.time_base))
+            container.seek(target_pts, stream=stream)
+            for frame in container.decode(video=0):
+                arr = frame.to_ndarray(format="rgb24")
+                tensor = torch.from_numpy(arr).permute(2, 0, 1).float() / 255.0
+                container.close()
+                return tensor
+            container.close()
+            raise RuntimeError(f"Could not decode frame at t={timestamp} from {video_path}")
+    def __getitem__(self, idx: int) -> dict:
+        # Retry with a different sample if this one has corrupt/mismatched video
+        for _attempt in range(5):
+            try:
+                return self._get_sample(idx)
+            except (IndexError, RuntimeError, OSError) as e:
+                # Video duration doesn't match parquet timestamps, or file is corrupt.
+                # Pick a random different index and try again.
+                import random
+                idx = random.randint(0, len(self._frame_index) - 1)
+        # If all retries fail, raise
+        return self._get_sample(idx)
+    def _get_sample(self, idx: int) -> dict:
+        dataset_path, ep_idx, frame_idx, num_frames, task, task_idx = self._frame_index[idx]
+        df = self._load_parquet(dataset_path, ep_idx)
+        # Current frame
+        row = df.iloc[frame_idx]
+        state = torch.tensor(row["observation.state"], dtype=torch.float32)
+        timestamp = float(row["timestamp"])
+        # Action chunk: next chunk_size actions starting from current frame
+        action_end = min(frame_idx + self.chunk_size, len(df))
+        action_rows = df.iloc[frame_idx:action_end]
+        actions = torch.tensor(
+            np.stack(action_rows["action"].values),
+            dtype=torch.float32,
+        )
+        # Pad with last action if near episode end
+        if actions.shape[0] < self.chunk_size:
+            pad = actions[-1:].expand(self.chunk_size - actions.shape[0], -1)
+            actions = torch.cat([actions, pad], dim=0)
+        # Decode video frames
+        video_dir = dataset_path / "videos" / "chunk-000"
+        ep_str = f"episode_{ep_idx:06d}.mp4"
+        image1 = self._decode_video_frame(
+            video_dir / "observation.images.image" / ep_str, timestamp
+        )
+        image2 = self._decode_video_frame(
+            video_dir / "observation.images.image2" / ep_str, timestamp
+        )
+        if self.image_transforms is not None:
+            image1 = self.image_transforms(image1)
+            image2 = self.image_transforms(image2)
+        return {
+            "observation.images.image": image1,       # (3, 480, 640) float32 [0,1]
+            "observation.images.image2": image2,       # (3, 480, 640) float32 [0,1]
+            "observation.state": state,                # (6,) float32, raw values
+            "action": actions,                         # (50, 6) float32, raw values
+            "task": task,                              # str
+            "task_index": torch.tensor(task_idx),
+            "timestamp": torch.tensor(timestamp),
+            "frame_index": torch.tensor(frame_idx),
+            "episode_index": torch.tensor(ep_idx),
+            "index": torch.tensor(idx),
+        }
+    def __repr__(self):
+        return (
+            f"SO100Dataset(\n"
+            f"  data_root='{self.data_root}',\n"
+            f"  episodes={self.num_episodes},\n"
+            f"  frames={self.num_frames:,},\n"
+            f"  tasks={len(self.tasks)},\n"
+            f"  video_backend='{self.video_backend}',\n"
+            f")"
+        )