haofuly commited on 2 days ago

Commit

5e4171f

verified ·

1 Parent(s): 45ac12e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

capvector-pi05/examples/libero/Dockerfile +59 -0
capvector-pi05/examples/libero/README.md +71 -0
capvector-pi05/examples/libero/main.py +219 -0
capvector-pi05/examples/libero/requirements.in +11 -0
capvector-pi05/examples/libero/requirements.txt +136 -0
capvector-pi05/examples/simple_client/Dockerfile +32 -0
capvector-pi05/examples/simple_client/README.md +30 -0
capvector-pi05/examples/simple_client/compose.yml +42 -0
capvector-pi05/examples/simple_client/main.py +187 -0
capvector-pi05/examples/simple_client/requirements.in +5 -0
capvector-pi05/examples/simple_client/requirements.txt +30 -0
capvector-pi05/examples/ur5/README.md +142 -0
capvector-pi05/packages/openpi-client/pyproject.toml +23 -0
capvector-pi05/packages/openpi-client/src/openpi_client/__init__.py +1 -0
capvector-pi05/packages/openpi-client/src/openpi_client/action_chunk_broker.py +50 -0
capvector-pi05/packages/openpi-client/src/openpi_client/base_policy.py +12 -0
capvector-pi05/packages/openpi-client/src/openpi_client/image_tools.py +78 -0
capvector-pi05/packages/openpi-client/src/openpi_client/image_tools_test.py +37 -0
capvector-pi05/packages/openpi-client/src/openpi_client/msgpack_numpy.py +57 -0
capvector-pi05/packages/openpi-client/src/openpi_client/msgpack_numpy_test.py +45 -0
capvector-pi05/packages/openpi-client/src/openpi_client/runtime/agent.py +17 -0
capvector-pi05/packages/openpi-client/src/openpi_client/runtime/agents/policy_agent.py +18 -0
capvector-pi05/packages/openpi-client/src/openpi_client/runtime/environment.py +32 -0
capvector-pi05/packages/openpi-client/src/openpi_client/runtime/runtime.py +92 -0
capvector-pi05/packages/openpi-client/src/openpi_client/runtime/subscriber.py +20 -0
capvector-pi05/packages/openpi-client/src/openpi_client/websocket_client_policy.py +55 -0
capvector-pi05/scripts/__init__.py +0 -0
capvector-pi05/scripts/compute_norm_stats.py +117 -0
capvector-pi05/scripts/docker/compose.yml +29 -0
capvector-pi05/scripts/docker/install_docker_ubuntu22.sh +37 -0
capvector-pi05/scripts/docker/install_nvidia_container_toolkit.sh +17 -0
capvector-pi05/scripts/docker/serve_policy.Dockerfile +38 -0
capvector-pi05/scripts/serve_policy.py +122 -0
capvector-pi05/scripts/train.py +280 -0
capvector-pi05/scripts/train_align_pytorch.py +658 -0
capvector-pi05/scripts/train_pytorch.py +632 -0
capvector-pi05/scripts/train_regular_loss_pytorch.py +754 -0
capvector-pi05/scripts/train_test.py +30 -0
capvector-pi05/src/openpi/__init__.py +0 -0
capvector-pi05/src/openpi/conftest.py +17 -0
capvector-pi05/src/openpi/models/__init__.py +0 -0
capvector-pi05/src/openpi/models/gemma.py +459 -0
capvector-pi05/src/openpi/models/gemma_fast.py +437 -0
capvector-pi05/src/openpi/models/lora.py +148 -0
capvector-pi05/src/openpi/models/lora_test.py +94 -0
capvector-pi05/src/openpi/models/model.py +335 -0
capvector-pi05/src/openpi/models/model_test.py +94 -0
capvector-pi05/src/openpi/models/pi0.py +279 -0
capvector-pi05/src/openpi/models/pi0_config.py +108 -0
capvector-pi05/src/openpi/models/pi0_fast.py +313 -0

capvector-pi05/examples/libero/Dockerfile ADDED Viewed

	@@ -0,0 +1,59 @@

+# Dockerfile for the LIBERO benchmark.
+# Build the container:
+# docker build . -t libero -f examples/libero/Dockerfile
+# Run the container:
+# docker run --rm -it --network=host -v .:/app -v /tmp/.X11-unix:/tmp/.X11-unix:ro -e DISPLAY=$DISPLAY --gpus all libero /bin/bash
+FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04@sha256:2d913b09e6be8387e1a10976933642c73c840c0b735f0bf3c28d97fc9bc422e0
+COPY --from=ghcr.io/astral-sh/uv:0.5.1 /uv /uvx /bin/
+RUN apt-get update && \
+    apt-get install -y \
+    make \
+    g++ \
+    clang \
+    libosmesa6-dev \
+    libgl1-mesa-glx \
+    libglew-dev \
+    libglfw3-dev \
+    libgles2-mesa-dev \
+    libglib2.0-0 \
+    libsm6 \
+    libxrender1 \
+    libxext6
+WORKDIR /app
+# Copy from the cache instead of linking since it's a mounted volume
+ENV UV_LINK_MODE=copy
+# Write the virtual environment outside of the project directory so it doesn't
+# leak out of the container when we mount the application code.
+ENV UV_PROJECT_ENVIRONMENT=/.venv
+# Copy the requirements files so we can install dependencies.
+# The rest of the project is mounted as a volume, so we don't need to rebuild on changes.
+# This strategy is best for development-style usage.
+COPY ./examples/libero/requirements.txt /tmp/requirements.txt
+COPY ./third_party/libero/requirements.txt /tmp/requirements-libero.txt
+COPY ./packages/openpi-client/pyproject.toml /tmp/openpi-client/pyproject.toml
+# Install python dependencies.
+RUN uv venv --python 3.8 $UV_PROJECT_ENVIRONMENT
+RUN uv pip sync /tmp/requirements.txt /tmp/requirements-libero.txt /tmp/openpi-client/pyproject.toml --extra-index-url https://download.pytorch.org/whl/cu113 --index-strategy=unsafe-best-match
+ENV PYTHONPATH=/app:/app/packages/openpi-client/src:/app/third_party/libero
+# Create a default config file to avoid an input prompt from LIBERO's init script.
+# https://github.com/Lifelong-Robot-Learning/LIBERO/blob/master/libero/libero/__init__.py
+ENV LIBERO_CONFIG_PATH=/tmp/libero
+RUN mkdir -p /tmp/libero && cat <<'EOF' > /tmp/libero/config.yaml
+benchmark_root: /app/third_party/libero/libero/libero
+bddl_files: /app/third_party/libero/libero/libero/bddl_files
+init_states: /app/third_party/libero/libero/libero/init_files
+datasets: /app/third_party/libero/libero/datasets
+assets: /app/third_party/libero/libero/libero/assets
+EOF
+CMD ["/bin/bash", "-c", "source /.venv/bin/activate && python examples/libero/main.py $CLIENT_ARGS"]

capvector-pi05/examples/libero/README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# LIBERO Benchmark
+This example runs the LIBERO benchmark: https://github.com/Lifelong-Robot-Learning/LIBERO
+Note: When updating requirements.txt in this directory, there is an additional flag `--extra-index-url https://download.pytorch.org/whl/cu113` that must be added to the `uv pip compile` command.
+This example requires git submodules to be initialized. Don't forget to run:
+```bash
+git submodule update --init --recursive
+```
+## With Docker (recommended)
+```bash
+# Grant access to the X11 server:
+sudo xhost +local:docker
+# To run with the default checkpoint and task suite:
+SERVER_ARGS="--env LIBERO" docker compose -f examples/libero/compose.yml up --build
+# To run with glx for Mujoco instead (use this if you have egl errors):
+MUJOCO_GL=glx SERVER_ARGS="--env LIBERO" docker compose -f examples/libero/compose.yml up --build
+```
+You can customize the loaded checkpoint by providing additional `SERVER_ARGS` (see `scripts/serve_policy.py`), and the LIBERO task suite by providing additional `CLIENT_ARGS` (see `examples/libero/main.py`).
+For example:
+```bash
+# To load a custom checkpoint (located in the top-level openpi/ directory):
+export SERVER_ARGS="--env LIBERO policy:checkpoint --policy.config pi05_libero --policy.dir ./my_custom_checkpoint"
+# To run the libero_10 task suite:
+export CLIENT_ARGS="--args.task-suite-name libero_10"
+```
+## Without Docker (not recommended)
+Terminal window 1:
+```bash
+# Create virtual environment
+uv venv --python 3.8 examples/libero/.venv
+source examples/libero/.venv/bin/activate
+uv pip sync examples/libero/requirements.txt third_party/libero/requirements.txt --extra-index-url https://download.pytorch.org/whl/cu113 --index-strategy=unsafe-best-match
+uv pip install -e packages/openpi-client
+uv pip install -e third_party/libero
+export PYTHONPATH=$PYTHONPATH:$PWD/third_party/libero
+# Run the simulation
+python examples/libero/main.py
+# To run with glx for Mujoco instead (use this if you have egl errors):
+MUJOCO_GL=glx python examples/libero/main.py
+```
+Terminal window 2:
+```bash
+# Run the server
+uv run scripts/serve_policy.py --env LIBERO
+```
+## Results
+If you want to reproduce the following numbers, you can evaluate the checkpoint at `gs://openpi-assets/checkpoints/pi05_libero/`. This
+checkpoint was trained in openpi with the `pi05_libero` config.
+| Model | Libero Spatial | Libero Object | Libero Goal | Libero 10 | Average |
+|-------|---------------|---------------|-------------|-----------|---------|
+| π0.5 @ 30k (finetuned) | 98.8 | 98.2 | 98.0 | 92.4 | 96.85

capvector-pi05/examples/libero/main.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import collections
+import dataclasses
+import logging
+import math
+import pathlib
+import imageio
+from libero.libero import benchmark
+from libero.libero import get_libero_path
+from libero.libero.envs import OffScreenRenderEnv
+import numpy as np
+from openpi_client import image_tools
+from openpi_client import websocket_client_policy as _websocket_client_policy
+import tqdm
+import tyro
+LIBERO_DUMMY_ACTION = [0.0] * 6 + [-1.0]
+LIBERO_ENV_RESOLUTION = 256  # resolution used to render training data
+@dataclasses.dataclass
+class Args:
+    #################################################################################################################
+    # Model server parameters
+    #################################################################################################################
+    host: str = "0.0.0.0"
+    port: int = 8000
+    resize_size: int = 224
+    replan_steps: int = 5
+    #################################################################################################################
+    # LIBERO environment-specific parameters
+    #################################################################################################################
+    task_suite_name: str = (
+        "libero_spatial"  # Task suite. Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90
+    )
+    num_steps_wait: int = 10  # Number of steps to wait for objects to stabilize i n sim
+    num_trials_per_task: int = 50  # Number of rollouts per task
+    #################################################################################################################
+    # Utils
+    #################################################################################################################
+    video_out_path: str = "data/libero/videos"  # Path to save videos
+    seed: int = 7  # Random Seed (for reproducibility)
+def eval_libero(args: Args) -> None:
+    # Set random seed
+    np.random.seed(args.seed)
+    # Initialize LIBERO task suite
+    benchmark_dict = benchmark.get_benchmark_dict()
+    task_suite = benchmark_dict[args.task_suite_name]()
+    num_tasks_in_suite = task_suite.n_tasks
+    logging.info(f"Task suite: {args.task_suite_name}")
+    pathlib.Path(args.video_out_path).mkdir(parents=True, exist_ok=True)
+    if args.task_suite_name == "libero_spatial":
+        max_steps = 220  # longest training demo has 193 steps
+    elif args.task_suite_name == "libero_object":
+        max_steps = 280  # longest training demo has 254 steps
+    elif args.task_suite_name == "libero_goal":
+        max_steps = 300  # longest training demo has 270 steps
+    elif args.task_suite_name == "libero_10":
+        max_steps = 520  # longest training demo has 505 steps
+    elif args.task_suite_name == "libero_90":
+        max_steps = 400  # longest training demo has 373 steps
+    else:
+        raise ValueError(f"Unknown task suite: {args.task_suite_name}")
+    client = _websocket_client_policy.WebsocketClientPolicy(args.host, args.port)
+    # Start evaluation
+    total_episodes, total_successes = 0, 0
+    for task_id in tqdm.tqdm(range(num_tasks_in_suite)):
+        # Get task
+        task = task_suite.get_task(task_id)
+        # Get default LIBERO initial states
+        initial_states = task_suite.get_task_init_states(task_id)
+        # Initialize LIBERO environment and task description
+        env, task_description = _get_libero_env(task, LIBERO_ENV_RESOLUTION, args.seed)
+        # Start episodes
+        task_episodes, task_successes = 0, 0
+        for episode_idx in tqdm.tqdm(range(args.num_trials_per_task)):
+            logging.info(f"\nTask: {task_description}")
+            # Reset environment
+            env.reset()
+            action_plan = collections.deque()
+            # Set initial states
+            obs = env.set_init_state(initial_states[episode_idx])
+            # Setup
+            t = 0
+            replay_images = []
+            logging.info(f"Starting episode {task_episodes+1}...")
+            while t < max_steps + args.num_steps_wait:
+                try:
+                    # IMPORTANT: Do nothing for the first few timesteps because the simulator drops objects
+                    # and we need to wait for them to fall
+                    if t < args.num_steps_wait:
+                        obs, reward, done, info = env.step(LIBERO_DUMMY_ACTION)
+                        t += 1
+                        continue
+                    # Get preprocessed image
+                    # IMPORTANT: rotate 180 degrees to match train preprocessing
+                    img = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1])
+                    wrist_img = np.ascontiguousarray(obs["robot0_eye_in_hand_image"][::-1, ::-1])
+                    img = image_tools.convert_to_uint8(
+                        image_tools.resize_with_pad(img, args.resize_size, args.resize_size)
+                    )
+                    wrist_img = image_tools.convert_to_uint8(
+                        image_tools.resize_with_pad(wrist_img, args.resize_size, args.resize_size)
+                    )
+                    # Save preprocessed image for replay video
+                    replay_images.append(img)
+                    if not action_plan:
+                        # Finished executing previous action chunk -- compute new chunk
+                        # Prepare observations dict
+                        element = {
+                            "observation/image": img,
+                            "observation/wrist_image": wrist_img,
+                            "observation/state": np.concatenate(
+                                (
+                                    obs["robot0_eef_pos"],
+                                    _quat2axisangle(obs["robot0_eef_quat"]),
+                                    obs["robot0_gripper_qpos"],
+                                )
+                            ),
+                            "prompt": str(task_description),
+                        }
+                        # Query model to get action
+                        action_chunk = client.infer(element)["actions"]
+                        assert (
+                            len(action_chunk) >= args.replan_steps
+                        ), f"We want to replan every {args.replan_steps} steps, but policy only predicts {len(action_chunk)} steps."
+                        action_plan.extend(action_chunk[: args.replan_steps])
+                    action = action_plan.popleft()
+                    # Execute action in environment
+                    obs, reward, done, info = env.step(action.tolist())
+                    if done:
+                        task_successes += 1
+                        total_successes += 1
+                        break
+                    t += 1
+                except Exception as e:
+                    logging.error(f"Caught exception: {e}")
+                    break
+            task_episodes += 1
+            total_episodes += 1
+            # Save a replay video of the episode
+            suffix = "success" if done else "failure"
+            task_segment = task_description.replace(" ", "_")
+            imageio.mimwrite(
+                pathlib.Path(args.video_out_path) / f"rollout_{task_segment}_{suffix}.mp4",
+                [np.asarray(x) for x in replay_images],
+                fps=10,
+            )
+            # Log current results
+            logging.info(f"Success: {done}")
+            logging.info(f"# episodes completed so far: {total_episodes}")
+            logging.info(f"# successes: {total_successes} ({total_successes / total_episodes * 100:.1f}%)")
+        # Log final results
+        logging.info(f"Current task success rate: {float(task_successes) / float(task_episodes)}")
+        logging.info(f"Current total success rate: {float(total_successes) / float(total_episodes)}")
+    logging.info(f"Total success rate: {float(total_successes) / float(total_episodes)}")
+    logging.info(f"Total episodes: {total_episodes}")
+def _get_libero_env(task, resolution, seed):
+    """Initializes and returns the LIBERO environment, along with the task description."""
+    task_description = task.language
+    task_bddl_file = pathlib.Path(get_libero_path("bddl_files")) / task.problem_folder / task.bddl_file
+    env_args = {"bddl_file_name": task_bddl_file, "camera_heights": resolution, "camera_widths": resolution}
+    env = OffScreenRenderEnv(**env_args)
+    env.seed(seed)  # IMPORTANT: seed seems to affect object positions even when using fixed initial state
+    return env, task_description
+def _quat2axisangle(quat):
+    """
+    Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
+    """
+    # clip quaternion
+    if quat[3] > 1.0:
+        quat[3] = 1.0
+    elif quat[3] < -1.0:
+        quat[3] = -1.0
+    den = np.sqrt(1.0 - quat[3] * quat[3])
+    if math.isclose(den, 0.0):
+        # This is (close to) a zero degree rotation, immediately return
+        return np.zeros(3)
+    return (quat[:3] * 2.0 * math.acos(quat[3])) / den
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    tyro.cli(eval_libero)

capvector-pi05/examples/libero/requirements.in ADDED Viewed

	@@ -0,0 +1,11 @@

+imageio[ffmpeg]
+numpy==1.22.4
+tqdm
+tyro
+PyYaml
+opencv-python==4.6.0.66
+torch==1.11.0+cu113
+torchvision==0.12.0+cu113
+torchaudio==0.11.0+cu113
+robosuite==1.4.1
+matplotlib==3.5.3

capvector-pi05/examples/libero/requirements.txt ADDED Viewed

	@@ -0,0 +1,136 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile examples/libero/requirements.in -o examples/libero/requirements.txt --python-version 3.8 --index-strategy=unsafe-best-match
+absl-py==2.1.0
+    # via mujoco
+certifi==2024.12.14
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+cycler==0.12.1
+    # via matplotlib
+docstring-parser==0.16
+    # via tyro
+etils==1.3.0
+    # via mujoco
+eval-type-backport==0.2.0
+    # via tyro
+evdev==1.7.1
+    # via pynput
+fonttools==4.55.3
+    # via matplotlib
+glfw==1.12.0
+    # via mujoco
+idna==3.10
+    # via requests
+imageio==2.35.1
+    # via -r examples/libero/requirements.in
+imageio-ffmpeg==0.5.1
+    # via imageio
+importlib-metadata==8.5.0
+    # via typeguard
+importlib-resources==6.4.5
+    # via etils
+kiwisolver==1.4.7
+    # via matplotlib
+llvmlite==0.36.0
+    # via numba
+markdown-it-py==3.0.0
+    # via rich
+matplotlib==3.5.3
+    # via -r examples/libero/requirements.in
+mdurl==0.1.2
+    # via markdown-it-py
+mujoco==3.2.3
+    # via robosuite
+numba==0.53.1
+    # via robosuite
+numpy==1.22.4
+    # via
+    #   -r examples/libero/requirements.in
+    #   imageio
+    #   matplotlib
+    #   mujoco
+    #   numba
+    #   opencv-python
+    #   robosuite
+    #   scipy
+    #   torchvision
+opencv-python==4.6.0.66
+    # via
+    #   -r examples/libero/requirements.in
+    #   robosuite
+packaging==24.2
+    # via matplotlib
+pillow==10.4.0
+    # via
+    #   imageio
+    #   matplotlib
+    #   robosuite
+    #   torchvision
+psutil==6.1.0
+    # via imageio
+pygments==2.18.0
+    # via rich
+pynput==1.7.7
+    # via robosuite
+pyopengl==3.1.7
+    # via mujoco
+pyparsing==3.1.4
+    # via matplotlib
+python-dateutil==2.9.0.post0
+    # via matplotlib
+python-xlib==0.33
+    # via pynput
+pyyaml==6.0.2
+    # via -r examples/libero/requirements.in
+requests==2.32.3
+    # via torchvision
+rich==13.9.4
+    # via tyro
+robosuite==1.4.1
+    # via -r examples/libero/requirements.in
+scipy==1.10.1
+    # via robosuite
+setuptools==75.3.0
+    # via
+    #   imageio-ffmpeg
+    #   numba
+shtab==1.7.1
+    # via tyro
+six==1.17.0
+    # via
+    #   pynput
+    #   python-dateutil
+    #   python-xlib
+termcolor==2.4.0
+    # via robosuite
+torch==1.11.0+cu113
+    # via
+    #   -r examples/libero/requirements.in
+    #   torchaudio
+    #   torchvision
+torchaudio==0.11.0+cu113
+    # via -r examples/libero/requirements.in
+torchvision==0.12.0+cu113
+    # via -r examples/libero/requirements.in
+tqdm==4.67.1
+    # via -r examples/libero/requirements.in
+typeguard==4.4.0
+    # via tyro
+typing-extensions==4.12.2
+    # via
+    #   etils
+    #   rich
+    #   torch
+    #   torchvision
+    #   typeguard
+    #   tyro
+tyro==0.9.2
+    # via -r examples/libero/requirements.in
+urllib3==2.2.3
+    # via requests
+zipp==3.20.2
+    # via
+    #   etils
+    #   importlib-metadata
+    #   importlib-resources

capvector-pi05/examples/simple_client/Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+# Dockerfile for the simple client.
+# Build the container:
+# docker build . -t simple_client -f examples/simple_client/Dockerfile
+# Run the container:
+# docker run --rm -it --network=host -v .:/app simple_client /bin/bash
+FROM python:3.7-slim
+COPY --from=ghcr.io/astral-sh/uv:0.5.1 /uv /uvx /bin/
+WORKDIR /app
+# Copy from the cache instead of linking since it's a mounted volume
+ENV UV_LINK_MODE=copy
+# Write the virtual environment outside of the project directory so it doesn't
+# leak out of the container when we mount the application code.
+ENV UV_PROJECT_ENVIRONMENT=/.venv
+# Copy the requirements files so we can install dependencies.
+# The rest of the project is mounted as a volume, so we don't need to rebuild on changes.
+# This strategy is best for development-style usage.
+COPY ./examples/simple_client/requirements.txt /tmp/requirements.txt
+COPY ./packages/openpi-client/pyproject.toml /tmp/openpi-client/pyproject.toml
+# Install python dependencies.
+RUN uv venv --python 3.11.9 $UV_PROJECT_ENVIRONMENT
+RUN uv pip sync /tmp/requirements.txt /tmp/openpi-client/pyproject.toml
+ENV PYTHONPATH=/app:/app/src:/app/packages/openpi-client/src
+CMD /bin/bash -c "source /.venv/bin/activate && python examples/simple_client/main.py $SERVER_ARGS"

capvector-pi05/examples/simple_client/README.md ADDED Viewed

	@@ -0,0 +1,30 @@

+# Simple Client
+A minimal client that sends observations to the server and prints the inference rate.
+You can specify which runtime environment to use using the `--env` flag. You can see the available options by running:
+```bash
+uv run examples/simple_client/main.py --help
+```
+## With Docker
+```bash
+export SERVER_ARGS="--env ALOHA_SIM"
+docker compose -f examples/simple_client/compose.yml up --build
+```
+## Without Docker
+Terminal window 1:
+```bash
+uv run examples/simple_client/main.py --env DROID
+```
+Terminal window 2:
+```bash
+uv run scripts/serve_policy.py --env DROID
+```

capvector-pi05/examples/simple_client/compose.yml ADDED Viewed

	@@ -0,0 +1,42 @@

+# Run with:
+# docker compose -f examples/simple_client/compose.yml up --build
+services:
+  runtime:
+    image: simple_client
+    depends_on:
+      - openpi_server
+    build:
+      context: ../..
+      dockerfile: examples/simple_client/Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    volumes:
+      - $PWD:/app
+    environment:
+      - SERVER_ARGS
+  openpi_server:
+    image: openpi_server
+    build:
+      context: ../..
+      dockerfile: scripts/docker/serve_policy.Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    volumes:
+      - $PWD:/app
+      - ${OPENPI_DATA_HOME:-~/.cache/openpi}:/openpi_assets
+    environment:
+      - SERVER_ARGS
+      - OPENPI_DATA_HOME=/openpi_assets
+      - IS_DOCKER=true
+    # Comment out this block if not running on a machine with GPUs.
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

capvector-pi05/examples/simple_client/main.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import dataclasses
+import enum
+import logging
+import pathlib
+import time
+import numpy as np
+from openpi_client import websocket_client_policy as _websocket_client_policy
+import polars as pl
+import rich
+import tqdm
+import tyro
+logger = logging.getLogger(__name__)
+class EnvMode(enum.Enum):
+    """Supported environments."""
+    ALOHA = "aloha"
+    ALOHA_SIM = "aloha_sim"
+    DROID = "droid"
+    LIBERO = "libero"
+@dataclasses.dataclass
+class Args:
+    """Command line arguments."""
+    # Host and port to connect to the server.
+    host: str = "0.0.0.0"
+    # Port to connect to the server. If None, the server will use the default port.
+    port: int | None = 8000
+    # API key to use for the server.
+    api_key: str | None = None
+    # Number of steps to run the policy for.
+    num_steps: int = 20
+    # Path to save the timings to a parquet file. (e.g., timing.parquet)
+    timing_file: pathlib.Path | None = None
+    # Environment to run the policy in.
+    env: EnvMode = EnvMode.ALOHA_SIM
+class TimingRecorder:
+    """Records timing measurements for different keys."""
+    def __init__(self) -> None:
+        self._timings: dict[str, list[float]] = {}
+    def record(self, key: str, time_ms: float) -> None:
+        """Record a timing measurement for the given key."""
+        if key not in self._timings:
+            self._timings[key] = []
+        self._timings[key].append(time_ms)
+    def get_stats(self, key: str) -> dict[str, float]:
+        """Get statistics for the given key."""
+        times = self._timings[key]
+        return {
+            "mean": float(np.mean(times)),
+            "std": float(np.std(times)),
+            "p25": float(np.quantile(times, 0.25)),
+            "p50": float(np.quantile(times, 0.50)),
+            "p75": float(np.quantile(times, 0.75)),
+            "p90": float(np.quantile(times, 0.90)),
+            "p95": float(np.quantile(times, 0.95)),
+            "p99": float(np.quantile(times, 0.99)),
+        }
+    def print_all_stats(self) -> None:
+        """Print statistics for all keys in a concise format."""
+        table = rich.table.Table(
+            title="[bold blue]Timing Statistics[/bold blue]",
+            show_header=True,
+            header_style="bold white",
+            border_style="blue",
+            title_justify="center",
+        )
+        # Add metric column with custom styling
+        table.add_column("Metric", style="cyan", justify="left", no_wrap=True)
+        # Add statistical columns with consistent styling
+        stat_columns = [
+            ("Mean", "yellow", "mean"),
+            ("Std", "yellow", "std"),
+            ("P25", "magenta", "p25"),
+            ("P50", "magenta", "p50"),
+            ("P75", "magenta", "p75"),
+            ("P90", "magenta", "p90"),
+            ("P95", "magenta", "p95"),
+            ("P99", "magenta", "p99"),
+        ]
+        for name, style, _ in stat_columns:
+            table.add_column(name, justify="right", style=style, no_wrap=True)
+        # Add rows for each metric with formatted values
+        for key in sorted(self._timings.keys()):
+            stats = self.get_stats(key)
+            values = [f"{stats[key]:.1f}" for _, _, key in stat_columns]
+            table.add_row(key, *values)
+        # Print with custom console settings
+        console = rich.console.Console(width=None, highlight=True)
+        console.print(table)
+    def write_parquet(self, path: pathlib.Path) -> None:
+        """Save the timings to a parquet file."""
+        logger.info(f"Writing timings to {path}")
+        frame = pl.DataFrame(self._timings)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        frame.write_parquet(path)
+def main(args: Args) -> None:
+    obs_fn = {
+        EnvMode.ALOHA: _random_observation_aloha,
+        EnvMode.ALOHA_SIM: _random_observation_aloha,
+        EnvMode.DROID: _random_observation_droid,
+        EnvMode.LIBERO: _random_observation_libero,
+    }[args.env]
+    policy = _websocket_client_policy.WebsocketClientPolicy(
+        host=args.host,
+        port=args.port,
+        api_key=args.api_key,
+    )
+    logger.info(f"Server metadata: {policy.get_server_metadata()}")
+    # Send a few observations to make sure the model is loaded.
+    for _ in range(2):
+        policy.infer(obs_fn())
+    timing_recorder = TimingRecorder()
+    for _ in tqdm.trange(args.num_steps, desc="Running policy"):
+        inference_start = time.time()
+        action = policy.infer(obs_fn())
+        timing_recorder.record("client_infer_ms", 1000 * (time.time() - inference_start))
+        for key, value in action.get("server_timing", {}).items():
+            timing_recorder.record(f"server_{key}", value)
+        for key, value in action.get("policy_timing", {}).items():
+            timing_recorder.record(f"policy_{key}", value)
+    timing_recorder.print_all_stats()
+    if args.timing_file is not None:
+        timing_recorder.write_parquet(args.timing_file)
+def _random_observation_aloha() -> dict:
+    return {
+        "state": np.ones((14,)),
+        "images": {
+            "cam_high": np.random.randint(256, size=(3, 224, 224), dtype=np.uint8),
+            "cam_low": np.random.randint(256, size=(3, 224, 224), dtype=np.uint8),
+            "cam_left_wrist": np.random.randint(256, size=(3, 224, 224), dtype=np.uint8),
+            "cam_right_wrist": np.random.randint(256, size=(3, 224, 224), dtype=np.uint8),
+        },
+        "prompt": "do something",
+    }
+def _random_observation_droid() -> dict:
+    return {
+        "observation/exterior_image_1_left": np.random.randint(256, size=(224, 224, 3), dtype=np.uint8),
+        "observation/wrist_image_left": np.random.randint(256, size=(224, 224, 3), dtype=np.uint8),
+        "observation/joint_position": np.random.rand(7),
+        "observation/gripper_position": np.random.rand(1),
+        "prompt": "do something",
+    }
+def _random_observation_libero() -> dict:
+    return {
+        "observation/state": np.random.rand(8),
+        "observation/image": np.random.randint(256, size=(224, 224, 3), dtype=np.uint8),
+        "observation/wrist_image": np.random.randint(256, size=(224, 224, 3), dtype=np.uint8),
+        "prompt": "do something",
+    }
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    main(tyro.cli(Args))

capvector-pi05/examples/simple_client/requirements.in ADDED Viewed

	@@ -0,0 +1,5 @@

+numpy>=1.22.4,<2.0.0
+rich
+tqdm
+tyro
+polars

capvector-pi05/examples/simple_client/requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile examples/simple_client/requirements.in -o examples/simple_client/requirements.txt --python-version 3.11.9
+docstring-parser==0.16
+    # via tyro
+markdown-it-py==3.0.0
+    # via rich
+mdurl==0.1.2
+    # via markdown-it-py
+numpy==1.26.4
+    # via -r examples/simple_client/requirements.in
+polars==1.30.0
+    # via -r examples/simple_client/requirements.in
+pygments==2.19.1
+    # via rich
+rich==14.0.0
+    # via
+    #   -r examples/simple_client/requirements.in
+    #   tyro
+shtab==1.7.2
+    # via tyro
+tqdm==4.67.1
+    # via -r examples/simple_client/requirements.in
+typeguard==4.4.2
+    # via tyro
+typing-extensions==4.13.2
+    # via
+    #   typeguard
+    #   tyro
+tyro==0.9.22
+    # via -r examples/simple_client/requirements.in

capvector-pi05/examples/ur5/README.md ADDED Viewed

	@@ -0,0 +1,142 @@

+# UR5 Example
+Below we provide an outline of how to implement the key components mentioned in the "Finetune on your data" section of the [README](../README.md) for finetuning on UR5 datasets.
+First, we will define the `UR5Inputs` and `UR5Outputs` classes, which map the UR5 environment to the model and vice versa. Check the corresponding files in `src/openpi/policies/libero_policy.py` for comments explaining each line.
+```python
+@dataclasses.dataclass(frozen=True)
+class UR5Inputs(transforms.DataTransformFn):
+    model_type: _model.ModelType = _model.ModelType.PI0
+    def __call__(self, data: dict) -> dict:
+        # First, concatenate the joints and gripper into the state vector.
+        state = np.concatenate([data["joints"], data["gripper"]])
+        # Possibly need to parse images to uint8 (H,W,C) since LeRobot automatically
+        # stores as float32 (C,H,W), gets skipped for policy inference.
+        base_image = _parse_image(data["base_rgb"])
+        wrist_image = _parse_image(data["wrist_rgb"])
+        # Create inputs dict.
+        inputs = {
+            "state": state,
+            "image": {
+                "base_0_rgb": base_image,
+                "left_wrist_0_rgb": wrist_image,
+                # Since there is no right wrist, replace with zeros
+                "right_wrist_0_rgb": np.zeros_like(base_image),
+            },
+            "image_mask": {
+                "base_0_rgb": np.True_,
+                "left_wrist_0_rgb": np.True_,
+                # Since the "slot" for the right wrist is not used, this mask is set
+                # to False
+                "right_wrist_0_rgb": np.True_ if self.model_type == _model.ModelType.PI0_FAST else np.False_,
+            },
+        }
+        if "actions" in data:
+            inputs["actions"] = data["actions"]
+        # Pass the prompt (aka language instruction) to the model.
+        if "prompt" in data:
+            inputs["prompt"] = data["prompt"]
+        return inputs
+@dataclasses.dataclass(frozen=True)
+class UR5Outputs(transforms.DataTransformFn):
+    def __call__(self, data: dict) -> dict:
+        # Since the robot has 7 action dimensions (6 DoF + gripper), return the first 7 dims
+        return {"actions": np.asarray(data["actions"][:, :7])}
+```
+Next, we will define the `UR5DataConfig` class, which defines how to process raw UR5 data from LeRobot dataset for training. For a full example, see the `LeRobotLiberoDataConfig` config in the [training config file](https://github.com/physical-intelligence/openpi/blob/main/src/openpi/training/config.py).
+```python
+@dataclasses.dataclass(frozen=True)
+class LeRobotUR5DataConfig(DataConfigFactory):
+    @override
+    def create(self, assets_dirs: pathlib.Path, model_config: _model.BaseModelConfig) -> DataConfig:
+        # Boilerplate for remapping keys from the LeRobot dataset. We assume no renaming needed here.
+        repack_transform = _transforms.Group(
+            inputs=[
+                _transforms.RepackTransform(
+                    {
+                        "base_rgb": "image",
+                        "wrist_rgb": "wrist_image",
+                        "joints": "joints",
+                        "gripper": "gripper",
+                        "prompt": "prompt",
+                    }
+                )
+            ]
+        )
+        # These transforms are the ones we wrote earlier.
+        data_transforms = _transforms.Group(
+            inputs=[UR5Inputs(action_dim=model_config.action_dim, model_type=model_config.model_type)],
+            outputs=[UR5Outputs()],
+        )
+        # Convert absolute actions to delta actions.
+        # By convention, we do not convert the gripper action (7th dimension).
+        delta_action_mask = _transforms.make_bool_mask(6, -1)
+        data_transforms = data_transforms.push(
+            inputs=[_transforms.DeltaActions(delta_action_mask)],
+            outputs=[_transforms.AbsoluteActions(delta_action_mask)],
+        )
+        # Model transforms include things like tokenizing the prompt and action targets
+        # You do not need to change anything here for your own dataset.
+        model_transforms = ModelTransformFactory()(model_config)
+        # We return all data transforms for training and inference. No need to change anything here.
+        return dataclasses.replace(
+            self.create_base_config(assets_dirs),
+            repack_transforms=repack_transform,
+            data_transforms=data_transforms,
+            model_transforms=model_transforms,
+        )
+```
+Finally, we define the TrainConfig for our UR5 dataset. Here, we define a config for fine-tuning pi0 on our UR5 dataset. See the [training config file](https://github.com/physical-intelligence/openpi/blob/main/src/openpi/training/config.py) for more examples, e.g. for pi0-FAST or for LoRA fine-tuning.
+```python
+TrainConfig(
+    name="pi0_ur5",
+    model=pi0.Pi0Config(),
+    data=LeRobotUR5DataConfig(
+        repo_id="your_username/ur5_dataset",
+        # This config lets us reload the UR5 normalization stats from the base model checkpoint.
+        # Reloading normalization stats can help transfer pre-trained models to new environments.
+        # See the [norm_stats.md](../docs/norm_stats.md) file for more details.
+        assets=AssetsConfig(
+            assets_dir="gs://openpi-assets/checkpoints/pi0_base/assets",
+            asset_id="ur5e",
+        ),
+        base_config=DataConfig(
+            # This flag determines whether we load the prompt (i.e. the task instruction) from the
+            # ``task`` field in the LeRobot dataset. The recommended setting is True.
+            prompt_from_task=True,
+        ),
+    ),
+    # Load the pi0 base model checkpoint.
+    weight_loader=weight_loaders.CheckpointWeightLoader("gs://openpi-assets/checkpoints/pi0_base/params"),
+    num_train_steps=30_000,
+)
+```

capvector-pi05/packages/openpi-client/pyproject.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+[project]
+name = "openpi-client"
+version = "0.1.0"
+requires-python = ">=3.7"
+dependencies = [
+    "dm-tree>=0.1.8",
+    "msgpack>=1.0.5",
+    "numpy>=1.22.4,<2.0.0",
+    "pillow>=9.0.0",
+    "tree>=0.2.4",
+    "websockets>=11.0",
+]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.uv]
+dev-dependencies = ["pytest>=8.3.4"]
+[tool.ruff]
+line-length = 120
+target-version = "py37"

capvector-pi05/packages/openpi-client/src/openpi_client/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.1.0"

capvector-pi05/packages/openpi-client/src/openpi_client/action_chunk_broker.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import Dict
+import numpy as np
+import tree
+from typing_extensions import override
+from openpi_client import base_policy as _base_policy
+class ActionChunkBroker(_base_policy.BasePolicy):
+    """Wraps a policy to return action chunks one-at-a-time.
+    Assumes that the first dimension of all action fields is the chunk size.
+    A new inference call to the inner policy is only made when the current
+    list of chunks is exhausted.
+    """
+    def __init__(self, policy: _base_policy.BasePolicy, action_horizon: int):
+        self._policy = policy
+        self._action_horizon = action_horizon
+        self._cur_step: int = 0
+        self._last_results: Dict[str, np.ndarray] | None = None
+    @override
+    def infer(self, obs: Dict) -> Dict:  # noqa: UP006
+        if self._last_results is None:
+            self._last_results = self._policy.infer(obs)
+            self._cur_step = 0
+        def slicer(x):
+            if isinstance(x, np.ndarray):
+                return x[self._cur_step, ...]
+            else:
+                return x
+        results = tree.map_structure(slicer, self._last_results)
+        self._cur_step += 1
+        if self._cur_step >= self._action_horizon:
+            self._last_results = None
+        return results
+    @override
+    def reset(self) -> None:
+        self._policy.reset()
+        self._last_results = None
+        self._cur_step = 0

capvector-pi05/packages/openpi-client/src/openpi_client/base_policy.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import abc
+from typing import Dict
+class BasePolicy(abc.ABC):
+    @abc.abstractmethod
+    def infer(self, obs: Dict) -> Dict:
+        """Infer actions from observations."""
+    def reset(self) -> None:
+        """Reset the policy to its initial state."""
+        pass

capvector-pi05/packages/openpi-client/src/openpi_client/image_tools.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import numpy as np
+from PIL import Image
+def convert_to_uint8(img: np.ndarray) -> np.ndarray:
+    """Converts an image to uint8 if it is a float image.
+    This is important for reducing the size of the image when sending it over the network.
+    """
+    if np.issubdtype(img.dtype, np.floating):
+        img = (255 * img).astype(np.uint8)
+    return img
+def resize_with_pad(images: np.ndarray, height: int, width: int, method=Image.BILINEAR, return_mask=False) -> np.ndarray:
+    """Replicates tf.image.resize_with_pad for multiple images using PIL. Resizes a batch of images to a target height.
+    Args:
+        images: A batch of images in [..., height, width, channel] format.
+        height: The target height of the image.
+        width: The target width of the image.
+        method: The interpolation method to use. Default is bilinear.
+    Returns:
+        The resized images in [..., height, width, channel].
+    """
+    # If the images are already the correct size, return them as is.
+    if images.shape[-3:-1] == (height, width):
+        if return_mask:
+            img_padding_mask = np.ones((*images.shape[:-3], height, width), dtype=bool)
+            return images, img_padding_mask
+        return images
+    original_shape = images.shape
+    images = images.reshape(-1, *original_shape[-3:])
+    resized_results = [
+        _resize_with_pad_pil(Image.fromarray(im), height, width, method=method) for im in images
+    ]
+    resized_images, img_padding_mask = zip(*resized_results)
+    resized_images = np.stack(resized_images)
+    img_padding_mask = np.stack(img_padding_mask)
+    if return_mask:
+        return (
+            resized_images.reshape(*original_shape[:-3], *resized_images.shape[-3:]),
+            img_padding_mask.reshape(*original_shape[:-3], *img_padding_mask.shape[-2:]),
+        )
+    else:
+        return resized_images.reshape(*original_shape[:-3], *resized_images.shape[-3:])
+def _resize_with_pad_pil(image: Image.Image, height: int, width: int, method: int) -> Image.Image:
+    """Replicates tf.image.resize_with_pad for one image using PIL. Resizes an image to a target height and
+    width without distortion by padding with zeros.
+    Unlike the jax version, note that PIL uses [width, height, channel] ordering instead of [batch, h, w, c].
+    """
+    cur_width, cur_height = image.size
+    if cur_width == width and cur_height == height:
+        return image  # No need to resize if the image is already the correct size.
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    resized_image = image.resize((resized_width, resized_height), resample=method)
+    zero_image = Image.new(resized_image.mode, (width, height), 0)
+    pad_height = max(0, int((height - resized_height) / 2))
+    pad_width = max(0, int((width - resized_width) / 2))
+    zero_image.paste(resized_image, (pad_width, pad_height))
+    assert zero_image.size == (width, height)
+    img_padding_mask = np.zeros((height, width), dtype=bool)
+    img_padding_mask[pad_height:pad_height+resized_height, pad_width:pad_width+resized_width] = True
+    return zero_image, img_padding_mask

capvector-pi05/packages/openpi-client/src/openpi_client/image_tools_test.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import numpy as np
+import openpi_client.image_tools as image_tools
+def test_resize_with_pad_shapes():
+    # Test case 1: Resize image with larger dimensions
+    images = np.zeros((2, 10, 10, 3), dtype=np.uint8)  # Input images of shape (batch_size, height, width, channels)
+    height = 20
+    width = 20
+    resized_images = image_tools.resize_with_pad(images, height, width)
+    assert resized_images.shape == (2, height, width, 3)
+    assert np.all(resized_images == 0)
+    # Test case 2: Resize image with smaller dimensions
+    images = np.zeros((3, 30, 30, 3), dtype=np.uint8)
+    height = 15
+    width = 15
+    resized_images = image_tools.resize_with_pad(images, height, width)
+    assert resized_images.shape == (3, height, width, 3)
+    assert np.all(resized_images == 0)
+    # Test case 3: Resize image with the same dimensions
+    images = np.zeros((1, 50, 50, 3), dtype=np.uint8)
+    height = 50
+    width = 50
+    resized_images = image_tools.resize_with_pad(images, height, width)
+    assert resized_images.shape == (1, height, width, 3)
+    assert np.all(resized_images == 0)
+    # Test case 3: Resize image with odd-numbered padding
+    images = np.zeros((1, 256, 320, 3), dtype=np.uint8)
+    height = 60
+    width = 80
+    resized_images = image_tools.resize_with_pad(images, height, width)
+    assert resized_images.shape == (1, height, width, 3)
+    assert np.all(resized_images == 0)

capvector-pi05/packages/openpi-client/src/openpi_client/msgpack_numpy.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Adds NumPy array support to msgpack.
+msgpack is good for (de)serializing data over a network for multiple reasons:
+- msgpack is secure (as opposed to pickle/dill/etc which allow for arbitrary code execution)
+- msgpack is widely used and has good cross-language support
+- msgpack does not require a schema (as opposed to protobuf/flatbuffers/etc) which is convenient in dynamically typed
+    languages like Python and JavaScript
+- msgpack is fast and efficient (as opposed to readable formats like JSON/YAML/etc); I found that msgpack was ~4x faster
+    than pickle for serializing large arrays using the below strategy
+The code below is adapted from https://github.com/lebedov/msgpack-numpy. The reason not to use that library directly is
+that it falls back to pickle for object arrays.
+"""
+import functools
+import msgpack
+import numpy as np
+def pack_array(obj):
+    if (isinstance(obj, (np.ndarray, np.generic))) and obj.dtype.kind in ("V", "O", "c"):
+        raise ValueError(f"Unsupported dtype: {obj.dtype}")
+    if isinstance(obj, np.ndarray):
+        return {
+            b"__ndarray__": True,
+            b"data": obj.tobytes(),
+            b"dtype": obj.dtype.str,
+            b"shape": obj.shape,
+        }
+    if isinstance(obj, np.generic):
+        return {
+            b"__npgeneric__": True,
+            b"data": obj.item(),
+            b"dtype": obj.dtype.str,
+        }
+    return obj
+def unpack_array(obj):
+    if b"__ndarray__" in obj:
+        return np.ndarray(buffer=obj[b"data"], dtype=np.dtype(obj[b"dtype"]), shape=obj[b"shape"])
+    if b"__npgeneric__" in obj:
+        return np.dtype(obj[b"dtype"]).type(obj[b"data"])
+    return obj
+Packer = functools.partial(msgpack.Packer, default=pack_array)
+packb = functools.partial(msgpack.packb, default=pack_array)
+Unpacker = functools.partial(msgpack.Unpacker, object_hook=unpack_array)
+unpackb = functools.partial(msgpack.unpackb, object_hook=unpack_array)

capvector-pi05/packages/openpi-client/src/openpi_client/msgpack_numpy_test.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import numpy as np
+import pytest
+import tree
+from openpi_client import msgpack_numpy
+def _check(expected, actual):
+    if isinstance(expected, np.ndarray):
+        assert expected.shape == actual.shape
+        assert expected.dtype == actual.dtype
+        assert np.array_equal(expected, actual, equal_nan=expected.dtype.kind == "f")
+    else:
+        assert expected == actual
+@pytest.mark.parametrize(
+    "data",
+    [
+        1,  # int
+        1.0,  # float
+        "hello",  # string
+        np.bool_(True),  # boolean scalar
+        np.array([1, 2, 3])[0],  # int scalar
+        np.str_("asdf"),  # string scalar
+        [1, 2, 3],  # list
+        {"key": "value"},  # dict
+        {"key": [1, 2, 3]},  # nested dict
+        np.array(1.0),  # 0D array
+        np.array([1, 2, 3], dtype=np.int32),  # 1D integer array
+        np.array(["asdf", "qwer"]),  # string array
+        np.array([True, False]),  # boolean array
+        np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),  # 2D float array
+        np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=np.int16),  # 3D integer array
+        np.array([np.nan, np.inf, -np.inf]),  # special float values
+        {"arr": np.array([1, 2, 3]), "nested": {"arr": np.array([4, 5, 6])}},  # nested dict with arrays
+        [np.array([1, 2]), np.array([3, 4])],  # list of arrays
+        np.zeros((3, 4, 5), dtype=np.float32),  # 3D zeros
+        np.ones((2, 3), dtype=np.float64),  # 2D ones with double precision
+    ],
+)
+def test_pack_unpack(data):
+    packed = msgpack_numpy.packb(data)
+    unpacked = msgpack_numpy.unpackb(packed)
+    tree.map_structure(_check, data, unpacked)

capvector-pi05/packages/openpi-client/src/openpi_client/runtime/agent.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import abc
+class Agent(abc.ABC):
+    """An Agent is the thing with agency, i.e. the entity that makes decisions.
+    Agents receive observations about the state of the world, and return actions
+    to take in response.
+    """
+    @abc.abstractmethod
+    def get_action(self, observation: dict) -> dict:
+        """Query the agent for the next action."""
+    @abc.abstractmethod
+    def reset(self) -> None:
+        """Reset the agent to its initial state."""

capvector-pi05/packages/openpi-client/src/openpi_client/runtime/agents/policy_agent.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from typing_extensions import override
+from openpi_client import base_policy as _base_policy
+from openpi_client.runtime import agent as _agent
+class PolicyAgent(_agent.Agent):
+    """An agent that uses a policy to determine actions."""
+    def __init__(self, policy: _base_policy.BasePolicy) -> None:
+        self._policy = policy
+    @override
+    def get_action(self, observation: dict) -> dict:
+        return self._policy.infer(observation)
+    def reset(self) -> None:
+        self._policy.reset()

capvector-pi05/packages/openpi-client/src/openpi_client/runtime/environment.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import abc
+class Environment(abc.ABC):
+    """An Environment represents the robot and the environment it inhabits.
+    The primary contract of environments is that they can be queried for observations
+    about their state, and have actions applied to them to change that state.
+    """
+    @abc.abstractmethod
+    def reset(self) -> None:
+        """Reset the environment to its initial state.
+        This will be called once before starting each episode.
+        """
+    @abc.abstractmethod
+    def is_episode_complete(self) -> bool:
+        """Allow the environment to signal that the episode is complete.
+        This will be called after each step. It should return `True` if the episode is
+        complete (either successfully or unsuccessfully), and `False` otherwise.
+        """
+    @abc.abstractmethod
+    def get_observation(self) -> dict:
+        """Query the environment for the current state."""
+    @abc.abstractmethod
+    def apply_action(self, action: dict) -> None:
+        """Take an action in the environment."""

capvector-pi05/packages/openpi-client/src/openpi_client/runtime/runtime.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import logging
+import threading
+import time
+from openpi_client.runtime import agent as _agent
+from openpi_client.runtime import environment as _environment
+from openpi_client.runtime import subscriber as _subscriber
+class Runtime:
+    """The core module orchestrating interactions between key components of the system."""
+    def __init__(
+        self,
+        environment: _environment.Environment,
+        agent: _agent.Agent,
+        subscribers: list[_subscriber.Subscriber],
+        max_hz: float = 0,
+        num_episodes: int = 1,
+        max_episode_steps: int = 0,
+    ) -> None:
+        self._environment = environment
+        self._agent = agent
+        self._subscribers = subscribers
+        self._max_hz = max_hz
+        self._num_episodes = num_episodes
+        self._max_episode_steps = max_episode_steps
+        self._in_episode = False
+        self._episode_steps = 0
+    def run(self) -> None:
+        """Runs the runtime loop continuously until stop() is called or the environment is done."""
+        for _ in range(self._num_episodes):
+            self._run_episode()
+        # Final reset, this is important for real environments to move the robot to its home position.
+        self._environment.reset()
+    def run_in_new_thread(self) -> threading.Thread:
+        """Runs the runtime loop in a new thread."""
+        thread = threading.Thread(target=self.run)
+        thread.start()
+        return thread
+    def mark_episode_complete(self) -> None:
+        """Marks the end of an episode."""
+        self._in_episode = False
+    def _run_episode(self) -> None:
+        """Runs a single episode."""
+        logging.info("Starting episode...")
+        self._environment.reset()
+        self._agent.reset()
+        for subscriber in self._subscribers:
+            subscriber.on_episode_start()
+        self._in_episode = True
+        self._episode_steps = 0
+        step_time = 1 / self._max_hz if self._max_hz > 0 else 0
+        last_step_time = time.time()
+        while self._in_episode:
+            self._step()
+            self._episode_steps += 1
+            # Sleep to maintain the desired frame rate
+            now = time.time()
+            dt = now - last_step_time
+            if dt < step_time:
+                time.sleep(step_time - dt)
+                last_step_time = time.time()
+            else:
+                last_step_time = now
+        logging.info("Episode completed.")
+        for subscriber in self._subscribers:
+            subscriber.on_episode_end()
+    def _step(self) -> None:
+        """A single step of the runtime loop."""
+        observation = self._environment.get_observation()
+        action = self._agent.get_action(observation)
+        self._environment.apply_action(action)
+        for subscriber in self._subscribers:
+            subscriber.on_step(observation, action)
+        if self._environment.is_episode_complete() or (
+            self._max_episode_steps > 0 and self._episode_steps >= self._max_episode_steps
+        ):
+            self.mark_episode_complete()

capvector-pi05/packages/openpi-client/src/openpi_client/runtime/subscriber.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import abc
+class Subscriber(abc.ABC):
+    """Subscribes to events in the runtime.
+    Subscribers can be used to save data, visualize, etc.
+    """
+    @abc.abstractmethod
+    def on_episode_start(self) -> None:
+        """Called when an episode starts."""
+    @abc.abstractmethod
+    def on_step(self, observation: dict, action: dict) -> None:
+        """Append a step to the episode."""
+    @abc.abstractmethod
+    def on_episode_end(self) -> None:
+        """Called when an episode ends."""

capvector-pi05/packages/openpi-client/src/openpi_client/websocket_client_policy.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import logging
+import time
+from typing import Dict, Optional, Tuple
+from typing_extensions import override
+import websockets.sync.client
+from openpi_client import base_policy as _base_policy
+from openpi_client import msgpack_numpy
+class WebsocketClientPolicy(_base_policy.BasePolicy):
+    """Implements the Policy interface by communicating with a server over websocket.
+    See WebsocketPolicyServer for a corresponding server implementation.
+    """
+    def __init__(self, host: str = "0.0.0.0", port: Optional[int] = None, api_key: Optional[str] = None) -> None:
+        self._uri = f"ws://{host}"
+        if port is not None:
+            self._uri += f":{port}"
+        self._packer = msgpack_numpy.Packer()
+        self._api_key = api_key
+        self._ws, self._server_metadata = self._wait_for_server()
+    def get_server_metadata(self) -> Dict:
+        return self._server_metadata
+    def _wait_for_server(self) -> Tuple[websockets.sync.client.ClientConnection, Dict]:
+        logging.info(f"Waiting for server at {self._uri}...")
+        while True:
+            try:
+                headers = {"Authorization": f"Api-Key {self._api_key}"} if self._api_key else None
+                conn = websockets.sync.client.connect(
+                    self._uri, compression=None, max_size=None, additional_headers=headers
+                )
+                metadata = msgpack_numpy.unpackb(conn.recv())
+                return conn, metadata
+            except ConnectionRefusedError:
+                logging.info("Still waiting for server...")
+                time.sleep(5)
+    @override
+    def infer(self, obs: Dict) -> Dict:  # noqa: UP006
+        data = self._packer.pack(obs)
+        self._ws.send(data)
+        response = self._ws.recv()
+        if isinstance(response, str):
+            # we're expecting bytes; if the server sends a string, it's an error.
+            raise RuntimeError(f"Error in inference server:\n{response}")
+        return msgpack_numpy.unpackb(response)
+    @override
+    def reset(self) -> None:
+        pass

capvector-pi05/scripts/__init__.py ADDED Viewed

File without changes

capvector-pi05/scripts/compute_norm_stats.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""Compute normalization statistics for a config.
+This script is used to compute the normalization statistics for a given config. It
+will compute the mean and standard deviation of the data in the dataset and save it
+to the config assets directory.
+"""
+import numpy as np
+import tqdm
+import tyro
+import openpi.models.model as _model
+import openpi.shared.normalize as normalize
+import openpi.training.config as _config
+import openpi.training.data_loader as _data_loader
+import openpi.transforms as transforms
+class RemoveStrings(transforms.DataTransformFn):
+    def __call__(self, x: dict) -> dict:
+        return {k: v for k, v in x.items() if not np.issubdtype(np.asarray(v).dtype, np.str_)}
+def create_torch_dataloader(
+    data_config: _config.DataConfig,
+    action_horizon: int,
+    batch_size: int,
+    model_config: _model.BaseModelConfig,
+    num_workers: int,
+    max_frames: int | None = None,
+) -> tuple[_data_loader.Dataset, int]:
+    if data_config.repo_id is None:
+        raise ValueError("Data config must have a repo_id")
+    dataset = _data_loader.create_torch_dataset(data_config, action_horizon, model_config)
+    dataset = _data_loader.TransformedDataset(
+        dataset,
+        [
+            *data_config.repack_transforms.inputs,
+            *data_config.data_transforms.inputs,
+            # Remove strings since they are not supported by JAX and are not needed to compute norm stats.
+            RemoveStrings(),
+        ],
+    )
+    if max_frames is not None and max_frames < len(dataset):
+        num_batches = max_frames // batch_size
+        shuffle = True
+    else:
+        num_batches = len(dataset) // batch_size
+        shuffle = False
+    data_loader = _data_loader.TorchDataLoader(
+        dataset,
+        local_batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=shuffle,
+        num_batches=num_batches,
+    )
+    return data_loader, num_batches
+def create_rlds_dataloader(
+    data_config: _config.DataConfig,
+    action_horizon: int,
+    batch_size: int,
+    max_frames: int | None = None,
+) -> tuple[_data_loader.Dataset, int]:
+    dataset = _data_loader.create_rlds_dataset(data_config, action_horizon, batch_size, shuffle=False)
+    dataset = _data_loader.IterableTransformedDataset(
+        dataset,
+        [
+            *data_config.repack_transforms.inputs,
+            *data_config.data_transforms.inputs,
+            # Remove strings since they are not supported by JAX and are not needed to compute norm stats.
+            RemoveStrings(),
+        ],
+        is_batched=True,
+    )
+    if max_frames is not None and max_frames < len(dataset):
+        num_batches = max_frames // batch_size
+    else:
+        # NOTE: this length is currently hard-coded for DROID.
+        num_batches = len(dataset) // batch_size
+    data_loader = _data_loader.RLDSDataLoader(
+        dataset,
+        num_batches=num_batches,
+    )
+    return data_loader, num_batches
+def main(config_name: str, max_frames: int | None = None):
+    config = _config.get_config(config_name)
+    data_config = config.data.create(config.assets_dirs, config.model)
+    if data_config.rlds_data_dir is not None:
+        data_loader, num_batches = create_rlds_dataloader(
+            data_config, config.model.action_horizon, config.batch_size, max_frames
+        )
+    else:
+        data_loader, num_batches = create_torch_dataloader(
+            data_config, config.model.action_horizon, config.batch_size, config.model, config.num_workers, max_frames
+        )
+    keys = ["state", "actions"]
+    stats = {key: normalize.RunningStats() for key in keys}
+    for batch in tqdm.tqdm(data_loader, total=num_batches, desc="Computing stats"):
+        for key in keys:
+            stats[key].update(np.asarray(batch[key]))
+    norm_stats = {key: stats.get_statistics() for key, stats in stats.items()}
+    output_path = config.assets_dirs / data_config.repo_id
+    print(f"Writing stats to: {output_path}")
+    normalize.save(output_path, norm_stats)
+if __name__ == "__main__":
+    tyro.cli(main)

capvector-pi05/scripts/docker/compose.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+# Run with:
+# docker compose -f scripts/docker/compose.yml up --build
+services:
+  openpi_server:
+    image: openpi_server
+    build:
+      context: ../..
+      dockerfile: scripts/docker/serve_policy.Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    # Populate configured openpi data home to /openpi_assets inside the container.
+    # Populate aws credential inside the container.
+    volumes:
+      - $PWD:/app
+      - ${OPENPI_DATA_HOME:-~/.cache/openpi}:/openpi_assets
+    environment:
+      - SERVER_ARGS
+      - OPENPI_DATA_HOME=/openpi_assets
+      - IS_DOCKER=true
+    # Comment out this block if not running on a machine with GPUs.
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

capvector-pi05/scripts/docker/install_docker_ubuntu22.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/bin/bash
+# Add Docker's official GPG key:
+sudo apt-get update
+sudo apt-get install -y ca-certificates curl
+sudo install -m 0755 -d /etc/apt/keyrings
+sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+sudo chmod a+r /etc/apt/keyrings/docker.asc
+# Add the repository to Apt sources:
+echo \
+	"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+  $(. /etc/os-release && echo "$VERSION_CODENAME") stable" |
+	sudo tee /etc/apt/sources.list.d/docker.list >/dev/null
+sudo apt-get update
+sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+# Add current user to the 'docker' group, which allows them to use docker commands (docker build, docker run, etc).
+# See https://docs.docker.com/engine/install/linux-postinstall/
+username=$(whoami)
+sudo usermod -aG docker $username
+# Configure docker to start automatically on system boot.
+sudo systemctl enable docker.service
+sudo systemctl enable containerd.service
+# https://forums.docker.com/t/docker-credential-desktop-exe-executable-file-not-found-in-path-using-wsl2/100225/5
+if [ ~/.docker/config.json ]; then
+	sed -i 's/credsStore/credStore/g' ~/.docker/config.json
+fi
+echo ""
+echo "********************************************************************"
+echo "**** Restart to allow Docker permission changes to take effect. ****"
+echo "********************************************************************"
+echo ""

capvector-pi05/scripts/docker/install_nvidia_container_toolkit.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/bin/bash
+# Installs the NVIDIA Container Toolkit, which allows Docker containers to access NVIDIA GPUs.
+# NVIDIA's official documentation: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg &&
+	curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list |
+	sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' |
+		sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+# NVIDIA's documenation omits 'sudo' in the following command, but it is required.
+sudo sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list
+sudo apt-get update
+sudo apt-get install -y nvidia-container-toolkit
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker

capvector-pi05/scripts/docker/serve_policy.Dockerfile ADDED Viewed

	@@ -0,0 +1,38 @@

+# Dockerfile for serving a PI policy.
+# Based on UV's instructions: https://docs.astral.sh/uv/guides/integration/docker/#developing-in-a-container
+# Build the container:
+# docker build . -t openpi_server -f scripts/docker/serve_policy.Dockerfile
+# Run the container:
+# docker run --rm -it --network=host -v .:/app --gpus=all openpi_server /bin/bash
+FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04@sha256:2d913b09e6be8387e1a10976933642c73c840c0b735f0bf3c28d97fc9bc422e0
+COPY --from=ghcr.io/astral-sh/uv:0.5.1 /uv /uvx /bin/
+WORKDIR /app
+# Needed because LeRobot uses git-lfs.
+RUN apt-get update && apt-get install -y git git-lfs linux-headers-generic build-essential clang
+# Copy from the cache instead of linking since it's a mounted volume
+ENV UV_LINK_MODE=copy
+# Write the virtual environment outside of the project directory so it doesn't
+# leak out of the container when we mount the application code.
+ENV UV_PROJECT_ENVIRONMENT=/.venv
+# Install the project's dependencies using the lockfile and settings
+RUN uv venv --python 3.11.9 $UV_PROJECT_ENVIRONMENT
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    --mount=type=bind,source=packages/openpi-client/pyproject.toml,target=packages/openpi-client/pyproject.toml \
+    --mount=type=bind,source=packages/openpi-client/src,target=packages/openpi-client/src \
+    GIT_LFS_SKIP_SMUDGE=1 uv sync --frozen --no-install-project --no-dev
+# Copy transformers_replace files while preserving directory structure
+COPY src/openpi/models_pytorch/transformers_replace/ /tmp/transformers_replace/
+RUN /.venv/bin/python -c "import transformers; print(transformers.__file__)" | xargs dirname | xargs -I{} cp -r /tmp/transformers_replace/* {} && rm -rf /tmp/transformers_replace
+CMD /bin/bash -c "uv run scripts/serve_policy.py $SERVER_ARGS"

capvector-pi05/scripts/serve_policy.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import dataclasses
+import enum
+import logging
+import socket
+import tyro
+from openpi.policies import policy as _policy
+from openpi.policies import policy_config as _policy_config
+from openpi.serving import websocket_policy_server
+from openpi.training import config as _config
+class EnvMode(enum.Enum):
+    """Supported environments."""
+    ALOHA = "aloha"
+    ALOHA_SIM = "aloha_sim"
+    DROID = "droid"
+    LIBERO = "libero"
+@dataclasses.dataclass
+class Checkpoint:
+    """Load a policy from a trained checkpoint."""
+    # Training config name (e.g., "pi0_aloha_sim").
+    config: str
+    # Checkpoint directory (e.g., "checkpoints/pi0_aloha_sim/exp/10000").
+    dir: str
+@dataclasses.dataclass
+class Default:
+    """Use the default policy for the given environment."""
+@dataclasses.dataclass
+class Args:
+    """Arguments for the serve_policy script."""
+    # Environment to serve the policy for. This is only used when serving default policies.
+    env: EnvMode = EnvMode.ALOHA_SIM
+    # If provided, will be used in case the "prompt" key is not present in the data, or if the model doesn't have a default
+    # prompt.
+    default_prompt: str | None = None
+    # Port to serve the policy on.
+    port: int = 8000
+    # Record the policy's behavior for debugging.
+    record: bool = False
+    # Specifies how to load the policy. If not provided, the default policy for the environment will be used.
+    policy: Checkpoint | Default = dataclasses.field(default_factory=Default)
+# Default checkpoints that should be used for each environment.
+DEFAULT_CHECKPOINT: dict[EnvMode, Checkpoint] = {
+    EnvMode.ALOHA: Checkpoint(
+        config="pi05_aloha",
+        dir="gs://openpi-assets/checkpoints/pi05_base",
+    ),
+    EnvMode.ALOHA_SIM: Checkpoint(
+        config="pi0_aloha_sim",
+        dir="gs://openpi-assets/checkpoints/pi0_aloha_sim",
+    ),
+    EnvMode.DROID: Checkpoint(
+        config="pi05_droid",
+        dir="gs://openpi-assets/checkpoints/pi05_droid",
+    ),
+    EnvMode.LIBERO: Checkpoint(
+        config="pi05_libero",
+        dir="gs://openpi-assets/checkpoints/pi05_libero",
+    ),
+}
+def create_default_policy(env: EnvMode, *, default_prompt: str | None = None) -> _policy.Policy:
+    """Create a default policy for the given environment."""
+    if checkpoint := DEFAULT_CHECKPOINT.get(env):
+        return _policy_config.create_trained_policy(
+            _config.get_config(checkpoint.config), checkpoint.dir, default_prompt=default_prompt
+        )
+    raise ValueError(f"Unsupported environment mode: {env}")
+def create_policy(args: Args) -> _policy.Policy:
+    """Create a policy from the given arguments."""
+    match args.policy:
+        case Checkpoint():
+            return _policy_config.create_trained_policy(
+                _config.get_config(args.policy.config), args.policy.dir, default_prompt=args.default_prompt
+            )
+        case Default():
+            return create_default_policy(args.env, default_prompt=args.default_prompt)
+def main(args: Args) -> None:
+    policy = create_policy(args)
+    policy_metadata = policy.metadata
+    # Record the policy's behavior.
+    if args.record:
+        policy = _policy.PolicyRecorder(policy, "policy_records")
+    hostname = socket.gethostname()
+    local_ip = socket.gethostbyname(hostname)
+    logging.info("Creating server (host: %s, ip: %s)", hostname, local_ip)
+    server = websocket_policy_server.WebsocketPolicyServer(
+        policy=policy,
+        host="0.0.0.0",
+        port=args.port,
+        metadata=policy_metadata,
+    )
+    server.serve_forever()
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, force=True)
+    main(tyro.cli(Args))

capvector-pi05/scripts/train.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import dataclasses
+import functools
+import logging
+import platform
+from typing import Any
+import etils.epath as epath
+import flax.nnx as nnx
+from flax.training import common_utils
+import flax.traverse_util as traverse_util
+import jax
+import jax.experimental
+import jax.numpy as jnp
+import numpy as np
+import optax
+import tqdm_loggable.auto as tqdm
+import wandb
+import openpi.models.model as _model
+import openpi.shared.array_typing as at
+import openpi.shared.nnx_utils as nnx_utils
+import openpi.training.checkpoints as _checkpoints
+import openpi.training.config as _config
+import openpi.training.data_loader as _data_loader
+import openpi.training.optimizer as _optimizer
+import openpi.training.sharding as sharding
+import openpi.training.utils as training_utils
+import openpi.training.weight_loaders as _weight_loaders
+def init_logging():
+    """Custom logging format for better readability."""
+    level_mapping = {"DEBUG": "D", "INFO": "I", "WARNING": "W", "ERROR": "E", "CRITICAL": "C"}
+    class CustomFormatter(logging.Formatter):
+        def format(self, record):
+            record.levelname = level_mapping.get(record.levelname, record.levelname)
+            return super().format(record)
+    formatter = CustomFormatter(
+        fmt="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)-80s (%(process)d:%(filename)s:%(lineno)s)",
+        datefmt="%H:%M:%S",
+    )
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    logger.handlers[0].setFormatter(formatter)
+def init_wandb(config: _config.TrainConfig, *, resuming: bool, log_code: bool = False, enabled: bool = True):
+    if not enabled:
+        wandb.init(mode="disabled")
+        return
+    ckpt_dir = config.checkpoint_dir
+    if not ckpt_dir.exists():
+        raise FileNotFoundError(f"Checkpoint directory {ckpt_dir} does not exist.")
+    if resuming:
+        run_id = (ckpt_dir / "wandb_id.txt").read_text().strip()
+        wandb.init(id=run_id, resume="must", project=config.project_name)
+    else:
+        wandb.init(
+            name=config.exp_name,
+            config=dataclasses.asdict(config),
+            project=config.project_name,
+        )
+        (ckpt_dir / "wandb_id.txt").write_text(wandb.run.id)
+    if log_code:
+        wandb.run.log_code(epath.Path(__file__).parent.parent)
+def _load_weights_and_validate(loader: _weight_loaders.WeightLoader, params_shape: at.Params) -> at.Params:
+    """Loads and validates the weights. Returns a loaded subset of the weights."""
+    loaded_params = loader.load(params_shape)
+    at.check_pytree_equality(expected=params_shape, got=loaded_params, check_shapes=True, check_dtypes=True)
+    # Remove jax.ShapeDtypeStruct from the loaded params. This makes sure that only the loaded params are returned.
+    return traverse_util.unflatten_dict(
+        {k: v for k, v in traverse_util.flatten_dict(loaded_params).items() if not isinstance(v, jax.ShapeDtypeStruct)}
+    )
+@at.typecheck
+def init_train_state(
+    config: _config.TrainConfig, init_rng: at.KeyArrayLike, mesh: jax.sharding.Mesh, *, resume: bool
+) -> tuple[training_utils.TrainState, Any]:
+    tx = _optimizer.create_optimizer(config.optimizer, config.lr_schedule, weight_decay_mask=None)
+    def init(rng: at.KeyArrayLike, partial_params: at.Params | None = None) -> training_utils.TrainState:
+        rng, model_rng = jax.random.split(rng)
+        # initialize the model (and its parameters).
+        model = config.model.create(model_rng)
+        # Merge the partial params into the model.
+        if partial_params is not None:
+            graphdef, state = nnx.split(model)
+            # This will produce an error if the partial params are not a subset of the state.
+            state.replace_by_pure_dict(partial_params)
+            model = nnx.merge(graphdef, state)
+        params = nnx.state(model)
+        # Convert frozen params to bfloat16.
+        params = nnx_utils.state_map(params, config.freeze_filter, lambda p: p.replace(p.value.astype(jnp.bfloat16)))
+        return training_utils.TrainState(
+            step=0,
+            params=params,
+            model_def=nnx.graphdef(model),
+            tx=tx,
+            opt_state=tx.init(params.filter(config.trainable_filter)),
+            ema_decay=config.ema_decay,
+            ema_params=None if config.ema_decay is None else params,
+        )
+    train_state_shape = jax.eval_shape(init, init_rng)
+    state_sharding = sharding.fsdp_sharding(train_state_shape, mesh, log=True)
+    if resume:
+        return train_state_shape, state_sharding
+    partial_params = _load_weights_and_validate(config.weight_loader, train_state_shape.params.to_pure_dict())
+    replicated_sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec())
+    # Initialize the train state and mix in the partial params.
+    train_state = jax.jit(
+        init,
+        donate_argnums=(1,),  # donate the partial params buffer.
+        in_shardings=replicated_sharding,
+        out_shardings=state_sharding,
+    )(init_rng, partial_params)
+    return train_state, state_sharding
+@at.typecheck
+def train_step(
+    config: _config.TrainConfig,
+    rng: at.KeyArrayLike,
+    state: training_utils.TrainState,
+    batch: tuple[_model.Observation, _model.Actions],
+) -> tuple[training_utils.TrainState, dict[str, at.Array]]:
+    model = nnx.merge(state.model_def, state.params)
+    model.train()
+    @at.typecheck
+    def loss_fn(
+        model: _model.BaseModel, rng: at.KeyArrayLike, observation: _model.Observation, actions: _model.Actions
+    ):
+        chunked_loss = model.compute_loss(rng, observation, actions, train=True)
+        return jnp.mean(chunked_loss)
+    train_rng = jax.random.fold_in(rng, state.step)
+    observation, actions = batch
+    # Filter out frozen params.
+    diff_state = nnx.DiffState(0, config.trainable_filter)
+    loss, grads = nnx.value_and_grad(loss_fn, argnums=diff_state)(model, train_rng, observation, actions)
+    params = state.params.filter(config.trainable_filter)
+    updates, new_opt_state = state.tx.update(grads, state.opt_state, params)
+    new_params = optax.apply_updates(params, updates)
+    # Update the model in place and return the new full state.
+    nnx.update(model, new_params)
+    new_params = nnx.state(model)
+    new_state = dataclasses.replace(state, step=state.step + 1, params=new_params, opt_state=new_opt_state)
+    if state.ema_decay is not None:
+        new_state = dataclasses.replace(
+            new_state,
+            ema_params=jax.tree.map(
+                lambda old, new: state.ema_decay * old + (1 - state.ema_decay) * new, state.ema_params, new_params
+            ),
+        )
+    # Filter out params that aren't kernels.
+    kernel_params = nnx.state(
+        model,
+        nnx.All(
+            nnx.Param,
+            nnx.Not(nnx_utils.PathRegex(".*/(bias|scale|pos_embedding|input_embedding)")),
+            lambda _, x: x.value.ndim > 1,
+        ),
+    )
+    info = {
+        "loss": loss,
+        "grad_norm": optax.global_norm(grads),
+        "param_norm": optax.global_norm(kernel_params),
+    }
+    return new_state, info
+def main(config: _config.TrainConfig):
+    init_logging()
+    logging.info(f"Running on: {platform.node()}")
+    if config.batch_size % jax.device_count() != 0:
+        raise ValueError(
+            f"Batch size {config.batch_size} must be divisible by the number of devices {jax.device_count()}."
+        )
+    jax.config.update("jax_compilation_cache_dir", str(epath.Path("~/.cache/jax").expanduser()))
+    rng = jax.random.key(config.seed)
+    train_rng, init_rng = jax.random.split(rng)
+    mesh = sharding.make_mesh(config.fsdp_devices)
+    data_sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec(sharding.DATA_AXIS))
+    replicated_sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec())
+    checkpoint_manager, resuming = _checkpoints.initialize_checkpoint_dir(
+        config.checkpoint_dir,
+        keep_period=config.keep_period,
+        overwrite=config.overwrite,
+        resume=config.resume,
+    )
+    init_wandb(config, resuming=resuming, enabled=config.wandb_enabled)
+    data_loader = _data_loader.create_data_loader(
+        config,
+        sharding=data_sharding,
+        shuffle=True,
+    )
+    data_iter = iter(data_loader)
+    batch = next(data_iter)
+    logging.info(f"Initialized data loader:\n{training_utils.array_tree_to_info(batch)}")
+    # Log images from first batch to sanity check.
+    images_to_log = [
+        wandb.Image(np.concatenate([np.array(img[i]) for img in batch[0].images.values()], axis=1))
+        for i in range(min(5, len(next(iter(batch[0].images.values())))))
+    ]
+    wandb.log({"camera_views": images_to_log}, step=0)
+    train_state, train_state_sharding = init_train_state(config, init_rng, mesh, resume=resuming)
+    jax.block_until_ready(train_state)
+    logging.info(f"Initialized train state:\n{training_utils.array_tree_to_info(train_state.params)}")
+    if resuming:
+        train_state = _checkpoints.restore_state(checkpoint_manager, train_state, data_loader)
+    ptrain_step = jax.jit(
+        functools.partial(train_step, config),
+        in_shardings=(replicated_sharding, train_state_sharding, data_sharding),
+        out_shardings=(train_state_sharding, replicated_sharding),
+        donate_argnums=(1,),
+    )
+    start_step = int(train_state.step)
+    pbar = tqdm.tqdm(
+        range(start_step, config.num_train_steps),
+        initial=start_step,
+        total=config.num_train_steps,
+        dynamic_ncols=True,
+    )
+    infos = []
+    for step in pbar:
+        with sharding.set_mesh(mesh):
+            train_state, info = ptrain_step(train_rng, train_state, batch)
+        infos.append(info)
+        if step % config.log_interval == 0:
+            stacked_infos = common_utils.stack_forest(infos)
+            reduced_info = jax.device_get(jax.tree.map(jnp.mean, stacked_infos))
+            info_str = ", ".join(f"{k}={v:.4f}" for k, v in reduced_info.items())
+            pbar.write(f"Step {step}: {info_str}")
+            wandb.log(reduced_info, step=step)
+            infos = []
+        batch = next(data_iter)
+        if (step % config.save_interval == 0 and step > start_step) or step == config.num_train_steps - 1:
+            _checkpoints.save_state(checkpoint_manager, train_state, data_loader, step)
+    logging.info("Waiting for checkpoint manager to finish")
+    checkpoint_manager.wait_until_finished()
+if __name__ == "__main__":
+    main(_config.cli())

capvector-pi05/scripts/train_align_pytorch.py ADDED Viewed

	@@ -0,0 +1,658 @@

+"""
+PyTorch training entrypoint for PI0/PI05 with multi-GPU and multi-node (DDP) support.
+This script mirrors the behavior of the JAX trainer (`scripts/train.py`) but runs
+entirely in PyTorch using the `PI0Pytorch` model and your existing config/data
+pipeline from `src/openpi/training/config.py` and `src/openpi/training/data_loader.py`.
+Usage
+Single GPU:
+  python scripts/train_pytorch.py <config_name> --exp_name <run_name> --save_interval <interval>
+  Example:
+  python scripts/train_pytorch.py debug --exp_name pytorch_ddp_test
+  python scripts/train_pytorch.py debug --exp_name pytorch_ddp_test --resume  # Resume from latest checkpoint
+Multi-GPU (single node):
+  torchrun --standalone --nnodes=1 --nproc_per_node=<num_gpus> scripts/train_pytorch.py <config_name> --exp_name <run_name>
+  Example:
+  torchrun --standalone --nnodes=1 --nproc_per_node=2 scripts/train_pytorch.py pi0_aloha_sim --exp_name pytorch_ddp_test
+  torchrun --standalone --nnodes=1 --nproc_per_node=2 scripts/train_pytorch.py pi0_aloha_sim --exp_name pytorch_ddp_test --resume
+Multi-Node Training:
+	torchrun \
+    --nnodes=<num_nodes> --nproc_per_node=<gpus_per_node> --node_rank=<rank_of_node> \
+    --master_addr=<master_ip> --master_port=<port> \
+    scripts/train_pytorch.py <config_name> --exp_name=<run_name> --save_interval <interval>
+"""
+import dataclasses
+import gc
+import logging
+import os
+import platform
+import shutil
+import time
+import jax
+import numpy as np
+import safetensors.torch
+import torch
+import torch.distributed as dist
+import torch.nn.parallel
+import tqdm
+import wandb
+import openpi.models.pi0_config
+from openpi.models_pytorch import pi0_pytorch, pi0_align_pytorch, projectors
+import openpi.shared.normalize as _normalize
+import openpi.training.config as _config
+import openpi.training.data_loader as _data
+from vggt.models.vggt import VGGT
+def init_logging():
+    level_mapping = {"DEBUG": "D", "INFO": "I", "WARNING": "W", "ERROR": "E", "CRITICAL": "C"}
+    class CustomFormatter(logging.Formatter):
+        def format(self, record):
+            record.levelname = level_mapping.get(record.levelname, record.levelname)
+            return super().format(record)
+    formatter = CustomFormatter(
+        fmt="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)-80s (%(process)d:%(filename)s:%(lineno)s)",
+        datefmt="%H:%M:%S",
+    )
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    if not logger.handlers:
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+    else:
+        logger.handlers[0].setFormatter(formatter)
+def init_wandb(config: _config.TrainConfig, *, resuming: bool, enabled: bool = True):
+    """Initialize wandb logging."""
+    if not enabled:
+        wandb.init(mode="disabled")
+        return
+    ckpt_dir = config.checkpoint_dir
+    if not ckpt_dir.exists():
+        raise FileNotFoundError(f"Checkpoint directory {ckpt_dir} does not exist.")
+    if resuming:
+        run_id = (ckpt_dir / "wandb_id.txt").read_text().strip()
+        wandb.init(id=run_id, resume="must", project=config.project_name)
+    else:
+        wandb.init(
+            name=config.exp_name,
+            config=dataclasses.asdict(config),
+            project=config.project_name,
+        )
+        (ckpt_dir / "wandb_id.txt").write_text(wandb.run.id)
+def setup_ddp():
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    use_ddp = world_size > 1
+    if use_ddp and not torch.distributed.is_initialized():
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+        torch.distributed.init_process_group(backend=backend, init_method="env://")
+        # Set up debugging environment variables for DDP issues
+        if os.environ.get("TORCH_DISTRIBUTED_DEBUG") is None:
+            os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
+    local_rank = int(os.environ.get("LOCAL_RANK", os.environ.get("RANK", "0")))
+    device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu")
+    if torch.cuda.is_available():
+        torch.cuda.set_device(device)
+    return use_ddp, local_rank, device
+def cleanup_ddp():
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+        torch.distributed.destroy_process_group()
+def set_seed(seed: int, local_rank: int):
+    torch.manual_seed(seed + local_rank)
+    np.random.seed(seed + local_rank)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed + local_rank)
+def build_datasets(config: _config.TrainConfig):
+    # Use the unified data loader with PyTorch framework
+    data_loader = _data.create_data_loader(config, framework="pytorch", shuffle=True)
+    return data_loader, data_loader.data_config()
+def get_model_state_dict(model):
+    """Get state dict from model, handling DDP wrapper."""
+    return (
+        model.module.state_dict()
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel)
+        else model.state_dict()
+    )
+def get_model_parameters(model):
+    """Get parameters from model, handling DDP wrapper."""
+    return (
+        model.module.parameters()
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel)
+        else model.parameters()
+    )
+def save_checkpoint(model, optimizer, global_step, config, is_main, data_config):
+    """Save a checkpoint with model state, optimizer state, and metadata."""
+    if not is_main:
+        return
+    # Only save if it's time to save or if it's the final step
+    if (global_step % config.save_interval == 0 and global_step > 0) or global_step == config.num_train_steps - 1:
+        # Create temporary directory for atomic checkpoint saving
+        final_ckpt_dir = config.checkpoint_dir / f"{global_step}"
+        tmp_ckpt_dir = config.checkpoint_dir / f"tmp_{global_step}"
+        # Remove any existing temp directory and create new one
+        if tmp_ckpt_dir.exists():
+            shutil.rmtree(tmp_ckpt_dir)
+        tmp_ckpt_dir.mkdir(parents=True, exist_ok=True)
+        # Save model state using safetensors (handle shared tensors)
+        model_to_save = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+        safetensors.torch.save_model(model_to_save, tmp_ckpt_dir / "model.safetensors")
+        # Save optimizer state using PyTorch format
+        torch.save(optimizer.state_dict(), tmp_ckpt_dir / "optimizer.pt")
+        # Save training metadata (avoid saving full config to prevent JAX/Flax compatibility issues)
+        metadata = {
+            "global_step": global_step,
+            "config": dataclasses.asdict(config),
+            "timestamp": time.time(),
+        }
+        torch.save(metadata, tmp_ckpt_dir / "metadata.pt")
+        # save norm stats
+        norm_stats = data_config.norm_stats
+        if norm_stats is not None and data_config.asset_id is not None:
+            _normalize.save(tmp_ckpt_dir / "assets" / data_config.asset_id, norm_stats)
+        # Atomically move temp directory to final location
+        if final_ckpt_dir.exists():
+            shutil.rmtree(final_ckpt_dir)
+        tmp_ckpt_dir.rename(final_ckpt_dir)
+        logging.info(f"Saved checkpoint at step {global_step} -> {final_ckpt_dir}")
+        # Log checkpoint to wandb
+        if config.wandb_enabled:
+            wandb.log({"checkpoint_step": global_step}, step=global_step)
+def load_checkpoint(model, optimizer, checkpoint_dir, device):
+    """Load the latest checkpoint and return the global step."""
+    checkpoint_steps = [
+        int(d.name)
+        for d in checkpoint_dir.iterdir()
+        if d.is_dir() and d.name.isdigit() and not d.name.startswith("tmp_")
+    ]
+    if not checkpoint_steps:
+        raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
+    latest_step = max(checkpoint_steps)
+    ckpt_dir = checkpoint_dir / f"{latest_step}"
+    # Clear memory before loading checkpoints
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "before_loading_checkpoint")
+    try:
+        # Load model state with error handling
+        logging.info("Loading model state...")
+        safetensors_path = ckpt_dir / "model.safetensors"
+        if safetensors_path.exists():
+            model_to_load = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+            safetensors.torch.load_model(model_to_load, safetensors_path, device=str(device))
+            logging.info("Loaded model state from safetensors format")
+        else:
+            raise FileNotFoundError(f"No model checkpoint found at {ckpt_dir}")
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "after_loading_model")
+        # Load optimizer state with error handling
+        logging.info("Loading optimizer state...")
+        optimizer_path = ckpt_dir / "optimizer.pt"
+        if optimizer_path.exists():
+            optimizer_state_dict = torch.load(optimizer_path, map_location=device, weights_only=False)
+            logging.info("Loaded optimizer state from pt format")
+        else:
+            raise FileNotFoundError(f"No optimizer checkpoint found at {ckpt_dir}")
+        optimizer.load_state_dict(optimizer_state_dict)
+        del optimizer_state_dict
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "after_loading_optimizer")
+        # Load metadata
+        logging.info("Loading metadata...")
+        metadata = torch.load(ckpt_dir / "metadata.pt", map_location=device, weights_only=False)
+        global_step = metadata.get("global_step", latest_step)
+        del metadata
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "after_loading_metadata")
+        logging.info(f"Successfully loaded all checkpoint components from step {latest_step}")
+        return global_step
+    except RuntimeError as e:
+        if "out of memory" in str(e):
+            # Clear memory and provide detailed error message
+            torch.cuda.empty_cache()
+            gc.collect()
+            logging.error(f"Out of memory error while loading checkpoint: {e!s}")
+            log_memory_usage(device, latest_step, "after_oom_error")
+            raise RuntimeError(
+                "Out of memory while loading checkpoint. Try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
+            ) from e
+        raise
+def get_latest_checkpoint_step(checkpoint_dir):
+    """Get the latest checkpoint step number from a checkpoint directory."""
+    checkpoint_steps = [
+        int(d.name)
+        for d in checkpoint_dir.iterdir()
+        if d.is_dir() and d.name.isdigit() and not d.name.startswith("tmp_")
+    ]
+    return max(checkpoint_steps) if checkpoint_steps else None
+def log_memory_usage(device, step, phase="unknown"):
+    """Log detailed memory usage information."""
+    if not torch.cuda.is_available():
+        return
+    memory_allocated = torch.cuda.memory_allocated(device) / 1e9
+    memory_reserved = torch.cuda.memory_reserved(device) / 1e9
+    memory_free = torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device)
+    memory_free = memory_free / 1e9
+    # Get more detailed memory info
+    memory_stats = torch.cuda.memory_stats(device)
+    max_memory_allocated = memory_stats.get("allocated_bytes.all.peak", 0) / 1e9
+    max_memory_reserved = memory_stats.get("reserved_bytes.all.peak", 0) / 1e9
+    # Get DDP info if available
+    ddp_info = ""
+    if dist.is_initialized():
+        ddp_info = f" | DDP: rank={dist.get_rank()}, world_size={dist.get_world_size()}"
+    logging.info(
+        f"Step {step} ({phase}): GPU memory - allocated: {memory_allocated:.2f}GB, reserved: {memory_reserved:.2f}GB, free: {memory_free:.2f}GB, peak_allocated: {max_memory_allocated:.2f}GB, peak_reserved: {max_memory_reserved:.2f}GB{ddp_info}"
+    )
+def train_loop(config: _config.TrainConfig):
+    use_ddp, local_rank, device = setup_ddp()
+    is_main = (not use_ddp) or (dist.get_rank() == 0)
+    set_seed(config.seed, local_rank)
+    # Initialize checkpoint directory and wandb
+    resuming = False
+    if config.resume:
+        # Find checkpoint directory based on experiment name
+        exp_checkpoint_dir = config.checkpoint_dir
+        if exp_checkpoint_dir.exists():
+            # Use validation to find the latest working checkpoint
+            latest_step = get_latest_checkpoint_step(exp_checkpoint_dir)
+            if latest_step is not None:
+                resuming = True
+                logging.info(
+                    f"Resuming from experiment checkpoint directory: {exp_checkpoint_dir} at step {latest_step}"
+                )
+            else:
+                raise FileNotFoundError(f"No valid checkpoints found in {exp_checkpoint_dir} for resume")
+        else:
+            raise FileNotFoundError(f"Experiment checkpoint directory {exp_checkpoint_dir} does not exist for resume")
+    elif config.overwrite and config.checkpoint_dir.exists():
+        shutil.rmtree(config.checkpoint_dir)
+        logging.info(f"Overwriting checkpoint directory: {config.checkpoint_dir}")
+    # Create checkpoint directory with experiment name
+    if not resuming:
+        # For new runs, create experiment-specific checkpoint directory
+        exp_checkpoint_dir = config.checkpoint_dir
+        exp_checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        logging.info(f"Created experiment checkpoint directory: {exp_checkpoint_dir}")
+    else:
+        # For resume, checkpoint_dir is already set to the experiment directory
+        logging.info(f"Using existing experiment checkpoint directory: {config.checkpoint_dir}")
+    # Initialize wandb (only on main process)
+    if is_main:
+        init_wandb(config, resuming=resuming, enabled=config.wandb_enabled)
+    # Build data loader using the unified data loader
+    # Calculate effective batch size per GPU for DDP
+    # For N GPUs, each GPU should get batch_size/N samples, so total across all GPUs is batch_size
+    world_size = torch.distributed.get_world_size() if use_ddp else 1
+    effective_batch_size = config.batch_size // world_size
+    logging.info(
+        f"Using batch size per GPU: {effective_batch_size} (total batch size across {world_size} GPUs: {config.batch_size})"
+    )
+    # Pass the original batch size to data loader - it will handle DDP splitting internally
+    loader, data_config = build_datasets(config)
+    # Log sample images to wandb on first batch
+    if is_main and config.wandb_enabled and not resuming:
+        # Create a separate data loader for sample batch to avoid consuming the main loader
+        sample_data_loader = _data.create_data_loader(config, framework="pytorch", shuffle=False)
+        sample_batch = next(iter(sample_data_loader))
+        # Convert observation and actions to torch tensors
+        observation, actions = sample_batch
+        sample_batch = observation.to_dict()
+        sample_batch["actions"] = actions
+        # Create sample images for wandb
+        images_to_log = []
+        # Get batch size from the first image tensor
+        batch_size = next(iter(sample_batch["image"].values())).shape[0]
+        for i in range(min(5, batch_size)):
+            # Concatenate all camera views horizontally for this batch item
+            # Convert from NCHW to NHWC format for wandb
+            img_concatenated = torch.cat([img[i].permute(1, 2, 0) for img in sample_batch["image"].values()], axis=1)
+            img_concatenated = img_concatenated.cpu().numpy()
+            images_to_log.append(wandb.Image(img_concatenated))
+        wandb.log({"camera_views": images_to_log}, step=0)
+        # Clear sample batch from memory aggressively
+        del sample_batch, observation, actions, images_to_log, img_concatenated
+        del sample_data_loader  # Also delete the sample data loader
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        logging.info("Cleared sample batch and data loader from memory")
+    # Build model
+    if not isinstance(config.model, openpi.models.pi0_config.Pi0Config):
+        # Convert dataclass to Pi0Config if needed
+        model_cfg = openpi.models.pi0_config.Pi0Config(
+            dtype=config.pytorch_training_precision,
+            action_dim=config.model.action_dim,
+            action_horizon=config.model.action_horizon,
+            max_token_len=config.model.max_token_len,
+            paligemma_variant=getattr(config.model, "paligemma_variant", "gemma_2b"),
+            action_expert_variant=getattr(config.model, "action_expert_variant", "gemma_300m"),
+            pi05=getattr(config.model, "pi05", False),
+        )
+    else:
+        model_cfg = config.model
+        # Update dtype to match pytorch_training_precision
+        object.__setattr__(model_cfg, "dtype", config.pytorch_training_precision)
+    model = openpi.models_pytorch.pi0_align_pytorch.PI0Pytorch(model_cfg, config).to(device)
+    vggt_model = VGGT(
+        enable_camera=False,
+        enable_point=False,
+        enable_depth=False,
+        enable_track=False,
+        feature_only=True,
+    ).to(device)
+    align_projector = projectors.AlignProjector(
+        model.LLM_width,
+        config.vggt_dim,
+        config.use_vlm_norm).to(device)
+    if hasattr(model, "gradient_checkpointing_enable"):
+        enable_gradient_checkpointing = True
+        model.gradient_checkpointing_enable()
+        logging.info("Enabled gradient checkpointing for memory optimization")
+    else:
+        enable_gradient_checkpointing = False
+        logging.info("Gradient checkpointing is not supported for this model")
+    # Log initial memory usage after model creation
+    if is_main and torch.cuda.is_available():
+        log_memory_usage(device, 0, "after_model_creation")
+    # Enable memory optimizations for large-scale training
+    if world_size >= 8:
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        # Set memory allocation configuration
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
+        logging.info("Enabled memory optimizations for 8+ GPU training")
+    if use_ddp:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model,
+            device_ids=[device.index] if device.type == "cuda" else None,
+            find_unused_parameters=True,  # Disable for memory efficiency
+            gradient_as_bucket_view=True,  # Enable for memory efficiency
+            static_graph=world_size >= 8,  # Enable for 8+ GPUs
+        )
+        align_projector = torch.nn.parallel.DistributedDataParallel(
+            align_projector,
+            device_ids=[device.index] if device.type == "cuda" else None,
+            find_unused_parameters=True,  # Disable for memory efficiency
+            gradient_as_bucket_view=True,  # Enable for memory efficiency
+            static_graph=world_size >= 8,  # Enable for 8+ GPUs
+        )
+    # Load weights from weight_loader if specified (for fine-tuning)
+    if config.pytorch_weight_path is not None:
+        logging.info(f"Loading weights from: {config.pytorch_weight_path}")
+        model_path = os.path.join(config.pytorch_weight_path, "model.safetensors")
+        safetensors.torch.load_model(
+            (model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model),
+            model_path,
+            strict=False,
+        )
+        logging.info(f"Loaded PyTorch weights from {config.pytorch_weight_path}")
+    if config.vggt_weight_path is not None:
+        vggt_path = os.path.join(config.vggt_weight_path, "model.pt")
+        if not os.path.exists(vggt_path):
+            raise FileNotFoundError(f"VGGT weight file not found at {vggt_path}")
+        vggt_model.load_state_dict(torch.load(vggt_path), strict=False)
+        logging.info(f"Loaded VGGT weights from {config.vggt_weight_path}")
+    # Optimizer + learning rate schedule from config
+    warmup_steps = config.lr_schedule.warmup_steps
+    peak_lr = config.lr_schedule.peak_lr
+    decay_steps = config.lr_schedule.decay_steps
+    end_lr = config.lr_schedule.decay_lr
+    # Create optimizer with config parameters
+    optim = torch.optim.AdamW(
+        list(model.parameters()) + list(align_projector.parameters()),
+        lr=peak_lr,
+        betas=(config.optimizer.b1, config.optimizer.b2),
+        eps=config.optimizer.eps,
+        weight_decay=config.optimizer.weight_decay,
+    )
+    # Load checkpoint if resuming
+    global_step = 0
+    if resuming:
+        global_step = load_checkpoint(model, optim, config.checkpoint_dir, device)
+        logging.info(f"Resumed training from step {global_step}")
+    def lr_schedule(step: int):
+        if step < warmup_steps:
+            # Match JAX behavior: start from peak_lr / (warmup_steps + 1)
+            init_lr = peak_lr / (warmup_steps + 1)
+            return init_lr + (peak_lr - init_lr) * step / warmup_steps
+        # cosine decay
+        progress = min(1.0, (step - warmup_steps) / max(1, decay_steps - warmup_steps))
+        cos = 0.5 * (1 + np.cos(np.pi * progress))
+        return end_lr + (peak_lr - end_lr) * cos
+    model.train()
+    align_projector.train()
+    vggt_model.eval()
+    start_time = time.time()
+    infos = []  # Collect stats over log interval
+    if is_main:
+        logging.info(
+            f"Running on: {platform.node()} | world_size={torch.distributed.get_world_size() if use_ddp else 1}"
+        )
+        logging.info(
+            f"Training config: batch_size={config.batch_size}, effective_batch_size={effective_batch_size}, num_train_steps={config.num_train_steps}"
+        )
+        logging.info(f"Memory optimizations: gradient_checkpointing={enable_gradient_checkpointing}")
+        logging.info(
+            f"LR schedule: warmup={warmup_steps}, peak_lr={peak_lr:.2e}, decay_steps={decay_steps}, end_lr={end_lr:.2e}"
+        )
+        logging.info(
+            f"Optimizer: {type(config.optimizer).__name__}, weight_decay={config.optimizer.weight_decay}, clip_norm={config.optimizer.clip_gradient_norm}"
+        )
+        logging.info("EMA is not supported for PyTorch training")
+        logging.info(f"Training precision: {model_cfg.dtype}")
+    # Training loop - iterate until we reach num_train_steps
+    pbar = (
+        tqdm.tqdm(total=config.num_train_steps, initial=global_step, desc="Training", disable=not is_main)
+        if is_main
+        else None
+    )
+    while global_step < config.num_train_steps:
+        # Set epoch for distributed training
+        if use_ddp and hasattr(loader, "set_epoch"):
+            loader.set_epoch(global_step // len(loader))
+        for observation, actions in loader:
+            # Check if we've reached the target number of steps
+            if global_step >= config.num_train_steps:
+                break
+            # The unified data loader returns (observation, actions) tuple
+            observation = jax.tree.map(lambda x: x.to(device), observation)  # noqa: PLW2901
+            actions = actions.to(torch.float32)  # noqa: PLW2901
+            actions = actions.to(device)  # noqa: PLW2901
+            # Update LR
+            for pg in optim.param_groups:
+                pg["lr"] = lr_schedule(global_step)
+            # Forward pass
+            action_losses, align_loss = model(observation, actions, vggt=vggt_model, align_proj=align_projector)
+            loss = action_losses + config.align_loss_coeff * align_loss
+            # Backward pass
+            loss.backward()
+            # Log memory usage after backward pass
+            if global_step < 5 and is_main and torch.cuda.is_available():
+                log_memory_usage(device, global_step, "after_backward")
+            # Gradient clipping
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.optimizer.clip_gradient_norm)
+            # Optimizer step
+            optim.step()
+            optim.zero_grad(set_to_none=True)
+            # Clear gradients more aggressively
+            for param in model.parameters():
+                if param.grad is not None:
+                    param.grad.detach_()
+                    param.grad = None
+            # Collect stats
+            if is_main:
+                infos.append(
+                    {
+                        "action_loss": action_losses.item(),
+                        "align_loss": align_loss.item(),
+                        "learning_rate": optim.param_groups[0]["lr"],
+                        "grad_norm": float(grad_norm) if isinstance(grad_norm, torch.Tensor) else grad_norm,
+                    }
+                )
+            if is_main and (global_step % config.log_interval == 0):
+                elapsed = time.time() - start_time
+                # Average stats over log interval
+                avg_loss = sum(info["action_loss"] for info in infos) / len(infos)
+                avg_align_loss = sum(info["align_loss"] for info in infos) / len(infos)
+                avg_lr = sum(info["learning_rate"] for info in infos) / len(infos)
+                avg_grad_norm = None
+                if any("grad_norm" in info for info in infos):
+                    vals = [
+                        info["grad_norm"] for info in infos if "grad_norm" in info and info["grad_norm"] is not None
+                    ]
+                    if len(vals) > 0:
+                        avg_grad_norm = sum(vals) / len(vals)
+                logging.info(
+                    f"step={global_step} action_loss={avg_loss:.4f} align_loss={avg_align_loss:.4f} lr={avg_lr:.2e} grad_norm={avg_grad_norm:.2f} time={elapsed:.1f}s"
+                    if avg_grad_norm is not None
+                    else f"step={global_step} action_loss={avg_loss:.4f} align_loss={avg_align_loss:.4f} lr={avg_lr:.2e} time={elapsed:.1f}s"
+                )
+                # Log to wandb
+                if config.wandb_enabled and len(infos) > 0:
+                    log_payload = {
+                        "action_loss": avg_loss,
+                        "align_loss": avg_align_loss,
+                        "learning_rate": avg_lr,
+                        "step": global_step,
+                        "time_per_step": elapsed / config.log_interval,
+                    }
+                    if avg_grad_norm is not None:
+                        log_payload["grad_norm"] = avg_grad_norm
+                    wandb.log(log_payload, step=global_step)
+                start_time = time.time()
+                infos = []  # Reset stats collection
+            global_step += 1
+            # Save checkpoint using the new mechanism
+            save_checkpoint(model, optim, global_step, config, is_main, data_config)
+            # Update progress bar
+            if pbar is not None:
+                pbar.update(1)
+                pbar.set_postfix(
+                    {"loss": f"{loss.item():.4f}", "lr": f"{optim.param_groups[0]['lr']:.2e}", "step": global_step}
+                )
+    # Close progress bar
+    if pbar is not None:
+        pbar.close()
+    # Finish wandb run
+    if is_main and config.wandb_enabled:
+        wandb.finish()
+    cleanup_ddp()
+def main():
+    init_logging()
+    config = _config.cli()
+    train_loop(config)
+if __name__ == "__main__":
+    main()

capvector-pi05/scripts/train_pytorch.py ADDED Viewed

	@@ -0,0 +1,632 @@

+"""
+PyTorch training entrypoint for PI0/PI05 with multi-GPU and multi-node (DDP) support.
+This script mirrors the behavior of the JAX trainer (`scripts/train.py`) but runs
+entirely in PyTorch using the `PI0Pytorch` model and your existing config/data
+pipeline from `src/openpi/training/config.py` and `src/openpi/training/data_loader.py`.
+Usage
+Single GPU:
+  python scripts/train_pytorch.py <config_name> --exp_name <run_name> --save_interval <interval>
+  Example:
+  python scripts/train_pytorch.py debug --exp_name pytorch_ddp_test
+  python scripts/train_pytorch.py debug --exp_name pytorch_ddp_test --resume  # Resume from latest checkpoint
+Multi-GPU (single node):
+  torchrun --standalone --nnodes=1 --nproc_per_node=<num_gpus> scripts/train_pytorch.py <config_name> --exp_name <run_name>
+  Example:
+  torchrun --standalone --nnodes=1 --nproc_per_node=2 scripts/train_pytorch.py pi0_aloha_sim --exp_name pytorch_ddp_test
+  torchrun --standalone --nnodes=1 --nproc_per_node=2 scripts/train_pytorch.py pi0_aloha_sim --exp_name pytorch_ddp_test --resume
+Multi-Node Training:
+	torchrun \
+    --nnodes=<num_nodes> --nproc_per_node=<gpus_per_node> --node_rank=<rank_of_node> \
+    --master_addr=<master_ip> --master_port=<port> \
+    scripts/train_pytorch.py <config_name> --exp_name=<run_name> --save_interval <interval>
+"""
+import dataclasses
+import gc
+import logging
+import os
+import platform
+import shutil
+import time
+import jax
+import numpy as np
+import safetensors.torch
+import torch
+import torch.distributed as dist
+import torch.nn.parallel
+import tqdm
+import wandb
+import openpi.models.pi0_config
+import openpi.models_pytorch.pi0_pytorch
+import openpi.shared.normalize as _normalize
+import openpi.training.config as _config
+import openpi.training.data_loader as _data
+def init_logging():
+    level_mapping = {"DEBUG": "D", "INFO": "I", "WARNING": "W", "ERROR": "E", "CRITICAL": "C"}
+    class CustomFormatter(logging.Formatter):
+        def format(self, record):
+            record.levelname = level_mapping.get(record.levelname, record.levelname)
+            return super().format(record)
+    formatter = CustomFormatter(
+        fmt="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)-80s (%(process)d:%(filename)s:%(lineno)s)",
+        datefmt="%H:%M:%S",
+    )
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    if not logger.handlers:
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+    else:
+        logger.handlers[0].setFormatter(formatter)
+def init_wandb(config: _config.TrainConfig, *, resuming: bool, enabled: bool = True):
+    """Initialize wandb logging."""
+    if not enabled:
+        wandb.init(mode="disabled")
+        return
+    ckpt_dir = config.checkpoint_dir
+    if not ckpt_dir.exists():
+        raise FileNotFoundError(f"Checkpoint directory {ckpt_dir} does not exist.")
+    if resuming:
+        run_id = (ckpt_dir / "wandb_id.txt").read_text().strip()
+        wandb.init(id=run_id, resume="must", project=config.project_name)
+    else:
+        wandb.init(
+            name=config.exp_name,
+            config=dataclasses.asdict(config),
+            project=config.project_name,
+        )
+        (ckpt_dir / "wandb_id.txt").write_text(wandb.run.id)
+def setup_ddp():
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    use_ddp = world_size > 1
+    if use_ddp and not torch.distributed.is_initialized():
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+        torch.distributed.init_process_group(backend=backend, init_method="env://")
+        # Set up debugging environment variables for DDP issues
+        if os.environ.get("TORCH_DISTRIBUTED_DEBUG") is None:
+            os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
+    local_rank = int(os.environ.get("LOCAL_RANK", os.environ.get("RANK", "0")))
+    device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu")
+    if torch.cuda.is_available():
+        torch.cuda.set_device(device)
+    return use_ddp, local_rank, device
+def cleanup_ddp():
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+        torch.distributed.destroy_process_group()
+def set_seed(seed: int, local_rank: int):
+    torch.manual_seed(seed + local_rank)
+    np.random.seed(seed + local_rank)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed + local_rank)
+def build_datasets(config: _config.TrainConfig):
+    # Use the unified data loader with PyTorch framework
+    data_loader = _data.create_data_loader(config, framework="pytorch", shuffle=True)
+    return data_loader, data_loader.data_config()
+def get_model_state_dict(model):
+    """Get state dict from model, handling DDP wrapper."""
+    return (
+        model.module.state_dict()
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel)
+        else model.state_dict()
+    )
+def get_model_parameters(model):
+    """Get parameters from model, handling DDP wrapper."""
+    return (
+        model.module.parameters()
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel)
+        else model.parameters()
+    )
+def save_checkpoint(model, optimizer, global_step, config, is_main, data_config):
+    """Save a checkpoint with model state, optimizer state, and metadata."""
+    if not is_main:
+        return
+    # Only save if it's time to save or if it's the final step
+    if (global_step % config.save_interval == 0 and global_step > 0) or global_step == config.num_train_steps - 1:
+        # Create temporary directory for atomic checkpoint saving
+        final_ckpt_dir = config.checkpoint_dir / f"{global_step}"
+        tmp_ckpt_dir = config.checkpoint_dir / f"tmp_{global_step}"
+        # Remove any existing temp directory and create new one
+        if tmp_ckpt_dir.exists():
+            shutil.rmtree(tmp_ckpt_dir)
+        tmp_ckpt_dir.mkdir(parents=True, exist_ok=True)
+        # Save model state using safetensors (handle shared tensors)
+        model_to_save = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+        safetensors.torch.save_model(model_to_save, tmp_ckpt_dir / "model.safetensors")
+        # Save optimizer state using PyTorch format
+        torch.save(optimizer.state_dict(), tmp_ckpt_dir / "optimizer.pt")
+        # Save training metadata (avoid saving full config to prevent JAX/Flax compatibility issues)
+        metadata = {
+            "global_step": global_step,
+            "config": dataclasses.asdict(config),
+            "timestamp": time.time(),
+        }
+        torch.save(metadata, tmp_ckpt_dir / "metadata.pt")
+        # save norm stats
+        norm_stats = data_config.norm_stats
+        if norm_stats is not None and data_config.asset_id is not None:
+            _normalize.save(tmp_ckpt_dir / "assets" / data_config.asset_id, norm_stats)
+        # Atomically move temp directory to final location
+        if final_ckpt_dir.exists():
+            shutil.rmtree(final_ckpt_dir)
+        tmp_ckpt_dir.rename(final_ckpt_dir)
+        logging.info(f"Saved checkpoint at step {global_step} -> {final_ckpt_dir}")
+        # Log checkpoint to wandb
+        if config.wandb_enabled:
+            wandb.log({"checkpoint_step": global_step}, step=global_step)
+def load_checkpoint(model, optimizer, checkpoint_dir, device):
+    """Load the latest checkpoint and return the global step."""
+    checkpoint_steps = [
+        int(d.name)
+        for d in checkpoint_dir.iterdir()
+        if d.is_dir() and d.name.isdigit() and not d.name.startswith("tmp_")
+    ]
+    if not checkpoint_steps:
+        raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
+    latest_step = max(checkpoint_steps)
+    ckpt_dir = checkpoint_dir / f"{latest_step}"
+    # Clear memory before loading checkpoints
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "before_loading_checkpoint")
+    try:
+        # Load model state with error handling
+        logging.info("Loading model state...")
+        safetensors_path = ckpt_dir / "model.safetensors"
+        if safetensors_path.exists():
+            model_to_load = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+            safetensors.torch.load_model(model_to_load, safetensors_path, device=str(device))
+            logging.info("Loaded model state from safetensors format")
+        else:
+            raise FileNotFoundError(f"No model checkpoint found at {ckpt_dir}")
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "after_loading_model")
+        # Load optimizer state with error handling
+        logging.info("Loading optimizer state...")
+        optimizer_path = ckpt_dir / "optimizer.pt"
+        if optimizer_path.exists():
+            optimizer_state_dict = torch.load(optimizer_path, map_location=device, weights_only=False)
+            logging.info("Loaded optimizer state from pt format")
+        else:
+            raise FileNotFoundError(f"No optimizer checkpoint found at {ckpt_dir}")
+        optimizer.load_state_dict(optimizer_state_dict)
+        del optimizer_state_dict
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "after_loading_optimizer")
+        # Load metadata
+        logging.info("Loading metadata...")
+        metadata = torch.load(ckpt_dir / "metadata.pt", map_location=device, weights_only=False)
+        global_step = metadata.get("global_step", latest_step)
+        del metadata
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "after_loading_metadata")
+        logging.info(f"Successfully loaded all checkpoint components from step {latest_step}")
+        return global_step
+    except RuntimeError as e:
+        if "out of memory" in str(e):
+            # Clear memory and provide detailed error message
+            torch.cuda.empty_cache()
+            gc.collect()
+            logging.error(f"Out of memory error while loading checkpoint: {e!s}")
+            log_memory_usage(device, latest_step, "after_oom_error")
+            raise RuntimeError(
+                "Out of memory while loading checkpoint. Try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
+            ) from e
+        raise
+def get_latest_checkpoint_step(checkpoint_dir):
+    """Get the latest checkpoint step number from a checkpoint directory."""
+    checkpoint_steps = [
+        int(d.name)
+        for d in checkpoint_dir.iterdir()
+        if d.is_dir() and d.name.isdigit() and not d.name.startswith("tmp_")
+    ]
+    return max(checkpoint_steps) if checkpoint_steps else None
+def log_memory_usage(device, step, phase="unknown"):
+    """Log detailed memory usage information."""
+    if not torch.cuda.is_available():
+        return
+    memory_allocated = torch.cuda.memory_allocated(device) / 1e9
+    memory_reserved = torch.cuda.memory_reserved(device) / 1e9
+    memory_free = torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device)
+    memory_free = memory_free / 1e9
+    # Get more detailed memory info
+    memory_stats = torch.cuda.memory_stats(device)
+    max_memory_allocated = memory_stats.get("allocated_bytes.all.peak", 0) / 1e9
+    max_memory_reserved = memory_stats.get("reserved_bytes.all.peak", 0) / 1e9
+    # Get DDP info if available
+    ddp_info = ""
+    if dist.is_initialized():
+        ddp_info = f" | DDP: rank={dist.get_rank()}, world_size={dist.get_world_size()}"
+    logging.info(
+        f"Step {step} ({phase}): GPU memory - allocated: {memory_allocated:.2f}GB, reserved: {memory_reserved:.2f}GB, free: {memory_free:.2f}GB, peak_allocated: {max_memory_allocated:.2f}GB, peak_reserved: {max_memory_reserved:.2f}GB{ddp_info}"
+    )
+def train_loop(config: _config.TrainConfig):
+    use_ddp, local_rank, device = setup_ddp()
+    is_main = (not use_ddp) or (dist.get_rank() == 0)
+    set_seed(config.seed, local_rank)
+    # Initialize checkpoint directory and wandb
+    resuming = False
+    if config.resume:
+        # Find checkpoint directory based on experiment name
+        exp_checkpoint_dir = config.checkpoint_dir
+        if exp_checkpoint_dir.exists():
+            # Use validation to find the latest working checkpoint
+            latest_step = get_latest_checkpoint_step(exp_checkpoint_dir)
+            if latest_step is not None:
+                resuming = True
+                logging.info(
+                    f"Resuming from experiment checkpoint directory: {exp_checkpoint_dir} at step {latest_step}"
+                )
+            else:
+                raise FileNotFoundError(f"No valid checkpoints found in {exp_checkpoint_dir} for resume")
+        else:
+            raise FileNotFoundError(f"Experiment checkpoint directory {exp_checkpoint_dir} does not exist for resume")
+    elif config.overwrite and config.checkpoint_dir.exists():
+        shutil.rmtree(config.checkpoint_dir)
+        logging.info(f"Overwriting checkpoint directory: {config.checkpoint_dir}")
+    # Create checkpoint directory with experiment name
+    if not resuming:
+        # For new runs, create experiment-specific checkpoint directory
+        exp_checkpoint_dir = config.checkpoint_dir
+        exp_checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        logging.info(f"Created experiment checkpoint directory: {exp_checkpoint_dir}")
+    else:
+        # For resume, checkpoint_dir is already set to the experiment directory
+        logging.info(f"Using existing experiment checkpoint directory: {config.checkpoint_dir}")
+    # Initialize wandb (only on main process)
+    if is_main:
+        init_wandb(config, resuming=resuming, enabled=config.wandb_enabled)
+    # Build data loader using the unified data loader
+    # Calculate effective batch size per GPU for DDP
+    # For N GPUs, each GPU should get batch_size/N samples, so total across all GPUs is batch_size
+    world_size = torch.distributed.get_world_size() if use_ddp else 1
+    effective_batch_size = config.batch_size // world_size
+    logging.info(
+        f"Using batch size per GPU: {effective_batch_size} (total batch size across {world_size} GPUs: {config.batch_size})"
+    )
+    # Pass the original batch size to data loader - it will handle DDP splitting internally
+    loader, data_config = build_datasets(config)
+    # Log sample images to wandb on first batch
+    if is_main and config.wandb_enabled and not resuming:
+        # Create a separate data loader for sample batch to avoid consuming the main loader
+        sample_data_loader = _data.create_data_loader(config, framework="pytorch", shuffle=False)
+        sample_batch = next(iter(sample_data_loader))
+        # Convert observation and actions to torch tensors
+        observation, actions = sample_batch
+        sample_batch = observation.to_dict()
+        sample_batch["actions"] = actions
+        # Create sample images for wandb
+        images_to_log = []
+        # Get batch size from the first image tensor
+        batch_size = next(iter(sample_batch["image"].values())).shape[0]
+        for i in range(min(5, batch_size)):
+            # Concatenate all camera views horizontally for this batch item
+            # Convert from NCHW to NHWC format for wandb
+            img_concatenated = torch.cat([img[i].permute(1, 2, 0) for img in sample_batch["image"].values()], axis=1)
+            img_concatenated = img_concatenated.cpu().numpy()
+            images_to_log.append(wandb.Image(img_concatenated))
+        wandb.log({"camera_views": images_to_log}, step=0)
+        # Clear sample batch from memory aggressively
+        del sample_batch, observation, actions, images_to_log, img_concatenated
+        del sample_data_loader  # Also delete the sample data loader
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        logging.info("Cleared sample batch and data loader from memory")
+    # Build model
+    if not isinstance(config.model, openpi.models.pi0_config.Pi0Config):
+        # Convert dataclass to Pi0Config if needed
+        model_cfg = openpi.models.pi0_config.Pi0Config(
+            dtype=config.pytorch_training_precision,
+            action_dim=config.model.action_dim,
+            action_horizon=config.model.action_horizon,
+            max_token_len=config.model.max_token_len,
+            paligemma_variant=getattr(config.model, "paligemma_variant", "gemma_2b"),
+            action_expert_variant=getattr(config.model, "action_expert_variant", "gemma_300m"),
+            pi05=getattr(config.model, "pi05", False),
+        )
+    else:
+        model_cfg = config.model
+        # Update dtype to match pytorch_training_precision
+        object.__setattr__(model_cfg, "dtype", config.pytorch_training_precision)
+    model = openpi.models_pytorch.pi0_pytorch.PI0Pytorch(model_cfg).to(device)
+    if hasattr(model, "gradient_checkpointing_enable"):
+        enable_gradient_checkpointing = True
+        model.gradient_checkpointing_enable()
+        logging.info("Enabled gradient checkpointing for memory optimization")
+    else:
+        enable_gradient_checkpointing = False
+        logging.info("Gradient checkpointing is not supported for this model")
+    # Log initial memory usage after model creation
+    if is_main and torch.cuda.is_available():
+        log_memory_usage(device, 0, "after_model_creation")
+    # Enable memory optimizations for large-scale training
+    if world_size >= 8:
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        # Set memory allocation configuration
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
+        logging.info("Enabled memory optimizations for 8+ GPU training")
+    if use_ddp:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model,
+            device_ids=[device.index] if device.type == "cuda" else None,
+            find_unused_parameters=True,  # Disable for memory efficiency
+            gradient_as_bucket_view=True,  # Enable for memory efficiency
+            static_graph=world_size >= 8,  # Enable for 8+ GPUs
+        )
+    # Load weights from weight_loader if specified (for fine-tuning)
+    if config.pytorch_weight_path is not None:
+        logging.info(f"Loading weights from: {config.pytorch_weight_path}")
+        model_path = os.path.join(config.pytorch_weight_path, "model.safetensors")
+        safetensors.torch.load_model(
+            (model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model), model_path
+        )
+        logging.info(f"Loaded PyTorch weights from {config.pytorch_weight_path}")
+    # Optimizer + learning rate schedule from config
+    warmup_steps = config.lr_schedule.warmup_steps
+    peak_lr = config.lr_schedule.peak_lr
+    decay_steps = config.lr_schedule.decay_steps
+    end_lr = config.lr_schedule.decay_lr
+    # Create optimizer with config parameters
+    optim = torch.optim.AdamW(
+        model.parameters(),
+        lr=peak_lr,
+        betas=(config.optimizer.b1, config.optimizer.b2),
+        eps=config.optimizer.eps,
+        weight_decay=config.optimizer.weight_decay,
+    )
+    # Load checkpoint if resuming
+    global_step = 0
+    if resuming:
+        global_step = load_checkpoint(model, optim, config.checkpoint_dir, device)
+        logging.info(f"Resumed training from step {global_step}")
+    def lr_schedule(step: int):
+        if step < warmup_steps:
+            # Match JAX behavior: start from peak_lr / (warmup_steps + 1)
+            init_lr = peak_lr / (warmup_steps + 1)
+            return init_lr + (peak_lr - init_lr) * step / warmup_steps
+        # cosine decay
+        progress = min(1.0, (step - warmup_steps) / max(1, decay_steps - warmup_steps))
+        cos = 0.5 * (1 + np.cos(np.pi * progress))
+        return end_lr + (peak_lr - end_lr) * cos
+    model.train()
+    start_time = time.time()
+    infos = []  # Collect stats over log interval
+    if is_main:
+        logging.info(
+            f"Running on: {platform.node()} | world_size={torch.distributed.get_world_size() if use_ddp else 1}"
+        )
+        logging.info(
+            f"Training config: batch_size={config.batch_size}, effective_batch_size={effective_batch_size}, num_train_steps={config.num_train_steps}"
+        )
+        logging.info(f"Memory optimizations: gradient_checkpointing={enable_gradient_checkpointing}")
+        logging.info(
+            f"LR schedule: warmup={warmup_steps}, peak_lr={peak_lr:.2e}, decay_steps={decay_steps}, end_lr={end_lr:.2e}"
+        )
+        logging.info(
+            f"Optimizer: {type(config.optimizer).__name__}, weight_decay={config.optimizer.weight_decay}, clip_norm={config.optimizer.clip_gradient_norm}"
+        )
+        logging.info("EMA is not supported for PyTorch training")
+        logging.info(f"Training precision: {model_cfg.dtype}")
+    # Training loop - iterate until we reach num_train_steps
+    pbar = (
+        tqdm.tqdm(total=config.num_train_steps, initial=global_step, desc="Training", disable=not is_main)
+        if is_main
+        else None
+    )
+    while global_step < config.num_train_steps:
+        # Set epoch for distributed training
+        if use_ddp and hasattr(loader, "set_epoch"):
+            loader.set_epoch(global_step // len(loader))
+        for observation, actions in loader:
+            # Check if we've reached the target number of steps
+            if global_step >= config.num_train_steps:
+                break
+            # The unified data loader returns (observation, actions) tuple
+            observation = jax.tree.map(lambda x: x.to(device), observation)  # noqa: PLW2901
+            actions = actions.to(torch.float32)  # noqa: PLW2901
+            actions = actions.to(device)  # noqa: PLW2901
+            # Update LR
+            for pg in optim.param_groups:
+                pg["lr"] = lr_schedule(global_step)
+            # Forward pass
+            losses = model(observation, actions)
+            # Ensure losses is a tensor and handle different return types
+            if isinstance(losses, list | tuple):
+                losses = torch.stack(losses)
+            elif not isinstance(losses, torch.Tensor):
+                losses = torch.tensor(losses, device=device, dtype=torch.float32)
+            loss = losses.mean()
+            # Backward pass
+            loss.backward()
+            # Log memory usage after backward pass
+            if global_step < 5 and is_main and torch.cuda.is_available():
+                log_memory_usage(device, global_step, "after_backward")
+            # Gradient clipping
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.optimizer.clip_gradient_norm)
+            # Optimizer step
+            optim.step()
+            optim.zero_grad(set_to_none=True)
+            # Clear gradients more aggressively
+            for param in model.parameters():
+                if param.grad is not None:
+                    param.grad.detach_()
+                    param.grad = None
+            # Collect stats
+            if is_main:
+                infos.append(
+                    {
+                        "loss": loss.item(),
+                        "learning_rate": optim.param_groups[0]["lr"],
+                        "grad_norm": float(grad_norm) if isinstance(grad_norm, torch.Tensor) else grad_norm,
+                    }
+                )
+            if is_main and (global_step % config.log_interval == 0):
+                elapsed = time.time() - start_time
+                # Average stats over log interval
+                avg_loss = sum(info["loss"] for info in infos) / len(infos)
+                avg_lr = sum(info["learning_rate"] for info in infos) / len(infos)
+                avg_grad_norm = None
+                if any("grad_norm" in info for info in infos):
+                    vals = [
+                        info["grad_norm"] for info in infos if "grad_norm" in info and info["grad_norm"] is not None
+                    ]
+                    if len(vals) > 0:
+                        avg_grad_norm = sum(vals) / len(vals)
+                logging.info(
+                    f"step={global_step} loss={avg_loss:.4f} lr={avg_lr:.2e} grad_norm={avg_grad_norm:.2f} time={elapsed:.1f}s"
+                    if avg_grad_norm is not None
+                    else f"step={global_step} loss={avg_loss:.4f} lr={avg_lr:.2e} time={elapsed:.1f}s"
+                )
+                # Log to wandb
+                if config.wandb_enabled and len(infos) > 0:
+                    log_payload = {
+                        "loss": avg_loss,
+                        "learning_rate": avg_lr,
+                        "step": global_step,
+                        "time_per_step": elapsed / config.log_interval,
+                    }
+                    if avg_grad_norm is not None:
+                        log_payload["grad_norm"] = avg_grad_norm
+                    wandb.log(log_payload, step=global_step)
+                start_time = time.time()
+                infos = []  # Reset stats collection
+            global_step += 1
+            # Save checkpoint using the new mechanism
+            save_checkpoint(model, optim, global_step, config, is_main, data_config)
+            # Update progress bar
+            if pbar is not None:
+                pbar.update(1)
+                pbar.set_postfix(
+                    {"loss": f"{loss.item():.4f}", "lr": f"{optim.param_groups[0]['lr']:.2e}", "step": global_step}
+                )
+    # Close progress bar
+    if pbar is not None:
+        pbar.close()
+    # Finish wandb run
+    if is_main and config.wandb_enabled:
+        wandb.finish()
+    cleanup_ddp()
+def main():
+    init_logging()
+    config = _config.cli()
+    train_loop(config)
+if __name__ == "__main__":
+    main()

capvector-pi05/scripts/train_regular_loss_pytorch.py ADDED Viewed

	@@ -0,0 +1,754 @@

+"""
+PyTorch training entrypoint for PI0/PI05 with multi-GPU and multi-node (DDP) support.
+This script mirrors the behavior of the JAX trainer (`scripts/train.py`) but runs
+entirely in PyTorch using the `PI0Pytorch` model and your existing config/data
+pipeline from `src/openpi/training/config.py` and `src/openpi/training/data_loader.py`.
+Usage
+Single GPU:
+  python scripts/train_pytorch.py <config_name> --exp_name <run_name> --save_interval <interval>
+  Example:
+  python scripts/train_pytorch.py debug --exp_name pytorch_ddp_test
+  python scripts/train_pytorch.py debug --exp_name pytorch_ddp_test --resume  # Resume from latest checkpoint
+Multi-GPU (single node):
+  torchrun --standalone --nnodes=1 --nproc_per_node=<num_gpus> scripts/train_pytorch.py <config_name> --exp_name <run_name>
+  Example:
+  torchrun --standalone --nnodes=1 --nproc_per_node=2 scripts/train_pytorch.py pi0_aloha_sim --exp_name pytorch_ddp_test
+  torchrun --standalone --nnodes=1 --nproc_per_node=2 scripts/train_pytorch.py pi0_aloha_sim --exp_name pytorch_ddp_test --resume
+Multi-Node Training:
+	torchrun \
+    --nnodes=<num_nodes> --nproc_per_node=<gpus_per_node> --node_rank=<rank_of_node> \
+    --master_addr=<master_ip> --master_port=<port> \
+    scripts/train_pytorch.py <config_name> --exp_name=<run_name> --save_interval <interval>
+"""
+import dataclasses
+import gc
+import logging
+import os
+import platform
+from pathlib import Path
+import shutil
+import time
+import jax
+import numpy as np
+import safetensors.torch
+import torch
+import torch.distributed as dist
+import torch.nn.parallel
+import tqdm
+import wandb
+import openpi.models.pi0_config
+import openpi.models_pytorch.pi0_pytorch
+import openpi.shared.normalize as _normalize
+import openpi.training.config as _config
+import openpi.training.data_loader as _data
+def init_logging():
+    level_mapping = {"DEBUG": "D", "INFO": "I", "WARNING": "W", "ERROR": "E", "CRITICAL": "C"}
+    class CustomFormatter(logging.Formatter):
+        def format(self, record):
+            record.levelname = level_mapping.get(record.levelname, record.levelname)
+            return super().format(record)
+    formatter = CustomFormatter(
+        fmt="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)-80s (%(process)d:%(filename)s:%(lineno)s)",
+        datefmt="%H:%M:%S",
+    )
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    if not logger.handlers:
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+    else:
+        logger.handlers[0].setFormatter(formatter)
+def init_wandb(config: _config.TrainConfig, *, resuming: bool, enabled: bool = True):
+    """Initialize wandb logging."""
+    if not enabled:
+        wandb.init(mode="disabled")
+        return
+    ckpt_dir = config.checkpoint_dir
+    if not ckpt_dir.exists():
+        raise FileNotFoundError(f"Checkpoint directory {ckpt_dir} does not exist.")
+    if resuming:
+        run_id = (ckpt_dir / "wandb_id.txt").read_text().strip()
+        wandb.init(id=run_id, resume="must", project=config.project_name)
+    else:
+        wandb.init(
+            name=config.name,
+            config=dataclasses.asdict(config),
+            project=config.project_name,
+            id="-".join([config.name, config.exp_name]),
+        )
+        (ckpt_dir / "wandb_id.txt").write_text(wandb.run.id)
+def setup_ddp():
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    use_ddp = world_size > 1
+    if use_ddp and not torch.distributed.is_initialized():
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+        torch.distributed.init_process_group(backend=backend, init_method="env://")
+        # Set up debugging environment variables for DDP issues
+        if os.environ.get("TORCH_DISTRIBUTED_DEBUG") is None:
+            os.environ["TORCH_DISTRIBUTED_DEBUG"] = "INFO"
+    local_rank = int(os.environ.get("LOCAL_RANK", os.environ.get("RANK", "0")))
+    device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu")
+    if torch.cuda.is_available():
+        torch.cuda.set_device(device)
+    return use_ddp, local_rank, device
+def cleanup_ddp():
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+        torch.distributed.destroy_process_group()
+def set_seed(seed: int, local_rank: int):
+    torch.manual_seed(seed + local_rank)
+    np.random.seed(seed + local_rank)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed + local_rank)
+def build_datasets(config: _config.TrainConfig):
+    # Use the unified data loader with PyTorch framework
+    data_loader = _data.create_data_loader(config, framework="pytorch", shuffle=True)
+    return data_loader, data_loader.data_config()
+def get_model_state_dict(model):
+    """Get state dict from model, handling DDP wrapper."""
+    return (
+        model.module.state_dict()
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel)
+        else model.state_dict()
+    )
+def get_model_parameters(model):
+    """Get parameters from model, handling DDP wrapper."""
+    return (
+        model.module.parameters()
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel)
+        else model.parameters()
+    )
+def load_regular_vector_dict(path: str | Path) -> dict[str, torch.Tensor]:
+    """Load the regularization vectors, which are used for delta-based regularization."""
+    tensor_path = Path(path)
+    suffix = tensor_path.suffix.lower()
+    if suffix in {".pt", ".pth"}:
+        tensors = torch.load(tensor_path, map_location="cpu", weights_only=False, mmap=True)
+    elif suffix == ".safetensors":
+        tensors = safetensors.torch.load_file(str(tensor_path), device="cpu")
+    else:
+        raise ValueError(f"Unsupported tensor file format: {tensor_path}")
+    return tensors["state_dict"]
+def prepare_regularization_context(
+    model,
+    config: _config.TrainConfig,
+) -> dict | None:
+    """Load regularization tensors and build the runtime context for delta-based regularization."""
+    # Don't use regularization optionally
+    if not config.regularization_vector_path or config.regularization_coeff == 0.0:
+        return None
+    # Get the regularization vectors as reference directions
+    if config.resume:
+        raise ValueError(
+            "Delta-based regularization with --resume is not supported in this PyTorch trainer. "
+            "This run now keeps the anchor only in memory at startup."
+        )
+    vector_path = Path(config.regularization_vector_path).expanduser()
+    if not vector_path.exists():
+        raise FileNotFoundError(f"Regularization vector file does not exist: {vector_path}")
+    regularization_vectors = load_regular_vector_dict(vector_path)
+    # Get the model's trainable parameters to be regularized and the corresponding freezing anchors at startup
+    model_module = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+    trainable_entries = []
+    missing_vectors = 0
+    shape_mismatches = 0
+    trainable_param_names = set()
+    for name, param in model_module.named_parameters():
+        if not param.requires_grad:
+            continue
+        trainable_param_names.add(name)
+        regularization_vector = regularization_vectors.get(name)
+        if regularization_vector is None:
+            missing_vectors += 1
+            continue
+        anchor_param = param.detach().clone().contiguous()
+        if regularization_vector.shape != param.shape or anchor_param.shape != param.shape:
+            shape_mismatches += 1
+            continue
+        trainable_entries.append(
+            {
+                "name": name,
+                "param": param,
+                "anchor": anchor_param,
+                "vector": regularization_vector.to(device=param.device, dtype=param.dtype).contiguous(),
+            }
+        )
+    logging.info(
+        "Regularization coverage: matched=%d missing_vectors=%d shape_mismatches=%d",
+        len(trainable_entries),
+        missing_vectors,
+        shape_mismatches,
+    )
+    return {
+        "entries": trainable_entries,
+        "weight": config.regularization_coeff,
+        "vector_path": str(vector_path),
+    }
+def compute_regularization_loss(regularization_context: dict | None, device: torch.device) -> torch.Tensor:
+    """Compute the delta-based regularization loss for the current model parameters."""
+    reg_loss = torch.zeros((), device=device, dtype=torch.float32)
+    if not regularization_context:
+        return reg_loss
+    for entry in regularization_context["entries"]:
+        param = entry["param"]
+        anchor = entry["anchor"]
+        vector = entry["vector"]
+        delta = (param - anchor).reshape(-1).float()
+        direction = vector.reshape(-1).float()
+        reg_loss = reg_loss + torch.abs(torch.dot(delta, direction))
+    return reg_loss * regularization_context["weight"]
+def save_checkpoint(model, optimizer, global_step, config, is_main, data_config):
+    """Save a checkpoint with model state, optimizer state, and metadata."""
+    if not is_main:
+        return
+    # Only save if it's time to save or if it's the final step
+    if (global_step % config.save_interval == 0 and global_step > 0) or global_step == config.num_train_steps - 1:
+        # Create temporary directory for atomic checkpoint saving
+        final_ckpt_dir = config.checkpoint_dir / f"{global_step}"
+        tmp_ckpt_dir = config.checkpoint_dir / f"tmp_{global_step}"
+        # Remove any existing temp directory and create new one
+        if tmp_ckpt_dir.exists():
+            shutil.rmtree(tmp_ckpt_dir)
+        tmp_ckpt_dir.mkdir(parents=True, exist_ok=True)
+        # Save model state using safetensors (handle shared tensors)
+        model_to_save = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+        safetensors.torch.save_model(model_to_save, tmp_ckpt_dir / "model.safetensors")
+        # Save optimizer state using PyTorch format
+        torch.save(optimizer.state_dict(), tmp_ckpt_dir / "optimizer.pt")
+        # Save training metadata (avoid saving full config to prevent JAX/Flax compatibility issues)
+        metadata = {
+            "global_step": global_step,
+            "config": dataclasses.asdict(config),
+            "timestamp": time.time(),
+        }
+        torch.save(metadata, tmp_ckpt_dir / "metadata.pt")
+        # save norm stats
+        norm_stats = data_config.norm_stats
+        if norm_stats is not None and data_config.asset_id is not None:
+            _normalize.save(tmp_ckpt_dir / "assets" / data_config.asset_id, norm_stats)
+        # Atomically move temp directory to final location
+        if final_ckpt_dir.exists():
+            shutil.rmtree(final_ckpt_dir)
+        tmp_ckpt_dir.rename(final_ckpt_dir)
+        logging.info(f"Saved checkpoint at step {global_step} -> {final_ckpt_dir}")
+        # Log checkpoint to wandb
+        if config.wandb_enabled:
+            wandb.log({"checkpoint_step": global_step}, step=global_step)
+def load_checkpoint(model, optimizer, checkpoint_dir, device):
+    """Load the latest checkpoint and return the global step."""
+    checkpoint_steps = [
+        int(d.name)
+        for d in checkpoint_dir.iterdir()
+        if d.is_dir() and d.name.isdigit() and not d.name.startswith("tmp_")
+    ]
+    if not checkpoint_steps:
+        raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
+    latest_step = max(checkpoint_steps)
+    ckpt_dir = checkpoint_dir / f"{latest_step}"
+    # Clear memory before loading checkpoints
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "before_loading_checkpoint")
+    try:
+        # Load model state with error handling
+        logging.info("Loading model state...")
+        safetensors_path = ckpt_dir / "model.safetensors"
+        if safetensors_path.exists():
+            model_to_load = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+            safetensors.torch.load_model(model_to_load, safetensors_path, device=str(device))
+            logging.info("Loaded model state from safetensors format")
+        else:
+            raise FileNotFoundError(f"No model checkpoint found at {ckpt_dir}")
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "after_loading_model")
+        # Load optimizer state with error handling
+        logging.info("Loading optimizer state...")
+        optimizer_path = ckpt_dir / "optimizer.pt"
+        if optimizer_path.exists():
+            optimizer_state_dict = torch.load(optimizer_path, map_location=device, weights_only=False)
+            logging.info("Loaded optimizer state from pt format")
+        else:
+            raise FileNotFoundError(f"No optimizer checkpoint found at {ckpt_dir}")
+        optimizer.load_state_dict(optimizer_state_dict)
+        del optimizer_state_dict
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "after_loading_optimizer")
+        # Load metadata
+        logging.info("Loading metadata...")
+        metadata = torch.load(ckpt_dir / "metadata.pt", map_location=device, weights_only=False)
+        global_step = metadata.get("global_step", latest_step)
+        del metadata
+        torch.cuda.empty_cache()
+        gc.collect()
+        log_memory_usage(device, latest_step, "after_loading_metadata")
+        logging.info(f"Successfully loaded all checkpoint components from step {latest_step}")
+        return global_step
+    except RuntimeError as e:
+        if "out of memory" in str(e):
+            # Clear memory and provide detailed error message
+            torch.cuda.empty_cache()
+            gc.collect()
+            logging.error(f"Out of memory error while loading checkpoint: {e!s}")
+            log_memory_usage(device, latest_step, "after_oom_error")
+            raise RuntimeError(
+                "Out of memory while loading checkpoint. Try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
+            ) from e
+        raise
+def get_latest_checkpoint_step(checkpoint_dir):
+    """Get the latest checkpoint step number from a checkpoint directory."""
+    checkpoint_steps = [
+        int(d.name)
+        for d in checkpoint_dir.iterdir()
+        if d.is_dir() and d.name.isdigit() and not d.name.startswith("tmp_")
+    ]
+    return max(checkpoint_steps) if checkpoint_steps else None
+def log_memory_usage(device, step, phase="unknown"):
+    """Log detailed memory usage information."""
+    if not torch.cuda.is_available():
+        return
+    memory_allocated = torch.cuda.memory_allocated(device) / 1e9
+    memory_reserved = torch.cuda.memory_reserved(device) / 1e9
+    memory_free = torch.cuda.memory_reserved(device) - torch.cuda.memory_allocated(device)
+    memory_free = memory_free / 1e9
+    # Get more detailed memory info
+    memory_stats = torch.cuda.memory_stats(device)
+    max_memory_allocated = memory_stats.get("allocated_bytes.all.peak", 0) / 1e9
+    max_memory_reserved = memory_stats.get("reserved_bytes.all.peak", 0) / 1e9
+    # Get DDP info if available
+    ddp_info = ""
+    if dist.is_initialized():
+        ddp_info = f" | DDP: rank={dist.get_rank()}, world_size={dist.get_world_size()}"
+    logging.info(
+        f"Step {step} ({phase}): GPU memory - allocated: {memory_allocated:.2f}GB, reserved: {memory_reserved:.2f}GB, free: {memory_free:.2f}GB, peak_allocated: {max_memory_allocated:.2f}GB, peak_reserved: {max_memory_reserved:.2f}GB{ddp_info}"
+    )
+def train_loop(config: _config.TrainConfig):
+    use_ddp, local_rank, device = setup_ddp()
+    is_main = (not use_ddp) or (dist.get_rank() == 0)
+    set_seed(config.seed, local_rank)
+    # Initialize checkpoint directory and wandb
+    resuming = False
+    if config.resume:
+        # Find checkpoint directory based on experiment name
+        exp_checkpoint_dir = config.checkpoint_dir
+        if exp_checkpoint_dir.exists():
+            # Use validation to find the latest working checkpoint
+            latest_step = get_latest_checkpoint_step(exp_checkpoint_dir)
+            if latest_step is not None:
+                resuming = True
+                logging.info(
+                    f"Resuming from experiment checkpoint directory: {exp_checkpoint_dir} at step {latest_step}"
+                )
+            else:
+                raise FileNotFoundError(f"No valid checkpoints found in {exp_checkpoint_dir} for resume")
+        else:
+            raise FileNotFoundError(f"Experiment checkpoint directory {exp_checkpoint_dir} does not exist for resume")
+    elif config.overwrite and config.checkpoint_dir.exists():
+        shutil.rmtree(config.checkpoint_dir)
+        logging.info(f"Overwriting checkpoint directory: {config.checkpoint_dir}")
+    # Create checkpoint directory with experiment name
+    if not resuming:
+        # For new runs, create experiment-specific checkpoint directory
+        exp_checkpoint_dir = config.checkpoint_dir
+        exp_checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        logging.info(f"Created experiment checkpoint directory: {exp_checkpoint_dir}")
+    else:
+        # For resume, checkpoint_dir is already set to the experiment directory
+        logging.info(f"Using existing experiment checkpoint directory: {config.checkpoint_dir}")
+    # Initialize wandb (only on main process)
+    if is_main:
+        init_wandb(config, resuming=resuming, enabled=config.wandb_enabled)
+    # Build data loader using the unified data loader
+    # Calculate effective batch size per GPU for DDP
+    # For N GPUs, each GPU should get batch_size/N samples, so total across all GPUs is batch_size
+    world_size = torch.distributed.get_world_size() if use_ddp else 1
+    effective_batch_size = config.batch_size // world_size
+    logging.info(
+        f"Using batch size per GPU: {effective_batch_size} (total batch size across {world_size} GPUs: {config.batch_size})"
+    )
+    # Pass the original batch size to data loader - it will handle DDP splitting internally
+    loader, data_config = build_datasets(config)
+    # Log sample images to wandb on first batch
+    if is_main and config.wandb_enabled and not resuming:
+        # Create a separate data loader for sample batch to avoid consuming the main loader
+        sample_data_loader = _data.create_data_loader(config, framework="pytorch", shuffle=False)
+        sample_batch = next(iter(sample_data_loader))
+        # Convert observation and actions to torch tensors
+        observation, actions = sample_batch
+        sample_batch = observation.to_dict()
+        sample_batch["actions"] = actions
+        # Create sample images for wandb
+        images_to_log = []
+        # Get batch size from the first image tensor
+        batch_size = next(iter(sample_batch["image"].values())).shape[0]
+        for i in range(min(5, batch_size)):
+            # Concatenate all camera views horizontally for this batch item
+            # Convert from NCHW to NHWC format for wandb
+            img_concatenated = torch.cat([img[i].permute(1, 2, 0) for img in sample_batch["image"].values()], axis=1)
+            img_concatenated = img_concatenated.cpu().numpy()
+            images_to_log.append(wandb.Image(img_concatenated))
+        wandb.log({"camera_views": images_to_log}, step=0)
+        # Clear sample batch from memory aggressively
+        del sample_batch, observation, actions, images_to_log, img_concatenated
+        del sample_data_loader  # Also delete the sample data loader
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        logging.info("Cleared sample batch and data loader from memory")
+    # Build model
+    if not isinstance(config.model, openpi.models.pi0_config.Pi0Config):
+        # Convert dataclass to Pi0Config if needed
+        model_cfg = openpi.models.pi0_config.Pi0Config(
+            dtype=config.pytorch_training_precision,
+            action_dim=config.model.action_dim,
+            action_horizon=config.model.action_horizon,
+            max_token_len=config.model.max_token_len,
+            paligemma_variant=getattr(config.model, "paligemma_variant", "gemma_2b"),
+            action_expert_variant=getattr(config.model, "action_expert_variant", "gemma_300m"),
+            pi05=getattr(config.model, "pi05", False),
+        )
+    else:
+        model_cfg = config.model
+        # Update dtype to match pytorch_training_precision
+        object.__setattr__(model_cfg, "dtype", config.pytorch_training_precision)
+    model = openpi.models_pytorch.pi0_pytorch.PI0Pytorch(model_cfg).to(device)
+    if hasattr(model, "gradient_checkpointing_enable"):
+        enable_gradient_checkpointing = True
+        model.gradient_checkpointing_enable()
+        logging.info("Enabled gradient checkpointing for memory optimization")
+    else:
+        enable_gradient_checkpointing = False
+        logging.info("Gradient checkpointing is not supported for this model")
+    # Log initial memory usage after model creation
+    if is_main and torch.cuda.is_available():
+        log_memory_usage(device, 0, "after_model_creation")
+    # Enable memory optimizations for large-scale training
+    if world_size >= 8:
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        # Set memory allocation configuration
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
+        logging.info("Enabled memory optimizations for 8+ GPU training")
+    if use_ddp:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model,
+            device_ids=[device.index] if device.type == "cuda" else None,
+            find_unused_parameters=True,  # Disable for memory efficiency
+            gradient_as_bucket_view=True,  # Enable for memory efficiency
+            static_graph=world_size >= 8,  # Enable for 8+ GPUs
+        )
+    # Load weights from weight_loader if specified (for fine-tuning)
+    if config.pytorch_weight_path is not None:
+        logging.info(f"Loading weights from: {config.pytorch_weight_path}")
+        model_path = os.path.join(config.pytorch_weight_path, "model.safetensors")
+        safetensors.torch.load_model(
+            (model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model), model_path
+        )
+        logging.info(f"Loaded PyTorch weights from {config.pytorch_weight_path}")
+    regularization_context = prepare_regularization_context(model, config)
+    # Optimizer + learning rate schedule from config
+    warmup_steps = config.lr_schedule.warmup_steps
+    peak_lr = config.lr_schedule.peak_lr
+    decay_steps = config.lr_schedule.decay_steps
+    end_lr = config.lr_schedule.decay_lr
+    # Create optimizer with config parameters
+    optim = torch.optim.AdamW(
+        model.parameters(),
+        lr=peak_lr,
+        betas=(config.optimizer.b1, config.optimizer.b2),
+        eps=config.optimizer.eps,
+        weight_decay=config.optimizer.weight_decay,
+    )
+    # Load checkpoint if resuming
+    global_step = 0
+    if resuming:
+        global_step = load_checkpoint(model, optim, config.checkpoint_dir, device)
+        logging.info(f"Resumed training from step {global_step}")
+    def lr_schedule(step: int):
+        if step < warmup_steps:
+            # Match JAX behavior: start from peak_lr / (warmup_steps + 1)
+            init_lr = peak_lr / (warmup_steps + 1)
+            return init_lr + (peak_lr - init_lr) * step / warmup_steps
+        # cosine decay
+        progress = min(1.0, (step - warmup_steps) / max(1, decay_steps - warmup_steps))
+        cos = 0.5 * (1 + np.cos(np.pi * progress))
+        return end_lr + (peak_lr - end_lr) * cos
+    model.train()
+    start_time = time.time()
+    infos = []  # Collect stats over log interval
+    if is_main:
+        logging.info(
+            f"Running on: {platform.node()} | world_size={torch.distributed.get_world_size() if use_ddp else 1}"
+        )
+        logging.info(
+            f"Training config: batch_size={config.batch_size}, effective_batch_size={effective_batch_size}, num_train_steps={config.num_train_steps}"
+        )
+        logging.info(f"Memory optimizations: gradient_checkpointing={enable_gradient_checkpointing}")
+        logging.info(
+            f"LR schedule: warmup={warmup_steps}, peak_lr={peak_lr:.2e}, decay_steps={decay_steps}, end_lr={end_lr:.2e}"
+        )
+        logging.info(
+            f"Optimizer: {type(config.optimizer).__name__}, weight_decay={config.optimizer.weight_decay}, clip_norm={config.optimizer.clip_gradient_norm}"
+        )
+        logging.info("EMA is not supported for PyTorch training")
+        logging.info(f"Training precision: {model_cfg.dtype}")
+        if regularization_context:
+            logging.info(
+                "Delta-based regularization: enabled | weight=%.2e | vector=%s",
+                config.regularization_coeff,
+                regularization_context["vector_path"],
+            )
+    # Training loop - iterate until we reach num_train_steps
+    pbar = (
+        tqdm.tqdm(total=config.num_train_steps, initial=global_step, desc="Training", disable=not is_main)
+        if is_main
+        else None
+    )
+    while global_step < config.num_train_steps:
+        # Set epoch for distributed training
+        if use_ddp and hasattr(loader, "set_epoch"):
+            loader.set_epoch(global_step // len(loader))
+        for observation, actions in loader:
+            # Check if we've reached the target number of steps
+            if global_step >= config.num_train_steps:
+                break
+            # The unified data loader returns (observation, actions) tuple
+            observation = jax.tree.map(lambda x: x.to(device), observation)  # noqa: PLW2901
+            actions = actions.to(torch.float32)  # noqa: PLW2901
+            actions = actions.to(device)  # noqa: PLW2901
+            # Update LR
+            for pg in optim.param_groups:
+                pg["lr"] = lr_schedule(global_step)
+            # Forward pass
+            losses = model(observation, actions)
+            # Ensure losses is a tensor and handle different return types
+            if isinstance(losses, list | tuple):
+                losses = torch.stack(losses)
+            elif not isinstance(losses, torch.Tensor):
+                losses = torch.tensor(losses, device=device, dtype=torch.float32)
+            action_loss = losses.mean()
+            regularization_loss = compute_regularization_loss(regularization_context, device)
+            total_loss = action_loss + regularization_loss
+            # Backward pass
+            total_loss.backward()
+            # Log memory usage after backward pass
+            if global_step < 5 and is_main and torch.cuda.is_available():
+                log_memory_usage(device, global_step, "after_backward")
+            # Gradient clipping
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.optimizer.clip_gradient_norm)
+            # Optimizer step
+            optim.step()
+            optim.zero_grad(set_to_none=True)
+            # Clear gradients more aggressively
+            for param in model.parameters():
+                if param.grad is not None:
+                    param.grad.detach_()
+                    param.grad = None
+            # Collect stats
+            if is_main:
+                infos.append(
+                    {
+                        "action_loss": action_loss.item(),
+                        "regularization_loss": regularization_loss.item(),
+                        "total_loss": total_loss.item(),
+                        "learning_rate": optim.param_groups[0]["lr"],
+                        "grad_norm": float(grad_norm) if isinstance(grad_norm, torch.Tensor) else grad_norm,
+                    }
+                )
+            if is_main and (global_step % config.log_interval == 0):
+                elapsed = time.time() - start_time
+                # Average stats over log interval
+                avg_action_loss = sum(info["action_loss"] for info in infos) / len(infos)
+                avg_regularization_loss = sum(info["regularization_loss"] for info in infos) / len(infos)
+                avg_total_loss = sum(info["total_loss"] for info in infos) / len(infos)
+                avg_lr = sum(info["learning_rate"] for info in infos) / len(infos)
+                avg_grad_norm = None
+                if any("grad_norm" in info for info in infos):
+                    vals = [
+                        info["grad_norm"] for info in infos if "grad_norm" in info and info["grad_norm"] is not None
+                    ]
+                    if len(vals) > 0:
+                        avg_grad_norm = sum(vals) / len(vals)
+                logging.info(
+                    f"step={global_step} action_loss={avg_action_loss:.4f} regularization_loss={avg_regularization_loss:.4f} total_loss={avg_total_loss:.4f} lr={avg_lr:.2e} grad_norm={avg_grad_norm:.2f} time={elapsed:.1f}s"
+                    if avg_grad_norm is not None
+                    else f"step={global_step} action_loss={avg_action_loss:.4f} regularization_loss={avg_regularization_loss:.4f} total_loss={avg_total_loss:.4f} lr={avg_lr:.2e} time={elapsed:.1f}s"
+                )
+                # Log to wandb
+                if config.wandb_enabled and len(infos) > 0:
+                    log_payload = {
+                        "action_loss": avg_action_loss,
+                        "regularization_loss": avg_regularization_loss,
+                        "total_loss": avg_total_loss,
+                        "learning_rate": avg_lr,
+                        "step": global_step,
+                        "time_per_step": elapsed / config.log_interval,
+                    }
+                    if avg_grad_norm is not None:
+                        log_payload["grad_norm"] = avg_grad_norm
+                    wandb.log(log_payload, step=global_step)
+                start_time = time.time()
+                infos = []  # Reset stats collection
+            global_step += 1
+            # Save checkpoint using the new mechanism
+            save_checkpoint(model, optim, global_step, config, is_main, data_config)
+            # Update progress bar
+            if pbar is not None:
+                pbar.update(1)
+                pbar.set_postfix(
+                    {
+                        "action_loss": f"{action_loss.item():.4f}",
+                        "reg_loss": f"{regularization_loss.item():.4f}",
+                        "total_loss": f"{total_loss.item():.4f}",
+                        "lr": f"{optim.param_groups[0]['lr']:.2e}",
+                        "step": global_step,
+                    }
+                )
+    # Close progress bar
+    if pbar is not None:
+        pbar.close()
+    # Finish wandb run
+    if is_main and config.wandb_enabled:
+        wandb.finish()
+    cleanup_ddp()
+def main():
+    init_logging()
+    config = _config.cli()
+    train_loop(config)
+if __name__ == "__main__":
+    main()

capvector-pi05/scripts/train_test.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import dataclasses
+import os
+import pathlib
+import pytest
+os.environ["JAX_PLATFORMS"] = "cpu"
+from openpi.training import config as _config
+from . import train
+@pytest.mark.parametrize("config_name", ["debug"])
+def test_train(tmp_path: pathlib.Path, config_name: str):
+    config = dataclasses.replace(
+        _config._CONFIGS_DICT[config_name],  # noqa: SLF001
+        batch_size=2,
+        checkpoint_base_dir=str(tmp_path / "checkpoint"),
+        exp_name="test",
+        overwrite=False,
+        resume=False,
+        num_train_steps=2,
+        log_interval=1,
+    )
+    train.main(config)
+    # test resuming
+    config = dataclasses.replace(config, resume=True, num_train_steps=4)
+    train.main(config)

capvector-pi05/src/openpi/__init__.py ADDED Viewed

File without changes

capvector-pi05/src/openpi/conftest.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+import pynvml
+import pytest
+def set_jax_cpu_backend_if_no_gpu() -> None:
+    try:
+        pynvml.nvmlInit()
+        pynvml.nvmlShutdown()
+    except pynvml.NVMLError:
+        # No GPU found.
+        os.environ["JAX_PLATFORMS"] = "cpu"
+def pytest_configure(config: pytest.Config) -> None:
+    set_jax_cpu_backend_if_no_gpu()

capvector-pi05/src/openpi/models/__init__.py ADDED Viewed

File without changes

capvector-pi05/src/openpi/models/gemma.py ADDED Viewed

	@@ -0,0 +1,459 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Gemma adaptation for Pi, taken from big_vision.
+We follow this einsum axis naming convention:
+  B: batch
+  T: query length
+  S: k/v length
+  N: num query heads
+  K: num k/v heads
+  G: num query heads per k/v head
+  H: head dim
+  D: d_model ("features")
+"""
+from collections.abc import Sequence
+import dataclasses
+from typing import Literal, TypeAlias
+import einops
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import openpi.models.lora as lora
+import openpi.shared.array_typing as at
+import openpi.training.sharding as sharding
+PALIGEMMA_VOCAB_SIZE = 257_152
+@dataclasses.dataclass
+class Config:
+    width: int
+    depth: int
+    mlp_dim: int
+    num_heads: int
+    num_kv_heads: int
+    head_dim: int
+    lora_configs: dict[str, lora.LoRAConfig] = dataclasses.field(default_factory=dict)
+Variant = Literal["dummy", "gemma_300m", "gemma_300m_lora", "gemma_2b", "gemma_2b_lora"]
+def get_config(variant: Variant) -> Config:
+    """Returns config for specified gemma variant."""
+    if variant == "dummy":
+        return Config(
+            width=64,
+            depth=4,
+            mlp_dim=128,
+            num_heads=8,
+            num_kv_heads=1,
+            head_dim=16,
+        )
+    if variant == "gemma_300m":
+        # 311M params
+        return Config(
+            width=1024,
+            depth=18,
+            mlp_dim=4096,
+            num_heads=8,
+            num_kv_heads=1,
+            head_dim=256,
+        )
+    if variant == "gemma_2b":
+        return Config(
+            width=2048,
+            depth=18,
+            mlp_dim=16_384,
+            num_heads=8,
+            num_kv_heads=1,
+            head_dim=256,
+        )
+    if variant == "gemma_2b_lora":
+        return Config(
+            width=2048,
+            depth=18,
+            mlp_dim=16_384,
+            num_heads=8,
+            num_kv_heads=1,
+            head_dim=256,
+            lora_configs={"attn": lora.LoRAConfig(rank=16, alpha=16.0), "ffn": lora.LoRAConfig(rank=16, alpha=16.0)},
+        )
+    if variant == "gemma_300m_lora":
+        # 311M params
+        return Config(
+            width=1024,
+            depth=18,
+            mlp_dim=4096,
+            num_heads=8,
+            num_kv_heads=1,
+            head_dim=256,
+            lora_configs={"attn": lora.LoRAConfig(rank=32, alpha=32.0), "ffn": lora.LoRAConfig(rank=32, alpha=32.0)},
+        )
+    raise ValueError(f"Unknown variant: {variant}")
+@at.typecheck
+class RMSNorm(nn.Module):
+    @nn.compact
+    def __call__(self, x, cond):
+        dtype = x.dtype  # original dtype, could be half-precision
+        var = jnp.mean(jnp.square(x.astype(jnp.float32)), axis=-1, keepdims=True)  # compute variance in float32
+        normed_inputs = jnp.asarray(x * jnp.reciprocal(jnp.sqrt(var + 1e-06)))  # compute normalization in float32
+        if cond is None:
+            # regular RMSNorm
+            scale = self.param("scale", nn.initializers.zeros_init(), (x.shape[-1]))
+            normed_inputs = normed_inputs * (
+                1 + scale
+            )  # scale by learned parameter in float32 (matches Flax implementation)
+            return normed_inputs.astype(dtype), None  # return in original dtype
+        # adaptive RMSNorm
+        modulation = nn.Dense(x.shape[-1] * 3, kernel_init=nn.initializers.zeros, dtype=dtype)(cond)
+        scale, shift, gate = jnp.split(modulation[:, None, :], 3, axis=-1)
+        normed_inputs = normed_inputs * (1 + scale) + shift  # scale and shift in float32
+        return normed_inputs.astype(dtype), gate
+@at.typecheck
+class Embedder(nn.Module):
+    """Embedder module."""
+    vocab_size: int
+    embed_dim: int
+    def setup(self):
+        self.input_embedding_table = self.param(
+            "input_embedding",
+            nn.initializers.normal(),
+            (self.vocab_size, self.embed_dim),
+        )
+    def encode(self, x):
+        x = self.input_embedding_table[(x,)]
+        x *= jnp.sqrt(self.embed_dim).astype(x.dtype)
+        return x
+    def decode(self, x):
+        return jnp.dot(x, self.input_embedding_table.T)
+@at.typecheck
+class Attention(nn.Module):
+    """Attention module."""
+    configs: Sequence[Config]
+    @nn.compact
+    def __call__(self, xs, positions, attn_mask, kv_cache):
+        # all experts must share the same head dim, num heads, and num kv heads for self-attention to work
+        assert all(config.head_dim == self.configs[0].head_dim for config in self.configs)
+        assert all(config.num_heads == self.configs[0].num_heads for config in self.configs)
+        assert all(config.num_kv_heads == self.configs[0].num_kv_heads for config in self.configs)
+        dtype = next(x.dtype for x in xs if x is not None)  # original dtype, could be half-precision
+        qkvs = []
+        for i, (x, config) in enumerate(zip(xs, self.configs, strict=True)):
+            if x is None:
+                continue
+            if config.num_kv_heads == config.num_heads:
+                qkv_einsum = lora.Einsum(
+                    shape=(3, config.num_heads, config.width, config.head_dim),
+                    name=_name("qkv_einsum", i),
+                    init_fn=nn.initializers.lecun_normal(in_axis=-2, out_axis=-1, batch_axis=(0, 1)),
+                    lora_config=config.lora_configs.get("attn"),
+                )
+                qkvs.append(qkv_einsum("BSD,3KDH->3BSKH", x))
+            else:
+                q_einsum = lora.Einsum(
+                    shape=(config.num_heads, config.width, config.head_dim),
+                    name=_name("q_einsum", i),
+                    init_fn=nn.initializers.lecun_normal(in_axis=-2, out_axis=-1, batch_axis=(0,)),
+                    lora_config=config.lora_configs.get("attn"),
+                )
+                q = q_einsum("BTD,NDH->BTNH", x)
+                kv_einsum = lora.Einsum(
+                    shape=(2, config.num_kv_heads, config.width, config.head_dim),
+                    name=_name("kv_einsum", i),
+                    init_fn=nn.initializers.lecun_normal(in_axis=-2, out_axis=-1, batch_axis=(0, 1)),
+                    lora_config=config.lora_configs.get("attn"),
+                )
+                k, v = kv_einsum("BSD,2KDH->2BSKH", x)
+                qkvs.append((q, k, v))
+        q, k, v = (jnp.concatenate(y, axis=1) for y in zip(*qkvs, strict=True))
+        q = _apply_rope(q, positions=positions)
+        q *= self.configs[0].head_dim ** -0.5
+        k = _apply_rope(k, positions=positions)
+        # should still be half-precision here (if input was half-precision)
+        assert q.dtype == k.dtype == v.dtype == dtype
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            k = jnp.concatenate([cache_k, k], axis=1)
+            v = jnp.concatenate([cache_v, v], axis=1)
+        q = einops.rearrange(q, "B T (K G) H -> B T K G H", K=self.configs[0].num_kv_heads)
+        logits = jnp.einsum("BTKGH,BSKH->BKGTS", q, k, preferred_element_type=jnp.float32)
+        if attn_mask.shape != (q.shape[0], 1, q.shape[1], k.shape[1]):
+            raise ValueError(
+                f"Attention mask with shape {attn_mask.shape} but shapes for q and k are: {q.shape} and {k.shape}"
+            )
+        # big_neg = jnp.finfo(logits.dtype).min
+        big_neg = -2.3819763e38  # See gemma/modules.py
+        masked_logits = jnp.where(attn_mask[:, :, None, :, :], logits, big_neg)
+        probs = jax.nn.softmax(masked_logits, axis=-1).astype(dtype)
+        encoded = jnp.einsum("BKGTS,BSKH->BTKGH", probs, v)
+        encoded = einops.rearrange(encoded, "B T K G H -> B T (K G) H")
+        out = []
+        start = 0
+        for i, (x, config) in enumerate(zip(xs, self.configs, strict=True)):
+            if x is not None:
+                end = start + x.shape[1]
+                out_einsum = lora.Einsum(
+                    shape=(config.num_heads, config.head_dim, config.width),
+                    name=_name("attn_vec_einsum", i),
+                    init_fn=nn.initializers.lecun_normal(in_axis=(-3, -2), out_axis=-1),
+                    lora_config=config.lora_configs.get("attn"),
+                )
+                out.append(out_einsum("BTNH,NHD->BTD", encoded[:, start:end]))
+                start = end
+            else:
+                out.append(None)
+        return out, (k, v)
+@at.typecheck
+class FeedForward(nn.Module):
+    """Feed forward module."""
+    features: int
+    hidden_dim: int
+    @nn.compact
+    def __call__(self, x):
+        dtype = x.dtype  # original dtype, could be half-precision
+        w_gating = self.param(
+            "gating_einsum",
+            nn.initializers.lecun_normal(in_axis=-2, out_axis=-1, batch_axis=(0,)),
+            (2, self.features, self.hidden_dim),
+        ).astype(dtype)
+        ff_gate = jnp.dot(x, w_gating[0])
+        gate_value = nn.gelu(ff_gate)
+        ff1 = jnp.dot(x, w_gating[1])
+        activations = gate_value * ff1
+        w_linear = self.param(
+            "linear",
+            nn.initializers.lecun_normal(in_axis=-2, out_axis=-1),
+            (self.hidden_dim, self.features),
+        ).astype(dtype)
+        outputs = jnp.dot(activations, w_linear)
+        assert outputs.dtype == dtype
+        return outputs
+@at.typecheck
+class Block(nn.Module):
+    """Transformer block."""
+    configs: tuple[Config, ...]
+    dropout: float = 0.0
+    dropout_bdims: tuple[int, ...] = ()
+    @nn.compact
+    def __call__(self, xs, kv_cache, positions, attn_mask, adarms_cond, deterministic=True):  # noqa: FBT002
+        xs = sharding.activation_sharding_constraint(xs)
+        drop = nn.Dropout(self.dropout, self.dropout_bdims) if self.dropout else lambda x, _: x
+        attn = Attention(configs=self.configs, name="attn")
+        pre_attn = []
+        gates = []
+        for i, x in enumerate(xs):
+            if x is not None:
+                x, gate = RMSNorm(name=_name("pre_attention_norm", i))(x, adarms_cond[i])  # noqa: PLW2901
+            pre_attn.append(x)
+            gates.append(gate if x is not None else None)
+        pre_attn = sharding.activation_sharding_constraint(pre_attn)
+        post_attn, kv_cache = attn(pre_attn, positions, attn_mask, kv_cache)
+        post_attn = jax.tree.map(lambda x: drop(x, deterministic), post_attn)
+        post_attn = sharding.activation_sharding_constraint(post_attn)
+        xs = [_gated_residual(x, y, gate) for x, y, gate in zip(xs, post_attn, gates, strict=True)]
+        xs = sharding.activation_sharding_constraint(xs)
+        out = []
+        gates = []
+        for i, (x, config) in enumerate(zip(xs, self.configs, strict=True)):
+            if x is not None:
+                x, gate = RMSNorm(name=_name("pre_ffw_norm", i))(x, adarms_cond[i])  # noqa: PLW2901
+                x = lora.FeedForward(  # noqa: PLW2901
+                    features=config.width,
+                    hidden_dim=config.mlp_dim,
+                    name=_name("mlp", i),
+                    lora_config=config.lora_configs.get("ffn"),
+                )(x)
+            out.append(x)
+            gates.append(gate if x is not None else None)
+        out = sharding.activation_sharding_constraint(out)
+        out = jax.tree.map(lambda x: drop(x, deterministic), out)
+        xs = [_gated_residual(x, y, gate) for x, y, gate in zip(xs, out, gates, strict=True)]
+        xs = sharding.activation_sharding_constraint(xs)
+        return xs, kv_cache
+KVCache: TypeAlias = tuple[at.Float[at.Array, "l b _t _k _h"], at.Float[at.Array, "l b _t _v _h"]]
+@at.typecheck
+class Module(nn.Module):
+    """Transformer model, supporting a mixture of different weights for different tokens."""
+    configs: Sequence[Config]  # list of configs, one for each expert
+    embed_dtype: str
+    dropout: float = 0.0
+    dropout_bdims: tuple[int, ...] = ()  # Every float is dropped independently.
+    adarms: bool = False
+    def setup(self):
+        # all experts must have the same depth
+        assert all(config.depth == self.configs[0].depth for config in self.configs)
+        self.embedder = Embedder(
+            vocab_size=PALIGEMMA_VOCAB_SIZE,
+            embed_dim=self.configs[0].width,  # embedder for first expert only
+            name="embedder",
+        )
+        block_cls = nn.remat(
+            Block,
+            prevent_cse=False,
+            static_argnums=(5,),  # 0=self, 6=deterministic
+            policy=jax.checkpoint_policies.nothing_saveable,
+        )
+        self.layers = nn.scan(
+            block_cls,
+            variable_axes={"params": 0},
+            split_rngs={"params": True, "dropout": True},
+            in_axes=(
+                0,
+                nn.broadcast,
+                nn.broadcast,
+                nn.broadcast,
+                nn.broadcast,
+            ),  # 0=kv_cache, 1=positions, 2=mask, 3=adarms_cond, 4=deterministic
+            length=self.configs[0].depth,
+        )(
+            configs=self.configs,
+            dropout=self.dropout,
+            dropout_bdims=self.dropout_bdims,
+        )
+        self.final_norms = [RMSNorm(name=_name("final_norm", i)) for i in range(len(self.configs))]
+    @at.typecheck
+    def embed(self, tokens: at.Int[at.Array, "b t"]) -> at.Float[at.Array, "b t d"]:
+        return self.embedder.encode(tokens).astype(self.embed_dtype)
+    @at.typecheck
+    def __call__(
+        self,
+        # list of token arrays, one for each expert, or None if that expert should not be run
+        embedded: Sequence[at.Float[at.Array, "b _t _d"] | None],
+        positions: at.Int[at.Array, "b t"],
+        mask: at.Bool[at.Array, "b t s"],
+        adarms_cond: Sequence[at.Float[at.Array, "b _d"] | None] | None = None,
+        *,
+        kv_cache: KVCache | None = None,
+        deterministic: bool = True,
+    ) -> tuple[Sequence[at.Float[at.Array, "b _t _d"] | None], KVCache]:
+        embedded = jax.tree.map(lambda e: e.astype(self.embed_dtype), embedded)
+        mask = jnp.asarray(mask)[:, None, :, :]
+        if adarms_cond is None:
+            adarms_cond = [None] * len(self.configs)
+        embedded, kv_cache = self.layers(embedded, kv_cache, positions, mask, adarms_cond, deterministic)
+        assert all(e.dtype == jnp.dtype(self.embed_dtype) for e in embedded if e is not None)
+        return [
+            f(e, a)[0] if e is not None else e for f, e, a in zip(self.final_norms, embedded, adarms_cond, strict=True)
+        ], kv_cache
+    def init(self, use_adarms: Sequence[bool]):
+        """Convenience method for initializing all parameters, necessary due to the quirks of linen."""
+        self.embed(jnp.zeros((1, 1), dtype=jnp.int32))
+        self(
+            [jnp.zeros((1, 1, c.width)) for c in self.configs],
+            jnp.zeros((1, len(self.configs)), dtype=jnp.int32),
+            jnp.zeros((1, len(self.configs), len(self.configs)), dtype=bool),
+            adarms_cond=[jnp.zeros((1, c.width)) if u else None for u, c in zip(use_adarms, self.configs, strict=True)],
+        )
+def _apply_rope(x, *, positions, max_wavelength=10_000):
+    """Applies RoPE positions [B, L] to x [B, L, H, D]."""
+    freq_exponents = (2.0 / x.shape[-1]) * jnp.arange(x.shape[-1] // 2, dtype=jnp.float32)
+    timescale = max_wavelength**freq_exponents
+    radians = positions[..., None] / timescale[None, None, :]
+    radians = radians[..., None, :]
+    assert radians.dtype == jnp.float32
+    # radians.shape = [...,L,1,d=D/2]
+    sin, cos = jnp.sin(radians), jnp.cos(radians)
+    x1, x2 = jnp.split(x, 2, axis=-1)
+    res = jnp.concatenate([x1 * cos - x2 * sin, x2 * cos + x1 * sin], axis=-1)
+    assert res.dtype == jnp.float32
+    # The original bigvision impl allows RoPE to upcast to float32. It is then immediately downcast again to the cache
+    # dtype when in inference mode (but not in training mode). I don't think any of this was intentional. Based on the
+    # original DeepMind impl, as well as the widely-used transformers impl, it is ok to always downcast back to bfloat16
+    # here.
+    return res.astype(x.dtype)
+def _name(name, i):
+    # we name layers like this because we want the first expert's weights to have no suffix (e.g., "attn"), so that they
+    # can be loaded seamlessly from the existing PaliGemma checkpoint. subsequent experts will have a suffix (e.g.,
+    # "attn_1") and their weights will be initialized from scratch. in practice, we only use two experts -- PaliGemma,
+    # and the action expert.
+    if i == 0:
+        return name
+    return f"{name}_{i}"
+def _gated_residual(x, y, gate):
+    assert (x is None) == (y is None)
+    if x is None:
+        return None
+    if gate is None:
+        return x + y
+    return x + y * gate

capvector-pi05/src/openpi/models/gemma_fast.py ADDED Viewed

	@@ -0,0 +1,437 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Gemma model implementation from big_vision/models/ppp/gemma.py (with small modifications for NNX compatibility)
+Used for FAST autoregressive policies.
+"""
+import dataclasses
+from typing import Literal, TypeAlias
+import einops
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import ml_collections
+import openpi.models.lora as lora
+import openpi.shared.array_typing as at
+Variant = Literal["gemma_2b", "gemma_2b_lora"]
+def get_config(variant):
+    """Returns config for specified gemma variant."""
+    if variant == "gemma_2b":
+        return ml_collections.ConfigDict(
+            {
+                "variant": variant,
+                "width": 2048,
+                "depth": 18,
+                "mlp_dim": 16_384,
+                "num_heads": 8,
+                "num_kv_heads": 1,
+                "head_dim": 256,
+                "norm_eps": 1e-6,
+                "vocab_size": 257_152,
+                "scan": True,
+                "remat_policy": "nothing_saveable",
+            }
+        )
+    if variant == "gemma_2b_lora":
+        return ml_collections.ConfigDict(
+            {
+                "variant": variant,
+                "width": 2048,
+                "depth": 18,
+                "mlp_dim": 16_384,
+                "num_heads": 8,
+                "num_kv_heads": 1,
+                "head_dim": 256,
+                "norm_eps": 1e-6,
+                "vocab_size": 257_152,
+                "scan": True,
+                "remat_policy": "nothing_saveable",
+                "lora_configs": {
+                    "attn": lora.LoRAConfig(rank=16, alpha=16.0),
+                    "ffn": lora.LoRAConfig(rank=16, alpha=16.0),
+                },
+            }
+        )
+    raise ValueError(f"Unknown variant: {variant}")
+@at.typecheck
+class Einsum(nn.Module):
+    shape: tuple[int, ...]
+    @nn.compact
+    def __call__(self, eqn, x):
+        dtype = x.dtype  # original dtype, could be half-precision
+        w = self.param("w", nn.initializers.zeros_init(), self.shape).astype(dtype)
+        return jnp.einsum(eqn, x, w)
+@at.typecheck
+class RMSNorm(nn.Module):
+    @nn.compact
+    def __call__(self, x):
+        dtype = x.dtype  # original dtype, could be half-precision
+        scale = self.param("scale", nn.initializers.zeros_init(), (x.shape[-1]))
+        var = jnp.mean(jnp.square(x.astype(jnp.float32)), axis=-1, keepdims=True)  # compute variance in float32
+        normed_inputs = jnp.asarray(x * jnp.reciprocal(jnp.sqrt(var + 1e-06)))  # compute normalization in float32
+        normed_inputs = normed_inputs * (
+            1 + scale
+        )  # scale by learned parameter in float32 (matches Flax implementation)
+        return normed_inputs.astype(dtype)  # return in original dtype
+@at.typecheck
+class Embedder(nn.Module):
+    """Embedder module."""
+    vocab_size: int
+    embed_dim: int
+    def setup(self):
+        self.input_embedding_table = self.param(
+            "input_embedding",
+            nn.initializers.zeros_init(),
+            (self.vocab_size, self.embed_dim),
+        )
+    def encode(self, x):
+        x = self.input_embedding_table[(x,)]
+        x *= jnp.sqrt(self.embed_dim).astype(x.dtype)
+        return x
+    def decode(self, x):
+        return jnp.dot(x, self.input_embedding_table.T)
+@at.typecheck
+class Attention(nn.Module):
+    """Attention module."""
+    num_heads: int
+    num_kv_heads: int
+    features: int
+    head_dim: int
+    cache_dtype: str | None = None
+    lora_config: lora.LoRAConfig | None = None
+    def setup(self):
+        if self.num_kv_heads == self.num_heads:
+            self.qkv_einsum = lora.Einsum(
+                shape=(3, self.num_heads, self.features, self.head_dim),
+                name="qkv_einsum",
+                init_fn=nn.initializers.lecun_normal(in_axis=-2, out_axis=-1, batch_axis=(0, 1)),
+                lora_config=self.lora_config,
+            )
+        else:
+            self.q_einsum = lora.Einsum(
+                shape=(self.num_heads, self.features, self.head_dim),
+                name="q_einsum",
+                init_fn=nn.initializers.lecun_normal(in_axis=-2, out_axis=-1, batch_axis=(0,)),
+                lora_config=self.lora_config,
+            )
+            self.kv_einsum = lora.Einsum(
+                shape=(2, self.num_kv_heads, self.features, self.head_dim),
+                name="kv_einsum",
+                init_fn=nn.initializers.lecun_normal(in_axis=-2, out_axis=-1, batch_axis=(0, 1)),
+                lora_config=self.lora_config,
+            )
+        self.attn_vec_einsum = lora.Einsum(
+            shape=(self.num_heads, self.head_dim, self.features),
+            name="attn_vec_einsum",
+            init_fn=nn.initializers.lecun_normal(in_axis=-2, out_axis=-1, batch_axis=(0,)),
+            lora_config=self.lora_config,
+        )
+    def _init_cache(self, k, v, cache_size):
+        """Initialize KV cache"""
+        prefill_len = k.shape[1]
+        pad_width = ((0, 0), (0, cache_size - prefill_len), (0, 0), (0, 0))
+        cache_dtype = self.cache_dtype or k.dtype
+        k_cache = jnp.pad(k.astype(cache_dtype), pad_width)
+        v_cache = jnp.pad(v.astype(cache_dtype), pad_width)
+        idx = jnp.zeros((k.shape[0],), dtype=jnp.int32) + prefill_len
+        return idx, k_cache, v_cache
+    def _update_cache(self, k, v, idx, k_cache, v_cache):
+        """Update KV cache with new values"""
+        assert k.shape[1] == 1, "Only support kv-cache updates of length 1"
+        indices = (0, idx[0], 0, 0)
+        cache_dtype = self.cache_dtype or k.dtype
+        k_new = jax.lax.dynamic_update_slice(k_cache, k.astype(cache_dtype), indices)
+        v_new = jax.lax.dynamic_update_slice(v_cache, v.astype(cache_dtype), indices)
+        idx_new = idx + 1
+        return idx_new, k_new, v_new
+    @nn.compact
+    def __call__(self, x, positions, attn_mask, kv_cache, decode, deterministic=True):  # noqa: FBT002
+        dtype = x.dtype  # original dtype, could be half-precision
+        if self.num_kv_heads == self.num_heads:
+            q, k, v = self.qkv_einsum("BSD,3KDH->3BSKH", x)
+        else:
+            q = self.q_einsum("BTD,NDH->BTNH", x)
+            k, v = self.kv_einsum("BSD,2KDH->2BSKH", x)
+        q = _apply_rope(q, positions=positions)  # promotes to float32
+        q *= self.head_dim**-0.5
+        k = _apply_rope(k, positions=positions)  # promotes to float32
+        if kv_cache is None:
+            idx, k_cache, v_cache = self._init_cache(k, v, attn_mask.shape[-1])
+        else:
+            idx, k_cache, v_cache = kv_cache
+            idx, k_cache, v_cache = self._update_cache(k, v, idx, k_cache, v_cache)
+        k, v = k_cache, v_cache
+        kv_cache = (idx, k_cache, v_cache)
+        q = einops.rearrange(q, "B T (K G) H -> B T K G H", K=self.num_kv_heads)
+        logits = jnp.einsum("BTKGH,BSKH->BKGTS", q, k, preferred_element_type=jnp.float32)
+        if attn_mask.shape != (q.shape[0], 1, q.shape[1], k.shape[1]):
+            raise ValueError(
+                f"Attention mask with shape {attn_mask.shape} but shapes for q and k are: {q.shape} and {k.shape}"
+            )
+        # big_neg = jnp.finfo(logits.dtype).min
+        big_neg = -2.3819763e38  # See gemma/modules.py
+        masked_logits = jnp.where(attn_mask[:, :, None, :, :], logits, big_neg)
+        probs = jax.nn.softmax(masked_logits, axis=-1).astype(dtype)
+        encoded = jnp.einsum("BKGTS,BSKH->BTKGH", probs, v)
+        encoded = einops.rearrange(encoded, "B T K G H -> B T (K G) H")
+        return self.attn_vec_einsum("BTNH,NHD->BTD", encoded), kv_cache
+@at.typecheck
+class Block(nn.Module):
+    """Transformer block."""
+    num_heads: int
+    num_kv_heads: int
+    embed_dim: int
+    head_dim: int
+    hidden_dim: int
+    dropout: float = 0.0
+    dropout_bdims: tuple[int, ...] = ()
+    cache_dtype: str | None = None
+    lora_configs: ml_collections.ConfigDict = dataclasses.field(default_factory=ml_collections.ConfigDict)
+    def setup(self):
+        self.pre_attention_norm = RMSNorm()
+        self.attn = Attention(
+            num_heads=self.num_heads,
+            num_kv_heads=self.num_kv_heads,
+            features=self.embed_dim,
+            head_dim=self.head_dim,
+            cache_dtype=self.cache_dtype,
+            lora_config=self.lora_configs.get("attn"),
+        )
+        self.pre_ffw_norm = RMSNorm()
+        self.mlp = lora.FeedForward(
+            features=self.embed_dim, hidden_dim=self.hidden_dim, name="mlp", lora_config=self.lora_configs.get("ffn")
+        )
+        if self.dropout:
+            self.drop = nn.Dropout(self.dropout, self.dropout_bdims)
+        else:
+            self.drop = lambda x, _: x
+    def __call__(self, x, kv_cache, positions, attn_mask, decode, deterministic=True):  # noqa: FBT002
+        x = nn.with_logical_constraint(x, ("act_batch", "act_len", "act_emb"))
+        inputs_normalized = self.pre_attention_norm(x)
+        attn_output, kv_cache = self.attn(inputs_normalized, positions, attn_mask, kv_cache, decode, deterministic)
+        attn_output = self.drop(attn_output, deterministic)
+        attn_output += x
+        residual = attn_output
+        attn_output = self.pre_ffw_norm(attn_output)
+        outputs = self.mlp(attn_output)
+        outputs = self.drop(outputs, deterministic)
+        outputs = residual + outputs
+        return outputs, kv_cache
+KVCache: TypeAlias = tuple[at.Int[at.Array, " b"], at.Float[at.Array, "b _t _k _h"], at.Float[at.Array, "b _t _v _h"]]
+@at.typecheck
+class Module(nn.Module):
+    """gemma model."""
+    variant: str
+    width: int
+    depth: int
+    mlp_dim: int
+    num_heads: int
+    num_kv_heads: int
+    head_dim: int
+    norm_eps: float
+    vocab_size: int
+    embed_dtype: str
+    dropout: float = 0.0
+    dropout_bdims: tuple[int, ...] = ()  # Every float is dropped independently.
+    cache_dtype: str | None = None
+    scan: bool = False
+    remat_policy: str = "none"
+    lora_configs: ml_collections.ConfigDict = dataclasses.field(default_factory=ml_collections.ConfigDict)
+    @nn.compact
+    def __call__(
+        self,
+        tokens=None,
+        embedded_prefix=None,
+        embed_only=False,  # noqa: FBT002
+        pre_logits=None,
+        positions=None,
+        mask=None,
+        decode=False,  # noqa: FBT002
+        kv_cache=None,
+        deterministic=True,  # noqa: FBT002
+        return_prelogits=False,  # noqa: FBT002
+    ):
+        """Embed only, or complete forward pass.
+        Args:
+          tokens: Embedded, then and appended to `embedded_prefix`. Can be None.
+          embedded_prefix: Optional prefix that is already embedded.
+          embed_only: Whether to compute embeddings only.
+          pre_logits: If present computes logits from pre_logits and returns.
+          positions: Optional `[B, T]` allows to specify the absolute position of
+            the tokens.
+          mask: Optional attention mask `[B, T, S]`.
+          decode: Whether to use kv-cache. Caller must pass masks and positions.
+          deterministic: Forwarded to all dropout layers.
+          return_prelogits: Whether to return the pre-logits.
+        Returns:
+          If `embed_only=False`, then `(logits, out)` will be returned.
+          If `embed_only=True`, then the embeddings will be returned.
+          If `return_prelogits=True`, then the pre-logits will be returned.
+        """
+        out = {}
+        embedder = Embedder(vocab_size=self.vocab_size, embed_dim=self.width, name="embedder")
+        if pre_logits is not None:
+            x = out["pre_logits"] = pre_logits
+            logits = out["logits"] = embedder.decode(x)
+            return logits, out
+        x = []
+        if embedded_prefix is not None:
+            x.append(embedded_prefix)
+        if tokens is not None:
+            x.append(embedder.encode(tokens))
+        x = jnp.concatenate(x, axis=-2)
+        x = x.astype(self.embed_dtype)
+        batch_size, seq_len, width = x.shape
+        if embed_only:
+            return x
+        if decode:
+            assert positions is not None and mask is not None, (  # noqa: PT018
+                "Must explicitly pass positions and mask for decoding."
+            )
+        if positions is None:
+            positions = jnp.arange(seq_len).astype(jnp.int32)[None, :]
+        assert positions.shape[1] == x.shape[1], (positions.shape, x.shape)
+        if mask is None:
+            mask = nn.attention.make_causal_mask(jnp.ones([batch_size, seq_len]))
+        if mask.ndim == 3:
+            mask = mask[:, None, :, :]
+        cache_size = max(seq_len, mask.shape[-1])
+        assert mask.shape == (batch_size, 1, seq_len, cache_size), mask.shape
+        if self.remat_policy == "none":
+            block_cls = Block
+        else:
+            block_cls = nn.remat(
+                Block,
+                prevent_cse=not self.scan,
+                static_argnums=(5, 6),  # 0=self, 5=decode, 6=deterministic
+                policy=getattr(jax.checkpoint_policies, self.remat_policy),
+            )
+        block_kw = {
+            "num_heads": self.num_heads,
+            "head_dim": self.head_dim,
+            "num_kv_heads": self.num_kv_heads,
+            "embed_dim": width,
+            "hidden_dim": self.mlp_dim,
+            "dropout": self.dropout,
+            "dropout_bdims": self.dropout_bdims,
+            "cache_dtype": self.cache_dtype,
+            "lora_configs": self.lora_configs,
+        }
+        layers = self.scope.push("layers")
+        blocks = [
+            nn.scan(
+                block_cls,
+                variable_axes={"params": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(0, nn.broadcast, nn.broadcast, nn.broadcast, nn.broadcast),  # 0=kv_cache, 1=positions, 2=mask
+                length=self.depth,
+            )(parent=layers, **block_kw)
+        ]
+        for block in blocks:
+            x, kv_cache = block(x, kv_cache, positions, mask, decode, deterministic)
+        assert x.dtype == jnp.dtype(self.embed_dtype)  # Sanity check.
+        out["encoded"] = x
+        x = RMSNorm(name="final_norm")(x)
+        out["pre_logits"] = x
+        if return_prelogits:
+            return x, kv_cache, out
+        x = embedder.decode(x)
+        out["logits"] = x
+        return x, kv_cache, out
+    def init(self):
+        """Convenience method for initializing all parameters, necessary due to the quirks of linen."""
+        self(jnp.zeros((1, 1), dtype=jnp.int32))
+def _apply_rope(x, *, positions, max_wavelength=10_000):
+    """Applies RoPE positions [B, L] to x [B, L, H, D]."""
+    freq_exponents = (2.0 / x.shape[-1]) * jnp.arange(x.shape[-1] // 2, dtype=jnp.float32)
+    timescale = max_wavelength**freq_exponents
+    radians = positions[..., None] / timescale[None, None, :]
+    radians = radians[..., None, :]
+    assert radians.dtype == jnp.float32
+    # radians.shape = [...,L,1,d=D/2]
+    sin, cos = jnp.sin(radians), jnp.cos(radians)
+    x1, x2 = jnp.split(x, 2, axis=-1)
+    res = jnp.concatenate([x1 * cos - x2 * sin, x2 * cos + x1 * sin], axis=-1)
+    assert res.dtype == jnp.float32
+    return res

capvector-pi05/src/openpi/models/lora.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import math
+import re
+import flax.linen as nn
+import flax.struct as struct
+import jax.numpy as jnp
+import openpi.shared.array_typing as at
+@struct.dataclass
+class LoRAConfig:
+    """Configuration for LoRA."""
+    # LoRA rank.
+    rank: int
+    # LoRA scaling factor.
+    alpha: float = 1.0
+    # Initialization function for LoRA parameters.
+    init_fn: nn.initializers.Initializer = nn.initializers.normal(stddev=0.01)
+    # Enable rank-stabilized LoRA: https://arxiv.org/pdf/2312.03732
+    rslora: bool = False
+    # Axes in the weight to apply LoRA to. Should typically be the last two axes.
+    axes: tuple[int, int] = (-2, -1)
+    # Axis label which is used by LoRA in einsum equations. Must not be present in the original equation.
+    label: str = "L"
+    @property
+    def scaling_value(self) -> float:
+        return self.alpha / math.sqrt(self.rank) if self.rslora else self.alpha / self.rank
+class Einsum(nn.Module):
+    """Einsum with LoRA support. Can be used as a drop-in replacement for the Gemma Einsum."""
+    # Shape of the weight.
+    shape: tuple[int, ...]
+    # Initialization function for the weight.
+    init_fn: nn.initializers.Initializer = nn.initializers.zeros
+    # If not None, apply LoRA to the weight.
+    lora_config: LoRAConfig | None = None
+    def setup(self):
+        self.w = self.param("w", self.init_fn, self.shape)
+        if config := self.lora_config:
+            # Setup LoRA parameters.
+            shape_a, shape_b = list(self.shape), list(self.shape)
+            shape_a[config.axes[1]] = config.rank
+            shape_b[config.axes[0]] = config.rank
+            self.w_a = self.param("lora_a", config.init_fn, shape_a)
+            self.w_b = self.param("lora_b", config.init_fn, shape_b)
+    @nn.compact
+    def __call__(self, eqn: str, x):
+        dtype = x.dtype  # original dtype, could be half-precision
+        result = jnp.einsum(eqn, x, self.w.astype(dtype))
+        if config := self.lora_config:
+            eqn_a, eqn_b = self._make_lora_eqns(eqn)
+            lora = jnp.einsum(eqn_a, x, self.w_a.astype(dtype))
+            lora = jnp.einsum(eqn_b, lora, self.w_b.astype(dtype))
+            result = result + lora * config.scaling_value
+        return result
+    def _make_lora_eqns(self, eqn: str) -> tuple[str, str]:
+        if "L" in eqn:
+            raise ValueError(f"L already in eqn: {eqn}")
+        if not (m := re.match("(.*),(.*)->(.*)", eqn)):
+            raise ValueError(f"Unsupported einsum eqn: {eqn}")
+        lhs, rhs, out = m.groups()
+        assert self.lora_config is not None
+        a_label, b_label = (rhs[x] for x in self.lora_config.axes)
+        label = self.lora_config.label
+        a_rhs = rhs.replace(b_label, label)
+        a_out = out.replace(b_label, label)
+        eqn_a = f"{lhs},{a_rhs}->{a_out}"
+        b_rhs = rhs.replace(a_label, label)
+        eqn_b = f"{a_out},{b_rhs}->{out}"
+        return eqn_a, eqn_b
+class FeedForward(nn.Module):
+    """Feed forward module."""
+    features: int
+    hidden_dim: int
+    # If not None, apply LoRA to the weight.
+    lora_config: LoRAConfig | None = None
+    def setup(self):
+        self.w_gating = self.param(
+            "gating_einsum",
+            nn.initializers.lecun_normal(in_axis=-2, out_axis=-1, batch_axis=(0,)),
+            (2, self.features, self.hidden_dim),
+        )
+        self.w_linear = self.param(
+            "linear",
+            nn.initializers.lecun_normal(in_axis=-2, out_axis=-1),
+            (self.hidden_dim, self.features),
+        )
+        self.w_gating_lora = None
+        self.w_linear_lora = None
+        if self.lora_config:
+            # Setup LoRA parameters.
+            # TODO: follow up with a simplified init_fn api.
+            self.w_gating_lora = (
+                self.param("gating_einsum_lora_a", self.lora_config.init_fn, (2, self.features, self.lora_config.rank)),
+                self.param(
+                    "gating_einsum_lora_b", self.lora_config.init_fn, (2, self.lora_config.rank, self.hidden_dim)
+                ),
+            )
+            self.w_linear_lora = (
+                self.param("linear_lora_a", self.lora_config.init_fn, (self.hidden_dim, self.lora_config.rank)),
+                self.param("linear_lora_b", self.lora_config.init_fn, (self.lora_config.rank, self.features)),
+            )
+    @nn.compact
+    def __call__(self, x):
+        dtype = x.dtype  # original dtype, could be half-precision
+        ff_gate = self._dot(
+            x,
+            self.w_gating[0],
+            None if self.w_gating_lora is None else (self.w_gating_lora[0][0], self.w_gating_lora[1][0]),
+        )
+        gate_value = nn.gelu(ff_gate)
+        ff1 = self._dot(
+            x,
+            self.w_gating[1],
+            None if self.w_gating_lora is None else (self.w_gating_lora[0][1], self.w_gating_lora[1][1]),
+        )
+        activations = gate_value * ff1
+        outputs = self._dot(activations, self.w_linear, self.w_linear_lora)
+        assert outputs.dtype == dtype
+        return outputs
+    def _dot(self, x: at.Array, w: at.Array, lora_weights: tuple[at.Array, at.Array] | None) -> at.Array:
+        base = jnp.dot(x, w.astype(x.dtype))
+        if lora_weights is None:
+            return base
+        return base + jnp.dot(jnp.dot(x, lora_weights[0].astype(x.dtype)), lora_weights[1].astype(x.dtype))

capvector-pi05/src/openpi/models/lora_test.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import openpi.models.lora as lora
+def test_lora_einsum_params_shape():
+    shape = (3, 8, 32, 4)  # (3KDH)
+    einsum = lora.Einsum(shape)
+    lora0 = lora.Einsum(shape, lora_config=lora.LoRAConfig(rank=2))
+    lora1 = lora.Einsum(shape, lora_config=lora.LoRAConfig(rank=2, axes=(1, 2)))
+    key = jax.random.key(0)
+    x = jax.random.normal(key, (8, 64, 32))  # (BSD)
+    eqn = "BSD,3KDH->3BSKH"
+    # Ensure that lora parameters are not initialized when LoRA is not used.
+    params = einsum.init(key, eqn, x)
+    assert "lora_a" not in params["params"]
+    assert "lora_b" not in params["params"]
+    # Check that default axes work.
+    params_lora0 = lora0.init(key, eqn, x)
+    assert params_lora0["params"]["lora_a"].shape == (3, 8, 32, 2)
+    assert params_lora0["params"]["lora_b"].shape == (3, 8, 2, 4)
+    # Check that user provided axes work.
+    params_lora1 = lora1.init(key, eqn, x)
+    assert params_lora1["params"]["lora_a"].shape == (3, 8, 2, 4)
+    assert params_lora1["params"]["lora_b"].shape == (3, 2, 32, 4)
+def test_lora_einsum_same_output():
+    shape = (3, 8, 32, 4)  # (3KDH)
+    einsum = lora.Einsum(shape)
+    einsum_lora = lora.Einsum(shape, lora_config=lora.LoRAConfig(rank=2, init_fn=nn.initializers.zeros))
+    key = jax.random.key(0)
+    x = jax.random.normal(key, (8, 64, 32))  # (BSD)
+    eqn = "BSD,3KDH->3BSKH"
+    params = einsum.init(key, eqn, x)
+    output = einsum.apply(params, eqn, x)
+    params_lora = einsum_lora.init(key, eqn, x)
+    output_lora = einsum_lora.apply(params_lora, eqn, x)
+    # Results are the same since the LoRA parameters are initialized to zeros.
+    assert jnp.allclose(output, output_lora)
+def test_lora_ffn_params_shape():
+    ffn = lora.FeedForward(features=8, hidden_dim=32)
+    ffn_lora = lora.FeedForward(
+        features=8,
+        hidden_dim=32,
+        lora_config=lora.LoRAConfig(rank=2),
+    )
+    key = jax.random.key(0)
+    x = jax.random.normal(key, (2, 8))
+    params = ffn.init(key, x)
+    assert params["params"]["gating_einsum"].shape == (2, 8, 32)
+    assert params["params"]["linear"].shape == (32, 8)
+    params_lora = ffn_lora.init(key, x)
+    assert params_lora["params"]["gating_einsum"].shape == (2, 8, 32)
+    assert params_lora["params"]["linear"].shape == (32, 8)
+    assert params_lora["params"]["gating_einsum_lora_a"].shape == (2, 8, 2)
+    assert params_lora["params"]["gating_einsum_lora_b"].shape == (2, 2, 32)
+    assert params_lora["params"]["linear_lora_a"].shape == (32, 2)
+    assert params_lora["params"]["linear_lora_b"].shape == (2, 8)
+def test_lora_ffn_same_output():
+    ffn = lora.FeedForward(features=8, hidden_dim=32)
+    ffn_lora = lora.FeedForward(
+        features=8,
+        hidden_dim=32,
+        lora_config=lora.LoRAConfig(rank=2, init_fn=nn.initializers.zeros),
+    )
+    key = jax.random.key(0)
+    x = jax.random.normal(key, (2, 8))
+    params = ffn.init(key, x)
+    output = ffn.apply(params, x)
+    params_lora = ffn_lora.init(key, x)
+    output_lora = ffn_lora.apply(params_lora, x)
+    assert jnp.allclose(output, output_lora)

capvector-pi05/src/openpi/models/model.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import abc
+from collections.abc import Sequence
+import dataclasses
+import enum
+import logging
+import pathlib
+from typing import Generic, TypeVar
+import augmax
+from flax import nnx
+from flax import struct
+from flax import traverse_util
+import jax
+import jax.numpy as jnp
+import numpy as np
+import orbax.checkpoint as ocp
+import safetensors
+import torch
+from openpi.models_pytorch import pi0_pytorch
+from openpi.shared import image_tools
+import openpi.shared.array_typing as at
+logger = logging.getLogger("openpi")
+# Type variable for array types (JAX arrays, PyTorch tensors, or numpy arrays)
+ArrayT = TypeVar("ArrayT", bound=jax.Array | torch.Tensor | np.ndarray)
+class ModelType(enum.Enum):
+    """Supported model types."""
+    PI0 = "pi0"
+    PI0_FAST = "pi0_fast"
+    PI05 = "pi05"
+# The model always expects these images
+IMAGE_KEYS = (
+    "base_0_rgb",
+    "left_wrist_0_rgb",
+    "right_wrist_0_rgb",
+)
+# This may need change if we release a small model.
+IMAGE_RESOLUTION = (224, 224)
+# Data format
+#
+# Data transforms produce the model input as a nested dictionary which is later converted
+# into `Obesrvation` and `Actions` objects. See below.
+#
+# In the dictory form, this data should look like:
+# {
+#     # Observation data.
+#     "image": {
+#         "base_0_rgb": (float32|uint8)[*b, h, w, 3],  # RGB image in [-1, 1] or [0, 255]
+#         ...  # Additional camera views
+#     },
+#     "image_mask": {
+#         "base_0_rgb": bool[*b],  # True if image is valid
+#         ...  # Masks for additional views
+#     },
+#     "state": float32[*b, s],  # Low-dimensional robot state
+#     "tokenized_prompt": int32[*b, l],  # Optional, tokenized language prompt
+#     "tokenized_prompt_mask": bool[*b, l],  # Optional, mask for tokenized prompt
+#     "token_ar_mask": int32[*b, l],  # Optional, autoregressive mask for FAST model
+#     "token_loss_mask": bool[*b, l],  # Optional, loss mask for FAST model
+#
+#      # Actions data.
+#      "actions": float32[*b ah ad]
+# }
+# where:
+#   *b = batch dimensions
+#   h,w = image height/width
+#   s = state dimension
+#   l = sequence length
+#
+@at.typecheck
+@struct.dataclass
+class Observation(Generic[ArrayT]):
+    """Holds observations, i.e., inputs to the model.
+    See `Observation.from_dict` to see the expected dictionary form. This is the format
+    that should be produced by the data transforms.
+    """
+    # Images, in [-1, 1] float32.
+    images: dict[str, at.Float[ArrayT, "*b h w c"]]
+    # the padding area for non-rectangular input images is False
+    image_padding_mask: dict[str, at.Bool[ArrayT, "*b w c"]]
+    # Image masks, with same keys as images.
+    image_masks: dict[str, at.Bool[ArrayT, "*b"]]
+    # Low-dimensional robot state.
+    state: at.Float[ArrayT, "*b s"]
+    # Tokenized prompt.
+    tokenized_prompt: at.Int[ArrayT, "*b l"] | None = None
+    # Tokenized prompt mask.
+    tokenized_prompt_mask: at.Bool[ArrayT, "*b l"] | None = None
+    # pi0-fast model specific fields.
+    # Token auto-regressive mask (for FAST autoregressive model).
+    token_ar_mask: at.Int[ArrayT, "*b l"] | None = None
+    # Token loss mask (for FAST autoregressive model).
+    token_loss_mask: at.Bool[ArrayT, "*b l"] | None = None
+    @classmethod
+    def from_dict(cls, data: at.PyTree[ArrayT]) -> "Observation[ArrayT]":
+        """This method defines the mapping between unstructured data (i.e., nested dict) to the structured Observation format."""
+        # Ensure that tokenized_prompt and tokenized_prompt_mask are provided together.
+        if ("tokenized_prompt" in data) != ("tokenized_prompt_mask" in data):
+            raise ValueError("tokenized_prompt and tokenized_prompt_mask must be provided together.")
+        # If images are uint8, convert them to [-1, 1] float32.
+        for key in data["image"]:
+            if data["image"][key].dtype == np.uint8:
+                data["image"][key] = data["image"][key].astype(np.float32) / 255.0 * 2.0 - 1.0
+            elif hasattr(data["image"][key], "dtype") and data["image"][key].dtype == torch.uint8:
+                data["image"][key] = data["image"][key].to(torch.float32).permute(0, 3, 1, 2) / 255.0 * 2.0 - 1.0
+        return cls(
+            images=data["image"],
+            image_padding_mask=data.get("image_padding_mask", {}),
+            image_masks=data["image_mask"],
+            state=data["state"],
+            tokenized_prompt=data.get("tokenized_prompt"),
+            tokenized_prompt_mask=data.get("tokenized_prompt_mask"),
+            token_ar_mask=data.get("token_ar_mask"),
+            token_loss_mask=data.get("token_loss_mask"),
+        )
+    def to_dict(self) -> at.PyTree[ArrayT]:
+        """Convert the Observation to a nested dict."""
+        result = dataclasses.asdict(self)
+        result["image"] = result.pop("images")
+        result["image_mask"] = result.pop("image_masks")
+        return result
+# Defines the format of the actions. This field is included as "actions" inside the dictionary
+# produced by the data transforms.
+Actions = at.Float[ArrayT, "*b ah ad"]
+def preprocess_observation(
+    rng: at.KeyArrayLike | None,
+    observation: Observation,
+    *,
+    train: bool = False,
+    image_keys: Sequence[str] = IMAGE_KEYS,
+    image_resolution: tuple[int, int] = IMAGE_RESOLUTION,
+) -> Observation:
+    """Preprocess the observations by performing image augmentations (if train=True), resizing (if necessary), and
+    filling in a default image mask (if necessary).
+    """
+    if not set(image_keys).issubset(observation.images):
+        raise ValueError(f"images dict missing keys: expected {image_keys}, got {list(observation.images)}")
+    batch_shape = observation.state.shape[:-1]
+    out_images = {}
+    for key in image_keys:
+        image = observation.images[key]
+        if image.shape[1:3] != image_resolution:
+            logger.info(f"Resizing image {key} from {image.shape[1:3]} to {image_resolution}")
+            image = image_tools.resize_with_pad(image, *image_resolution)
+        if train:
+            # Convert from [-1, 1] to [0, 1] for augmax.
+            image = image / 2.0 + 0.5
+            transforms = []
+            if "wrist" not in key:
+                height, width = image.shape[1:3]
+                transforms += [
+                    augmax.RandomCrop(int(width * 0.95), int(height * 0.95)),
+                    augmax.Resize(width, height),
+                    augmax.Rotate((-5, 5)),
+                ]
+            transforms += [
+                augmax.ColorJitter(brightness=0.3, contrast=0.4, saturation=0.5),
+            ]
+            sub_rngs = jax.random.split(rng, image.shape[0])
+            image = jax.vmap(augmax.Chain(*transforms))(sub_rngs, image)
+            # Back to [-1, 1].
+            image = image * 2.0 - 1.0
+        out_images[key] = image
+    # obtain mask
+    out_masks = {}
+    for key in out_images:
+        if key not in observation.image_masks:
+            # do not mask by default
+            out_masks[key] = jnp.ones(batch_shape, dtype=jnp.bool)
+        else:
+            out_masks[key] = jnp.asarray(observation.image_masks[key])
+    return Observation(
+        images=out_images,
+        image_masks=out_masks,
+        state=observation.state,
+        tokenized_prompt=observation.tokenized_prompt,
+        tokenized_prompt_mask=observation.tokenized_prompt_mask,
+        token_ar_mask=observation.token_ar_mask,
+        token_loss_mask=observation.token_loss_mask,
+    )
+@dataclasses.dataclass(frozen=True)
+class BaseModelConfig(abc.ABC):
+    """Configuration shared by all models. Specific models should inherit from this class, and implement the `create`
+    method to create the corresponding model.
+    """
+    # Action space dimension.
+    action_dim: int
+    # Action sequence length.
+    action_horizon: int
+    # Tokenized prompt maximum length.
+    max_token_len: int
+    @property
+    @abc.abstractmethod
+    def model_type(self) -> ModelType:
+        """The model type."""
+    @abc.abstractmethod
+    def create(self, rng: at.KeyArrayLike) -> "BaseModel":
+        """Create a new model, initializing parameters."""
+    def load(self, params: at.Params, *, remove_extra_params: bool = True) -> "BaseModel":
+        """Create a model with the given parameters."""
+        model = nnx.eval_shape(self.create, jax.random.key(0))
+        graphdef, state = nnx.split(model)
+        if remove_extra_params:
+            params = ocp.transform_utils.intersect_trees(state.to_pure_dict(), params)
+        at.check_pytree_equality(expected=state.to_pure_dict(), got=params, check_shapes=True, check_dtypes=False)
+        state.replace_by_pure_dict(params)
+        return nnx.merge(graphdef, state)
+    def load_pytorch(self, train_config, weight_path: str):
+        logger.info(f"train_config: {train_config}")
+        model = pi0_pytorch.PI0Pytorch(config=train_config.model)
+        safetensors.torch.load_model(model, weight_path)
+        return model
+    @abc.abstractmethod
+    def inputs_spec(self, *, batch_size: int = 1) -> tuple[Observation, Actions]:
+        """Returns the input specification for the model. Values are jax.ShapeDtypeStruct."""
+    def fake_obs(self, batch_size: int = 1) -> Observation:
+        observation_spec, _ = self.inputs_spec(batch_size=batch_size)
+        return jax.tree.map(lambda x: jnp.ones(x.shape, x.dtype), observation_spec)
+    def fake_act(self, batch_size: int = 1) -> Actions:
+        _, action_spec = self.inputs_spec(batch_size=batch_size)
+        return jax.tree.map(lambda x: jnp.ones(x.shape, x.dtype), action_spec)
+@dataclasses.dataclass
+class BaseModel(nnx.Module, abc.ABC):
+    """Base class for all model implementations. Specific models should inherit from this class. They should call
+    super().__init__() to initialize the shared attributes (action_dim, action_horizon, and max_token_len).
+    """
+    action_dim: int
+    action_horizon: int
+    max_token_len: int
+    @abc.abstractmethod
+    def compute_loss(
+        self,
+        rng: at.KeyArrayLike,
+        observation: Observation,
+        actions: Actions,
+        *,
+        train: bool = False,
+    ) -> at.Float[at.Array, "*b ah"]: ...
+    @abc.abstractmethod
+    def sample_actions(self, rng: at.KeyArrayLike, observation: Observation, **kwargs) -> Actions: ...
+def restore_params(
+    params_path: pathlib.Path | str,
+    *,
+    restore_type: type[np.ndarray] | type[jax.Array] = jax.Array,
+    dtype: jnp.dtype | None = None,
+    sharding: jax.sharding.Sharding | None = None,
+) -> at.Params:
+    """Restores unstructured params PyTree from a checkpoint.
+    This works with checkpoints saved with `save_state` during openpi training (see `training/checkpoints.py`) as
+    well as pre-trained checkpoints released for openpi.
+    Args:
+        params_path: The local path to the checkpoint directory.
+        restore_type: The type to restore the params as. Can be set to `np.ndarray` to load the params as a numpy array.
+        dtype: The dtype to restore all params as. If not provided, will use the original dtype from the checkpoint.
+        sharding: The sharding to use for the params. If not provided, the params will be replicated across all devices.
+    Returns:
+        The restored params.
+    """
+    params_path = pathlib.Path(params_path).resolve() if not str(params_path).startswith("gs://") else params_path
+    if restore_type is jax.Array and sharding is None:
+        mesh = jax.sharding.Mesh(jax.devices(), ("x",))
+        sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec())
+    with ocp.PyTreeCheckpointer() as ckptr:
+        metadata = ckptr.metadata(params_path)
+        item = {"params": metadata["params"]}
+        params = ckptr.restore(
+            params_path,
+            ocp.args.PyTreeRestore(
+                item=item,
+                restore_args=jax.tree.map(
+                    lambda _: ocp.ArrayRestoreArgs(sharding=sharding, restore_type=restore_type, dtype=dtype), item
+                ),
+            ),
+        )["params"]
+    # If the params were saved with `save_state` during openpi training, every key path will end with "value", which is
+    # added by `nnx.State`. We remove the "value" suffix here and always return what NNX calls a "pure dict".
+    flat_params = traverse_util.flatten_dict(params)
+    if all(kp[-1] == "value" for kp in flat_params):
+        flat_params = {kp[:-1]: v for kp, v in flat_params.items()}
+    return traverse_util.unflatten_dict(flat_params)

capvector-pi05/src/openpi/models/model_test.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from flax import nnx
+import jax
+import pytest
+from openpi.models import model as _model
+from openpi.models import pi0_config
+from openpi.models import pi0_fast
+from openpi.shared import download
+from openpi.shared import nnx_utils
+def test_pi0_model():
+    key = jax.random.key(0)
+    config = pi0_config.Pi0Config()
+    model = config.create(key)
+    batch_size = 2
+    obs, act = config.fake_obs(batch_size), config.fake_act(batch_size)
+    loss = nnx_utils.module_jit(model.compute_loss)(key, obs, act)
+    assert loss.shape == (batch_size, config.action_horizon)
+    actions = nnx_utils.module_jit(model.sample_actions)(key, obs, num_steps=10)
+    assert actions.shape == (batch_size, model.action_horizon, model.action_dim)
+def test_pi0_lora_model():
+    key = jax.random.key(0)
+    config = pi0_config.Pi0Config(paligemma_variant="gemma_2b_lora")
+    model = config.create(key)
+    batch_size = 2
+    obs, act = config.fake_obs(batch_size), config.fake_act(batch_size)
+    loss = nnx_utils.module_jit(model.compute_loss)(key, obs, act)
+    assert loss.shape == (batch_size, config.action_horizon)
+    actions = nnx_utils.module_jit(model.sample_actions)(key, obs, num_steps=10)
+    assert actions.shape == (batch_size, model.action_horizon, model.action_dim)
+def test_pi0_fast_model():
+    key = jax.random.key(0)
+    config = pi0_fast.Pi0FASTConfig()
+    model = config.create(key)
+    batch_size = 2
+    obs, act = config.fake_obs(batch_size), config.fake_act(batch_size)
+    loss = nnx_utils.module_jit(model.compute_loss)(key, obs, act)
+    assert loss.shape == (batch_size,)
+    actions = nnx_utils.module_jit(model.sample_actions)(key, obs)
+    assert actions.shape == (batch_size, 256)
+def test_pi0_fast_lora_model():
+    key = jax.random.key(0)
+    config = pi0_fast.Pi0FASTConfig(paligemma_variant="gemma_2b_lora")
+    model = config.create(key)
+    batch_size = 2
+    obs, act = config.fake_obs(batch_size), config.fake_act(batch_size)
+    loss = nnx_utils.module_jit(model.compute_loss)(key, obs, act)
+    assert loss.shape == (batch_size,)
+    actions = nnx_utils.module_jit(model.sample_actions)(key, obs)
+    assert actions.shape == (batch_size, 256)
+    lora_filter = nnx_utils.PathRegex(".*lora.*")
+    model_state = nnx.state(model)
+    lora_state_elems = list(model_state.filter(lora_filter))
+    assert len(lora_state_elems) > 0
+@pytest.mark.manual
+def test_model_restore():
+    key = jax.random.key(0)
+    config = pi0_config.Pi0Config()
+    batch_size = 2
+    obs, act = config.fake_obs(batch_size), config.fake_act(batch_size)
+    model = config.load(
+        _model.restore_params(download.maybe_download("gs://openpi-assets/checkpoints/pi0_base/params"))
+    )
+    loss = model.compute_loss(key, obs, act)
+    assert loss.shape == (batch_size, config.action_horizon)
+    actions = model.sample_actions(key, obs, num_steps=10)
+    assert actions.shape == (batch_size, model.action_horizon, model.action_dim)

capvector-pi05/src/openpi/models/pi0.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import logging
+import einops
+import flax.nnx as nnx
+import flax.nnx.bridge as nnx_bridge
+import jax
+import jax.numpy as jnp
+from typing_extensions import override
+from openpi.models import model as _model
+from openpi.models import pi0_config
+import openpi.models.gemma as _gemma
+import openpi.models.siglip as _siglip
+from openpi.shared import array_typing as at
+logger = logging.getLogger("openpi")
+def make_attn_mask(input_mask, mask_ar):
+    """Adapted from big_vision.
+    Tokens can attend to valid inputs tokens which have a cumulative mask_ar
+    smaller or equal to theirs. This way `mask_ar` bool[?B, N] can be used to
+    setup several types of attention, for example:
+      [[1 1 1 1 1 1]]: pure causal attention.
+      [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between
+          themselves and the last 3 tokens have a causal attention. The first
+          entry could also be a 1 without changing behaviour.
+      [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a
+          block can attend all previous blocks and all tokens on the same block.
+    Args:
+      input_mask: bool[B, N] true if its part of the input, false if padding.
+      mask_ar: bool[?B, N] mask that's true where previous tokens cannot depend on
+        it and false where it shares the same attention mask as the previous token.
+    """
+    mask_ar = jnp.broadcast_to(mask_ar, input_mask.shape)
+    cumsum = jnp.cumsum(mask_ar, axis=1)
+    attn_mask = cumsum[:, None, :] <= cumsum[:, :, None]
+    valid_mask = input_mask[:, None, :] * input_mask[:, :, None]
+    return jnp.logical_and(attn_mask, valid_mask)
+@at.typecheck
+def posemb_sincos(
+    pos: at.Real[at.Array, " b"], embedding_dim: int, min_period: float, max_period: float
+) -> at.Float[at.Array, "b {embedding_dim}"]:
+    """Computes sine-cosine positional embedding vectors for scalar positions."""
+    if embedding_dim % 2 != 0:
+        raise ValueError(f"embedding_dim ({embedding_dim}) must be divisible by 2")
+    fraction = jnp.linspace(0.0, 1.0, embedding_dim // 2)
+    period = min_period * (max_period / min_period) ** fraction
+    sinusoid_input = jnp.einsum(
+        "i,j->ij",
+        pos,
+        1.0 / period * 2 * jnp.pi,
+        precision=jax.lax.Precision.HIGHEST,
+    )
+    return jnp.concatenate([jnp.sin(sinusoid_input), jnp.cos(sinusoid_input)], axis=-1)
+class Pi0(_model.BaseModel):
+    def __init__(self, config: pi0_config.Pi0Config, rngs: nnx.Rngs):
+        super().__init__(config.action_dim, config.action_horizon, config.max_token_len)
+        self.pi05 = config.pi05
+        paligemma_config = _gemma.get_config(config.paligemma_variant)
+        action_expert_config = _gemma.get_config(config.action_expert_variant)
+        # TODO: rewrite gemma in NNX. For now, use bridge.
+        llm = nnx_bridge.ToNNX(
+            _gemma.Module(
+                configs=[paligemma_config, action_expert_config],
+                embed_dtype=config.dtype,
+                adarms=config.pi05,
+            )
+        )
+        llm.lazy_init(rngs=rngs, method="init", use_adarms=[False, True] if config.pi05 else [False, False])
+        img = nnx_bridge.ToNNX(
+            _siglip.Module(
+                num_classes=paligemma_config.width,
+                variant="So400m/14",
+                pool_type="none",
+                scan=True,
+                dtype_mm=config.dtype,
+            )
+        )
+        img.lazy_init(next(iter(config.fake_obs().images.values())), train=False, rngs=rngs)
+        self.PaliGemma = nnx.Dict(llm=llm, img=img)
+        self.action_in_proj = nnx.Linear(config.action_dim, action_expert_config.width, rngs=rngs)
+        if config.pi05:
+            self.time_mlp_in = nnx.Linear(action_expert_config.width, action_expert_config.width, rngs=rngs)
+            self.time_mlp_out = nnx.Linear(action_expert_config.width, action_expert_config.width, rngs=rngs)
+        else:
+            self.state_proj = nnx.Linear(config.action_dim, action_expert_config.width, rngs=rngs)
+            self.action_time_mlp_in = nnx.Linear(2 * action_expert_config.width, action_expert_config.width, rngs=rngs)
+            self.action_time_mlp_out = nnx.Linear(action_expert_config.width, action_expert_config.width, rngs=rngs)
+        self.action_out_proj = nnx.Linear(action_expert_config.width, config.action_dim, rngs=rngs)
+        # This attribute gets automatically set by model.train() and model.eval().
+        self.deterministic = True
+    @at.typecheck
+    def embed_prefix(
+        self, obs: _model.Observation
+    ) -> tuple[at.Float[at.Array, "b s emb"], at.Bool[at.Array, "b s"], at.Bool[at.Array, " s"]]:
+        input_mask = []
+        ar_mask = []
+        tokens = []
+        # embed images
+        for name in obs.images:
+            image_tokens, _ = self.PaliGemma.img(obs.images[name], train=False)
+            tokens.append(image_tokens)
+            input_mask.append(
+                einops.repeat(
+                    obs.image_masks[name],
+                    "b -> b s",
+                    s=image_tokens.shape[1],
+                )
+            )
+            # image tokens attend to each other
+            ar_mask += [False] * image_tokens.shape[1]
+        # add language (aka tokenized inputs)
+        if obs.tokenized_prompt is not None:
+            tokenized_inputs = self.PaliGemma.llm(obs.tokenized_prompt, method="embed")
+            tokens.append(tokenized_inputs)
+            input_mask.append(obs.tokenized_prompt_mask)
+            # full attention between image and language inputs
+            ar_mask += [False] * tokenized_inputs.shape[1]
+        tokens = jnp.concatenate(tokens, axis=1)
+        input_mask = jnp.concatenate(input_mask, axis=1)
+        ar_mask = jnp.array(ar_mask)
+        return tokens, input_mask, ar_mask
+    @at.typecheck
+    def embed_suffix(
+        self, obs: _model.Observation, noisy_actions: _model.Actions, timestep: at.Float[at.Array, " b"]
+    ) -> tuple[
+        at.Float[at.Array, "b s emb"],
+        at.Bool[at.Array, "b s"],
+        at.Bool[at.Array, " s"],
+        at.Float[at.Array, "b emb"] | None,
+    ]:
+        input_mask = []
+        ar_mask = []
+        tokens = []
+        if not self.pi05:
+            # add a single state token
+            state_token = self.state_proj(obs.state)[:, None, :]
+            tokens.append(state_token)
+            input_mask.append(jnp.ones((obs.state.shape[0], 1), dtype=jnp.bool_))
+            # image/language inputs do not attend to state or actions
+            ar_mask += [True]
+        action_tokens = self.action_in_proj(noisy_actions)
+        # embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1]
+        time_emb = posemb_sincos(timestep, self.action_in_proj.out_features, min_period=4e-3, max_period=4.0)
+        if self.pi05:
+            # time MLP (for adaRMS)
+            time_emb = self.time_mlp_in(time_emb)
+            time_emb = nnx.swish(time_emb)
+            time_emb = self.time_mlp_out(time_emb)
+            time_emb = nnx.swish(time_emb)
+            action_expert_tokens = action_tokens
+            adarms_cond = time_emb
+        else:
+            # mix timestep + action information using an MLP (no adaRMS)
+            time_tokens = einops.repeat(time_emb, "b emb -> b s emb", s=self.action_horizon)
+            action_time_tokens = jnp.concatenate([action_tokens, time_tokens], axis=-1)
+            action_time_tokens = self.action_time_mlp_in(action_time_tokens)
+            action_time_tokens = nnx.swish(action_time_tokens)
+            action_time_tokens = self.action_time_mlp_out(action_time_tokens)
+            action_expert_tokens = action_time_tokens
+            adarms_cond = None
+        tokens.append(action_expert_tokens)
+        input_mask.append(jnp.ones(action_expert_tokens.shape[:2], dtype=jnp.bool_))
+        # image/language/state inputs do not attend to action tokens
+        ar_mask += [True] + ([False] * (self.action_horizon - 1))
+        tokens = jnp.concatenate(tokens, axis=1)
+        input_mask = jnp.concatenate(input_mask, axis=1)
+        ar_mask = jnp.array(ar_mask)
+        return tokens, input_mask, ar_mask, adarms_cond
+    @override
+    def compute_loss(
+        self, rng: at.KeyArrayLike, observation: _model.Observation, actions: _model.Actions, *, train: bool = False
+    ) -> at.Float[at.Array, "*b ah"]:
+        preprocess_rng, noise_rng, time_rng = jax.random.split(rng, 3)
+        observation = _model.preprocess_observation(preprocess_rng, observation, train=train)
+        batch_shape = actions.shape[:-2]
+        noise = jax.random.normal(noise_rng, actions.shape)
+        time = jax.random.beta(time_rng, 1.5, 1, batch_shape) * 0.999 + 0.001
+        time_expanded = time[..., None, None]
+        x_t = time_expanded * noise + (1 - time_expanded) * actions
+        u_t = noise - actions
+        # one big forward pass of prefix + suffix at once
+        prefix_tokens, prefix_mask, prefix_ar_mask = self.embed_prefix(observation)
+        suffix_tokens, suffix_mask, suffix_ar_mask, adarms_cond = self.embed_suffix(observation, x_t, time)
+        input_mask = jnp.concatenate([prefix_mask, suffix_mask], axis=1)
+        ar_mask = jnp.concatenate([prefix_ar_mask, suffix_ar_mask], axis=0)
+        attn_mask = make_attn_mask(input_mask, ar_mask)
+        positions = jnp.cumsum(input_mask, axis=1) - 1
+        (prefix_out, suffix_out), _ = self.PaliGemma.llm(
+            [prefix_tokens, suffix_tokens], mask=attn_mask, positions=positions, adarms_cond=[None, adarms_cond]
+        )
+        v_t = self.action_out_proj(suffix_out[:, -self.action_horizon :])
+        return jnp.mean(jnp.square(v_t - u_t), axis=-1)
+    @override
+    def sample_actions(
+        self,
+        rng: at.KeyArrayLike,
+        observation: _model.Observation,
+        *,
+        num_steps: int | at.Int[at.Array, ""] = 10,
+        noise: at.Float[at.Array, "b ah ad"] | None = None,
+    ) -> _model.Actions:
+        observation = _model.preprocess_observation(None, observation, train=False)
+        # note that we use the convention more common in diffusion literature, where t=1 is noise and t=0 is the target
+        # distribution. yes, this is the opposite of the pi0 paper, and I'm sorry.
+        dt = -1.0 / num_steps
+        batch_size = observation.state.shape[0]
+        if noise is None:
+            noise = jax.random.normal(rng, (batch_size, self.action_horizon, self.action_dim))
+        # first fill KV cache with a forward pass of the prefix
+        prefix_tokens, prefix_mask, prefix_ar_mask = self.embed_prefix(observation)
+        prefix_attn_mask = make_attn_mask(prefix_mask, prefix_ar_mask)
+        positions = jnp.cumsum(prefix_mask, axis=1) - 1
+        _, kv_cache = self.PaliGemma.llm([prefix_tokens, None], mask=prefix_attn_mask, positions=positions)
+        def step(carry):
+            x_t, time = carry
+            suffix_tokens, suffix_mask, suffix_ar_mask, adarms_cond = self.embed_suffix(
+                observation, x_t, jnp.broadcast_to(time, batch_size)
+            )
+            # `suffix_attn_mask` is shape (b, suffix_len, suffix_len) indicating how the suffix tokens can attend to each
+            # other
+            suffix_attn_mask = make_attn_mask(suffix_mask, suffix_ar_mask)
+            # `prefix_attn_mask` is shape (b, suffix_len, prefix_len) indicating how the suffix tokens can attend to the
+            # prefix tokens
+            prefix_attn_mask = einops.repeat(prefix_mask, "b p -> b s p", s=suffix_tokens.shape[1])
+            # `combined_mask` is shape (b, suffix_len, prefix_len + suffix_len) indicating how the suffix tokens (which
+            # generate the queries) can attend to the full prefix + suffix sequence (which generates the keys and values)
+            full_attn_mask = jnp.concatenate([prefix_attn_mask, suffix_attn_mask], axis=-1)
+            assert full_attn_mask.shape == (
+                batch_size,
+                suffix_tokens.shape[1],
+                prefix_tokens.shape[1] + suffix_tokens.shape[1],
+            )
+            # `positions` is shape (b, suffix_len) indicating the positions of the suffix tokens
+            positions = jnp.sum(prefix_mask, axis=-1)[:, None] + jnp.cumsum(suffix_mask, axis=-1) - 1
+            (prefix_out, suffix_out), _ = self.PaliGemma.llm(
+                [None, suffix_tokens],
+                mask=full_attn_mask,
+                positions=positions,
+                kv_cache=kv_cache,
+                adarms_cond=[None, adarms_cond],
+            )
+            assert prefix_out is None
+            v_t = self.action_out_proj(suffix_out[:, -self.action_horizon :])
+            return x_t + dt * v_t, time + dt
+        def cond(carry):
+            x_t, time = carry
+            # robust to floating-point error
+            return time >= -dt / 2
+        x_0, _ = jax.lax.while_loop(cond, step, (noise, 1.0))
+        return x_0

capvector-pi05/src/openpi/models/pi0_config.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import dataclasses
+from typing import TYPE_CHECKING
+import flax.nnx as nnx
+import jax
+import jax.numpy as jnp
+from typing_extensions import override
+from openpi.models import model as _model
+import openpi.models.gemma as _gemma
+from openpi.shared import array_typing as at
+import openpi.shared.nnx_utils as nnx_utils
+if TYPE_CHECKING:
+    from openpi.models.pi0 import Pi0
+@dataclasses.dataclass(frozen=True)
+class Pi0Config(_model.BaseModelConfig):
+    dtype: str = "bfloat16"
+    paligemma_variant: _gemma.Variant = "gemma_2b"
+    action_expert_variant: _gemma.Variant = "gemma_300m"
+    # Set the model specific defaults.
+    action_dim: int = 32
+    action_horizon: int = 50
+    max_token_len: int = None  # type: ignore
+    # Pi05 has two differences from Pi0:
+    # - the state input is part of the discrete language tokens rather than a continuous input that is part of the suffix
+    # - the action expert uses adaRMSNorm to inject the flow matching timestep
+    pi05: bool = False
+    # This config option is not used directly by the model, but it is read by the ModelTransformFactory.
+    discrete_state_input: bool = None  # type: ignore
+    def __post_init__(self):
+        if self.max_token_len is None:
+            object.__setattr__(self, "max_token_len", 200 if self.pi05 else 48)
+        if self.discrete_state_input is None:
+            object.__setattr__(self, "discrete_state_input", self.pi05)
+    @property
+    @override
+    def model_type(self) -> _model.ModelType:
+        if self.pi05:
+            return _model.ModelType.PI05
+        return _model.ModelType.PI0
+    @override
+    def create(self, rng: at.KeyArrayLike) -> "Pi0":
+        from openpi.models.pi0 import Pi0
+        return Pi0(self, rngs=nnx.Rngs(rng))
+    @override
+    def inputs_spec(self, *, batch_size: int = 1) -> tuple[_model.Observation, _model.Actions]:
+        image_spec = jax.ShapeDtypeStruct([batch_size, *_model.IMAGE_RESOLUTION, 3], jnp.float32)
+        image_mask_spec = jax.ShapeDtypeStruct([batch_size], jnp.bool_)
+        with at.disable_typechecking():
+            observation_spec = _model.Observation(
+                images={
+                    "base_0_rgb": image_spec,
+                    "left_wrist_0_rgb": image_spec,
+                    "right_wrist_0_rgb": image_spec,
+                },
+                image_masks={
+                    "base_0_rgb": image_mask_spec,
+                    "left_wrist_0_rgb": image_mask_spec,
+                    "right_wrist_0_rgb": image_mask_spec,
+                },
+                state=jax.ShapeDtypeStruct([batch_size, self.action_dim], jnp.float32),
+                tokenized_prompt=jax.ShapeDtypeStruct([batch_size, self.max_token_len], jnp.int32),
+                tokenized_prompt_mask=jax.ShapeDtypeStruct([batch_size, self.max_token_len], bool),
+            )
+        action_spec = jax.ShapeDtypeStruct([batch_size, self.action_horizon, self.action_dim], jnp.float32)
+        return observation_spec, action_spec
+    def get_freeze_filter(self) -> nnx.filterlib.Filter:
+        """Returns the freeze filter based on the model config."""
+        filters = []
+        has_lora = False
+        gemma_params_filter = nnx_utils.PathRegex(".*llm.*")
+        action_expert_params_filter = nnx_utils.PathRegex(".*llm.*_1.*")
+        if "lora" in self.paligemma_variant:
+            filters.append(
+                gemma_params_filter,
+            )
+            if "lora" not in self.action_expert_variant:
+                # If only freeze gemma params, exclude action expert params.
+                filters.append(
+                    nnx.Not(action_expert_params_filter),
+                )
+            has_lora = True
+        elif "lora" in self.action_expert_variant:
+            filters.append(
+                action_expert_params_filter,
+            )
+            has_lora = True
+        if has_lora:
+            # If any lora is used, exclude all lora params.
+            filters.append(
+                nnx.Not(nnx_utils.PathRegex(".*lora.*")),
+            )
+        if not filters:
+            return nnx.Nothing
+        return nnx.All(*filters)

capvector-pi05/src/openpi/models/pi0_fast.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import dataclasses
+import logging
+from typing import Any
+import einops
+import flax.nnx as nnx
+import flax.nnx.bridge as nnx_bridge
+import jax
+import jax.numpy as jnp
+from typing_extensions import override
+from openpi.models import model as _model
+import openpi.models.gemma_fast as _gemma
+import openpi.models.siglip as _siglip
+from openpi.shared import array_typing as at
+import openpi.shared.nnx_utils as nnx_utils
+logger = logging.getLogger("openpi")
+PALIGEMMA_EOS_TOKEN = 1
+def make_attn_mask(input_mask, mask_ar):
+    """Adapted from big_vision.
+    Tokens can attend to valid inputs tokens which have a cumulative mask_ar
+    smaller or equal to theirs. This way `mask_ar` bool[?B, N] can be used to
+    setup several types of attention, for example:
+      [[1 1 1 1 1 1]]: pure causal attention.
+      [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between
+          themselves and the last 3 tokens have a causal attention. The first
+          entry could also be a 1 without changing behaviour.
+      [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a
+          block can attend all previous blocks and all tokens on the same block.
+    Args:
+      input_mask: bool[B, N] true if its part of the input, false if padding.
+      mask_ar: bool[?B, N] mask that's true where previous tokens cannot depend on
+        it and false where it shares the same attention mask as the previous token.
+    """
+    mask_ar = jnp.broadcast_to(mask_ar, input_mask.shape)
+    cumsum = jnp.cumsum(mask_ar, axis=1)
+    attn_mask = cumsum[:, None, :] <= cumsum[:, :, None]
+    valid_mask = input_mask[:, None, :] * input_mask[:, :, None]
+    return jnp.logical_and(attn_mask, valid_mask)
+@jax.vmap
+def left_to_right_align(x, input_mask, attn_mask):
+    """Converts input from left-align to right-aligned."""
+    # Due to vmap, this is operating in a single example (not batch level).
+    assert x.ndim == 2
+    assert input_mask.ndim == 1
+    assert attn_mask.ndim == 2
+    assert x.shape[0] == input_mask.shape[0]
+    assert attn_mask.shape[0] == attn_mask.shape[1], attn_mask.shape
+    seqlen = jnp.max(input_mask * jnp.arange(input_mask.shape[0])) + 1
+    x = jnp.roll(x, -seqlen, axis=0)
+    input_mask = jnp.roll(input_mask, -seqlen, axis=0)
+    attn_mask = jnp.roll(attn_mask, -seqlen, axis=(0, 1))
+    return x, input_mask, attn_mask
+def put_along_last_axis(arr, indices, values):
+    """Like np.put_along_axis(..., axis=-1), since jax is missing it."""
+    assert arr.ndim == indices.ndim == values.ndim, (arr.ndim, indices.ndim, values.ndim)
+    onehot = jax.nn.one_hot(indices, arr.shape[-1], dtype=values.dtype)
+    put_mask = jnp.einsum("...i,...in->...n", jnp.ones(values.shape, jnp.int32), onehot)
+    put_values = jnp.einsum("...i,...in->...n", values, onehot)
+    return jnp.where(put_mask, put_values, arr)
+@dataclasses.dataclass(frozen=True)
+class Pi0FASTConfig(_model.BaseModelConfig):
+    dtype: str = "bfloat16"
+    paligemma_variant: _gemma.Variant = "gemma_2b"
+    # Set the model specific defaults.
+    action_dim: int = 32
+    action_horizon: int = 32
+    max_token_len: int = 250
+    # Tokenizer for the fast model.
+    fast_model_tokenizer: Any | None = None
+    # Keyword arguments for the fast model tokenizer.
+    fast_model_tokenizer_kwargs: dict[str, Any] | None = None
+    @property
+    @override
+    def model_type(self) -> _model.ModelType:
+        return _model.ModelType.PI0_FAST
+    @override
+    def create(self, rng: at.KeyArrayLike) -> "Pi0FAST":
+        return Pi0FAST(self, rngs=nnx.Rngs(rng))
+    @override
+    def inputs_spec(self, *, batch_size: int = 1) -> tuple[_model.Observation, _model.Actions]:
+        image_spec = jax.ShapeDtypeStruct([batch_size, *_model.IMAGE_RESOLUTION, 3], jnp.float32)
+        image_mask_spec = jax.ShapeDtypeStruct([batch_size], jnp.bool_)
+        with at.disable_typechecking():
+            observation_spec = _model.Observation(
+                images={
+                    "base_0_rgb": image_spec,
+                    "base_1_rgb": image_spec,
+                    "wrist_0_rgb": image_spec,
+                },
+                image_masks={
+                    "base_0_rgb": image_mask_spec,
+                    "base_1_rgb": image_mask_spec,
+                    "wrist_0_rgb": image_mask_spec,
+                },
+                state=jax.ShapeDtypeStruct([batch_size, self.action_dim], jnp.float32),
+                tokenized_prompt=jax.ShapeDtypeStruct([batch_size, self.max_token_len], jnp.int32),
+                tokenized_prompt_mask=jax.ShapeDtypeStruct([batch_size, self.max_token_len], bool),
+                token_ar_mask=jax.ShapeDtypeStruct([batch_size, self.max_token_len], jnp.int32),
+                token_loss_mask=jax.ShapeDtypeStruct([batch_size, self.max_token_len], jnp.bool_),
+            )
+        action_spec = jax.ShapeDtypeStruct([batch_size, self.action_horizon, self.action_dim], jnp.float32)
+        return observation_spec, action_spec
+    def get_freeze_filter(self) -> nnx.filterlib.Filter:
+        """Returns the freeze filter based on the model config."""
+        if "lora" in self.paligemma_variant:
+            return nnx.All(nnx_utils.PathRegex(".*llm.*"), nnx.Not(nnx_utils.PathRegex(".*lora.*")))
+        return nnx.Nothing
+class Pi0FAST(_model.BaseModel):
+    def __init__(self, config: Pi0FASTConfig, rngs: nnx.Rngs):
+        super().__init__(config.action_dim, config.action_horizon, config.max_token_len)
+        paligemma_config = _gemma.get_config(config.paligemma_variant)
+        # TODO: rewrite gemma in NNX. For now, use bridge.
+        llm = nnx_bridge.ToNNX(
+            _gemma.Module(
+                **paligemma_config,
+                embed_dtype=config.dtype,
+                cache_dtype=config.dtype,
+            )
+        )
+        llm.lazy_init(rngs=rngs, method="init")
+        img = nnx_bridge.ToNNX(
+            _siglip.Module(
+                num_classes=paligemma_config.width,
+                variant="So400m/14",
+                pool_type="none",
+                scan=True,
+                dtype_mm=config.dtype,
+            )
+        )
+        img.lazy_init(next(iter(config.fake_obs().images.values())), train=False, rngs=rngs)
+        self.PaliGemma = nnx.Dict(llm=llm, img=img)
+    @at.typecheck
+    def embed_inputs(
+        self, obs: _model.Observation
+    ) -> tuple[at.Float[at.Array, "b s emb"], at.Bool[at.Array, "b s"], at.Int[at.Array, "b s"]]:
+        input_mask = []
+        ar_mask = []
+        token_embeddings = []
+        # embed images
+        for name in obs.images:
+            image_token_embeddings, _ = self.PaliGemma.img(obs.images[name], train=False)
+            token_embeddings.append(image_token_embeddings)
+            input_mask.append(
+                einops.repeat(
+                    obs.image_masks[name],
+                    "b -> b s",
+                    s=image_token_embeddings.shape[1],
+                )
+            )
+            # image tokens attend to each other --> AR mask = 0
+            ar_mask.append(0 * input_mask[-1])
+        # add tokenized inputs
+        assert obs.tokenized_prompt is not None, "Tokenized prompt is required"
+        assert obs.tokenized_prompt_mask is not None, "Tokenized prompt mask is required"
+        assert obs.token_ar_mask is not None, "Token auto-regressive mask is required"
+        tokenized_inputs_embeddings = self.PaliGemma.llm(obs.tokenized_prompt, embed_only=True)
+        token_embeddings.append(tokenized_inputs_embeddings)
+        input_mask.append(obs.tokenized_prompt_mask)
+        ar_mask.append(obs.token_ar_mask)
+        # return embeddings, input mask, and ar mask
+        return (
+            jnp.concatenate(token_embeddings, axis=1),
+            jnp.concatenate(input_mask, axis=1),
+            jnp.concatenate(ar_mask, axis=1),
+        )
+    @override
+    def compute_loss(
+        self, rng: at.KeyArrayLike, observation: _model.Observation, actions: _model.Actions, *, train: bool = False
+    ) -> at.Float[at.Array, "*b ah"]:
+        observation = _model.preprocess_observation(
+            rng, observation, train=train, image_keys=list(observation.images.keys())
+        )
+        # Compute inputs: one big forward pass of prefix + suffix at once
+        input_token_embeddings, input_mask, ar_mask = self.embed_inputs(observation)
+        attn_mask = make_attn_mask(input_mask, ar_mask)
+        # Compute one-hot targets: we predict *next* token, so shift the input tokens by one.
+        targets = jax.nn.one_hot(
+            observation.tokenized_prompt[:, 1:],
+            self.PaliGemma.llm.module.vocab_size,
+        )
+        # Each input predicts *next* token, so we don't input the last token.
+        pre_logits, _, _ = self.PaliGemma.llm(
+            embedded_prefix=input_token_embeddings[:, :-1],
+            mask=attn_mask[:, :-1, :-1],
+            return_prelogits=True,
+        )
+        # Only decode logits for the target tokens to save memory
+        # (decoding matmul is large because it is a seq_len x vocab_size dense layer).
+        logits, _ = self.PaliGemma.llm(
+            pre_logits=pre_logits[:, -targets.shape[1] :],
+        )
+        logp = jax.nn.log_softmax(logits, axis=-1)
+        # Compute CE loss on token targets
+        assert observation.token_loss_mask is not None, "Token loss mask is required"
+        loss_mask = observation.token_loss_mask[:, 1:]
+        token_pplx = jnp.sum(targets * logp, axis=-1)
+        return -jnp.sum(token_pplx * loss_mask, axis=-1) / jnp.clip(jnp.sum(loss_mask, -1), 1)
+    @override
+    def sample_actions(
+        self,
+        rng: at.KeyArrayLike,
+        observation: _model.Observation,
+        *,
+        max_decoding_steps: int | at.Int[at.Array, ""] = 256,
+        temperature: float = 0.0,
+    ) -> _model.Actions:
+        # TODO: this is a hack to get the image keys.
+        observation = _model.preprocess_observation(
+            None, observation, train=False, image_keys=list(observation.images.keys())
+        )
+        # embed inputs
+        prefix_token_embeddings, prefix_mask, prefix_ar_mask = self.embed_inputs(observation)
+        prefix_attn_mask = make_attn_mask(prefix_mask, prefix_ar_mask)
+        # left to right align all input token sequences
+        prefix_token_embeddings, prefix_mask, prefix_attn_mask = left_to_right_align(
+            prefix_token_embeddings, prefix_mask, prefix_attn_mask
+        )
+        prefill_size = prefix_token_embeddings.shape[1]
+        prefill_len = jnp.sum(prefix_mask, axis=-1)
+        prefix_start = prefill_size - prefill_len
+        # first fill KV cache with a forward pass of the prefix
+        # pad attention mask to set the size of the KV cache (prefill_size + max_decoding_steps)
+        prefix_attn_mask = jnp.pad(prefix_attn_mask, ((0, 0), (0, 0), (0, max_decoding_steps)))
+        prefix_positions = jnp.cumsum(prefix_mask, axis=-1) - 1
+        prefix_logits, kv_cache, _ = self.PaliGemma.llm(
+            embedded_prefix=prefix_token_embeddings, mask=prefix_attn_mask, positions=prefix_positions, decode=True
+        )
+        # prepare decoding -- final logit decodes the first token
+        last_logit = prefix_logits[:, -1:]
+        output_tokens = jnp.zeros((last_logit.shape[0], max_decoding_steps))
+        def step(carry):
+            rng, last_logit, output_tokens, cache, _, step = carry
+            # Sample token from last logit
+            # Split RNG for this step
+            rng, rng_step = jax.random.split(rng)
+            token = jax.lax.cond(
+                temperature > 0.0,
+                lambda _: jax.random.categorical(rng_step, last_logit / temperature, axis=-1),
+                lambda _: jnp.argmax(last_logit, axis=-1),
+                operand=None,
+            )
+            output_tokens = put_along_last_axis(output_tokens, jnp.broadcast_to(step, (token.shape[0], 1)), token)
+            # Check for early stopping --> stop if all batch elements have EOS token
+            has_eos = jnp.any(token == PALIGEMMA_EOS_TOKEN, axis=-1)
+            all_eos = jnp.all(has_eos)
+            # Decode one step
+            token_embedding = self.PaliGemma.llm(token, embed_only=True)
+            positions = prefill_len[:, None] + step + 1
+            mask = jnp.logical_and(
+                jnp.arange(prefill_size + max_decoding_steps)[None, None, :] >= prefix_start[:, None, None],
+                jnp.arange(prefill_size + max_decoding_steps)[None, None, :]
+                < (jnp.broadcast_to(prefill_size + step + 1, (prefix_start.shape[0], 1, 1))),
+            )
+            last_logit, kv_cache, _ = self.PaliGemma.llm(
+                embedded_prefix=token_embedding, mask=mask, positions=positions, decode=True, kv_cache=cache
+            )
+            return rng, last_logit, output_tokens, kv_cache, all_eos, step + 1
+        def cond(carry):
+            _, _, _, _, all_eos, step = carry
+            return (~all_eos) & (step < max_decoding_steps)
+        # Use lax.while_loop so we can jit the full decoding loop.
+        _, _, output_tokens, _, _, _ = jax.lax.while_loop(
+            cond, step, (rng, last_logit, output_tokens, kv_cache, False, 0)
+        )
+        return output_tokens