lfh commited on 1 day ago

Commit

eb868a1

1 Parent(s): 95a3948

remove files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +0 -35
README.md +0 -57
capvector-oft/.pre-commit-config.yaml +0 -27
capvector-oft/ALOHA.md +0 -157
capvector-oft/LIBERO.md +0 -130
capvector-oft/LICENSE +0 -21
capvector-oft/SETUP.md +0 -24
capvector-oft/capvector/.gitignore +0 -8
capvector-oft/capvector/compute_lora_diff.py +0 -35
capvector-oft/capvector/compute_lora_shell/compute_lora_diff.sh +0 -8
capvector-oft/capvector/initialized_interpolate_shell/get_vector_robotwin.sh +0 -26
capvector-oft/capvector/interpolate.py +0 -247
capvector-oft/capvector/interpolate.sh +0 -26
capvector-oft/capvector/interpolate_robotwin.py +0 -247
capvector-oft/capvector/tools/check_model_config.py +0 -23
capvector-oft/capvector/tools/compute_lora_diff.py +0 -36
capvector-oft/capvector/tools/compute_lora_diff.sh +0 -8
capvector-oft/capvector/tools/vector_analyze.py +0 -153
capvector-oft/capvector/tools/vector_regularize.py +0 -75
capvector-oft/experiments/robot/aloha/aloha_utils.py +0 -85
capvector-oft/experiments/robot/aloha/constants.py +0 -100
capvector-oft/experiments/robot/aloha/preprocess_split_aloha_data.py +0 -260
capvector-oft/experiments/robot/aloha/real_env.py +0 -213
capvector-oft/experiments/robot/aloha/requirements_aloha.txt +0 -26
capvector-oft/experiments/robot/aloha/robot_utils.py +0 -187
capvector-oft/experiments/robot/aloha/run_aloha_eval.py +0 -385
capvector-oft/experiments/robot/libero/libero_requirements.txt +0 -6
capvector-oft/experiments/robot/libero/libero_utils.py +0 -87
capvector-oft/experiments/robot/libero/regenerate_libero_dataset.py +0 -249
capvector-oft/experiments/robot/libero/run_libero_eval.py +0 -540
capvector-oft/experiments/robot/libero/sample_libero_spatial_observation.pkl +0 -3
capvector-oft/experiments/robot/openvla_utils.py +0 -818
capvector-oft/experiments/robot/robot_utils.py +0 -199
capvector-oft/prismatic/__init__.py +0 -1
capvector-oft/prismatic/conf/__init__.py +0 -3
capvector-oft/prismatic/conf/datasets.py +0 -133
capvector-oft/prismatic/conf/models.py +0 -584
capvector-oft/prismatic/conf/vla.py +0 -235
capvector-oft/prismatic/extern/__init__.py +0 -0
capvector-oft/prismatic/extern/hf/__init__.py +0 -0
capvector-oft/prismatic/extern/hf/configuration_prismatic.py +0 -140
capvector-oft/prismatic/extern/hf/modeling_prismatic.py +0 -1085
capvector-oft/prismatic/extern/hf/processing_prismatic.py +0 -252
capvector-oft/prismatic/models/__init__.py +0 -2
capvector-oft/prismatic/models/action_heads.py +0 -211
capvector-oft/prismatic/models/backbones/__init__.py +0 -0
capvector-oft/prismatic/models/backbones/llm/__init__.py +0 -4
capvector-oft/prismatic/models/backbones/llm/base_llm.py +0 -223
capvector-oft/prismatic/models/backbones/llm/llama2.py +0 -102
capvector-oft/prismatic/models/backbones/llm/mistral.py +0 -72

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md DELETED Viewed

@@ -1,57 +0,0 @@
-# CapVector: Learning Transferable Capability Vectors in Parametric Space for Vision-Language-Action Models
-<div align="center">
-[![Paper](https://img.shields.io/badge/Paper-A42C25?style=for-the-badge&logo=arxiv&logoColor=white)](http://arxiv.org/abs/) [![Page](https://img.shields.io/badge/Project--Page-blue?style=for-the-badge&logo=homepage&logoColor=white)](https://capvector.github.io/) [![Hugging Face Collection](https://img.shields.io/badge/Models-fcd022?style=for-the-badge&logo=huggingface&logoColor=white)](https://huggingface.co/haofuly/capvector_models_collection)
-</div>
-CapVector is a training recipe for vision-language-action (VLA) models that extracts a transferable capability vector from the parameter difference between auxiliary-objective SFT methods and standard SFT methods. This vector is merged into a pretrained VLA to form a stronger initialization, and downstream adaptation uses standard SFT with a lightweight orthogonal regularization loss to preserve the injected capability.
-## 🌟 Key Features
-- **Efficient downstream adaptation**: CapVector recovers much of the benefit of auxiliary-objective SFT methods, while keeping the downstream overhead close to standard SFT.
-- **Versatility**: CapVector fits for OpenVLA-based, OpenPi-based, and StarVLA-based backbones.
-- **Generalization**: CapVector is designed to transfer across tasks, environments, and robot embodiments.
-## 🚀 Get Started
-This repository provides two implementation paths:
-- [`capvector-oft/`](./capvector-oft) based implementation
-- [`capvector-pi05/`](./capvector-pi05) based implementation.
-Choose the subdirectory that matches your base model and training stack. Follow the subproject README for environment setup, data preparation, training, and inference.
-[`capvector-pi05/`](./capvector-pi05) provides the capability vector extraction and merging scripts.
-## 🌏 Contact
-For further discussion and collaboration, please feel free to contact us via Email and WeChat:
-| Author | Email | WeChat |
-|:---:|:---:|:---:|
-| Wenxuan Song | songwenxuan0115@gmail.com | swx0757 |
-## ❤️ Acknowledgments
-CapVector builds on and interfaces with several excellent open-source projects, including:
-- [OpenVLA-OFT](https://github.com/moojink/openvla-oft)
-- [OpenPI](https://github.com/Physical-Intelligence/openpi)
-## 🖊 Citation
-If you find this work useful, please cite:
-```bibtex
-@article{song2026capvector,
-  title   = {CapVector: Learning Transferable Capability Vectors in Parametric Space for Vision-Language-Action Models},
-  author  = {Song, Wenxuan and Zhao, Han and Li, Fuhao and Zhou, Ziyang and Wang, Xi and Lyu, Jing and Ding, Pengxiang and Wang, Yan and Wang, Donglin and Li, Haoang},
-  journal = {Preprint},
-  year    = {2026}
-}
-```

capvector-oft/.pre-commit-config.yaml DELETED Viewed

@@ -1,27 +0,0 @@
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
-exclude: ".git"
-repos:
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.2.2
-    hooks:
-      - id: ruff
-        args: [ --fix, --exit-non-zero-on-fix ]
-  - repo: https://github.com/psf/black
-    rev: 24.2.0
-    hooks:
-      - id: black
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
-    hooks:
-      - id: check-added-large-files
-      - id: check-ast
-      - id: check-case-conflict
-      - id: check-merge-conflict
-      - id: check-toml
-      - id: check-yaml
-      - id: end-of-file-fixer
-      - id: trailing-whitespace

capvector-oft/ALOHA.md DELETED Viewed

@@ -1,157 +0,0 @@
-# OpenVLA-OFT+ in Real-World ALOHA Robot Tasks
-## Relevant Files
-Evaluation
-* `experiments/robot/aloha/`: ALOHA training and eval files
-  * `run_aloha_eval.py`: ALOHA eval script (CLIENT SIDE; see "SERVER SIDE" below)
-  * `aloha_utils.py`: ALOHA eval utils
-  * Other ALOHA robot environment files copied from the original [ALOHA GitHub repo](https://github.com/tonyzhaozh/aloha):
-    * `constants.py`
-    * `real_env.py`
-    * `robot_utils.py`
-* `experiments/robot/`: General eval utils files
-  * `openvla_utils.py`: OpenVLA-specific eval utils
-  * `robot_utils.py`: Other eval utils
-* `vla-scripts/deploy.py`: VLA server deploy script (SERVER SIDE)
-Note: Unlike the LIBERO evaluation setup, we use a server-client interface here. This is particularly useful if the user's machine which commands the robot does not have access to a local GPU with sufficient specs to run the fine-tuned VLA policies.
-Training
-* `experiments/robot/aloha/`: ALOHA training and eval files
-  * `preprocess_split_aloha_data.py`: ALOHA data preprocessing script
-* `vla-scripts/finetune.py`: VLA fine-tuning script
-## Setup
-Set up a conda environment for training policies and deploying them on the VLA server (see instructions in [SETUP.md](SETUP.md)).
-## Fine-Tuning on ALOHA Robot Data
-We assume that you have collected a set of expert demonstrations on the ALOHA robot already.
-First, use our `preprocess_split_aloha_data.py` script to preprocess the raw ALOHA dataset: downsize images from 480x640 to 256x256 and split into training and validation sets. Below are examples for the `put X into pot` task in our paper (which has 3 possible target objects, 1 per episode):
-```bash
-python experiments/robot/aloha/preprocess_split_aloha_data.py \
-  --dataset_path /scr/moojink/data/aloha1_raw/put_green_pepper_into_pot/ \
-  --out_base_dir /scr/moojink/data/aloha1_preprocessed/ \
-  --percent_val 0.05
-python experiments/robot/aloha/preprocess_split_aloha_data.py \
-  --dataset_path /scr/moojink/data/aloha1_raw/put_red_pepper_into_pot/ \
-  --out_base_dir /scr/moojink/data/aloha1_preprocessed/ \
-  --percent_val 0.05
-python experiments/robot/aloha/preprocess_split_aloha_data.py \
-  --dataset_path /scr/moojink/data/aloha1_raw/put_yellow_corn_into_pot/ \
-  --out_base_dir /scr/moojink/data/aloha1_preprocessed/ \
-  --percent_val 0.05
-```
-Then, convert the preprocessed ALOHA datasets into a single RLDS dataset that is compatible with OpenVLA fine-tuning. This process is the same as in the original OpenVLA repo. See instructions for converting to RLDS [here](https://github.com/moojink/rlds_dataset_builder) (a sample ALOHA preprocessed-to-RLDS conversion script is available [here](https://github.com/moojink/rlds_dataset_builder/blob/main/aloha1_put_X_into_pot_300_demos/aloha1_put_X_into_pot_300_demos_dataset_builder.py); this script converts the three preprocessed datasets above into one unified RLDS dataset, with train/val splits).
-After converting to RLDS, register the dataset (which, for the example task above, would be called `aloha1_put_X_into_pot_300_demos`) with our dataloader by adding an entry for it in `configs.py` ([here](prismatic/vla/datasets/rlds/oxe/configs.py#L680)), `transforms.py` ([here](prismatic/vla/datasets/rlds/oxe/transforms.py#L928)), and `mixtures.py` ([here](prismatic/vla/datasets/rlds/oxe/mixtures.py#L216)). For reference, in each of these files, there are sample entries for the ALOHA datasets that we used in our paper.
-Before fine-tuning, set the desired ALOHA action chunk size in [`prismatic/vla/constants.py`](prismatic/vla/constants.py) (see `NUM_ACTIONS_CHUNK` in `ALOHA_CONSTANTS`). We set it to 25 by default because we used a control frequency of 25 Hz in our ALOHA setup to reduce storage costs and training time (while still maintaining smoothness in the robot's motions). If you use 50 Hz, we recommend setting `NUM_ACTIONS_CHUNK` to `50`. In general, 1 second-long action chunks are a good default. Do NOT modify `ACTION_PROPRIO_NORMALIZATION_TYPE`: Since the ALOHA robot action space is absolute joint angles, we do not want to use a normalization scheme that clips outlier values (like the Q1-Q99 normalization we used with the relative end-effector pose actions for LIBERO), since that would prevent the model from outputting certain robot joint angles that are crucial for solving the task.
-Now begin fine-tuning! Below is a sample command to fine-tune OpenVLA using our OFT+ recipe on the `put X into pot` task above ("+" in "OFT+" means FiLM is included for enhanced language grounding). Replace `X` in the first line with the number of GPUs available to you.
-```bash
-torchrun --standalone --nnodes 1 --nproc-per-node X vla-scripts/finetune.py \
-  --vla_path openvla/openvla-7b \
-  --data_root_dir /PATH/TO/RLDS/DATASETS/DIR/ \
-  --dataset_name aloha1_put_X_into_pot_300_demos \
-  --run_root_dir /YOUR/CHECKPOINTS/AND/LOG/DIR/ \
-  --use_l1_regression True \
-  --use_diffusion False \
-  --use_film True \
-  --num_images_in_input 3 \
-  --use_proprio True \
-  --batch_size 4 \
-  --learning_rate 5e-4 \
-  --num_steps_before_decay 50000 \
-  --max_steps 100005 \
-  --use_val_set True \
-  --val_freq 10000 \
-  --save_freq 10000 \
-  --save_latest_checkpoint_only False \
-  --image_aug True \
-  --lora_rank 32 \
-  --wandb_entity "YOUR_WANDB_ENTITY" \
-  --wandb_project "YOUR_WANDB_PROJECT" \
-  --run_id_note parallel_dec--25_acts_chunk--continuous_acts--L1_regression--3rd_person_img--left_right_wrist_imgs--proprio_state--film
-```
-The above training command should reproduce our OpenVLA-OFT+ results on the `put X into pot` task if `X = 8` and the 100K step checkpoint is evaluated. It will fine-tune OpenVLA using 3 input images (1 third-person image + 2 wrist camera images). Note that we use learning rate decay after a certain point (50K steps in the command above) since doing so speeds up training convergence (train L1 loss spikes down from our experience).
-Best practices for fine-tuning:
-* In general, we recommend fine-tuning until training L1 loss goes below 0.01 and starts to plateau.
-  * One way to achieve this is to fine-tune using our default learning rate of `5e-4` until the loss starts to decrease very slowly, and then decay the learning rate by 10x to `5e-5` (which should make the loss spike down) and train until the training L1 loss finally plateaus.
-* Depending on your dataset size, you may need to adjust some hyperparameters. For example, if you use a large dataset with over 300 demos, you may need to decay the learning rate later and train for longer for best performance. Decaying too earlier can lead to a suboptimal policy.
-* If your task does not require good langauge grounding (e.g., if there is only one language instruction), FiLM is not necessary; consider setting `--use_film False` to train fewer model parameters.
-* Please be sure to test your policy with the same device/GPU used to train it! Otherwise, performance may drop substantially. You may be able to avoid the performance drop if you merge the LoRA weights into the base model on the downstream device used for testing (e.g., if you train on H100 and then merge on A100 before testing on A100). You can see our script [vla-scripts/merge_lora_weights_and_save.py](vla-scripts/merge_lora_weights_and_save.py) for merging the LoRA adapter into the base model offline. It's okay if you already merged LoRA weights into the base OpenVLA model during fine-tuning; you can always redownload the base model and merge again as long as you still have the LoRA adapter (`merge_lora_weights_and_save.py` will handle this for you).
-If you run into any issues, please open a new GitHub issue.
-## Launching ALOHA Robot Evaluations
-In the primary conda environment (`openvla-oft`) which you will use to launch the VLA server, install a few packages for the server-client interface:
-```bash
-conda activate openvla-oft
-pip install uvicorn fastapi json-numpy
-```
-On the machine that you will use to command the robot, set up a second conda environment that will be used to run the robot environment, query the VLA server, and execute actions in the environment:
-```bash
-# Create and activate client conda environment
-conda create -n openvla-oft-aloha python=3.10 -y
-conda activate openvla-oft-aloha
-# Install PyTorch
-# Use a command specific to your machine: https://pytorch.org/get-started/locally/
-pip3 install torch torchvision torchaudio
-# Clone openvla-oft repo and pip install to download dependencies
-git clone https://github.com/moojink/openvla-oft.git
-cd openvla-oft
-pip install -e .
-# Install packages needed for the ALOHA robot environment
-pip install -r experiments/robot/aloha/requirements_aloha.txt
-```
-Launch the VLA server on the machine that has the GPU you will use to run model inference (using the `openvla-oft` conda environment). Below is a sample command for this (change as needed):
-```bash
-python vla-scripts/deploy.py \
-  --pretrained_checkpoint /PATH/TO/FINETUNED/MODEL/CHECKPOINT/DIR/ \
-  --use_l1_regression True \
-  --use_film True \
-  --num_images_in_input 3 \
-  --use_proprio True \
-  --center_crop True \
-  --unnorm_key aloha1_put_X_into_pot_300_demos
-```
-Then, run the ALOHA evaluation script. Specify the VLA server URL or IP address in the `vla_server_url` argument. Below is a sample command:
-```bash
-python experiments/robot/aloha/run_aloha_eval.py \
-  --center_crop True \
-  --num_open_loop_steps 25 \
-  --use_vla_server True \
-  --vla_server_url <URL OF VLA SERVER> \
-  --num_rollouts_planned <NUM TEST ROLLOUTS> \
-  --max_steps <MAX NUM STEPS PER ROLLOUT>
-```
-If you run into any issues, please open a new GitHub issue.
-## Troubleshooting Tips
-* Tip #1: If you run into a ROS error such as `ImportError: /lib/x86_64-linux-gnu/libp11-kit.so.0: undefined symbol: ffi_type_pointer, version LIBFFI_BASE_7.0`, try running the following command in your client conda environment (`openvla-oft-aloha`):
-    ```
-    conda install -c conda-forge libffi
-    ```

capvector-oft/LIBERO.md DELETED Viewed

@@ -1,130 +0,0 @@
-# OpenVLA-OFT in the LIBERO Simulation Benchmark
-## Relevant Files
-Evaluation
-* `experiments/robot/libero/`: LIBERO eval files
-  * `run_libero_eval.py`: LIBERO eval script
-  * `libero_utils.py`: LIBERO eval utils
-* `experiments/robot/`: General eval utils files
-  * `openvla_utils.py`: OpenVLA-specific eval utils
-  * `robot_utils.py`: Other eval utils
-Training
-* `vla-scripts/finetune.py`: VLA fine-tuning script
-## Setup
-Set up a conda environment (see instructions in [SETUP.md](SETUP.md)).
-Clone and install the [LIBERO repo](https://github.com/Lifelong-Robot-Learning/LIBERO) and required packages:
-```bash
-git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git
-pip install -e LIBERO
-pip install -r experiments/robot/libero/libero_requirements.txt  # From openvla-oft base dir
-```
-(Optional, if you plan to launch training) To download the [LIBERO datasets](https://huggingface.co/datasets/openvla/modified_libero_rlds) that we used in our fine-tuning
-experiments, run the command below. This will download the LIBERO-Spatial, LIBERO-Object, LIBERO-Goal,
-and LIBERO-10 datasets in RLDS data format (~10 GB total). You can use these to fine-tune OpenVLA or
-train other methods. This step is optional since we provide pretrained OpenVLA-OFT checkpoints below.
-Note that these are the same datasets used in the original OpenVLA project. If needed, see details on how to download the original non-RLDS datasets [here](https://github.com/openvla/openvla?tab=readme-ov-file#libero-setup).
-```bash
-git clone git@hf.co:datasets/openvla/modified_libero_rlds
-```
-## Launching LIBERO Evaluations
-We fine-tuned OpenVLA via LoRA (r=32) with our OFT recipe on four LIBERO task suites: LIBERO-Spatial, LIBERO-Object, LIBERO-Goal, and LIBERO-10 (also called LIBERO-Long).
-In the initial version of our paper, we trained one checkpoint for each LIBERO task suite independently. In an updated version of the paper, we conducted an additional experiment in which we trained a single policy on all four task suites combined (results for this are available in the Additional Experiments section in the Appendix). Overall, the results for the task-specific policies and the combined policy are comparable: 97.1% vs. 96.8% average success rate across the four suites, respectively.
-Below are the four independently trained OpenVLA-OFT checkpoints for LIBERO:
-* [moojink/openvla-7b-oft-finetuned-libero-spatial](https://huggingface.co/moojink/openvla-7b-oft-finetuned-libero-spatial)
-* [moojink/openvla-7b-oft-finetuned-libero-object](https://huggingface.co/moojink/openvla-7b-oft-finetuned-libero-object)
-* [moojink/openvla-7b-oft-finetuned-libero-goal](https://huggingface.co/moojink/openvla-7b-oft-finetuned-libero-goal)
-* [moojink/openvla-7b-oft-finetuned-libero-10](https://huggingface.co/moojink/openvla-7b-oft-finetuned-libero-10)
-Below is the OpenVLA-OFT checkpoint trained on all four task suites combined:
-* [moojink/openvla-7b-oft-finetuned-libero-spatial-object-goal-10](https://huggingface.co/moojink/openvla-7b-oft-finetuned-libero-spatial-object-goal-10)
-To start evaluations with one of the independently trained checkpoints, run one of the commands below. Each will automatically download the appropriate checkpoint listed above. You can set the `TRANSFORMERS_CACHE` and `HF_HOME` environment variable to change where the checkpoint files get cached.
-```bash
-# Launch LIBERO-Spatial evals
-python experiments/robot/libero/run_libero_eval.py \
-  --pretrained_checkpoint moojink/openvla-7b-oft-finetuned-libero-spatial \
-  --task_suite_name libero_spatial
-# Launch LIBERO-Object evals
-python experiments/robot/libero/run_libero_eval.py \
-  --pretrained_checkpoint moojink/openvla-7b-oft-finetuned-libero-object \
-  --task_suite_name libero_object
-# Launch LIBERO-Goal evals
-python experiments/robot/libero/run_libero_eval.py \
-  --pretrained_checkpoint moojink/openvla-7b-oft-finetuned-libero-goal \
-  --task_suite_name libero_goal
-# Launch LIBERO-10 (LIBERO-Long) evals
-python experiments/robot/libero/run_libero_eval.py \
-  --pretrained_checkpoint moojink/openvla-7b-oft-finetuned-libero-10 \
-  --task_suite_name libero_10
-```
-To evaluate the policy trained on all four task suites together, simply swap out the `--pretrained_checkpoint` in the commands above with `moojink/openvla-7b-oft-finetuned-libero-spatial-object-goal-10`.
-Notes:
-* The evaluation script will run 500 trials by default (10 tasks x 50 episodes each). You can modify the number of
-  trials per task by setting `--num_trials_per_task`. You can also change the random seed via `--seed`. There are
-  other arguments in the script; we set them to the default values that work with the OpenVLA-OFT checkpoints above.
-* **NOTE: Setting `--center_crop True` is important** because we fine-tuned OpenVLA with random crop augmentations
-  (we took a random crop with 90% area in every training sample, so at test time we simply take the center 90% crop).
-* The evaluation script logs results locally. You can also log results in Weights & Biases
-  by setting `--use_wandb True` and specifying `--wandb_project <PROJECT>` and `--wandb_entity <ENTITY>`.
-* The results reported in our paper were obtained using **Python 3.10.14, PyTorch 2.2.0, and our
-  [custom transformers v4.40.1 fork](https://github.com/moojink/transformers-openvla-oft.git)**
-  on an **NVIDIA A100 GPU**, averaged over three random seeds. Please stick to these package versions if possible.
-  Note that results may vary slightly if you use a different GPU than the A100. If the discrepancy is large,
-  please post a GitHub issue, and we will look into it.
-## Fine-Tuning on LIBERO Datasets
-First, download the LIBERO datasets as mentioned above in the Setup section above: `libero_spatial_no_noops`, `libero_object_no_noops`, `libero_goal_no_noops`, `libero_10_no_noops`. (`"_no_noops"` stands for no no-op actions, i.e., training samples with near-zero actions are filtered out).
-Then, launch the fine-tuning script with the OFT configuration below, replacing `X` in the first line with the number of GPUs. The command below launches fine-tuning on LIBERO-Spatial with the hyperparameters that we used in our paper. Here, batch size 8 per GPU will require ~62 GB VRAM, and batch size 1 per GPU will require ~25 GB VRAM.
-```bash
-torchrun --standalone --nnodes 1 --nproc-per-node X vla-scripts/finetune.py \
-  --vla_path openvla/openvla-7b \
-  --data_root_dir /PATH/TO/RLDS/DATASETS/DIR/ \
-  --dataset_name libero_spatial_no_noops \
-  --run_root_dir /YOUR/CHECKPOINTS/AND/LOG/DIR/ \
-  --use_l1_regression True \
-  --use_diffusion False \
-  --use_film False \
-  --num_images_in_input 2 \
-  --use_proprio True \
-  --batch_size 8 \
-  --learning_rate 5e-4 \
-  --num_steps_before_decay 100000 \
-  --max_steps 150005 \
-  --save_freq 10000 \
-  --save_latest_checkpoint_only False \
-  --image_aug True \
-  --lora_rank 32 \
-  --wandb_entity "YOUR_WANDB_ENTITY" \
-  --wandb_project "YOUR_WANDB_PROJECT" \
-  --run_id_note parallel_dec--8_acts_chunk--continuous_acts--L1_regression--3rd_person_img--wrist_img--proprio_state
-```
-The above training command should reproduce our OpenVLA-OFT results if `X = 8` and the 150K step checkpoint is evaluated.
-You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. You can also modify other args — e.g., if you want to train with just one input image from the third-person camera and disable proprio state input, you can set `--num_images_in_input 1` and `--use_proprio False`.
-In general, we recommend fine-tuning until training L1 loss goes below 0.01 and starts to plateau (with the above configuration, it should reach ~0.006 L1 loss on LIBERO-Spatial after 150K gradient steps with 10x LR decay after 100K steps). However, for LIBERO-Goal only, we found that the 50K checkpoint (which was at ~0.02 L1 loss) performed best for unknown reasons. For all other task suites though, we found that the 150K checkpoint performed best.
-Please be sure to test your policy with the same device/GPU used to train it! Otherwise, performance may drop substantially. You may be able to avoid the performance drop if you merge the LoRA weights into the base model on the downstream device used for testing (e.g., if you train on H100 and then merge on A100 before testing on A100). You can see our script [vla-scripts/merge_lora_weights_and_save.py](vla-scripts/merge_lora_weights_and_save.py) for merging the LoRA adapter into the base model offline. It's okay if you already merged LoRA weights into the base OpenVLA model during fine-tuning; you can always redownload the base model and merge again as long as you still have the LoRA adapter (`merge_lora_weights_and_save.py` will handle this for you).
-If you run into any issues, please open a new GitHub issue. If you do not receive a response within 2 business days, please email Moo Jin Kim (moojink@cs.stanford.edu) to bring the issue to his attention.

capvector-oft/LICENSE DELETED Viewed

@@ -1,21 +0,0 @@
-MIT License
-Copyright (c) 2025 Moo Jin Kim, Chelsea Finn, Percy Liang.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

capvector-oft/SETUP.md DELETED Viewed

@@ -1,24 +0,0 @@
-# Setup Instructions
-## Set Up Conda Environment
-```bash
-# Create and activate conda environment
-conda create -n capvector-openvla-oft python=3.10 -y
-conda activate capvector-openvla-oft
-# Install PyTorch
-# Use a command specific to your machine: https://pytorch.org/get-started/locally/
-pip3 install torch torchvision torchaudio
-# Clone openvla-oft repo and pip install to download dependencies
-git clone https://github.com/Songwxuan/CapVector
-cd openvla-oft
-pip install -e .
-# Install Flash Attention 2 for training (https://github.com/Dao-AILab/flash-attention)
-#   =>> If you run into difficulty, try `pip cache remove flash_attn` first
-pip install packaging ninja
-ninja --version; echo $?  # Verify Ninja --> should return exit code "0"
-pip install "flash-attn==2.5.5" --no-build-isolation
-```

capvector-oft/capvector/.gitignore DELETED Viewed

@@ -1,8 +0,0 @@
-bin/
-draw_pic/
-feature_vector_ckpt/
-figure/
-id_extrapolation/
-id_interpolation/
-initialized_pt_vla/
-lora_diff/

capvector-oft/capvector/compute_lora_diff.py DELETED Viewed

@@ -1,35 +0,0 @@
-from safetensors.torch import load_file, save_file
-import torch
-import argparse
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--base", required=True)
-    parser.add_argument("--target", required=True)
-    parser.add_argument("--out", default="lora_diff.safetensors")
-    args = parser.parse_args()
-    base = load_file(args.base)
-    target = load_file(args.target)
-    diff = {}
-    print("=== Key Comparison ===")
-    only_in_base = set(base) - set(target)
-    only_in_target = set(target) - set(base)
-    print("Only in base:", list(only_in_base)[:10])
-    print("Only in target:", list(only_in_target)[:10])
-    for k in target:
-        if k in base:
-            diff[k] = target[k] - base[k]
-        else:
-            # new parameters are directly retained
-            diff[k] = target[k].clone()
-    save_file(diff, args.out)
-    print(f"\nSaved diff to: {args.out}")
-if __name__ == "__main__":
-    main()

capvector-oft/capvector/compute_lora_shell/compute_lora_diff.sh DELETED Viewed

@@ -1,8 +0,0 @@
-BASE_ADAPTER="checkpoints/reference_models/openvla_oft_libero_spatial/lora_adapter/adapter_model.safetensors"
-TARGET_ADAPTER="checkpoints/task_models/SF_spatial/lora_adapter/adapter_model.safetensors"
-OUTPUT_DIFF="checkpoints/lora_diff/sf_150000_steps_spatial_adapter_diff.safetensors"
-python compute_lora_diff.py \
-  --base "$BASE_ADAPTER" \
-  --target "$TARGET_ADAPTER" \
-  --out "$OUTPUT_DIFF"

capvector-oft/capvector/initialized_interpolate_shell/get_vector_robotwin.sh DELETED Viewed

@@ -1,26 +0,0 @@
-TASK=bigbin_pot_microwave_qrcode_bowlsthree   # Customize for your task
-VERSION=53
-PT_CKPT="checkpoints/openvla_base"
-TASK_MODEL_CHECKPOINT="checkpoints/task_models/v106.1"
-REFERENCE_MODEL_CHECKPOINT="checkpoints/reference_models/v106.0"
-VECTOR_SAVE_PATH="checkpoints/feature_vectors/feature_vector_with_SF_${TASK}_v${VERSION}.pth"
-INITIALIZED_PT_VLA_PATH="checkpoints/initialized_pt_vla/initailized_openvla_with_SF_${TASK}_v${VERSION}"
-TASK_SUITE_NAME="ALOHA_${TASK}"
-python interpolate_robotwin.py \
-  --pretrained_checkpoint "$TASK_MODEL_CHECKPOINT" \
-  --original_pretrained_checkpoint "$REFERENCE_MODEL_CHECKPOINT" \
-  --vector_save_path "$VECTOR_SAVE_PATH" \
-  --initialized_pt_vla_path $INITIALIZED_PT_VLA_PATH \
-  --pt_ckpt $PT_CKPT\
-  --feature_vector_weight 1.1 \
-  --task_suite_name $TASK_SUITE_NAME
-#the code below is used to transplant the model parameters except for the vla backbone, such as processor and tokenizer, so that the initialized model is complete
-rsync -av \
-  --ignore-existing \
-  --exclude='*.safetensors' \
-  --exclude='*.back.*' \
-  $PT_CKPT/ \
-  $INITIALIZED_PT_VLA_PATH/

capvector-oft/capvector/interpolate.py DELETED Viewed

@@ -1,247 +0,0 @@
-"""
-This is for extracting feature vector from the openvla-oft model and interpolating it with the original openvla model.
-"""
-import os
-import json
-import logging
-import sys
-from collections import deque
-from dataclasses import dataclass
-from enum import Enum
-from pathlib import Path
-from typing import Optional, Union
-from PIL import Image
-import draccus
-import numpy as np
-from tqdm import tqdm
-import torch
-import copy
-import wandb
-REPO_ROOT = Path(__file__).resolve().parents[1]
-if str(REPO_ROOT) not in sys.path:
-    sys.path.append(str(REPO_ROOT))
-from experiments.robot.openvla_utils import (
-    get_action_head,
-    get_noisy_action_projector,
-    get_processor,
-    get_proprio_projector,
-    resize_image_for_policy,
-)
-from experiments.robot.robot_utils import (
-    DATE_TIME,
-    get_action,
-    get_image_resize_size,
-    get_model,
-    invert_gripper_action,
-    normalize_gripper_action,
-    set_seed_everywhere,
-)
-from experiments.robot.libero.run_libero_eval import check_unnorm_key
-from prismatic.vla.constants import NUM_ACTIONS_CHUNK
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(message)s",
-    handlers=[logging.StreamHandler()],
-)
-logger = logging.getLogger(__name__)
-@dataclass
-class GenerateConfig:
-    # fmt: off
-    #################################################################################################################
-    # Model-specific parameters
-    #################################################################################################################
-    model_family: str = "openvla"                    # Model family
-    #the task-specific model after sf fine-tuning
-    pretrained_checkpoint: Union[str, Path] = "checkpoints/task_model"     # Task-specific checkpoint path
-    #the task-specific model after oft fine-tuning
-    original_pretrained_checkpoint: Union[str, Path] = "checkpoints/reference_model"     # Reference checkpoint path
-    #feature vector is the difference between the two models, which represents the spatial features
-    vector_save_path: Union[str, Path] = "checkpoints/feature_vectors/feature_vector.pth"
-    #the pt vla model initialized with the feature vector, named rule: initailized_{pt_ckpt}_with_{task-specific model name}_${task name on libero}
-    initialized_pt_vla_path: Union[str, Path] = "checkpoints/initialized_pt_vla"
-    #the original pretrained openvla model
-    pt_ckpt: Union[str, Path] = "checkpoints/openvla_base"
-    #the weight of the feature vector when initializing the pt vla model
-    feature_vector_weight: float = 1              # Weight of feature vector for interpolation
-    use_l1_regression: bool = True                   # If True, uses continuous action head with L1 regression objective
-    use_diffusion: bool = False                      # If True, uses continuous action head with diffusion modeling objective (DDIM)
-    num_diffusion_steps_train: int = 50              # (When `diffusion==True`) Number of diffusion steps used for training
-    num_diffusion_steps_inference: int = 50          # (When `diffusion==True`) Number of diffusion steps used for inference
-    use_film: bool = False                           # If True, uses FiLM to infuse language inputs into visual features
-    num_images_in_input: int = 2                     # Number of images in the VLA input (default: 1)
-    use_proprio: bool = True                         # Whether to include proprio state in input
-    center_crop: bool = True                         # Center crop? (if trained w/ random crop image aug)
-    num_open_loop_steps: int = 8                     # Number of actions to execute open-loop before requerying policy
-    lora_rank: int = 32                              # Rank of LoRA weight matrix (MAKE SURE THIS MATCHES TRAINING!)
-    unnorm_key: Union[str, Path] = ""                # Action un-normalization key
-    load_in_8bit: bool = False                       # (For OpenVLA only) Load with 8-bit quantization
-    load_in_4bit: bool = False                       # (For OpenVLA only) Load with 4-bit quantization
-    #################################################################################################################
-    # LIBERO environment-specific parameters
-    #################################################################################################################
-    task_suite_name: str = "de"  # Task suite
-    num_steps_wait: int = 10                         # Number of steps to wait for objects to stabilize in sim
-    num_trials_per_task: int = 50                    # Number of rollouts per task
-    initial_states_path: str = "DEFAULT"             # "DEFAULT", or path to initial states JSON file
-    env_img_res: int = 256                           # Resolution for environment images (not policy input resolution)
-    #################################################################################################################
-    # Utils
-    #################################################################################################################
-    run_id_note: Optional[str] = None                # Extra note to add to end of run ID for logging
-    local_log_dir: str = "./experiments/logs"        # Local directory for eval logs
-    use_wandb: bool = False                          # Whether to also log results in Weights & Biases
-    wandb_entity: str = "your-wandb-entity"          # Name of WandB entity
-    wandb_project: str = "your-wandb-project"        # Name of WandB project
-    seed: int = 7                                    # Random Seed (for reproducibility)
-def validate_config(cfg: GenerateConfig) -> None:
-    """Validate configuration parameters."""
-    assert cfg.pretrained_checkpoint is not None, "pretrained_checkpoint must not be None!"
-    if "image_aug" in str(cfg.pretrained_checkpoint):
-        assert cfg.center_crop, "Expecting `center_crop==True` because model was trained with image augmentations!"
-    assert not (cfg.load_in_8bit and cfg.load_in_4bit), "Cannot use both 8-bit and 4-bit quantization!"
-    # Validate task suite
-    assert cfg.task_suite_name in [suite.value for suite in TaskSuite], f"Invalid task suite: {cfg.task_suite_name}"
-def initialize_model(cfg: GenerateConfig, only_pt: bool = False): #load action_head and noisy_action_projector separately
-    """Initialize model and associated components."""
-    # Load model
-    model = get_model(cfg)
-    # Load proprio projector if needed
-    proprio_projector = None
-    if cfg.use_proprio:
-        proprio_projector = get_proprio_projector(
-            cfg,
-            model.llm_dim,
-            proprio_dim=8,  # 8-dimensional proprio for LIBERO
-        )
-    # Load action head if needed
-    action_head = None
-    if cfg.use_l1_regression or cfg.use_diffusion:
-        action_head = get_action_head(cfg, model.llm_dim)
-    # Load noisy action projector if using diffusion
-    noisy_action_projector = None
-    if cfg.use_diffusion:
-        noisy_action_projector = get_noisy_action_projector(cfg, model.llm_dim)
-    # Get OpenVLA processor if needed
-    processor = None
-    if not only_pt:
-        if cfg.model_family == "openvla":
-            processor = get_processor(cfg)
-            check_unnorm_key(cfg, model)
-    return model, action_head, proprio_projector, noisy_action_projector, processor
-# @draccus.wrap()
-def generate_feature_vector(cfg: GenerateConfig):
-    """Generate a feature vector (parameter differences) between two task-specific models."""
-    # Validate configuration
-    # Set random seed
-    set_seed_everywhere(cfg.seed)
-    # Initialize model and components
-    model, action_head, proprio_projector, noisy_action_projector, processor = initialize_model(cfg)
-    original_config = GenerateConfig(
-        pretrained_checkpoint=cfg.original_pretrained_checkpoint,
-        task_suite_name=cfg.task_suite_name,
-        )
-    original_model, original_action_head, original_proprio_projector, original_noisy_action_projector, original_processor = initialize_model(original_config)
-    #for action_head and noisy_action_projector, these modules are not interpolated
-    assert len(model.state_dict()) == len(original_model.state_dict())
-    feature_vector_dict = {}
-    total = len(original_model.state_dict())
-    for name, original_model_param in tqdm(original_model.named_parameters(), total=total):
-        model_param = model.state_dict()[name]
-        feature_vector_dict[name] = (model_param - original_model_param).detach().cpu()
-    return feature_vector_dict
-# @draccus.wrap()
-def interpolate_feature_vector(cfg: GenerateConfig):
-    """Interpolate feature vector."""
-    feature_vector_dict = torch.load(cfg.vector_save_path)
-    pt_vla_config = GenerateConfig(
-        pretrained_checkpoint=cfg.pt_ckpt,
-        original_pretrained_checkpoint=cfg.original_pretrained_checkpoint,
-        vector_save_path=cfg.vector_save_path,
-        initialized_pt_vla_path=cfg.initialized_pt_vla_path,
-        feature_vector_weight=cfg.feature_vector_weight,
-        pt_ckpt=cfg.pt_ckpt,
-        task_suite_name=cfg.task_suite_name,
-        use_proprio=False,
-        use_l1_regression=False,
-        use_diffusion=False
-    )
-    pt_vla,_,_,_,_ = initialize_model(pt_vla_config, only_pt=True)
-    #copy the SF parameters for checking the change before and after interpolation
-    model_sd = pt_vla.state_dict()
-    before_interp_sd = {k: v.clone() for k, v in model_sd.items() if v.dtype.is_floating_point}
-    with torch.no_grad():
-        pt_params = dict(pt_vla.named_parameters())
-        for name, diff in feature_vector_dict.items():
-            if name in pt_params:
-                pt_param = pt_params[name]
-                diff = diff.to(pt_param.device)
-                pt_param.add_(diff, alpha=cfg.feature_vector_weight)
-    #check after interpolation
-    diffs_after = []
-    for name, before_tensor in before_interp_sd.items():
-        after_tensor = model_sd[name]
-        difference = (after_tensor - before_tensor).float().norm().item()
-        diffs_after.append(difference)
-    print(f"[DEBUG] post-interp (SF -> interp): mean={sum(diffs_after)/len(diffs_after):.6f}, "
-          f"max={max(diffs_after):.6f}, num_tensors={len(diffs_after)}")
-    #########################################################
-    return pt_vla
-@draccus.wrap()
-def main(cfg: GenerateConfig):
-    if not os.path.exists(cfg.vector_save_path):
-        feature_vector_dict = generate_feature_vector(cfg)
-        torch.save(feature_vector_dict, cfg.vector_save_path)
-    else:
-        print(f"Feature vector already exists at {cfg.vector_save_path}")
-    initialized_pt_vla = interpolate_feature_vector(cfg)
-    os.makedirs(cfg.initialized_pt_vla_path, exist_ok=True)
-    initialized_pt_vla.save_pretrained(cfg.initialized_pt_vla_path)
-if __name__ == "__main__":
-    main()

capvector-oft/capvector/interpolate.sh DELETED Viewed

@@ -1,26 +0,0 @@
-TASK=spatial   # or object / goal / 10 / 90
-VERSION=21.4
-PT_CKPT="checkpoints/openvla_base"
-TASK_MODEL_CHECKPOINT="checkpoints/task_models/SF_${TASK}"
-REFERENCE_MODEL_CHECKPOINT="checkpoints/reference_models/openvla_oft_libero_${TASK}"
-VECTOR_SAVE_PATH="checkpoints/feature_vectors/feature_vector_with_SF_${TASK}_v${VERSION}.pth"
-INITIALIZED_PT_VLA_PATH="checkpoints/initialized_pt_vla/initailized_openvla_with_SF_${TASK}_v${VERSION}"
-TASK_SUITE_NAME="libero_${TASK}"
-python interpolate.py \
-  --pretrained_checkpoint "$TASK_MODEL_CHECKPOINT" \
-  --original_pretrained_checkpoint "$REFERENCE_MODEL_CHECKPOINT" \
-  --vector_save_path "$VECTOR_SAVE_PATH" \
-  --initialized_pt_vla_path $INITIALIZED_PT_VLA_PATH \
-  --pt_ckpt $PT_CKPT\
-  --feature_vector_weight 0.5 \
-  --task_suite_name $TASK_SUITE_NAME
-#the code below is used to transplant the model parameters except for the vla backbone, such as processor and tokenizer, so that the initialized model is complete
-rsync -av \
-  --ignore-existing \
-  --exclude='*.safetensors' \
-  --exclude='*.back.*' \
-  $PT_CKPT/ \
-  $INITIALIZED_PT_VLA_PATH/

capvector-oft/capvector/interpolate_robotwin.py DELETED Viewed

@@ -1,247 +0,0 @@
-"""
-This is for extracting feature vector from the openvla-oft model and interpolating it with the original openvla model.
-"""
-import os
-import json
-import logging
-import sys
-from collections import deque
-from dataclasses import dataclass
-from enum import Enum
-from pathlib import Path
-from typing import Optional, Union
-from PIL import Image
-import draccus
-import numpy as np
-from tqdm import tqdm
-import torch
-import copy
-import wandb
-REPO_ROOT = Path(__file__).resolve().parents[1]
-if str(REPO_ROOT) not in sys.path:
-    sys.path.append(str(REPO_ROOT))
-from experiments.robot.openvla_utils import (
-    get_action_head,
-    get_noisy_action_projector,
-    get_processor,
-    get_proprio_projector,
-    resize_image_for_policy,
-)
-from experiments.robot.robot_utils import (
-    DATE_TIME,
-    get_action,
-    get_image_resize_size,
-    get_model,
-    invert_gripper_action,
-    normalize_gripper_action,
-    set_seed_everywhere,
-)
-from experiments.robot.libero.run_libero_eval import check_unnorm_key
-from prismatic.vla.constants import NUM_ACTIONS_CHUNK
-from prismatic.vla.constants import PROPRIO_DIM
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(message)s",
-    handlers=[logging.StreamHandler()],
-)
-logger = logging.getLogger(__name__)
-@dataclass
-class GenerateConfig:
-    # fmt: off
-    #################################################################################################################
-    # Model-specific parameters
-    #################################################################################################################
-    model_family: str = "openvla"                    # Model family
-    #the task-specific model after sf fine-tuning
-    pretrained_checkpoint: Union[str, Path] = "checkpoints/task_model"     # Task-specific checkpoint path
-    #the task-specific model after oft fine-tuning
-    original_pretrained_checkpoint: Union[str, Path] = "checkpoints/reference_model"     # Reference checkpoint path
-    #feature vector is the difference between the two models, which represents the spatial features
-    vector_save_path: Union[str, Path] = "checkpoints/feature_vectors/feature_vector.pth"
-    #the pt vla model initialized with the feature vector, named rule: initailized_{pt_ckpt}_with_{task-specific model name}_${task name on libero}
-    initialized_pt_vla_path: Union[str, Path] = "checkpoints/initialized_pt_vla"
-    #the original pretrained openvla model
-    pt_ckpt: Union[str, Path] = "checkpoints/openvla_base"
-    #the weight of the feature vector when initializing the pt vla model
-    feature_vector_weight: float = 1              # Weight of feature vector for interpolation
-    use_l1_regression: bool = True                   # If True, uses continuous action head with L1 regression objective
-    use_diffusion: bool = False                      # If True, uses continuous action head with diffusion modeling objective (DDIM)
-    num_diffusion_steps_train: int = 50              # (When `diffusion==True`) Number of diffusion steps used for training
-    num_diffusion_steps_inference: int = 50          # (When `diffusion==True`) Number of diffusion steps used for inference
-    use_film: bool = False                           # If True, uses FiLM to infuse language inputs into visual features
-    num_images_in_input: int = 3                    # Number of images in the VLA input (default: 1)
-    use_proprio: bool = True                         # Whether to include proprio state in input
-    center_crop: bool = True                         # Center crop? (if trained w/ random crop image aug)
-    num_open_loop_steps: int = 8                     # Number of actions to execute open-loop before requerying policy
-    lora_rank: int = 32                              # Rank of LoRA weight matrix (MAKE SURE THIS MATCHES TRAINING!)
-    unnorm_key: Union[str, Path] = ""                # Action un-normalization key
-    load_in_8bit: bool = False                       # (For OpenVLA only) Load with 8-bit quantization
-    load_in_4bit: bool = False                       # (For OpenVLA only) Load with 4-bit quantization
-    #################################################################################################################
-    # LIBERO environment-specific parameters
-    #################################################################################################################
-    task_suite_name: str = "de"  # Task suite
-    num_steps_wait: int = 10                         # Number of steps to wait for objects to stabilize in sim
-    num_trials_per_task: int = 50                    # Number of rollouts per task
-    initial_states_path: str = "DEFAULT"             # "DEFAULT", or path to initial states JSON file
-    env_img_res: int = 256                           # Resolution for environment images (not policy input resolution)
-    #################################################################################################################
-    # Utils
-    #################################################################################################################
-    run_id_note: Optional[str] = None                # Extra note to add to end of run ID for logging
-    local_log_dir: str = "./experiments/logs"        # Local directory for eval logs
-    use_wandb: bool = False                          # Whether to also log results in Weights & Biases
-    wandb_entity: str = "your-wandb-entity"          # Name of WandB entity
-    wandb_project: str = "your-wandb-project"        # Name of WandB project
-    seed: int = 7                                    # Random Seed (for reproducibility)
-def validate_config(cfg: GenerateConfig) -> None:
-    """Validate configuration parameters."""
-    assert cfg.pretrained_checkpoint is not None, "pretrained_checkpoint must not be None!"
-    if "image_aug" in str(cfg.pretrained_checkpoint):
-        assert cfg.center_crop, "Expecting `center_crop==True` because model was trained with image augmentations!"
-    assert not (cfg.load_in_8bit and cfg.load_in_4bit), "Cannot use both 8-bit and 4-bit quantization!"
-    # Validate task suite
-    # assert cfg.task_suite_name in [suite.value for suite in TaskSuite], f"Invalid task suite: {cfg.task_suite_name}"
-def initialize_model(cfg: GenerateConfig, only_pt: bool = False): #load action_head and noisy_action_projector separately
-    """Initialize model and associated components."""
-    # Load model
-    model = get_model(cfg)
-    # Load proprio projector if needed
-    proprio_projector = None
-    if cfg.use_proprio:
-        proprio_projector = get_proprio_projector(
-            cfg,
-            model.llm_dim,
-            proprio_dim=PROPRIO_DIM, #set the proprio_dim for different robots
-        )
-    # Load action head if needed
-    action_head = None
-    if cfg.use_l1_regression or cfg.use_diffusion:
-        action_head = get_action_head(cfg, model.llm_dim)
-    # Load noisy action projector if using diffusion
-    noisy_action_projector = None
-    if cfg.use_diffusion:
-        noisy_action_projector = get_noisy_action_projector(cfg, model.llm_dim)
-    # Get OpenVLA processor if needed
-    processor = None
-    if not only_pt:
-        if cfg.model_family == "openvla":
-            processor = get_processor(cfg)
-            # check_unnorm_key(cfg, model)
-    return model, action_head, proprio_projector, noisy_action_projector, processor
-# @draccus.wrap()
-def generate_feature_vector(cfg: GenerateConfig):
-    """Generate a feature vector (parameter differences) between two task-specific models."""
-    # Validate configuration
-    # Set random seed
-    set_seed_everywhere(cfg.seed)
-    # Initialize model and components
-    model, action_head, proprio_projector, noisy_action_projector, processor = initialize_model(cfg)
-    original_config = GenerateConfig(
-        pretrained_checkpoint=cfg.original_pretrained_checkpoint,
-        task_suite_name=cfg.task_suite_name,
-        )
-    original_model, original_action_head, original_proprio_projector, original_noisy_action_projector, original_processor = initialize_model(original_config)
-    #for action_head and noisy_action_projector, these modules are not interpolated
-    assert len(model.state_dict()) == len(original_model.state_dict())
-    feature_vector_dict = {}
-    total = len(original_model.state_dict())
-    for name, original_model_param in tqdm(original_model.named_parameters(), total=total):
-        model_param = model.state_dict()[name]
-        feature_vector_dict[name] = (model_param - original_model_param).detach().cpu()
-    return feature_vector_dict
-# @draccus.wrap()
-def interpolate_feature_vector(cfg: GenerateConfig):
-    """Interpolate feature vector."""
-    feature_vector_dict = torch.load(cfg.vector_save_path)
-    pt_vla_config = GenerateConfig(
-        pretrained_checkpoint=cfg.pt_ckpt,
-        original_pretrained_checkpoint=cfg.original_pretrained_checkpoint,
-        vector_save_path=cfg.vector_save_path,
-        initialized_pt_vla_path=cfg.initialized_pt_vla_path,
-        feature_vector_weight=cfg.feature_vector_weight,
-        pt_ckpt=cfg.pt_ckpt,
-        task_suite_name=cfg.task_suite_name,
-        use_proprio=False,
-        use_l1_regression=False,
-        use_diffusion=False
-    )
-    pt_vla,_,_,_,_ = initialize_model(pt_vla_config, only_pt=True)
-    #copy the SF parameters for checking the change before and after interpolation
-    model_sd = pt_vla.state_dict()
-    before_interp_sd = {k: v.clone() for k, v in model_sd.items() if v.dtype.is_floating_point}
-    with torch.no_grad():
-        pt_params = dict(pt_vla.named_parameters())
-        for name, diff in feature_vector_dict.items():
-            if name in pt_params:
-                pt_param = pt_params[name]
-                diff = diff.to(pt_param.device)
-                pt_param.add_(diff, alpha=cfg.feature_vector_weight)
-    #check after interpolation
-    diffs_after = []
-    for name, before_tensor in before_interp_sd.items():
-        after_tensor = model_sd[name]
-        difference = (after_tensor - before_tensor).float().norm().item()
-        diffs_after.append(difference)
-    print(f"[DEBUG] post-interp (SF -> interp): mean={sum(diffs_after)/len(diffs_after):.6f}, "
-          f"max={max(diffs_after):.6f}, num_tensors={len(diffs_after)}")
-    #########################################################
-    return pt_vla
-@draccus.wrap()
-def main(cfg: GenerateConfig):
-    if not os.path.exists(cfg.vector_save_path):
-        feature_vector_dict = generate_feature_vector(cfg)
-        torch.save(feature_vector_dict, cfg.vector_save_path)
-    else:
-        print(f"Feature vector already exists at {cfg.vector_save_path}")
-    initialized_pt_vla = interpolate_feature_vector(cfg)
-    os.makedirs(cfg.initialized_pt_vla_path, exist_ok=True)
-    initialized_pt_vla.save_pretrained(cfg.initialized_pt_vla_path)
-if __name__ == "__main__":
-    main()

capvector-oft/capvector/tools/check_model_config.py DELETED Viewed

@@ -1,23 +0,0 @@
-#This is for checking the completeness of the model parameters.
-import argparse
-import torch
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("checkpoint_path", help="Path to the feature vector checkpoint (.pth)")
-    args = parser.parse_args()
-    fv = torch.load(args.checkpoint_path, map_location="cpu")
-    print("num_tensors:", len(fv))
-    nz = 0
-    for _, value in fv.items():
-        if value.abs().sum().item() != 0:
-            nz += 1
-    print("nonzero_tensors:", nz)
-if __name__ == "__main__":
-    main()

capvector-oft/capvector/tools/compute_lora_diff.py DELETED Viewed

@@ -1,36 +0,0 @@
-#This is for computing the difference between the base model and the target model.
-from safetensors.torch import load_file, save_file
-import torch
-import argparse
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--base", required=True)
-    parser.add_argument("--target", required=True)
-    parser.add_argument("--out", default="lora_diff.safetensors")
-    args = parser.parse_args()
-    base = load_file(args.base)
-    target = load_file(args.target)
-    diff = {}
-    print("=== Key Comparison ===")
-    only_in_base = set(base) - set(target)
-    only_in_target = set(target) - set(base)
-    print("Only in base:", list(only_in_base)[:10])
-    print("Only in target:", list(only_in_target)[:10])
-    for k in target:
-        if k in base:
-            diff[k] = target[k] - base[k]
-        else:
-            # keep the new parameters
-            diff[k] = target[k].clone()
-    save_file(diff, args.out)
-    print(f"\nSaved diff to: {args.out}")
-if __name__ == "__main__":
-    main()

capvector-oft/capvector/tools/compute_lora_diff.sh DELETED Viewed

@@ -1,8 +0,0 @@
-BASE_ADAPTER="checkpoints/reference_models/openvla_oft_libero_spatial/lora_adapter/adapter_model.safetensors"
-TARGET_ADAPTER="checkpoints/task_models/SF_spatial/lora_adapter/adapter_model.safetensors"
-OUTPUT_DIFF="checkpoints/lora_diff/sf_150000_steps_spatial_adapter_diff.safetensors"
-python compute_lora_diff.py \
-  --base "$BASE_ADAPTER" \
-  --target "$TARGET_ADAPTER" \
-  --out "$OUTPUT_DIFF"

capvector-oft/capvector/tools/vector_analyze.py DELETED Viewed

@@ -1,153 +0,0 @@
-#This is for analyzing the vector of the model and finding out which layers have the largest absolute values.
-import argparse
-import csv
-import os
-import re
-from collections import OrderedDict, defaultdict
-import matplotlib.pyplot as plt
-import torch
-LAYER_PREFIX = "language_model.model.layers."
-NUM_LAYERS = 32
-USE_LOG_Y = True
-def pick_state_dict(obj):
-    if isinstance(obj, (OrderedDict, dict)):
-        for key in ["state_dict", "model_state_dict", "model", "net", "weights", "params"]:
-            if key in obj and isinstance(obj[key], (OrderedDict, dict)):
-                return obj[key]
-        if any(torch.is_tensor(value) for value in obj.values()):
-            return obj
-    return None
-def aggregate_layers_abs_sum(state_dict):
-    layer_sum = defaultdict(float)
-    layer_cnt = defaultdict(int)
-    pattern = re.compile(r"^" + re.escape(LAYER_PREFIX) + r"(\d+)\.")
-    for name, tensor in state_dict.items():
-        if not isinstance(name, str):
-            continue
-        match = pattern.match(name)
-        if match is None or not torch.is_tensor(tensor):
-            continue
-        layer_id = int(match.group(1))
-        if layer_id < 0 or layer_id >= NUM_LAYERS:
-            continue
-        value = tensor.detach()
-        if value.is_cuda:
-            value = value.cpu()
-        value = value.to(torch.float64)
-        layer_sum[layer_id] += value.abs().sum().item()
-        layer_cnt[layer_id] += 1
-    for layer_id in range(NUM_LAYERS):
-        layer_sum[layer_id] = float(layer_sum.get(layer_id, 0.0))
-        layer_cnt[layer_id] = int(layer_cnt.get(layer_id, 0))
-    return layer_sum, layer_cnt
-def save_layer_csv(layer_sum, layer_cnt, path):
-    output_dir = os.path.dirname(path)
-    if output_dir:
-        os.makedirs(output_dir, exist_ok=True)
-    with open(path, "w", newline="") as file_obj:
-        writer = csv.DictWriter(file_obj, fieldnames=["layer_id", "abs_sum", "num_tensors"])
-        writer.writeheader()
-        for layer_id in range(NUM_LAYERS):
-            writer.writerow(
-                {
-                    "layer_id": layer_id,
-                    "abs_sum": f"{layer_sum[layer_id]:.12e}",
-                    "num_tensors": layer_cnt[layer_id],
-                }
-            )
-def plot_line(xs, ys, out_png, title):
-    ys_plot = ys[:]
-    if USE_LOG_Y:
-        min_pos = min([value for value in ys_plot if value > 0], default=1e-300)
-        eps = min_pos * 1e-6 if min_pos > 0 else 1e-300
-        ys_plot = [value if value > 0 else eps for value in ys_plot]
-    plt.figure(figsize=(12, 4.5))
-    plt.plot(xs, ys_plot, marker="o", linewidth=1.5)
-    plt.xlabel("Layer id")
-    plt.ylabel("abs_sum (all params in layer)")
-    plt.title(title)
-    plt.grid(True, which="both", linestyle="--", linewidth=0.5, alpha=0.5)
-    if USE_LOG_Y:
-        plt.yscale("log")
-    plt.tight_layout()
-    plt.savefig(out_png, dpi=200)
-    plt.close()
-def plot_bar(xs, ys, out_png, title):
-    ys_plot = ys[:]
-    if USE_LOG_Y:
-        min_pos = min([value for value in ys_plot if value > 0], default=1e-300)
-        eps = min_pos * 1e-6 if min_pos > 0 else 1e-300
-        ys_plot = [value if value > 0 else eps for value in ys_plot]
-    plt.figure(figsize=(12, 4.5))
-    plt.bar(xs, ys_plot)
-    plt.xlabel("Layer id")
-    plt.ylabel("abs_sum (all params in layer)")
-    plt.title(title)
-    plt.grid(True, which="both", axis="y", linestyle="--", linewidth=0.5, alpha=0.5)
-    if USE_LOG_Y:
-        plt.yscale("log")
-    plt.tight_layout()
-    plt.savefig(out_png, dpi=200)
-    plt.close()
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("checkpoint_path", help="Path to the feature vector checkpoint (.pth)")
-    args = parser.parse_args()
-    base = os.path.splitext(args.checkpoint_path)[0]
-    out_csv = base + "_language_model_layers_abs_sum.csv"
-    out_png_line = base + "_language_model_layers_abs_sum_line.png"
-    out_png_bar = base + "_language_model_layers_abs_sum_bar.png"
-    ckpt = torch.load(args.checkpoint_path, map_location="cpu")
-    state_dict = pick_state_dict(ckpt)
-    if state_dict is None:
-        print("Not a state_dict-like dict. Type:", type(ckpt))
-        if isinstance(ckpt, dict):
-            print("Top-level keys:", list(ckpt.keys())[:50])
-        raise SystemExit(1)
-    layer_sum, layer_cnt = aggregate_layers_abs_sum(state_dict)
-    save_layer_csv(layer_sum, layer_cnt, out_csv)
-    print(f"Saved CSV: {out_csv}")
-    xs = list(range(NUM_LAYERS))
-    ys = [layer_sum[i] for i in xs]
-    plot_line(xs, ys, out_png_line, f"{LAYER_PREFIX}*: abs_sum per layer")
-    plot_bar(xs, ys, out_png_bar, f"{LAYER_PREFIX}*: abs_sum per layer")
-    print(f"Saved plot: {out_png_line}")
-    print(f"Saved plot: {out_png_bar}")
-    top = sorted(((i, layer_sum[i], layer_cnt[i]) for i in xs), key=lambda item: item[1], reverse=True)[:5]
-    print("Top-5 layers by abs_sum:")
-    for layer_id, abs_sum, tensor_count in top:
-        print(f"  layer {layer_id:02d}: abs_sum={abs_sum:.6e}, tensors={tensor_count}")
-if __name__ == "__main__":
-    main()

capvector-oft/capvector/tools/vector_regularize.py DELETED Viewed

@@ -1,75 +0,0 @@
-# Used to regularize feature vectors by first computing the absolute-sum of each layer and then performing normalization
-import argparse
-from collections import OrderedDict
-import torch
-def pick_state_dict(obj):
-    """Extract state_dict from a checkpoint-like object"""
-    if isinstance(obj, (OrderedDict, dict)):
-        for k in ["state_dict", "model_state_dict", "model", "net", "weights", "params"]:
-            if k in obj and isinstance(obj[k], (OrderedDict, dict)):
-                return obj[k]
-        if any(torch.is_tensor(v) for v in obj.values()):
-            return obj
-    return None
-def calculate_total_abs_sum(state_dict):
-    """Compute the sum of absolute values over all parameters"""
-    total_sum = 0.0
-    param_count = 0
-    for name, tensor in state_dict.items():
-        if not torch.is_tensor(tensor):
-            continue
-        x = tensor.detach()
-        if x.is_cuda:
-            x = x.cpu()
-        # Use float64 to ensure numerical precision
-        x = x.to(torch.float64)
-        abs_sum = x.abs().sum().item()
-        total_sum += abs_sum
-        param_count += 1
-        print(f"{name}: {abs_sum:.12e} (shape: {list(x.shape)}, numel: {x.numel()})")
-    return total_sum, param_count
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("checkpoint_path", help="Path to the feature vector checkpoint (.pth)")
-    args = parser.parse_args()
-    print(f"Loading checkpoint: {args.checkpoint_path}")
-    ckpt = torch.load(args.checkpoint_path, map_location="cpu")
-    sd = pick_state_dict(ckpt)
-    if sd is None:
-        print("Error: failed to extract state_dict from checkpoint")
-        print(f"Checkpoint type: {type(ckpt)}")
-        if isinstance(ckpt, dict):
-            print(f"Top-level keys: {list(ckpt.keys())[:20]}")
-        raise SystemExit(1)
-    print(f"\nFound {len(sd)} parameters\n")
-    print("=" * 80)
-    print("Absolute-sum of each parameter:")
-    print("=" * 80)
-    total_abs_sum, param_count = calculate_total_abs_sum(sd)
-    print("=" * 80)
-    print(f"\nSummary:")
-    print(f"  Total number of parameters: {param_count}")
-    print(f"  Sum of absolute values of all parameters: {total_abs_sum:.12e}")
-    print(f"  Sum of absolute values of all parameters (scientific notation): {total_abs_sum:.6e}")
-if __name__ == "__main__":
-    main()

capvector-oft/experiments/robot/aloha/aloha_utils.py DELETED Viewed

@@ -1,85 +0,0 @@
-"""Utils for evaluating policies in real-world ALOHA environments."""
-import os
-import imageio
-import numpy as np
-from PIL import Image
-from experiments.robot.aloha.real_env import make_real_env
-from experiments.robot.robot_utils import (
-    DATE,
-    DATE_TIME,
-)
-def get_next_task_label(task_label):
-    """Prompt the user to input the next task."""
-    if task_label == "":
-        user_input = ""
-        while user_input == "":
-            user_input = input("Enter the task name: ")
-        task_label = user_input
-    else:
-        user_input = input("Enter the task name (or leave blank to repeat the previous task): ")
-        if user_input == "":
-            pass  # Do nothing -> Let task_label be the same
-        else:
-            task_label = user_input
-    print(f"Task: {task_label}")
-    return task_label
-def get_aloha_env():
-    """Initializes and returns the ALOHA environment."""
-    env = make_real_env(init_node=True)
-    return env
-def resize_image_for_preprocessing(img):
-    """
-    Takes numpy array corresponding to a single image and resizes to 256x256, exactly as done
-    in the ALOHA data preprocessing script, which is used before converting the dataset to RLDS.
-    """
-    ALOHA_PREPROCESS_SIZE = 256
-    img = np.array(
-        Image.fromarray(img).resize((ALOHA_PREPROCESS_SIZE, ALOHA_PREPROCESS_SIZE), resample=Image.BICUBIC)
-    )  # BICUBIC is default; specify explicitly to make it clear
-    return img
-def get_aloha_image(obs):
-    """Extracts third-person image from observations and preprocesses it."""
-    # obs: dm_env._environment.TimeStep
-    img = obs.observation["images"]["cam_high"]
-    img = resize_image_for_preprocessing(img)
-    return img
-def get_aloha_wrist_images(obs):
-    """Extracts both wrist camera images from observations and preprocesses them."""
-    # obs: dm_env._environment.TimeStep
-    left_wrist_img = obs.observation["images"]["cam_left_wrist"]
-    right_wrist_img = obs.observation["images"]["cam_right_wrist"]
-    left_wrist_img = resize_image_for_preprocessing(left_wrist_img)
-    right_wrist_img = resize_image_for_preprocessing(right_wrist_img)
-    return left_wrist_img, right_wrist_img
-def save_rollout_video(rollout_images, idx, success, task_description, log_file=None, notes=None):
-    """Saves an MP4 replay of an episode."""
-    rollout_dir = f"./rollouts/{DATE}"
-    os.makedirs(rollout_dir, exist_ok=True)
-    processed_task_description = task_description.lower().replace(" ", "_").replace("\n", "_").replace(".", "_")[:50]
-    filetag = f"{rollout_dir}/{DATE_TIME}--openvla_oft--episode={idx}--success={success}--task={processed_task_description}"
-    if notes is not None:
-        filetag += f"--{notes}"
-    mp4_path = f"{filetag}.mp4"
-    video_writer = imageio.get_writer(mp4_path, fps=25)
-    for img in rollout_images:
-        video_writer.append_data(img)
-    video_writer.close()
-    print(f"Saved rollout MP4 at path {mp4_path}")
-    if log_file is not None:
-        log_file.write(f"Saved rollout MP4 at path {mp4_path}\n")
-    return mp4_path

capvector-oft/experiments/robot/aloha/constants.py DELETED Viewed

@@ -1,100 +0,0 @@
-### Task parameters
-DATA_DIR = '/scr2/moojink/data/aloha1/'
-TASK_CONFIGS = {
-    # fold shorts
-    'fold_shorts':{
-        'dataset_dir': DATA_DIR + '/fold_shorts',
-        'num_episodes': 20,
-        'episode_len': 1000,
-        'camera_names': ['cam_high', 'cam_left_wrist', 'cam_right_wrist']
-    },
-    # fold shirt
-    'fold_shirt':{
-        'dataset_dir': DATA_DIR + '/fold_shirt',
-        'num_episodes': 30,
-        'episode_len': 1250,
-        'camera_names': ['cam_high', 'cam_left_wrist', 'cam_right_wrist']
-    },
-    # scoop X into bowl
-    'scoop_raisins_into_bowl':{
-        'dataset_dir': DATA_DIR + '/scoop_raisins_into_bowl',
-        'num_episodes': 15,
-        'episode_len': 900,
-        'camera_names': ['cam_high', 'cam_left_wrist', 'cam_right_wrist']
-    },
-    'scoop_almonds_and_green_M&Ms_into_bowl':{
-        'dataset_dir': DATA_DIR + '/scoop_almonds_and_green_M&Ms_into_bowl',
-        'num_episodes': 15,
-        'episode_len': 900,
-        'camera_names': ['cam_high', 'cam_left_wrist', 'cam_right_wrist']
-    },
-    'scoop_pretzels_into_bowl':{
-        'dataset_dir': DATA_DIR + '/scoop_pretzels_into_bowl',
-        'num_episodes': 15,
-        'episode_len': 900,
-        'camera_names': ['cam_high', 'cam_left_wrist', 'cam_right_wrist']
-    },
-    # put X into pot
-    'put_red_pepper_into_pot':{
-        'dataset_dir': DATA_DIR + '/put_red_pepper_into_pot',
-        'num_episodes': 100,
-        'episode_len': 400,
-        'camera_names': ['cam_high', 'cam_low', 'cam_left_wrist', 'cam_right_wrist']
-    },
-    'put_yellow_corn_into_pot':{
-        'dataset_dir': DATA_DIR + '/put_yellow_corn_into_pot',
-        'num_episodes': 100,
-        'episode_len': 400,
-        'camera_names': ['cam_high', 'cam_low', 'cam_left_wrist', 'cam_right_wrist']
-    },
-    'put_green_pepper_into_pot':{
-        'dataset_dir': DATA_DIR + '/put_green_pepper_into_pot',
-        'num_episodes': 100,
-        'episode_len': 400,
-        'camera_names': ['cam_high', 'cam_low', 'cam_left_wrist', 'cam_right_wrist']
-    },
-}
-### ALOHA fixed constants
-DT = 0.04  # 1 / 0.04 -> 25 Hz
-JOINT_NAMES = ["waist", "shoulder", "elbow", "forearm_roll", "wrist_angle", "wrist_rotate"]
-START_ARM_POSE = [0, -0.96, 1.16, 0, -0.3, 0, 0.02239, -0.02239,  0, -0.96, 1.16, 0, -0.3, 0, 0.02239, -0.02239]
-# Left finger position limits (qpos[7]), right_finger = -1 * left_finger
-MASTER_GRIPPER_POSITION_OPEN = 0.02417
-MASTER_GRIPPER_POSITION_CLOSE = 0.01244
-PUPPET_GRIPPER_POSITION_OPEN = 0.05800
-PUPPET_GRIPPER_POSITION_CLOSE = 0.01844
-# Gripper joint limits (qpos[6])
-MASTER_GRIPPER_JOINT_OPEN = 0.3083  # For ALOHA 1
-MASTER_GRIPPER_JOINT_CLOSE = -0.6842  # For ALOHA 1
-# MASTER_GRIPPER_JOINT_OPEN = -0.8  # For ALOHA 2
-# MASTER_GRIPPER_JOINT_CLOSE = -1.65  # For ALOHA 2
-PUPPET_GRIPPER_JOINT_OPEN = 1.4910
-PUPPET_GRIPPER_JOINT_CLOSE = -0.6213
-############################ Helper functions ############################
-MASTER_GRIPPER_POSITION_NORMALIZE_FN = lambda x: (x - MASTER_GRIPPER_POSITION_CLOSE) / (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE)
-PUPPET_GRIPPER_POSITION_NORMALIZE_FN = lambda x: (x - PUPPET_GRIPPER_POSITION_CLOSE) / (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE)
-MASTER_GRIPPER_POSITION_UNNORMALIZE_FN = lambda x: x * (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE) + MASTER_GRIPPER_POSITION_CLOSE
-PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN = lambda x: x * (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE) + PUPPET_GRIPPER_POSITION_CLOSE
-MASTER2PUPPET_POSITION_FN = lambda x: PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN(MASTER_GRIPPER_POSITION_NORMALIZE_FN(x))
-MASTER_GRIPPER_JOINT_NORMALIZE_FN = lambda x: (x - MASTER_GRIPPER_JOINT_CLOSE) / (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE)
-PUPPET_GRIPPER_JOINT_NORMALIZE_FN = lambda x: (x - PUPPET_GRIPPER_JOINT_CLOSE) / (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE)
-MASTER_GRIPPER_JOINT_UNNORMALIZE_FN = lambda x: x * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE) + MASTER_GRIPPER_JOINT_CLOSE
-PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN = lambda x: x * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE) + PUPPET_GRIPPER_JOINT_CLOSE
-MASTER2PUPPET_JOINT_FN = lambda x: PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(MASTER_GRIPPER_JOINT_NORMALIZE_FN(x))
-MASTER_GRIPPER_VELOCITY_NORMALIZE_FN = lambda x: x / (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE)
-PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN = lambda x: x / (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE)
-MASTER_POS2JOINT = lambda x: MASTER_GRIPPER_POSITION_NORMALIZE_FN(x) * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE) + MASTER_GRIPPER_JOINT_CLOSE
-MASTER_JOINT2POS = lambda x: MASTER_GRIPPER_POSITION_UNNORMALIZE_FN((x - MASTER_GRIPPER_JOINT_CLOSE) / (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE))
-PUPPET_POS2JOINT = lambda x: PUPPET_GRIPPER_POSITION_NORMALIZE_FN(x) * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE) + PUPPET_GRIPPER_JOINT_CLOSE
-PUPPET_JOINT2POS = lambda x: PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN((x - PUPPET_GRIPPER_JOINT_CLOSE) / (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE))
-MASTER_GRIPPER_JOINT_MID = (MASTER_GRIPPER_JOINT_OPEN + MASTER_GRIPPER_JOINT_CLOSE)/2

capvector-oft/experiments/robot/aloha/preprocess_split_aloha_data.py DELETED Viewed

@@ -1,260 +0,0 @@
-"""
-Preprocesses ALOHA dataset(s) and splits them into train/val sets.
-Preprocessing includes downsizing images from 480x640 to 256x256.
-Splits happen at the episode level (not step level), which means that
-an episode is treated as an atomic unit that entirely goes to either
-the train set or val set.
-Original ALOHA data layout:
-    /PATH/TO/DATASET/dataset_name/
-        - episode_0.hdf5
-        - episode_1.hdf5
-        - ...
-        - episode_N.hdf5
-Preprocessed data layout (after running this script):
-    /PATH/TO/PREPROCESSED_DATASETS/dataset_name/
-        - train/
-            - episode_0.hdf5
-            - episode_1.hdf5
-            - ...
-            - episode_M.hdf5
-        - val/
-            - episode_0.hdf5
-            - episode_1.hdf5
-            - ...
-            - episode_K.hdf5
-    where N > M > K
-Example usage:
-    # "put X into pot" task
-    python experiments/robot/aloha/preprocess_split_aloha_data.py \
-        --dataset_path /scr/moojink/data/aloha1_raw/put_green_pepper_into_pot/ \
-        --out_base_dir /scr/moojink/data/aloha1_preprocessed/ \
-        --percent_val 0.05 && \
-    python experiments/robot/aloha/preprocess_split_aloha_data.py \
-        --dataset_path /scr/moojink/data/aloha1_raw/put_red_pepper_into_pot/ \
-        --out_base_dir /scr/moojink/data/aloha1_preprocessed/ \
-        --percent_val 0.05 && \
-    python experiments/robot/aloha/preprocess_split_aloha_data.py \
-        --dataset_path /scr/moojink/data/aloha1_raw/put_yellow_corn_into_pot/ \
-        --out_base_dir /scr/moojink/data/aloha1_preprocessed/ \
-        --percent_val 0.05
-"""
-import argparse
-import glob
-import os
-import random
-import h5py
-import numpy as np
-from PIL import Image
-from tqdm import tqdm
-def load_hdf5(demo_path):
-    """Loads single episode."""
-    if not os.path.isfile(demo_path):
-        print(f"Dataset does not exist at \n{demo_path}\n")
-        exit()
-    print(f"Loading {demo_path}...")
-    with h5py.File(demo_path, "r") as root:
-        is_sim = root.attrs["sim"]
-        qpos = root["/observations/qpos"][()]
-        qvel = root["/observations/qvel"][()]
-        effort = root["/observations/effort"][()]
-        action = root["/action"][()]
-        image_dict = dict()
-        for cam_name in root["/observations/images/"].keys():
-            image_dict[cam_name] = root[f"/observations/images/{cam_name}"][()]
-    print(f"Loading episode complete: {demo_path}")
-    return qpos, qvel, effort, action, image_dict, is_sim
-def load_and_preprocess_all_episodes(demo_paths, out_dataset_dir):
-    """
-    Loads and preprocesses all episodes.
-    Resizes all images in one episode before loading the next, to reduce memory usage.
-    """
-    cam_names = ["cam_high", "cam_left_wrist", "cam_right_wrist"]
-    idx = 0
-    for demo in tqdm(demo_paths):
-        qpos, qvel, effort, action, image_dict, is_sim = load_hdf5(demo)
-        # Save non-image info
-        episode_len = image_dict["cam_high"].shape[0]
-        # Resize all images
-        print("Resizing images in episode...")
-        for k in cam_names:
-            resized_images = []
-            for i in range(episode_len):
-                resized_images.append(
-                    np.array(
-                        Image.fromarray(image_dict[k][i]).resize(
-                            (args.img_resize_size, args.img_resize_size), resample=Image.BICUBIC
-                        )
-                    )  # BICUBIC is default; specify explicitly to make it clear
-                )
-            image_dict[k] = np.stack(resized_images)
-        print("Resizing images in episode complete!")
-        # Save preprocessed episode
-        data_dict = dict(
-            qpos=qpos,
-            qvel=qvel,
-            effort=effort,
-            action=action,
-            image_dict=image_dict,
-            is_sim=is_sim,
-        )
-        save_new_hdf5(out_dataset_dir, data_dict, idx)
-        idx += 1
-def randomly_split(full_qpos, full_qvel, full_effort, full_action, full_image_dict, percent_val):
-    """Randomly splits dataset into train and validation sets."""
-    # Create a list of episode indices
-    num_episodes_total = len(full_qpos)
-    indices = list(range(num_episodes_total))
-    # Shuffle the episode indices
-    random.shuffle(indices)
-    # Create new lists using the shuffled indices
-    shuffled_qpos = [full_qpos[idx] for idx in indices]
-    shuffled_qvel = [full_qvel[idx] for idx in indices]
-    shuffled_effort = [full_effort[idx] for idx in indices]
-    shuffled_action = [full_action[idx] for idx in indices]
-    shuffled_image_dict = {
-        "cam_high": [],
-        "cam_left_wrist": [],
-        "cam_right_wrist": [],
-    }
-    for k in full_image_dict.keys():
-        shuffled_image_dict[k] = [full_image_dict[k][idx] for idx in indices]
-    # Split into train and val sets
-    num_episodes_val = int(num_episodes_total * percent_val)
-    print(f"Total # steps: {num_episodes_total}; using {num_episodes_val} ({percent_val:.2f}%) for val set")
-    num_episodes_train = num_episodes_total - num_episodes_val
-    train_dict = dict(
-        qpos=shuffled_qpos[:num_episodes_train],
-        qvel=shuffled_qvel[:num_episodes_train],
-        effort=shuffled_effort[:num_episodes_train],
-        action=shuffled_action[:num_episodes_train],
-        image_dict=dict(
-            cam_high=shuffled_image_dict["cam_high"][:num_episodes_train],
-            cam_left_wrist=shuffled_image_dict["cam_left_wrist"][:num_episodes_train],
-            cam_right_wrist=shuffled_image_dict["cam_right_wrist"][:num_episodes_train],
-        ),
-    )
-    val_dict = dict(
-        qpos=shuffled_qpos[num_episodes_train:],
-        qvel=shuffled_qvel[num_episodes_train:],
-        effort=shuffled_effort[num_episodes_train:],
-        action=shuffled_action[num_episodes_train:],
-        image_dict=dict(
-            cam_high=shuffled_image_dict["cam_high"][num_episodes_train:],
-            cam_left_wrist=shuffled_image_dict["cam_left_wrist"][num_episodes_train:],
-            cam_right_wrist=shuffled_image_dict["cam_right_wrist"][num_episodes_train:],
-        ),
-    )
-    return train_dict, val_dict
-def save_new_hdf5(out_dataset_dir, data_dict, episode_idx):
-    """Saves an HDF5 file for a new episode."""
-    camera_names = data_dict["image_dict"].keys()
-    H, W, C = data_dict["image_dict"]["cam_high"][0].shape
-    out_path = os.path.join(out_dataset_dir, f"episode_{episode_idx}.hdf5")
-    # Save HDF5 with same structure as original demos (except that now we combine all episodes into one HDF5 file)
-    with h5py.File(
-        out_path, "w", rdcc_nbytes=1024**2 * 2
-    ) as root:  # Magic constant for rdcc_nbytes comes from ALOHA codebase
-        episode_len = data_dict["qpos"].shape[0]
-        root.attrs["sim"] = data_dict["is_sim"]
-        obs = root.create_group("observations")
-        _ = obs.create_dataset("qpos", (episode_len, 14))
-        _ = obs.create_dataset("qvel", (episode_len, 14))
-        _ = obs.create_dataset("effort", (episode_len, 14))
-        root["/observations/qpos"][...] = data_dict["qpos"]
-        root["/observations/qvel"][...] = data_dict["qvel"]
-        root["/observations/effort"][...] = data_dict["effort"]
-        image = obs.create_group("images")
-        for cam_name in camera_names:
-            _ = image.create_dataset(
-                cam_name,
-                (episode_len, H, W, C),
-                dtype="uint8",
-                chunks=(1, H, W, C),
-            )
-            root[f"/observations/images/{cam_name}"][...] = data_dict["image_dict"][cam_name]
-        _ = root.create_dataset("action", (episode_len, 14))
-        root["/action"][...] = data_dict["action"]
-        # Compute and save *relative* actions as well
-        actions = data_dict["action"]
-        relative_actions = np.zeros_like(actions)
-        relative_actions[:-1] = actions[1:] - actions[:-1]  # Relative actions are the changes in joint pos
-        relative_actions[-1] = relative_actions[-2]  # Just copy the second-to-last action for the last action
-        _ = root.create_dataset("relative_action", (episode_len, 14))
-        root["/relative_action"][...] = relative_actions
-    print(f"Saved dataset: {out_path}")
-def main(args):
-    # Create directory to save preprocessed dataset (if it doesn't exist already)
-    os.makedirs(args.out_base_dir, exist_ok=True)
-    out_dataset_dir = os.path.join(args.out_base_dir, os.path.basename(args.dataset_path.rstrip("/")))
-    os.makedirs(out_dataset_dir, exist_ok=True)
-    # Get list of filepaths of all episodes
-    all_demo_paths = glob.glob(os.path.join(args.dataset_path, "*.hdf5"))  # List of HDF5 filepaths
-    all_demo_paths.sort()
-    # Create a list of episode indices
-    num_episodes_total = len(all_demo_paths)
-    indices = list(range(num_episodes_total))
-    # Shuffle the episode indices
-    random.shuffle(indices)
-    # Split into train and val sets
-    num_episodes_val = int(num_episodes_total * args.percent_val)
-    print(f"Total # episodes: {num_episodes_total}; using {num_episodes_val} ({args.percent_val:.2f}%) for val set")
-    num_episodes_train = num_episodes_total - num_episodes_val
-    train_indices = indices[:num_episodes_train]
-    val_indices = indices[num_episodes_train:]
-    train_demo_paths = [all_demo_paths[i] for i in train_indices]
-    val_demo_paths = [all_demo_paths[i] for i in val_indices]
-    # Preprocess all episodes and save the result
-    out_dataset_dir_train = os.path.join(out_dataset_dir, "train")
-    out_dataset_dir_val = os.path.join(out_dataset_dir, "val")
-    os.makedirs(out_dataset_dir_train, exist_ok=True)
-    os.makedirs(out_dataset_dir_val, exist_ok=True)
-    load_and_preprocess_all_episodes(train_demo_paths, out_dataset_dir_train)
-    load_and_preprocess_all_episodes(val_demo_paths, out_dataset_dir_val)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dataset_path",
-        required=True,
-        help="Path to raw ALOHA dataset directory. Example: /PATH/TO/USER/data/aloha_raw/put_green_pepper_into_pot/",
-    )
-    parser.add_argument(
-        "--out_base_dir",
-        required=True,
-        help="Path to directory in which to save preprocessed dataset. Example: /PATH/TO/USER/data/aloha_preprocessed/",
-    )
-    parser.add_argument(
-        "--percent_val",
-        type=float,
-        help="Percent of dataset to use as validation set (measured in episodes, not steps).",
-        default=0.05,
-    )
-    parser.add_argument(
-        "--img_resize_size",
-        type=int,
-        help="Size to resize images to. Final images will be square (img_resize_size x img_resize_size pixels).",
-        default=256,
-    )
-    args = parser.parse_args()
-    main(args)

capvector-oft/experiments/robot/aloha/real_env.py DELETED Viewed

@@ -1,213 +0,0 @@
-import time
-import numpy as np
-import collections
-import matplotlib.pyplot as plt
-import dm_env
-from experiments.robot.aloha.constants import DT, START_ARM_POSE, MASTER_GRIPPER_JOINT_NORMALIZE_FN, PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN
-from experiments.robot.aloha.constants import PUPPET_GRIPPER_POSITION_NORMALIZE_FN, PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN
-from experiments.robot.aloha.constants import PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE
-from experiments.robot.aloha.robot_utils import Recorder, ImageRecorder
-from experiments.robot.aloha.robot_utils import setup_master_bot, setup_puppet_bot, move_arms, move_grippers
-from interbotix_xs_modules.arm import InterbotixManipulatorXS
-from interbotix_xs_msgs.msg import JointSingleCommand
-import IPython
-e = IPython.embed
-class RealEnv:
-    """
-    Environment for real robot bi-manual manipulation
-    Action space:      [left_arm_qpos (6),             # absolute joint position
-                        left_gripper_positions (1),    # normalized gripper position (0: close, 1: open)
-                        right_arm_qpos (6),            # absolute joint position
-                        right_gripper_positions (1),]  # normalized gripper position (0: close, 1: open)
-    Observation space: {"qpos": Concat[ left_arm_qpos (6),          # absolute joint position
-                                        left_gripper_position (1),  # normalized gripper position (0: close, 1: open)
-                                        right_arm_qpos (6),         # absolute joint position
-                                        right_gripper_qpos (1)]     # normalized gripper position (0: close, 1: open)
-                        "qvel": Concat[ left_arm_qvel (6),         # absolute joint velocity (rad)
-                                        left_gripper_velocity (1),  # normalized gripper velocity (pos: opening, neg: closing)
-                                        right_arm_qvel (6),         # absolute joint velocity (rad)
-                                        right_gripper_qvel (1)]     # normalized gripper velocity (pos: opening, neg: closing)
-                        "images": {"cam_high": (480x640x3),        # h, w, c, dtype='uint8'
-                                   "cam_low": (480x640x3),         # h, w, c, dtype='uint8'
-                                   "cam_left_wrist": (480x640x3),  # h, w, c, dtype='uint8'
-                                   "cam_right_wrist": (480x640x3)} # h, w, c, dtype='uint8'
-    """
-    def __init__(self, init_node, setup_robots=True):
-        self.puppet_bot_left = InterbotixManipulatorXS(robot_model="vx300s", group_name="arm", gripper_name="gripper",
-                                                       robot_name=f'puppet_left', init_node=init_node)
-        self.puppet_bot_right = InterbotixManipulatorXS(robot_model="vx300s", group_name="arm", gripper_name="gripper",
-                                                        robot_name=f'puppet_right', init_node=False)
-        if setup_robots:
-            self.setup_robots()
-        self.recorder_left = Recorder('left', init_node=False)
-        self.recorder_right = Recorder('right', init_node=False)
-        self.image_recorder = ImageRecorder(init_node=False)
-        self.gripper_command = JointSingleCommand(name="gripper")
-    def setup_robots(self):
-        setup_puppet_bot(self.puppet_bot_left)
-        setup_puppet_bot(self.puppet_bot_right)
-    def get_qpos(self):
-        left_qpos_raw = self.recorder_left.qpos
-        right_qpos_raw = self.recorder_right.qpos
-        left_arm_qpos = left_qpos_raw[:6]
-        right_arm_qpos = right_qpos_raw[:6]
-        left_gripper_qpos = [PUPPET_GRIPPER_POSITION_NORMALIZE_FN(left_qpos_raw[7])] # this is position not joint
-        right_gripper_qpos = [PUPPET_GRIPPER_POSITION_NORMALIZE_FN(right_qpos_raw[7])] # this is position not joint
-        return np.concatenate([left_arm_qpos, left_gripper_qpos, right_arm_qpos, right_gripper_qpos])
-    def get_qvel(self):
-        left_qvel_raw = self.recorder_left.qvel
-        right_qvel_raw = self.recorder_right.qvel
-        left_arm_qvel = left_qvel_raw[:6]
-        right_arm_qvel = right_qvel_raw[:6]
-        left_gripper_qvel = [PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN(left_qvel_raw[7])]
-        right_gripper_qvel = [PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN(right_qvel_raw[7])]
-        return np.concatenate([left_arm_qvel, left_gripper_qvel, right_arm_qvel, right_gripper_qvel])
-    def get_effort(self):
-        left_effort_raw = self.recorder_left.effort
-        right_effort_raw = self.recorder_right.effort
-        left_robot_effort = left_effort_raw[:7]
-        right_robot_effort = right_effort_raw[:7]
-        return np.concatenate([left_robot_effort, right_robot_effort])
-    def get_images(self):
-        return self.image_recorder.get_images()
-    def set_gripper_pose(self, left_gripper_desired_pos_normalized, right_gripper_desired_pos_normalized):
-        left_gripper_desired_joint = PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(left_gripper_desired_pos_normalized)
-        self.gripper_command.cmd = left_gripper_desired_joint
-        self.puppet_bot_left.gripper.core.pub_single.publish(self.gripper_command)
-        right_gripper_desired_joint = PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(right_gripper_desired_pos_normalized)
-        self.gripper_command.cmd = right_gripper_desired_joint
-        self.puppet_bot_right.gripper.core.pub_single.publish(self.gripper_command)
-    def _reset_joints(self):
-        reset_position = START_ARM_POSE[:6]
-        move_arms([self.puppet_bot_left, self.puppet_bot_right], [reset_position, reset_position], move_time=1)
-    def _reset_gripper(self):
-        """Set to position mode and do position resets: first open then close. Then change back to PWM mode"""
-        move_grippers([self.puppet_bot_left, self.puppet_bot_right], [PUPPET_GRIPPER_JOINT_OPEN] * 2, move_time=0.5)
-        move_grippers([self.puppet_bot_left, self.puppet_bot_right], [PUPPET_GRIPPER_JOINT_CLOSE] * 2, move_time=1)
-    def _get_obs(self):
-        obs = collections.OrderedDict()
-        obs['qpos'] = self.get_qpos()
-        obs['qvel'] = self.get_qvel()
-        obs['effort'] = self.get_effort()
-        obs['images'] = self.get_images()
-        return obs
-    def get_observation(self, t=0):
-        step_type = dm_env.StepType.FIRST if t == 0 else dm_env.StepType.MID
-        return dm_env.TimeStep(
-            step_type=step_type,
-            reward=self.get_reward(),
-            discount=None,
-            observation=self._get_obs()
-        )
-    def get_reward(self):
-        return 0
-    def reset(self, fake=False):
-        if not fake:
-            # Reboot puppet robot gripper motors
-            self.puppet_bot_left.dxl.robot_reboot_motors("single", "gripper", True)
-            self.puppet_bot_right.dxl.robot_reboot_motors("single", "gripper", True)
-            self._reset_joints()
-            self._reset_gripper()
-        return dm_env.TimeStep(
-            step_type=dm_env.StepType.FIRST,
-            reward=self.get_reward(),
-            discount=None,
-            observation=self._get_obs())
-    def step(self, action):
-        state_len = int(len(action) / 2)
-        left_action = action[:state_len]
-        right_action = action[state_len:]
-        self.puppet_bot_left.arm.set_joint_positions(left_action[:6], blocking=False)
-        self.puppet_bot_right.arm.set_joint_positions(right_action[:6], blocking=False)
-        self.set_gripper_pose(left_action[-1], right_action[-1])
-        time.sleep(DT)
-        return dm_env.TimeStep(
-            step_type=dm_env.StepType.MID,
-            reward=self.get_reward(),
-            discount=None,
-            observation=self._get_obs())
-def get_action(master_bot_left, master_bot_right):
-    action = np.zeros(14) # 6 joint + 1 gripper, for two arms
-    # Arm actions
-    action[:6] = master_bot_left.dxl.joint_states.position[:6]
-    action[7:7+6] = master_bot_right.dxl.joint_states.position[:6]
-    # Gripper actions
-    action[6] = MASTER_GRIPPER_JOINT_NORMALIZE_FN(master_bot_left.dxl.joint_states.position[6])
-    action[7+6] = MASTER_GRIPPER_JOINT_NORMALIZE_FN(master_bot_right.dxl.joint_states.position[6])
-    return action
-def make_real_env(init_node, setup_robots=True):
-    env = RealEnv(init_node, setup_robots)
-    return env
-def test_real_teleop():
-    """
-    Test bimanual teleoperation and show image observations onscreen.
-    It first reads joint poses from both master arms.
-    Then use it as actions to step the environment.
-    The environment returns full observations including images.
-    An alternative approach is to have separate scripts for teleoperation and observation recording.
-    This script will result in higher fidelity (obs, action) pairs
-    """
-    onscreen_render = True
-    render_cam = 'cam_left_wrist'
-    # source of data
-    master_bot_left = InterbotixManipulatorXS(robot_model="wx250s", group_name="arm", gripper_name="gripper",
-                                              robot_name=f'master_left', init_node=True)
-    master_bot_right = InterbotixManipulatorXS(robot_model="wx250s", group_name="arm", gripper_name="gripper",
-                                               robot_name=f'master_right', init_node=False)
-    setup_master_bot(master_bot_left)
-    setup_master_bot(master_bot_right)
-    # setup the environment
-    env = make_real_env(init_node=False)
-    ts = env.reset(fake=True)
-    episode = [ts]
-    # setup visualization
-    if onscreen_render:
-        ax = plt.subplot()
-        plt_img = ax.imshow(ts.observation['images'][render_cam])
-        plt.ion()
-    for t in range(1000):
-        action = get_action(master_bot_left, master_bot_right)
-        ts = env.step(action)
-        episode.append(ts)
-        if onscreen_render:
-            plt_img.set_data(ts.observation['images'][render_cam])
-            plt.pause(DT)
-        else:
-            time.sleep(DT)
-if __name__ == '__main__':
-    test_real_teleop()

capvector-oft/experiments/robot/aloha/requirements_aloha.txt DELETED Viewed

@@ -1,26 +0,0 @@
-numpy<2
-draccus
-torchvision
-torch
-pyquaternion
-pyyaml
-rospkg
-pexpect
-mujoco==2.3.7
-dm_control==1.0.14
-opencv-python
-matplotlib
-einops
-packaging
-h5py
-traitlets
-ipdb
-IPython
-modern_robotics
-Pillow
-termcolor
-imageio[ffmpeg]
-uvicorn
-fastapi
-requests
-json_numpy

capvector-oft/experiments/robot/aloha/robot_utils.py DELETED Viewed

@@ -1,187 +0,0 @@
-import numpy as np
-import time
-from experiments.robot.aloha.constants import DT
-from interbotix_xs_msgs.msg import JointSingleCommand
-import IPython
-e = IPython.embed
-class ImageRecorder:
-    def __init__(self, init_node=True, is_debug=False):
-        from collections import deque
-        import rospy
-        from cv_bridge import CvBridge
-        from sensor_msgs.msg import Image
-        self.is_debug = is_debug
-        self.bridge = CvBridge()
-        self.camera_names = ['cam_high', 'cam_low', 'cam_left_wrist', 'cam_right_wrist']
-        if init_node:
-            rospy.init_node('image_recorder', anonymous=True)
-        for cam_name in self.camera_names:
-            setattr(self, f'{cam_name}_image', None)
-            setattr(self, f'{cam_name}_secs', None)
-            setattr(self, f'{cam_name}_nsecs', None)
-            if cam_name == 'cam_high':
-                callback_func = self.image_cb_cam_high
-            elif cam_name == 'cam_low':
-                callback_func = self.image_cb_cam_low
-            elif cam_name == 'cam_left_wrist':
-                callback_func = self.image_cb_cam_left_wrist
-            elif cam_name == 'cam_right_wrist':
-                callback_func = self.image_cb_cam_right_wrist
-            else:
-                raise NotImplementedError
-            rospy.Subscriber(f"/usb_{cam_name}/image_raw", Image, callback_func)
-            if self.is_debug:
-                setattr(self, f'{cam_name}_timestamps', deque(maxlen=50))
-        time.sleep(0.5)
-    def image_cb(self, cam_name, data):
-        setattr(self, f'{cam_name}_image', self.bridge.imgmsg_to_cv2(data, desired_encoding='passthrough'))
-        setattr(self, f'{cam_name}_secs', data.header.stamp.secs)
-        setattr(self, f'{cam_name}_nsecs', data.header.stamp.nsecs)
-        # cv2.imwrite('/home/tonyzhao/Desktop/sample.jpg', cv_image)
-        if self.is_debug:
-            getattr(self, f'{cam_name}_timestamps').append(data.header.stamp.secs + data.header.stamp.secs * 1e-9)
-    def image_cb_cam_high(self, data):
-        cam_name = 'cam_high'
-        return self.image_cb(cam_name, data)
-    def image_cb_cam_low(self, data):
-        cam_name = 'cam_low'
-        return self.image_cb(cam_name, data)
-    def image_cb_cam_left_wrist(self, data):
-        cam_name = 'cam_left_wrist'
-        return self.image_cb(cam_name, data)
-    def image_cb_cam_right_wrist(self, data):
-        cam_name = 'cam_right_wrist'
-        return self.image_cb(cam_name, data)
-    def get_images(self):
-        image_dict = dict()
-        for cam_name in self.camera_names:
-            image_dict[cam_name] = getattr(self, f'{cam_name}_image')
-        return image_dict
-    def print_diagnostics(self):
-        def dt_helper(l):
-            l = np.array(l)
-            diff = l[1:] - l[:-1]
-            return np.mean(diff)
-        for cam_name in self.camera_names:
-            image_freq = 1 / dt_helper(getattr(self, f'{cam_name}_timestamps'))
-            print(f'{cam_name} {image_freq=:.2f}')
-        print()
-class Recorder:
-    def __init__(self, side, init_node=True, is_debug=False):
-        from collections import deque
-        import rospy
-        from sensor_msgs.msg import JointState
-        from interbotix_xs_msgs.msg import JointGroupCommand, JointSingleCommand
-        self.secs = None
-        self.nsecs = None
-        self.qpos = None
-        self.effort = None
-        self.arm_command = None
-        self.gripper_command = None
-        self.is_debug = is_debug
-        if init_node:
-            rospy.init_node('recorder', anonymous=True)
-        rospy.Subscriber(f"/puppet_{side}/joint_states", JointState, self.puppet_state_cb)
-        rospy.Subscriber(f"/puppet_{side}/commands/joint_group", JointGroupCommand, self.puppet_arm_commands_cb)
-        rospy.Subscriber(f"/puppet_{side}/commands/joint_single", JointSingleCommand, self.puppet_gripper_commands_cb)
-        if self.is_debug:
-            self.joint_timestamps = deque(maxlen=50)
-            self.arm_command_timestamps = deque(maxlen=50)
-            self.gripper_command_timestamps = deque(maxlen=50)
-        time.sleep(0.1)
-    def puppet_state_cb(self, data):
-        self.qpos = data.position
-        self.qvel = data.velocity
-        self.effort = data.effort
-        self.data = data
-        if self.is_debug:
-            self.joint_timestamps.append(time.time())
-    def puppet_arm_commands_cb(self, data):
-        self.arm_command = data.cmd
-        if self.is_debug:
-            self.arm_command_timestamps.append(time.time())
-    def puppet_gripper_commands_cb(self, data):
-        self.gripper_command = data.cmd
-        if self.is_debug:
-            self.gripper_command_timestamps.append(time.time())
-    def print_diagnostics(self):
-        def dt_helper(l):
-            l = np.array(l)
-            diff = l[1:] - l[:-1]
-            return np.mean(diff)
-        joint_freq = 1 / dt_helper(self.joint_timestamps)
-        arm_command_freq = 1 / dt_helper(self.arm_command_timestamps)
-        gripper_command_freq = 1 / dt_helper(self.gripper_command_timestamps)
-        print(f'{joint_freq=:.2f}\n{arm_command_freq=:.2f}\n{gripper_command_freq=:.2f}\n')
-def get_arm_joint_positions(bot):
-    return bot.arm.core.joint_states.position[:6]
-def get_arm_gripper_positions(bot):
-    joint_position = bot.gripper.core.joint_states.position[6]
-    return joint_position
-def move_arms(bot_list, target_pose_list, move_time=1):
-    num_steps = int(move_time / DT)
-    curr_pose_list = [get_arm_joint_positions(bot) for bot in bot_list]
-    traj_list = [np.linspace(curr_pose, target_pose, num_steps) for curr_pose, target_pose in zip(curr_pose_list, target_pose_list)]
-    for t in range(num_steps):
-        for bot_id, bot in enumerate(bot_list):
-            bot.arm.set_joint_positions(traj_list[bot_id][t], blocking=False)
-        time.sleep(DT)
-def move_grippers(bot_list, target_pose_list, move_time):
-    gripper_command = JointSingleCommand(name="gripper")
-    num_steps = int(move_time / DT)
-    curr_pose_list = [get_arm_gripper_positions(bot) for bot in bot_list]
-    traj_list = [np.linspace(curr_pose, target_pose, num_steps) for curr_pose, target_pose in zip(curr_pose_list, target_pose_list)]
-    for t in range(num_steps):
-        for bot_id, bot in enumerate(bot_list):
-            gripper_command.cmd = traj_list[bot_id][t]
-            bot.gripper.core.pub_single.publish(gripper_command)
-        time.sleep(DT)
-def setup_puppet_bot(bot):
-    bot.dxl.robot_reboot_motors("single", "gripper", True)
-    bot.dxl.robot_set_operating_modes("group", "arm", "position")
-    bot.dxl.robot_set_operating_modes("single", "gripper", "current_based_position")
-    torque_on(bot)
-def setup_master_bot(bot):
-    bot.dxl.robot_set_operating_modes("group", "arm", "pwm")
-    bot.dxl.robot_set_operating_modes("single", "gripper", "current_based_position")
-    torque_off(bot)
-def set_standard_pid_gains(bot):
-    bot.dxl.robot_set_motor_registers("group", "arm", 'Position_P_Gain', 800)
-    bot.dxl.robot_set_motor_registers("group", "arm", 'Position_I_Gain', 0)
-def set_low_pid_gains(bot):
-    bot.dxl.robot_set_motor_registers("group", "arm", 'Position_P_Gain', 100)
-    bot.dxl.robot_set_motor_registers("group", "arm", 'Position_I_Gain', 0)
-def torque_off(bot):
-    bot.dxl.robot_torque_enable("group", "arm", False)
-    bot.dxl.robot_torque_enable("single", "gripper", False)
-def torque_on(bot):
-    bot.dxl.robot_torque_enable("group", "arm", True)
-    bot.dxl.robot_torque_enable("single", "gripper", True)

capvector-oft/experiments/robot/aloha/run_aloha_eval.py DELETED Viewed

@@ -1,385 +0,0 @@
-"""
-run_aloha_eval.py
-Evaluates a model in a real-world ALOHA environment.
-"""
-import logging
-import os
-import socket
-import sys
-import time
-from collections import deque
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Optional, Union
-import draccus
-import tqdm
-# Append current directory so that interpreter can find experiments.robot
-sys.path.append(".")
-from experiments.robot.aloha.aloha_utils import (
-    get_aloha_env,
-    get_aloha_image,
-    get_aloha_wrist_images,
-    get_next_task_label,
-    save_rollout_video,
-)
-from experiments.robot.openvla_utils import (
-    get_action_from_server,
-    resize_image_for_policy,
-)
-from experiments.robot.robot_utils import (
-    DATE_TIME,
-    get_image_resize_size,
-    set_seed_everywhere,
-)
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(message)s",
-    handlers=[logging.StreamHandler()],
-)
-logger = logging.getLogger(__name__)
-@dataclass
-class GenerateConfig:
-    # fmt: off
-    #################################################################################################################
-    # Model-specific parameters
-    #################################################################################################################
-    model_family: str = "openvla"                    # Model family
-    center_crop: bool = True                         # Center crop? (if trained w/ random crop image aug)
-    num_open_loop_steps: int = 25                    # Number of actions to execute open-loop before requerying policy
-    use_vla_server: bool = True                      # Whether to query remote VLA server for actions
-    vla_server_url: Union[str, Path] = ""            # Remote VLA server URL (set to 127.0.0.1 if on same machine)
-    #################################################################################################################
-    # ALOHA environment-specific parameters
-    #################################################################################################################
-    num_rollouts_planned: int = 50                   # Number of test rollouts
-    max_steps: int = 1500                            # Max number of steps per rollout
-    use_relative_actions: bool = False               # Whether to use relative actions (delta joint angles)
-    #################################################################################################################
-    # Utils
-    #################################################################################################################
-    run_id_note: Optional[str] = None                # Extra note to add to end of run ID for logging
-    local_log_dir: str = "./experiments/logs"        # Local directory for eval logs
-    seed: int = 7                                    # Random Seed (for reproducibility)
-    # fmt: on
-def validate_config(cfg: GenerateConfig) -> None:
-    """Validate configuration parameters."""
-    assert cfg.use_vla_server, (
-        "Must use VLA server (server-client interface) to query model and get actions! Please set --use_vla_server=True"
-    )
-def setup_logging(cfg: GenerateConfig):
-    """Set up logging to file."""
-    # Create run ID
-    run_id = f"EVAL-{cfg.model_family}-{DATE_TIME}"
-    if cfg.run_id_note is not None:
-        run_id += f"--{cfg.run_id_note}"
-    # Set up local logging
-    os.makedirs(cfg.local_log_dir, exist_ok=True)
-    local_log_filepath = os.path.join(cfg.local_log_dir, run_id + ".txt")
-    log_file = open(local_log_filepath, "w")
-    logger.info(f"Logging to local log file: {local_log_filepath}")
-    return log_file, local_log_filepath, run_id
-def log_message(message: str, log_file=None):
-    """Log a message to console and optionally to a log file."""
-    print(message)
-    logger.info(message)
-    if log_file:
-        log_file.write(message + "\n")
-        log_file.flush()
-def get_server_endpoint(cfg: GenerateConfig):
-    """Get the server endpoint for remote inference."""
-    ip_address = socket.gethostbyname(cfg.vla_server_url)
-    return f"http://{ip_address}:8777/act"
-def prepare_observation(obs, resize_size):
-    """Prepare observation for policy input."""
-    # Get preprocessed images
-    img = get_aloha_image(obs)
-    left_wrist_img, right_wrist_img = get_aloha_wrist_images(obs)
-    # Resize images to size expected by model
-    img_resized = resize_image_for_policy(img, resize_size)
-    left_wrist_img_resized = resize_image_for_policy(left_wrist_img, resize_size)
-    right_wrist_img_resized = resize_image_for_policy(right_wrist_img, resize_size)
-    # Prepare observations dict
-    observation = {
-        "full_image": img_resized,
-        "left_wrist_image": left_wrist_img_resized,
-        "right_wrist_image": right_wrist_img_resized,
-        "state": obs.observation["qpos"],
-    }
-    return observation, img_resized, left_wrist_img_resized, right_wrist_img_resized
-def run_episode(
-    cfg: GenerateConfig,
-    env,
-    task_description: str,
-    server_endpoint: str,
-    resize_size,
-    log_file=None,
-):
-    """Run a single episode in the ALOHA environment."""
-    # Define control frequency
-    STEP_DURATION_IN_SEC = 1.0 / 25.0
-    # Reset environment
-    obs = env.reset()
-    # Initialize action queue
-    action_queue = deque(maxlen=cfg.num_open_loop_steps)
-    # Setup
-    t = 0
-    curr_state = None
-    replay_images = []
-    replay_images_resized = []
-    replay_images_left_wrist_resized = []
-    replay_images_right_wrist_resized = []
-    log_message("Prepare the scene, and then press Enter to begin...", log_file)
-    input()
-    # Reset environment again to fetch first timestep observation
-    obs = env.reset()
-    # Fetch initial robot state (but sleep first so that robot stops moving)
-    time.sleep(2)
-    curr_state = env.get_qpos()
-    episode_start_time = time.time()
-    total_model_query_time = 0.0
-    try:
-        while t < cfg.max_steps:
-            # Get step start time (used to compute how much to sleep between steps)
-            step_start_time = time.time()
-            # Get observation
-            obs = env.get_observation(t=t)
-            # Save raw high camera image for replay video
-            replay_images.append(obs.observation["images"]["cam_high"])
-            # If action queue is empty, requery model
-            if len(action_queue) == 0:
-                # Prepare observation
-                observation, img_resized, left_wrist_resized, right_wrist_resized = prepare_observation(obs, resize_size)
-                observation["instruction"] = task_description
-                # Save processed images for replay
-                replay_images_resized.append(img_resized)
-                replay_images_left_wrist_resized.append(left_wrist_resized)
-                replay_images_right_wrist_resized.append(right_wrist_resized)
-                # Query model to get action
-                log_message("Requerying model...", log_file)
-                model_query_start_time = time.time()
-                actions = get_action_from_server(observation, server_endpoint)
-                actions = actions[: cfg.num_open_loop_steps]
-                total_model_query_time += time.time() - model_query_start_time
-                action_queue.extend(actions)
-            # Get action from queue
-            action = action_queue.popleft()
-            log_message("-----------------------------------------------------", log_file)
-            log_message(f"t: {t}", log_file)
-            log_message(f"action: {action}", log_file)
-            # Execute action in environment
-            if cfg.use_relative_actions:
-                # Get absolute joint angles from relative action
-                rel_action = action
-                target_state = curr_state + rel_action
-                obs = env.step(target_state.tolist())
-                # Update current state (assume it is the commanded target state)
-                curr_state = target_state
-            else:
-                obs = env.step(action.tolist())
-            t += 1
-            # Sleep until next timestep
-            step_elapsed_time = time.time() - step_start_time
-            if step_elapsed_time < STEP_DURATION_IN_SEC:
-                time_to_sleep = STEP_DURATION_IN_SEC - step_elapsed_time
-                log_message(f"Sleeping {time_to_sleep} sec...", log_file)
-                time.sleep(time_to_sleep)
-    except (KeyboardInterrupt, Exception) as e:
-        if isinstance(e, KeyboardInterrupt):
-            log_message("\nCaught KeyboardInterrupt: Terminating episode early.", log_file)
-        else:
-            log_message(f"\nCaught exception: {e}", log_file)
-    episode_end_time = time.time()
-    # Get success feedback from user
-    user_input = input("Success? Enter 'y' or 'n': ")
-    success = True if user_input.lower() == "y" else False
-    # Calculate episode statistics
-    episode_stats = {
-        "success": success,
-        "total_steps": t,
-        "model_query_time": total_model_query_time,
-        "episode_duration": episode_end_time - episode_start_time,
-    }
-    return (
-        episode_stats,
-        replay_images,
-        replay_images_resized,
-        replay_images_left_wrist_resized,
-        replay_images_right_wrist_resized,
-    )
-def save_episode_videos(
-    replay_images,
-    replay_images_resized,
-    replay_images_left_wrist,
-    replay_images_right_wrist,
-    episode_idx,
-    success,
-    task_description,
-    log_file=None,
-):
-    """Save videos of the episode from different camera angles."""
-    # Save main replay video
-    save_rollout_video(replay_images, episode_idx, success=success, task_description=task_description, log_file=log_file)
-    # Save processed view videos
-    save_rollout_video(
-        replay_images_resized,
-        episode_idx,
-        success=success,
-        task_description=task_description,
-        log_file=log_file,
-        notes="resized",
-    )
-    save_rollout_video(
-        replay_images_left_wrist,
-        episode_idx,
-        success=success,
-        task_description=task_description,
-        log_file=log_file,
-        notes="left_wrist_resized",
-    )
-    save_rollout_video(
-        replay_images_right_wrist,
-        episode_idx,
-        success=success,
-        task_description=task_description,
-        log_file=log_file,
-        notes="right_wrist_resized",
-    )
-@draccus.wrap()
-def eval_aloha(cfg: GenerateConfig) -> None:
-    """Main function to evaluate a trained policy in a real-world ALOHA environment."""
-    # Validate configuration
-    validate_config(cfg)
-    # Set random seed
-    set_seed_everywhere(cfg.seed)
-    # Setup logging
-    log_file, local_log_filepath, run_id = setup_logging(cfg)
-    # Get expected image dimensions
-    resize_size = get_image_resize_size(cfg)
-    # Get ALOHA environment
-    env = get_aloha_env()
-    # Get server endpoint for remote inference
-    server_endpoint = get_server_endpoint(cfg)
-    # Initialize task description
-    task_description = ""
-    # Start evaluation
-    num_rollouts_completed, total_successes = 0, 0
-    for episode_idx in tqdm.tqdm(range(cfg.num_rollouts_planned)):
-        # Get task description from user
-        task_description = get_next_task_label(task_description)
-        log_message(f"\nTask: {task_description}", log_file)
-        log_message(f"Starting episode {num_rollouts_completed + 1}...", log_file)
-        # Run episode
-        episode_stats, replay_images, replay_images_resized, replay_images_left_wrist, replay_images_right_wrist = (
-            run_episode(cfg, env, task_description, server_endpoint, resize_size, log_file)
-        )
-        # Update counters
-        num_rollouts_completed += 1
-        if episode_stats["success"]:
-            total_successes += 1
-        # Save videos
-        save_episode_videos(
-            replay_images,
-            replay_images_resized,
-            replay_images_left_wrist,
-            replay_images_right_wrist,
-            num_rollouts_completed,
-            episode_stats["success"],
-            task_description,
-            log_file,
-        )
-        # Log results
-        log_message(f"Success: {episode_stats['success']}", log_file)
-        log_message(f"# episodes completed so far: {num_rollouts_completed}", log_file)
-        log_message(f"# successes: {total_successes} ({total_successes / num_rollouts_completed * 100:.1f}%)", log_file)
-        log_message(f"Total model query time: {episode_stats['model_query_time']:.2f} sec", log_file)
-        log_message(f"Total episode elapsed time: {episode_stats['episode_duration']:.2f} sec", log_file)
-    # Calculate final success rate
-    final_success_rate = float(total_successes) / float(num_rollouts_completed) if num_rollouts_completed > 0 else 0
-    # Log final results
-    log_message("\nFinal results:", log_file)
-    log_message(f"Total episodes: {num_rollouts_completed}", log_file)
-    log_message(f"Total successes: {total_successes}", log_file)
-    log_message(f"Overall success rate: {final_success_rate:.4f} ({final_success_rate * 100:.1f}%)", log_file)
-    # Close log file
-    if log_file:
-        log_file.close()
-    return final_success_rate
-if __name__ == "__main__":
-    eval_aloha()

capvector-oft/experiments/robot/libero/libero_requirements.txt DELETED Viewed

@@ -1,6 +0,0 @@
-imageio[ffmpeg]
-robosuite==1.4.1
-bddl
-easydict
-cloudpickle
-gym

capvector-oft/experiments/robot/libero/libero_utils.py DELETED Viewed

@@ -1,87 +0,0 @@
-"""Utils for evaluating policies in LIBERO simulation environments."""
-import math
-import os
-import imageio
-import numpy as np
-import tensorflow as tf
-from libero.libero import get_libero_path
-from libero.libero.envs import OffScreenRenderEnv
-from experiments.robot.robot_utils import (
-    DATE,
-    DATE_TIME,
-)
-def get_libero_env(task, model_family, resolution=256):
-    """Initializes and returns the LIBERO environment, along with the task description."""
-    task_description = task.language
-    task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
-    env_args = {"bddl_file_name": task_bddl_file, "camera_heights": resolution, "camera_widths": resolution}
-    env = OffScreenRenderEnv(**env_args)
-    env.seed(0)  # IMPORTANT: seed seems to affect object positions even when using fixed initial state
-    return env, task_description
-def get_libero_dummy_action(model_family: str):
-    """Get dummy/no-op action, used to roll out the simulation while the robot does nothing."""
-    return [0, 0, 0, 0, 0, 0, -1]
-def get_libero_image(obs):
-    """Extracts third-person image from observations and preprocesses it."""
-    img = obs["agentview_image"]
-    img = img[::-1, ::-1]  # IMPORTANT: rotate 180 degrees to match train preprocessing
-    return img
-def get_libero_wrist_image(obs):
-    """Extracts wrist camera image from observations and preprocesses it."""
-    img = obs["robot0_eye_in_hand_image"]
-    img = img[::-1, ::-1]  # IMPORTANT: rotate 180 degrees to match train preprocessing
-    return img
-def save_rollout_video(rollout_images, idx, success, task_description, log_file=None):
-    """Saves an MP4 replay of an episode."""
-    rollout_dir = f"./rollouts/{DATE}"
-    os.makedirs(rollout_dir, exist_ok=True)
-    processed_task_description = task_description.lower().replace(" ", "_").replace("\n", "_").replace(".", "_")[:50]
-    mp4_path = f"{rollout_dir}/{DATE_TIME}--openvla_oft--episode={idx}--success={success}--task={processed_task_description}.mp4"
-    video_writer = imageio.get_writer(mp4_path, fps=30)
-    for img in rollout_images:
-        video_writer.append_data(img)
-    video_writer.close()
-    print(f"Saved rollout MP4 at path {mp4_path}")
-    if log_file is not None:
-        log_file.write(f"Saved rollout MP4 at path {mp4_path}\n")
-    return mp4_path
-def quat2axisangle(quat):
-    """
-    Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
-    Converts quaternion to axis-angle format.
-    Returns a unit vector direction scaled by its angle in radians.
-    Args:
-        quat (np.array): (x,y,z,w) vec4 float angles
-    Returns:
-        np.array: (ax,ay,az) axis-angle exponential coordinates
-    """
-    # clip quaternion
-    if quat[3] > 1.0:
-        quat[3] = 1.0
-    elif quat[3] < -1.0:
-        quat[3] = -1.0
-    den = np.sqrt(1.0 - quat[3] * quat[3])
-    if math.isclose(den, 0.0):
-        # This is (close to) a zero degree rotation, immediately return
-        return np.zeros(3)
-    return (quat[:3] * 2.0 * math.acos(quat[3])) / den

capvector-oft/experiments/robot/libero/regenerate_libero_dataset.py DELETED Viewed

@@ -1,249 +0,0 @@
-"""
-Regenerates a LIBERO dataset (HDF5 files) by replaying demonstrations in the environments.
-Notes:
-    - We save image observations at 256x256px resolution (instead of 128x128).
-    - We filter out transitions with "no-op" (zero) actions that do not change the robot's state.
-    - We filter out unsuccessful demonstrations.
-    - In the LIBERO HDF5 data -> RLDS data conversion (not shown here), we rotate the images by
-    180 degrees because we observe that the environments return images that are upside down
-    on our platform.
-Usage:
-    python experiments/robot/libero/regenerate_libero_dataset.py \
-        --libero_task_suite [ libero_spatial | libero_object | libero_goal | libero_10 ] \
-        --libero_raw_data_dir <PATH TO RAW HDF5 DATASET DIR> \
-        --libero_target_dir <PATH TO TARGET DIR>
-    Example (LIBERO-Spatial):
-        python experiments/robot/libero/regenerate_libero_dataset.py \
-            --libero_task_suite libero_spatial \
-            --libero_raw_data_dir ./LIBERO/libero/datasets/libero_spatial \
-            --libero_target_dir ./LIBERO/libero/datasets/libero_spatial_no_noops
-"""
-import argparse
-import json
-import os
-import time
-import h5py
-import numpy as np
-import robosuite.utils.transform_utils as T
-import tqdm
-from libero.libero import benchmark
-from experiments.robot.libero.libero_utils import (
-    get_libero_dummy_action,
-    get_libero_env,
-)
-IMAGE_RESOLUTION = 256
-def is_noop(action, prev_action=None, threshold=1e-4):
-    """
-    Returns whether an action is a no-op action.
-    A no-op action satisfies two criteria:
-        (1) All action dimensions, except for the last one (gripper action), are near zero.
-        (2) The gripper action is equal to the previous timestep's gripper action.
-    Explanation of (2):
-        Naively filtering out actions with just criterion (1) is not good because you will
-        remove actions where the robot is staying still but opening/closing its gripper.
-        So you also need to consider the current state (by checking the previous timestep's
-        gripper action as a proxy) to determine whether the action really is a no-op.
-    """
-    # Special case: Previous action is None if this is the first action in the episode
-    # Then we only care about criterion (1)
-    if prev_action is None:
-        return np.linalg.norm(action[:-1]) < threshold
-    # Normal case: Check both criteria (1) and (2)
-    gripper_action = action[-1]
-    prev_gripper_action = prev_action[-1]
-    return np.linalg.norm(action[:-1]) < threshold and gripper_action == prev_gripper_action
-def main(args):
-    print(f"Regenerating {args.libero_task_suite} dataset!")
-    # Create target directory
-    if os.path.isdir(args.libero_target_dir):
-        user_input = input(f"Target directory already exists at path: {args.libero_target_dir}\nEnter 'y' to overwrite the directory, or anything else to exit: ")
-        if user_input != 'y':
-            exit()
-    os.makedirs(args.libero_target_dir, exist_ok=True)
-    # Prepare JSON file to record success/false and initial states per episode
-    metainfo_json_dict = {}
-    metainfo_json_out_path = f"./experiments/robot/libero/{args.libero_task_suite}_metainfo.json"
-    with open(metainfo_json_out_path, "w") as f:
-        # Just test that we can write to this file (we overwrite it later)
-        json.dump(metainfo_json_dict, f)
-    # Get task suite
-    benchmark_dict = benchmark.get_benchmark_dict()
-    task_suite = benchmark_dict[args.libero_task_suite]()
-    num_tasks_in_suite = task_suite.n_tasks
-    # Setup
-    num_replays = 0
-    num_success = 0
-    num_noops = 0
-    for task_id in tqdm.tqdm(range(num_tasks_in_suite)):
-        # Get task in suite
-        task = task_suite.get_task(task_id)
-        env, task_description = get_libero_env(task, "llava", resolution=IMAGE_RESOLUTION)
-        # Get dataset for task
-        orig_data_path = os.path.join(args.libero_raw_data_dir, f"{task.name}_demo.hdf5")
-        assert os.path.exists(orig_data_path), f"Cannot find raw data file {orig_data_path}."
-        orig_data_file = h5py.File(orig_data_path, "r")
-        orig_data = orig_data_file["data"]
-        # Create new HDF5 file for regenerated demos
-        new_data_path = os.path.join(args.libero_target_dir, f"{task.name}_demo.hdf5")
-        new_data_file = h5py.File(new_data_path, "w")
-        grp = new_data_file.create_group("data")
-        for i in range(len(orig_data.keys())):
-            # Get demo data
-            demo_data = orig_data[f"demo_{i}"]
-            orig_actions = demo_data["actions"][()]
-            orig_states = demo_data["states"][()]
-            # Reset environment, set initial state, and wait a few steps for environment to settle
-            env.reset()
-            env.set_init_state(orig_states[0])
-            for _ in range(10):
-                obs, reward, done, info = env.step(get_libero_dummy_action("llava"))
-            # Set up new data lists
-            states = []
-            actions = []
-            ee_states = []
-            gripper_states = []
-            joint_states = []
-            robot_states = []
-            agentview_images = []
-            eye_in_hand_images = []
-            # Replay original demo actions in environment and record observations
-            for _, action in enumerate(orig_actions):
-                # Skip transitions with no-op actions
-                prev_action = actions[-1] if len(actions) > 0 else None
-                if is_noop(action, prev_action):
-                    print(f"\tSkipping no-op action: {action}")
-                    num_noops += 1
-                    continue
-                if states == []:
-                    # In the first timestep, since we're using the original initial state to initialize the environment,
-                    # copy the initial state (first state in episode) over from the original HDF5 to the new one
-                    states.append(orig_states[0])
-                    robot_states.append(demo_data["robot_states"][0])
-                else:
-                    # For all other timesteps, get state from environment and record it
-                    states.append(env.sim.get_state().flatten())
-                    robot_states.append(
-                        np.concatenate([obs["robot0_gripper_qpos"], obs["robot0_eef_pos"], obs["robot0_eef_quat"]])
-                    )
-                # Record original action (from demo)
-                actions.append(action)
-                # Record data returned by environment
-                if "robot0_gripper_qpos" in obs:
-                    gripper_states.append(obs["robot0_gripper_qpos"])
-                joint_states.append(obs["robot0_joint_pos"])
-                ee_states.append(
-                    np.hstack(
-                        (
-                            obs["robot0_eef_pos"],
-                            T.quat2axisangle(obs["robot0_eef_quat"]),
-                        )
-                    )
-                )
-                agentview_images.append(obs["agentview_image"])
-                eye_in_hand_images.append(obs["robot0_eye_in_hand_image"])
-                # Execute demo action in environment
-                obs, reward, done, info = env.step(action.tolist())
-            # At end of episode, save replayed trajectories to new HDF5 files (only keep successes)
-            if done:
-                dones = np.zeros(len(actions)).astype(np.uint8)
-                dones[-1] = 1
-                rewards = np.zeros(len(actions)).astype(np.uint8)
-                rewards[-1] = 1
-                assert len(actions) == len(agentview_images)
-                ep_data_grp = grp.create_group(f"demo_{i}")
-                obs_grp = ep_data_grp.create_group("obs")
-                obs_grp.create_dataset("gripper_states", data=np.stack(gripper_states, axis=0))
-                obs_grp.create_dataset("joint_states", data=np.stack(joint_states, axis=0))
-                obs_grp.create_dataset("ee_states", data=np.stack(ee_states, axis=0))
-                obs_grp.create_dataset("ee_pos", data=np.stack(ee_states, axis=0)[:, :3])
-                obs_grp.create_dataset("ee_ori", data=np.stack(ee_states, axis=0)[:, 3:])
-                obs_grp.create_dataset("agentview_rgb", data=np.stack(agentview_images, axis=0))
-                obs_grp.create_dataset("eye_in_hand_rgb", data=np.stack(eye_in_hand_images, axis=0))
-                ep_data_grp.create_dataset("actions", data=actions)
-                ep_data_grp.create_dataset("states", data=np.stack(states))
-                ep_data_grp.create_dataset("robot_states", data=np.stack(robot_states, axis=0))
-                ep_data_grp.create_dataset("rewards", data=rewards)
-                ep_data_grp.create_dataset("dones", data=dones)
-                num_success += 1
-            num_replays += 1
-            # Record success/false and initial environment state in metainfo dict
-            task_key = task_description.replace(" ", "_")
-            episode_key = f"demo_{i}"
-            if task_key not in metainfo_json_dict:
-                metainfo_json_dict[task_key] = {}
-            if episode_key not in metainfo_json_dict[task_key]:
-                metainfo_json_dict[task_key][episode_key] = {}
-            metainfo_json_dict[task_key][episode_key]["success"] = bool(done)
-            metainfo_json_dict[task_key][episode_key]["initial_state"] = orig_states[0].tolist()
-            # Write metainfo dict to JSON file
-            # (We repeatedly overwrite, rather than doing this once at the end, just in case the script crashes midway)
-            with open(metainfo_json_out_path, "w") as f:
-                json.dump(metainfo_json_dict, f, indent=2)
-            # Count total number of successful replays so far
-            print(
-                f"Total # episodes replayed: {num_replays}, Total # successes: {num_success} ({num_success / num_replays * 100:.1f} %)"
-            )
-            # Report total number of no-op actions filtered out so far
-            print(f"  Total # no-op actions filtered out: {num_noops}")
-        # Close HDF5 files
-        orig_data_file.close()
-        new_data_file.close()
-        print(f"Saved regenerated demos for task '{task_description}' at: {new_data_path}")
-    print(f"Dataset regeneration complete! Saved new dataset at: {args.libero_target_dir}")
-    print(f"Saved metainfo JSON at: {metainfo_json_out_path}")
-if __name__ == "__main__":
-    # Parse command-line arguments
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--libero_task_suite", type=str, choices=["libero_spatial", "libero_object", "libero_goal", "libero_10", "libero_90"],
-                        help="LIBERO task suite. Example: libero_spatial", required=True)
-    parser.add_argument("--libero_raw_data_dir", type=str,
-                        help="Path to directory containing raw HDF5 dataset. Example: ./LIBERO/libero/datasets/libero_spatial", required=True)
-    parser.add_argument("--libero_target_dir", type=str,
-                        help="Path to regenerated dataset directory. Example: ./LIBERO/libero/datasets/libero_spatial_no_noops", required=True)
-    args = parser.parse_args()
-    # Start data regeneration
-    main(args)

capvector-oft/experiments/robot/libero/run_libero_eval.py DELETED Viewed

@@ -1,540 +0,0 @@
-"""
-run_libero_eval.py
-Evaluates a trained policy in a LIBERO simulation benchmark task suite.
-"""
-import json
-import logging
-import os
-import sys
-from collections import deque
-from dataclasses import dataclass
-from enum import Enum
-from pathlib import Path
-from typing import Optional, Union
-import draccus
-import numpy as np
-import tqdm
-from libero.libero import benchmark
-import wandb
-# Append current directory so that interpreter can find experiments.robot
-sys.path.append("../..")
-from experiments.robot.libero.libero_utils import (
-    get_libero_dummy_action,
-    get_libero_env,
-    get_libero_image,
-    get_libero_wrist_image,
-    quat2axisangle,
-    save_rollout_video,
-)
-from experiments.robot.openvla_utils import (
-    get_action_head,
-    get_noisy_action_projector,
-    get_processor,
-    get_proprio_projector,
-    resize_image_for_policy,
-)
-from experiments.robot.robot_utils import (
-    DATE_TIME,
-    get_action,
-    get_image_resize_size,
-    get_model,
-    invert_gripper_action,
-    normalize_gripper_action,
-    set_seed_everywhere,
-)
-from prismatic.vla.constants import NUM_ACTIONS_CHUNK
-# import debugpy
-# try:
-#     debugpy.listen(("localhost", 9501))
-#     print("Waiting for debugger attach")
-#     debugpy.wait_for_client()
-# except Exception as e:
-#     pass
-# Define task suite constants
-class TaskSuite(str, Enum):
-    LIBERO_SPATIAL = "libero_spatial"
-    LIBERO_OBJECT = "libero_object"
-    LIBERO_GOAL = "libero_goal"
-    LIBERO_10 = "libero_10"
-    LIBERO_90 = "libero_90"
-# Define max steps for each task suite
-TASK_MAX_STEPS = {
-    TaskSuite.LIBERO_SPATIAL: 220,  # longest training demo has 193 steps
-    TaskSuite.LIBERO_OBJECT: 280,  # longest training demo has 254 steps
-    TaskSuite.LIBERO_GOAL: 300,  # longest training demo has 270 steps
-    TaskSuite.LIBERO_10: 520,  # longest training demo has 505 steps
-    TaskSuite.LIBERO_90: 400,  # longest training demo has 373 steps
-}
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(message)s",
-    handlers=[logging.StreamHandler()],
-)
-logger = logging.getLogger(__name__)
-@dataclass
-class GenerateConfig:
-    # fmt: off
-    #################################################################################################################
-    # Model-specific parameters
-    #################################################################################################################
-    model_family: str = "openvla"                    # Model family
-    pretrained_checkpoint: Union[str, Path] = ""     # Pretrained checkpoint path
-    use_l1_regression: bool = True                   # If True, uses continuous action head with L1 regression objective
-    use_diffusion: bool = False                      # If True, uses continuous action head with diffusion modeling objective (DDIM)
-    num_diffusion_steps_train: int = 50              # (When `diffusion==True`) Number of diffusion steps used for training
-    num_diffusion_steps_inference: int = 50          # (When `diffusion==True`) Number of diffusion steps used for inference
-    use_film: bool = False                           # If True, uses FiLM to infuse language inputs into visual features
-    num_images_in_input: int = 2                     # Number of images in the VLA input (default: 1)
-    use_proprio: bool = True                         # Whether to include proprio state in input
-    center_crop: bool = True                         # Center crop? (if trained w/ random crop image aug)
-    num_open_loop_steps: int = 8                     # Number of actions to execute open-loop before requerying policy
-    lora_rank: int = 32                              # Rank of LoRA weight matrix (MAKE SURE THIS MATCHES TRAINING!)
-    unnorm_key: Union[str, Path] = ""                # Action un-normalization key
-    load_in_8bit: bool = False                       # (For OpenVLA only) Load with 8-bit quantization
-    load_in_4bit: bool = False                       # (For OpenVLA only) Load with 4-bit quantization
-    #################################################################################################################
-    # LIBERO environment-specific parameters
-    #################################################################################################################
-    task_suite_name: str = TaskSuite.LIBERO_SPATIAL  # Task suite
-    num_steps_wait: int = 10                         # Number of steps to wait for objects to stabilize in sim
-    num_trials_per_task: int = 50                    # Number of rollouts per task
-    initial_states_path: str = "DEFAULT"             # "DEFAULT", or path to initial states JSON file
-    env_img_res: int = 256                           # Resolution for environment images (not policy input resolution)
-    #################################################################################################################
-    # Utils
-    #################################################################################################################
-    run_id_note: Optional[str] = None                # Extra note to add to end of run ID for logging
-    local_log_dir: str = "./experiments/logs"        # Local directory for eval logs
-    use_wandb: bool = False                          # Whether to also log results in Weights & Biases
-    wandb_entity: str = "your-wandb-entity"          # Name of WandB entity
-    wandb_project: str = "your-wandb-project"        # Name of WandB project
-    seed: int = 7                                    # Random Seed (for reproducibility)
-    # fmt: on
-def validate_config(cfg: GenerateConfig) -> None:
-    """Validate configuration parameters."""
-    assert cfg.pretrained_checkpoint is not None, "pretrained_checkpoint must not be None!"
-    if "image_aug" in str(cfg.pretrained_checkpoint):
-        assert cfg.center_crop, "Expecting `center_crop==True` because model was trained with image augmentations!"
-    assert not (cfg.load_in_8bit and cfg.load_in_4bit), "Cannot use both 8-bit and 4-bit quantization!"
-    # Validate task suite
-    assert cfg.task_suite_name in [suite.value for suite in TaskSuite], f"Invalid task suite: {cfg.task_suite_name}"
-def initialize_model(cfg: GenerateConfig):
-    """Initialize model and associated components."""
-    # Load model
-    model = get_model(cfg)
-    # Load proprio projector if needed
-    proprio_projector = None
-    if cfg.use_proprio:
-        proprio_projector = get_proprio_projector(
-            cfg,
-            model.llm_dim,
-            proprio_dim=8,  # 8-dimensional proprio for LIBERO
-        )
-    # Load action head if needed
-    action_head = None
-    if cfg.use_l1_regression or cfg.use_diffusion:
-        action_head = get_action_head(cfg, model.llm_dim)
-    # Load noisy action projector if using diffusion
-    noisy_action_projector = None
-    if cfg.use_diffusion:
-        noisy_action_projector = get_noisy_action_projector(cfg, model.llm_dim)
-    # Get OpenVLA processor if needed
-    processor = None
-    if cfg.model_family == "openvla":
-        processor = get_processor(cfg)
-        check_unnorm_key(cfg, model)
-    return model, action_head, proprio_projector, noisy_action_projector, processor
-def check_unnorm_key(cfg: GenerateConfig, model) -> None:
-    """Check that the model contains the action un-normalization key."""
-    # Initialize unnorm_key
-    unnorm_key = cfg.task_suite_name
-    # In some cases, the key must be manually modified (e.g. after training on a modified version of the dataset
-    # with the suffix "_no_noops" in the dataset name)
-    if unnorm_key not in model.norm_stats and f"{unnorm_key}_no_noops" in model.norm_stats:
-        unnorm_key = f"{unnorm_key}_no_noops"
-    assert unnorm_key in model.norm_stats, f"Action un-norm key {unnorm_key} not found in VLA `norm_stats`!"
-    # Set the unnorm_key in cfg
-    cfg.unnorm_key = unnorm_key
-def setup_logging(cfg: GenerateConfig):
-    """Set up logging to file and optionally to wandb."""
-    # Create run ID
-    run_id = f"EVAL-{cfg.task_suite_name}-{cfg.model_family}-{DATE_TIME}"
-    if cfg.run_id_note is not None:
-        run_id += f"--{cfg.run_id_note}"
-    # Set up local logging
-    os.makedirs(cfg.local_log_dir, exist_ok=True)
-    local_log_filepath = os.path.join(cfg.local_log_dir, run_id + ".txt")
-    log_file = open(local_log_filepath, "w")
-    logger.info(f"Logging to local log file: {local_log_filepath}")
-    # Initialize Weights & Biases logging if enabled
-    if cfg.use_wandb:
-        wandb.init(
-            entity=cfg.wandb_entity,
-            project=cfg.wandb_project,
-            name=run_id,
-        )
-    return log_file, local_log_filepath, run_id
-def log_message(message: str, log_file=None):
-    """Log a message to console and optionally to a log file."""
-    logger.info(message)
-    if log_file:
-        log_file.write(message + "\n")
-        log_file.flush()
-def load_initial_states(cfg: GenerateConfig, task_suite, task_id: int, log_file=None):
-    """Load initial states for the given task."""
-    # Get default initial states
-    initial_states = task_suite.get_task_init_states(task_id)
-    # If using custom initial states, load them from file
-    if cfg.initial_states_path != "DEFAULT":
-        with open(cfg.initial_states_path, "r") as f:
-            all_initial_states = json.load(f)
-        log_message(f"Using initial states from {cfg.initial_states_path}", log_file)
-        return initial_states, all_initial_states
-    else:
-        log_message("Using default initial states", log_file)
-        return initial_states, None
-def prepare_observation(obs, resize_size):
-    """Prepare observation for policy input."""
-    # Get preprocessed images
-    img = get_libero_image(obs)
-    wrist_img = get_libero_wrist_image(obs)
-    # Resize images to size expected by model
-    img_resized = resize_image_for_policy(img, resize_size)
-    wrist_img_resized = resize_image_for_policy(wrist_img, resize_size)
-    # Prepare observations dict
-    observation = {
-        "full_image": img_resized,
-        "wrist_image": wrist_img_resized,
-        "state": np.concatenate(
-            (obs["robot0_eef_pos"], quat2axisangle(obs["robot0_eef_quat"]), obs["robot0_gripper_qpos"])
-        ),
-    }
-    return observation, img  # Return both processed observation and original image for replay
-def process_action(action, model_family):
-    """Process action before sending to environment."""
-    # Normalize gripper action [0,1] -> [-1,+1] because the environment expects the latter
-    action = normalize_gripper_action(action, binarize=True)
-    # [OpenVLA] The dataloader flips the sign of the gripper action to align with other datasets
-    # (0 = close, 1 = open), so flip it back (-1 = open, +1 = close) before executing the action
-    if model_family == "openvla":
-        action = invert_gripper_action(action)
-    return action
-def run_episode(
-    cfg: GenerateConfig,
-    env,
-    task_description: str,
-    model,
-    resize_size,
-    processor=None,
-    action_head=None,
-    proprio_projector=None,
-    noisy_action_projector=None,
-    initial_state=None,
-    log_file=None,
-):
-    """Run a single episode in the environment."""
-    # Reset environment
-    env.reset()
-    # Set initial state if provided
-    if initial_state is not None:
-        obs = env.set_init_state(initial_state)
-    else:
-        obs = env.get_observation()
-    # Initialize action queue
-    if cfg.num_open_loop_steps != NUM_ACTIONS_CHUNK:
-        print(f"WARNING: cfg.num_open_loop_steps ({cfg.num_open_loop_steps}) does not match the NUM_ACTIONS_CHUNK "
-              f"({NUM_ACTIONS_CHUNK}) constant defined in prismatic.vla.constants! For best performance (in terms of "
-               "both speed and success rate), we recommend executing the full action chunk.")
-    action_queue = deque(maxlen=cfg.num_open_loop_steps)
-    # Setup
-    t = 0
-    replay_images = []
-    max_steps = TASK_MAX_STEPS[cfg.task_suite_name]
-    # Run episode
-    success = False
-    try:
-        while t < max_steps + cfg.num_steps_wait:
-            # Do nothing for the first few timesteps to let objects stabilize
-            if t < cfg.num_steps_wait:
-                obs, reward, done, info = env.step(get_libero_dummy_action(cfg.model_family))
-                t += 1
-                continue
-            # Prepare observation
-            observation, img = prepare_observation(obs, resize_size)
-            replay_images.append(img)
-            # If action queue is empty, requery model
-            if len(action_queue) == 0:
-                # Query model to get action
-                actions = get_action(
-                    cfg,
-                    model,
-                    observation,
-                    task_description,
-                    processor=processor,
-                    action_head=action_head,
-                    proprio_projector=proprio_projector,
-                    noisy_action_projector=noisy_action_projector,
-                    use_film=cfg.use_film,
-                )
-                action_queue.extend(actions)
-            # Get action from queue
-            action = action_queue.popleft()
-            # Process action
-            action = process_action(action, cfg.model_family)
-            # Execute action in environment
-            obs, reward, done, info = env.step(action.tolist())
-            if done:
-                success = True
-                break
-            t += 1
-    except Exception as e:
-        log_message(f"Episode error: {e}", log_file)
-    return success, replay_images
-def run_task(
-    cfg: GenerateConfig,
-    task_suite,
-    task_id: int,
-    model,
-    resize_size,
-    processor=None,
-    action_head=None,
-    proprio_projector=None,
-    noisy_action_projector=None,
-    total_episodes=0,
-    total_successes=0,
-    log_file=None,
-):
-    """Run evaluation for a single task."""
-    # Get task
-    task = task_suite.get_task(task_id)
-    # Get initial states
-    initial_states, all_initial_states = load_initial_states(cfg, task_suite, task_id, log_file)
-    # Initialize environment and get task description
-    env, task_description = get_libero_env(task, cfg.model_family, resolution=cfg.env_img_res)
-    # Start episodes
-    task_episodes, task_successes = 0, 0
-    for episode_idx in tqdm.tqdm(range(cfg.num_trials_per_task)):
-        log_message(f"\nTask: {task_description}", log_file)
-        # Handle initial state
-        if cfg.initial_states_path == "DEFAULT":
-            # Use default initial state
-            initial_state = initial_states[episode_idx]
-        else:
-            # Get keys for fetching initial episode state from JSON
-            initial_states_task_key = task_description.replace(" ", "_")
-            episode_key = f"demo_{episode_idx}"
-            # Skip episode if expert demonstration failed to complete the task
-            if not all_initial_states[initial_states_task_key][episode_key]["success"]:
-                log_message(f"Skipping task {task_id} episode {episode_idx} due to failed expert demo!", log_file)
-                continue
-            # Get initial state
-            initial_state = np.array(all_initial_states[initial_states_task_key][episode_key]["initial_state"])
-        log_message(f"Starting episode {task_episodes + 1}...", log_file)
-        # Run episode
-        success, replay_images = run_episode(
-            cfg,
-            env,
-            task_description,
-            model,
-            resize_size,
-            processor,
-            action_head,
-            proprio_projector,
-            noisy_action_projector,
-            initial_state,
-            log_file,
-        )
-        # Update counters
-        task_episodes += 1
-        total_episodes += 1
-        if success:
-            task_successes += 1
-            total_successes += 1
-        # Save replay video
-        save_rollout_video(
-            replay_images, total_episodes, success=success, task_description=task_description, log_file=log_file
-        )
-        # Log results
-        log_message(f"Success: {success}", log_file)
-        log_message(f"# episodes completed so far: {total_episodes}", log_file)
-        log_message(f"# successes: {total_successes} ({total_successes / total_episodes * 100:.1f}%)", log_file)
-    # Log task results
-    task_success_rate = float(task_successes) / float(task_episodes) if task_episodes > 0 else 0
-    total_success_rate = float(total_successes) / float(total_episodes) if total_episodes > 0 else 0
-    log_message(f"Current task success rate: {task_success_rate}", log_file)
-    log_message(f"Current total success rate: {total_success_rate}", log_file)
-    # Log to wandb if enabled
-    if cfg.use_wandb:
-        wandb.log(
-            {
-                f"success_rate/{task_description}": task_success_rate,
-                f"num_episodes/{task_description}": task_episodes,
-            }
-        )
-    return total_episodes, total_successes
-@draccus.wrap()
-def eval_libero(cfg: GenerateConfig) -> float:
-    """Main function to evaluate a trained policy on LIBERO benchmark tasks."""
-    # Validate configuration
-    validate_config(cfg)
-    # Set random seed
-    set_seed_everywhere(cfg.seed)
-    # Initialize model and components
-    model, action_head, proprio_projector, noisy_action_projector, processor = initialize_model(cfg)
-    # Get expected image dimensions
-    resize_size = get_image_resize_size(cfg)
-    # Setup logging
-    log_file, local_log_filepath, run_id = setup_logging(cfg)
-    # Initialize LIBERO task suite
-    benchmark_dict = benchmark.get_benchmark_dict()
-    task_suite = benchmark_dict[cfg.task_suite_name]()
-    num_tasks = task_suite.n_tasks
-    log_message(f"Task suite: {cfg.task_suite_name}", log_file)
-    # Start evaluation
-    total_episodes, total_successes = 0, 0
-    for task_id in tqdm.tqdm(range(num_tasks)):
-        total_episodes, total_successes = run_task(
-            cfg,
-            task_suite,
-            task_id,
-            model,
-            resize_size,
-            processor,
-            action_head,
-            proprio_projector,
-            noisy_action_projector,
-            total_episodes,
-            total_successes,
-            log_file,
-        )
-    # Calculate final success rate
-    final_success_rate = float(total_successes) / float(total_episodes) if total_episodes > 0 else 0
-    # Log final results
-    log_message("Final results:", log_file)
-    log_message(f"Total episodes: {total_episodes}", log_file)
-    log_message(f"Total successes: {total_successes}", log_file)
-    log_message(f"Overall success rate: {final_success_rate:.4f} ({final_success_rate * 100:.1f}%)", log_file)
-    # Log to wandb if enabled
-    if cfg.use_wandb:
-        wandb.log(
-            {
-                "success_rate/total": final_success_rate,
-                "num_episodes/total": total_episodes,
-            }
-        )
-        wandb.save(local_log_filepath)
-    # Close log file
-    if log_file:
-        log_file.close()
-    return final_success_rate
-if __name__ == "__main__":
-    eval_libero()

capvector-oft/experiments/robot/libero/sample_libero_spatial_observation.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:326db6c78dd0a9d91c11f05af03b93fa3095338ee3cb5a5eb15adf3d87eb0109
-size 301501

capvector-oft/experiments/robot/openvla_utils.py DELETED Viewed

@@ -1,818 +0,0 @@
-"""Utils for evaluating OpenVLA or fine-tuned OpenVLA policies."""
-import filecmp
-import json
-import os
-import shutil
-import time
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
-import json_numpy
-import numpy as np
-import requests
-import tensorflow as tf
-import torch
-from huggingface_hub import HfApi, hf_hub_download
-from PIL import Image
-from transformers import AutoConfig, AutoImageProcessor, AutoModelForVision2Seq, AutoProcessor
-# Apply JSON numpy patch for serialization
-json_numpy.patch()
-from prismatic.extern.hf.configuration_prismatic import OpenVLAConfig
-from prismatic.extern.hf.modeling_prismatic import OpenVLAForActionPrediction
-from prismatic.extern.hf.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
-from prismatic.models.action_heads import DiffusionActionHead, L1RegressionActionHead
-from prismatic.models.film_vit_wrapper import FiLMedPrismaticVisionBackbone
-from prismatic.models.projectors import NoisyActionProjector, ProprioProjector
-from prismatic.vla.constants import (
-    ACTION_DIM,
-    ACTION_PROPRIO_NORMALIZATION_TYPE,
-)
-from prismatic.vla.datasets.rlds.utils.data_utils import NormalizationType
-# Initialize important constants
-DATE = time.strftime("%Y_%m_%d")
-DATE_TIME = time.strftime("%Y_%m_%d-%H_%M_%S")
-DEVICE = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
-OPENVLA_IMAGE_SIZE = 224  # Standard image size expected by OpenVLA
-# Configure NumPy print settings
-np.set_printoptions(formatter={"float": lambda x: "{0:0.3f}".format(x)})
-def model_is_on_hf_hub(model_path: str) -> bool:
-    """Checks whether a model path points to a model on Hugging Face Hub."""
-    # If the API call below runs without error, the model is on the hub
-    try:
-        HfApi().model_info(model_path)
-        return True
-    except Exception:
-        return False
-def update_auto_map(pretrained_checkpoint: str) -> None:
-    """
-    Update the AutoMap configuration in the checkpoint config.json file.
-    This loads the config.json file inside the checkpoint directory and overwrites
-    the AutoConfig and AutoModelForVision2Seq fields to use OpenVLA-specific classes.
-    Args:
-        pretrained_checkpoint: Path to the checkpoint directory
-    """
-    if not os.path.isdir(pretrained_checkpoint):
-        return
-    config_path = os.path.join(pretrained_checkpoint, "config.json")
-    if not os.path.exists(config_path):
-        print(f"Warning: No config.json found at {config_path}")
-        return
-    # Create timestamped backup
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    backup_path = os.path.join(pretrained_checkpoint, f"config.json.back.{timestamp}")
-    shutil.copy2(config_path, backup_path)
-    print(f"Created backup of original config at: {os.path.abspath(backup_path)}")
-    # Read and update the config
-    with open(config_path, "r") as f:
-        config = json.load(f)
-    config["auto_map"] = {
-        "AutoConfig": "configuration_prismatic.OpenVLAConfig",
-        "AutoModelForVision2Seq": "modeling_prismatic.OpenVLAForActionPrediction",
-    }
-    # Write back the updated config
-    with open(config_path, "w") as f:
-        json.dump(config, f, indent=2)
-    print(f"Updated config.json at: {os.path.abspath(config_path)}")
-    print("Changes made:")
-    print('  - Set AutoConfig to "configuration_prismatic.OpenVLAConfig"')
-    print('  - Set AutoModelForVision2Seq to "modeling_prismatic.OpenVLAForActionPrediction"')
-def check_identical_files(path1: Union[str, Path], path2: Union[str, Path]) -> bool:
-    """
-    Check if two files are identical in content.
-    Args:
-        path1: Path to the first file
-        path2: Path to the second file
-    Returns:
-        bool: True if files are identical, False otherwise
-    """
-    path1, path2 = Path(path1), Path(path2)
-    # First check if file sizes match
-    if path1.stat().st_size != path2.stat().st_size:
-        return False
-    # Check if contents match
-    return filecmp.cmp(path1, path2, shallow=False)
-def _handle_file_sync(curr_filepath: str, checkpoint_filepath: str, file_type: str) -> None:
-    """
-    Handle syncing of files between current directory and checkpoint.
-    Creates backups if files exist but differ, and copies current versions to checkpoint.
-    Args:
-        curr_filepath: Path to the current file version
-        checkpoint_filepath: Path where the file should be in the checkpoint
-        file_type: Description of the file type for logging
-    """
-    if os.path.exists(checkpoint_filepath):
-        # Check if existing files are identical
-        match = check_identical_files(curr_filepath, checkpoint_filepath)
-        if not match:
-            print(
-                "\n------------------------------------------------------------------------------------------------\n"
-                f"Found mismatch between:\n"
-                f"Current:   {curr_filepath}\n"
-                f"Checkpoint: {checkpoint_filepath}\n"
-            )
-            # Create timestamped backup
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            backup_path = f"{checkpoint_filepath}.back.{timestamp}"
-            shutil.copy2(checkpoint_filepath, backup_path)
-            print(f"Created backup of original checkpoint file at: {os.path.abspath(backup_path)}")
-            # Copy current version to checkpoint directory
-            shutil.copy2(curr_filepath, checkpoint_filepath)
-            print(f"Copied current version to checkpoint at: {os.path.abspath(checkpoint_filepath)}")
-            print(
-                f"Changes complete. The checkpoint will now use the current version of {file_type}"
-                "\n------------------------------------------------------------------------------------------------\n"
-            )
-    else:
-        # If file doesn't exist in checkpoint directory, copy it
-        shutil.copy2(curr_filepath, checkpoint_filepath)
-        print(
-            "\n------------------------------------------------------------------------------------------------\n"
-            f"No {file_type} found in checkpoint directory.\n"
-            f"Copied current version from: {curr_filepath}\n"
-            f"To checkpoint location: {os.path.abspath(checkpoint_filepath)}"
-            "\n------------------------------------------------------------------------------------------------\n"
-        )
-def check_model_logic_mismatch(pretrained_checkpoint: str) -> None:
-    """
-    Check and sync model logic files between current code and checkpoint.
-    Handles the relationship between current and checkpoint versions of both
-    modeling_prismatic.py and configuration_prismatic.py:
-    - If checkpoint file exists and differs: creates backup and copies current version
-    - If checkpoint file doesn't exist: copies current version
-    Args:
-        pretrained_checkpoint: Path to the checkpoint directory
-    """
-    if not os.path.isdir(pretrained_checkpoint):
-        return
-    # Find current files
-    curr_files = {"modeling_prismatic.py": None, "configuration_prismatic.py": None}
-    for root, _, files in os.walk("./prismatic/"):
-        for filename in curr_files.keys():
-            if filename in files and curr_files[filename] is None:
-                curr_files[filename] = os.path.join(root, filename)
-    # Check and handle each file
-    for filename, curr_filepath in curr_files.items():
-        if curr_filepath is None:
-            print(f"WARNING: `{filename}` is not found anywhere in the current directory.")
-            continue
-        checkpoint_filepath = os.path.join(pretrained_checkpoint, filename)
-        _handle_file_sync(curr_filepath, checkpoint_filepath, filename)
-def find_checkpoint_file(pretrained_checkpoint: str, file_pattern: str) -> str:
-    """
-    Find a specific checkpoint file matching a pattern.
-    Args:
-        pretrained_checkpoint: Path to the checkpoint directory
-        file_pattern: String pattern to match in filenames
-    Returns:
-        str: Path to the matching checkpoint file
-    Raises:
-        AssertionError: If no files or multiple files match the pattern
-    """
-    assert os.path.isdir(pretrained_checkpoint), f"Checkpoint path must be a directory: {pretrained_checkpoint}"
-    checkpoint_files = []
-    for filename in os.listdir(pretrained_checkpoint):
-        if file_pattern in filename and "checkpoint" in filename:
-            full_path = os.path.join(pretrained_checkpoint, filename)
-            checkpoint_files.append(full_path)
-    assert len(checkpoint_files) == 1, (
-        f"Expected exactly 1 {file_pattern} checkpoint but found {len(checkpoint_files)} in directory: {pretrained_checkpoint}"
-    )
-    return checkpoint_files[0]
-def load_component_state_dict(checkpoint_path: str) -> Dict[str, torch.Tensor]:
-    """
-    Load a component's state dict from checkpoint and handle DDP prefix if present.
-    Args:
-        checkpoint_path: Path to the checkpoint file
-    Returns:
-        Dict: The processed state dictionary for loading
-    """
-    state_dict = torch.load(checkpoint_path, weights_only=True)
-    # If the component was trained with DDP, elements in the state dict have prefix "module." which we must remove
-    new_state_dict = {}
-    for k, v in state_dict.items():
-        if k.startswith("module."):
-            new_state_dict[k[7:]] = v
-        else:
-            new_state_dict[k] = v
-    return new_state_dict
-def get_vla(cfg: Any) -> torch.nn.Module:
-    """
-    Load and initialize the VLA model from checkpoint.
-    Args:
-        cfg: Configuration object
-    Returns:
-        torch.nn.Module: The initialized VLA model
-    """
-    print("Instantiating pretrained VLA policy...")
-    # If loading a locally stored pretrained checkpoint, check whether config or model files
-    # need to be synced so that any changes the user makes to the VLA modeling code will
-    # actually go into effect
-    # If loading a pretrained checkpoint from Hugging Face Hub, we just assume that the policy
-    # will be used as is, with its original modeling logic
-    if not model_is_on_hf_hub(cfg.pretrained_checkpoint):
-        # Register OpenVLA model to HF Auto Classes (not needed if the model is on HF Hub)
-        AutoConfig.register("openvla", OpenVLAConfig)
-        AutoImageProcessor.register(OpenVLAConfig, PrismaticImageProcessor)
-        AutoProcessor.register(OpenVLAConfig, PrismaticProcessor)
-        AutoModelForVision2Seq.register(OpenVLAConfig, OpenVLAForActionPrediction)
-        # Update config.json and sync model files
-        update_auto_map(cfg.pretrained_checkpoint)
-        check_model_logic_mismatch(cfg.pretrained_checkpoint)
-    # Load the model
-    vla = AutoModelForVision2Seq.from_pretrained(
-        cfg.pretrained_checkpoint,
-        # attn_implementation="flash_attention_2",
-        torch_dtype=torch.bfloat16,
-        load_in_8bit=cfg.load_in_8bit,
-        load_in_4bit=cfg.load_in_4bit,
-        low_cpu_mem_usage=True,
-        trust_remote_code=True,
-    )
-    # If using FiLM, wrap the vision backbone to allow for infusion of language inputs
-    if cfg.use_film:
-        vla = _apply_film_to_vla(vla, cfg)
-    # Set number of images in model input
-    vla.vision_backbone.set_num_images_in_input(cfg.num_images_in_input)
-    vla.eval()
-    # Move model to device if not using quantization
-    if not cfg.load_in_8bit and not cfg.load_in_4bit:
-        vla = vla.to(DEVICE)
-    # Load dataset stats for action normalization
-    _load_dataset_stats(vla, cfg.pretrained_checkpoint)
-    return vla
-def _apply_film_to_vla(vla: torch.nn.Module, cfg: Any) -> torch.nn.Module:
-    """
-    Apply FiLM (Feature-wise Linear Modulation) to the VLA vision backbone.
-    Args:
-        vla: The VLA model
-        cfg: Configuration object with model parameters
-    Returns:
-        torch.nn.Module: VLA model with FiLM applied
-    """
-    from peft import LoraConfig, get_peft_model
-    # Apply LoRA configuration
-    lora_config = LoraConfig(
-        r=cfg.lora_rank,
-        lora_alpha=min(cfg.lora_rank, 16),
-        lora_dropout=0.0,
-        target_modules="all-linear",
-        init_lora_weights="gaussian",
-    )
-    vla = get_peft_model(vla, lora_config)
-    # Create and apply FiLMed vision backbone
-    new_vision_backbone = FiLMedPrismaticVisionBackbone(
-        vision_backbone=vla.vision_backbone, llm_dim=vla.llm_dim,
-    )
-    vla.model.vision_backbone = new_vision_backbone
-    # Load vision backbone checkpoint
-    checkpoint_path = find_checkpoint_file(cfg.pretrained_checkpoint, "vision_backbone")
-    state_dict = torch.load(checkpoint_path, weights_only=True)
-    vla.model.vision_backbone.load_state_dict(state_dict)
-    # Use the model component instead of wrapper and convert to bfloat16
-    vla = vla.model
-    vla.vision_backbone = vla.vision_backbone.to(torch.bfloat16)
-    return vla
-def _load_dataset_stats(vla: torch.nn.Module, checkpoint_path: str) -> None:
-    """
-    Load dataset statistics used during training for action normalization.
-    Args:
-        vla: The VLA model
-        checkpoint_path: Path to the checkpoint directory
-    """
-    if model_is_on_hf_hub(checkpoint_path):
-        # Download dataset stats directly from HF Hub
-        dataset_statistics_path = hf_hub_download(
-            repo_id=checkpoint_path,
-            filename="dataset_statistics.json",
-        )
-    else:
-        dataset_statistics_path = os.path.join(checkpoint_path, "dataset_statistics.json")
-    if os.path.isfile(dataset_statistics_path):
-        with open(dataset_statistics_path, "r") as f:
-            norm_stats = json.load(f)
-        vla.norm_stats = norm_stats
-    else:
-        print(
-            "WARNING: No local dataset_statistics.json file found for current checkpoint.\n"
-            "You can ignore this if you are loading the base VLA (i.e. not fine-tuned) checkpoint."
-            "Otherwise, you may run into errors when trying to call `predict_action()` due to an absent `unnorm_key`."
-        )
-def get_processor(cfg: Any) -> AutoProcessor:
-    """
-    Get the VLA model's Hugging Face processor.
-    Args:
-        cfg: Configuration object with model parameters
-    Returns:
-        AutoProcessor: The model's processor
-    """
-    return AutoProcessor.from_pretrained(cfg.pretrained_checkpoint, trust_remote_code=True)
-def get_proprio_projector(cfg: Any, llm_dim: int, proprio_dim: int) -> ProprioProjector:
-    """
-    Get proprioception projector for the VLA model.
-    Args:
-        cfg: Configuration object with model parameters
-        llm_dim: Dimension of the language model
-        proprio_dim: Dimension of proprioception data
-    Returns:
-        ProprioProjector: The initialized proprio projector
-    """
-    # Initialize projector and move to device
-    proprio_projector = ProprioProjector(
-        llm_dim=llm_dim,
-        proprio_dim=proprio_dim,
-    ).to(DEVICE)
-    proprio_projector = proprio_projector.to(torch.bfloat16).to(DEVICE)
-    proprio_projector.eval()
-    # Find and load checkpoint (may be on Hugging Face Hub or stored locally)
-    if model_is_on_hf_hub(cfg.pretrained_checkpoint):
-        model_path_to_proprio_projector_name = {
-            "moojink/openvla-7b-oft-finetuned-libero-spatial": "proprio_projector--150000_checkpoint.pt",
-            "moojink/openvla-7b-oft-finetuned-libero-object": "proprio_projector--150000_checkpoint.pt",
-            "moojink/openvla-7b-oft-finetuned-libero-goal": "proprio_projector--50000_checkpoint.pt",
-            "moojink/openvla-7b-oft-finetuned-libero-10": "proprio_projector--150000_checkpoint.pt",
-            "moojink/openvla-7b-oft-finetuned-libero-spatial-object-goal-10": "proprio_projector--300000_checkpoint.pt",
-        }
-        if cfg.pretrained_checkpoint not in model_path_to_proprio_projector_name.keys():
-            raise ValueError("Unsupported HF Hub pretrained checkpoint found!")
-        # Download proprio projector directly from HF Hub
-        proprio_projector_path = hf_hub_download(
-            repo_id=cfg.pretrained_checkpoint, filename=model_path_to_proprio_projector_name[cfg.pretrained_checkpoint]
-        )
-        state_dict = load_component_state_dict(proprio_projector_path)
-        proprio_projector.load_state_dict(state_dict)
-    else:
-        checkpoint_path = find_checkpoint_file(cfg.pretrained_checkpoint, "proprio_projector")
-        state_dict = load_component_state_dict(checkpoint_path)
-        proprio_projector.load_state_dict(state_dict)
-    return proprio_projector
-def get_noisy_action_projector(cfg: Any, llm_dim: int) -> NoisyActionProjector:
-    """
-    Get noisy action projector for diffusion-based action prediction.
-    Args:
-        cfg: Configuration object with model parameters
-        llm_dim: Dimension of the language model
-    Returns:
-        NoisyActionProjector: The initialized noisy action projector
-    """
-    # Initialize projector and move to device
-    noisy_action_projector = NoisyActionProjector(
-        llm_dim=llm_dim,
-    ).to(DEVICE)
-    noisy_action_projector = noisy_action_projector.to(torch.bfloat16).to(DEVICE)
-    noisy_action_projector.eval()
-    # Find and load checkpoint
-    checkpoint_path = find_checkpoint_file(cfg.pretrained_checkpoint, "noisy_action_projector")
-    state_dict = load_component_state_dict(checkpoint_path)
-    noisy_action_projector.load_state_dict(state_dict)
-    return noisy_action_projector
-def get_action_head(cfg: Any, llm_dim: int) -> Union[L1RegressionActionHead, DiffusionActionHead]:
-    """
-    Get action head for continuous value prediction.
-    Args:
-        cfg: Configuration object with model parameters
-        llm_dim: Dimension of the language model
-    Returns:
-        Union[L1RegressionActionHead, DiffusionActionHead]: The initialized action head
-    Raises:
-        AssertionError: If both L1 regression and diffusion are specified
-    """
-    assert not (cfg.use_l1_regression and cfg.use_diffusion), "Cannot use both L1 regression and diffusion action head!"
-    # Initialize appropriate action head based on configuration
-    if cfg.use_l1_regression:
-        action_head = L1RegressionActionHead(input_dim=llm_dim, hidden_dim=llm_dim, action_dim=ACTION_DIM)
-    elif cfg.use_diffusion:
-        action_head = DiffusionActionHead(
-            input_dim=llm_dim, hidden_dim=llm_dim, action_dim=ACTION_DIM, num_diffusion_steps_train=cfg.num_diffusion_steps_train
-        )
-        # Set number of diffusion steps for inference
-        action_head.noise_scheduler.set_timesteps(cfg.num_diffusion_steps_inference)
-    else:
-        raise ValueError("Either use_l1_regression or use_diffusion must be True")
-    action_head = action_head.to(torch.bfloat16).to(DEVICE)
-    action_head.eval()
-    # Find and load checkpoint (may be on Hugging Face Hub or stored locally)
-    if model_is_on_hf_hub(cfg.pretrained_checkpoint):
-        model_path_to_action_head_name = {
-            "moojink/openvla-7b-oft-finetuned-libero-spatial": "action_head--150000_checkpoint.pt",
-            "moojink/openvla-7b-oft-finetuned-libero-object": "action_head--150000_checkpoint.pt",
-            "moojink/openvla-7b-oft-finetuned-libero-goal": "action_head--50000_checkpoint.pt",
-            "moojink/openvla-7b-oft-finetuned-libero-10": "action_head--150000_checkpoint.pt",
-            "moojink/openvla-7b-oft-finetuned-libero-spatial-object-goal-10": "action_head--300000_checkpoint.pt",
-        }
-        if cfg.pretrained_checkpoint not in model_path_to_action_head_name.keys():
-            raise ValueError("Unsupported HF Hub pretrained checkpoint found!")
-        # Download proprio projector directly from HF Hub
-        action_head_path = hf_hub_download(
-            repo_id=cfg.pretrained_checkpoint, filename=model_path_to_action_head_name[cfg.pretrained_checkpoint]
-        )
-        state_dict = load_component_state_dict(action_head_path)
-        action_head.load_state_dict(state_dict)
-    else:
-        checkpoint_path = find_checkpoint_file(cfg.pretrained_checkpoint, "action_head")
-        state_dict = load_component_state_dict(checkpoint_path)
-        action_head.load_state_dict(state_dict)
-    return action_head
-def resize_image_for_policy(img: np.ndarray, resize_size: Union[int, Tuple[int, int]]) -> np.ndarray:
-    """
-    Resize an image to match the policy's expected input size.
-    Uses the same resizing scheme as in the training data pipeline for distribution matching.
-    Args:
-        img: Numpy array containing the image
-        resize_size: Target size as int (square) or (height, width) tuple
-    Returns:
-        np.ndarray: The resized image
-    """
-    assert isinstance(resize_size, int) or isinstance(resize_size, tuple)
-    if isinstance(resize_size, int):
-        resize_size = (resize_size, resize_size)
-    # Resize using the same pipeline as in RLDS dataset builder
-    img = tf.image.encode_jpeg(img)  # Encode as JPEG
-    img = tf.io.decode_image(img, expand_animations=False, dtype=tf.uint8)  # Decode back
-    img = tf.image.resize(img, resize_size, method="lanczos3", antialias=True)
-    img = tf.cast(tf.clip_by_value(tf.round(img), 0, 255), tf.uint8)
-    return img.numpy()
-def crop_and_resize(image: tf.Tensor, crop_scale: float, batch_size: int) -> tf.Tensor:
-    """
-    Center-crop an image and resize it back to original dimensions.
-    Uses the same logic as in the training data pipeline for distribution matching.
-    Args:
-        image: TF Tensor of shape (batch_size, H, W, C) or (H, W, C) with values in [0,1]
-        crop_scale: Area of center crop relative to original image
-        batch_size: Batch size
-    Returns:
-        tf.Tensor: The cropped and resized image
-    """
-    # Handle 3D inputs by adding batch dimension if needed
-    assert image.shape.ndims in (3, 4), "Image must be 3D or 4D tensor"
-    expanded_dims = False
-    if image.shape.ndims == 3:
-        image = tf.expand_dims(image, axis=0)
-        expanded_dims = True
-    # Calculate crop dimensions (note: we use sqrt(crop_scale) for h/w)
-    new_heights = tf.reshape(tf.clip_by_value(tf.sqrt(crop_scale), 0, 1), shape=(batch_size,))
-    new_widths = tf.reshape(tf.clip_by_value(tf.sqrt(crop_scale), 0, 1), shape=(batch_size,))
-    # Create bounding box for the crop
-    height_offsets = (1 - new_heights) / 2
-    width_offsets = (1 - new_widths) / 2
-    bounding_boxes = tf.stack(
-        [
-            height_offsets,
-            width_offsets,
-            height_offsets + new_heights,
-            width_offsets + new_widths,
-        ],
-        axis=1,
-    )
-    # Apply crop and resize
-    image = tf.image.crop_and_resize(
-        image, bounding_boxes, tf.range(batch_size), (OPENVLA_IMAGE_SIZE, OPENVLA_IMAGE_SIZE)
-    )
-    # Remove batch dimension if it was added
-    if expanded_dims:
-        image = image[0]
-    return image
-def center_crop_image(image: Union[np.ndarray, Image.Image]) -> Image.Image:
-    """
-    Center crop an image to match training data distribution.
-    Args:
-        image: Input image (PIL or numpy array)
-    Returns:
-        Image.Image: Cropped PIL Image
-    """
-    batch_size = 1
-    crop_scale = 0.9
-    # Convert to TF Tensor if needed
-    if not isinstance(image, tf.Tensor):
-        image = tf.convert_to_tensor(np.array(image))
-    orig_dtype = image.dtype
-    # Convert to float32 in range [0,1]
-    image = tf.image.convert_image_dtype(image, tf.float32)
-    # Apply center crop and resize
-    image = crop_and_resize(image, crop_scale, batch_size)
-    # Convert back to original data type
-    image = tf.clip_by_value(image, 0, 1)
-    image = tf.image.convert_image_dtype(image, orig_dtype, saturate=True)
-    # Convert to PIL Image
-    return Image.fromarray(image.numpy()).convert("RGB")
-def check_image_format(image: Any) -> None:
-    """
-    Validate input image format.
-    Args:
-        image: Image to check
-    Raises:
-        AssertionError: If image format is invalid
-    """
-    is_numpy_array = isinstance(image, np.ndarray)
-    has_correct_shape = len(image.shape) == 3 and image.shape[-1] == 3
-    has_correct_dtype = image.dtype == np.uint8
-    assert is_numpy_array and has_correct_shape and has_correct_dtype, (
-        "Incorrect image format detected! Make sure that the input image is a "
-        "numpy array with shape (H, W, 3) and dtype np.uint8!"
-    )
-def normalize_proprio(proprio: np.ndarray, norm_stats: Dict[str, Any]) -> np.ndarray:
-    """
-    Normalize proprioception data to match training distribution.
-    Args:
-        proprio: Raw proprioception data
-        norm_stats: Normalization statistics
-    Returns:
-        np.ndarray: Normalized proprioception data
-    """
-    if ACTION_PROPRIO_NORMALIZATION_TYPE == NormalizationType.BOUNDS:
-        mask = norm_stats.get("mask", np.ones_like(norm_stats["min"], dtype=bool))
-        proprio_high, proprio_low = np.array(norm_stats["max"]), np.array(norm_stats["min"])
-    elif ACTION_PROPRIO_NORMALIZATION_TYPE == NormalizationType.BOUNDS_Q99:
-        mask = norm_stats.get("mask", np.ones_like(norm_stats["q01"], dtype=bool))
-        proprio_high, proprio_low = np.array(norm_stats["q99"]), np.array(norm_stats["q01"])
-    else:
-        raise ValueError("Unsupported action/proprio normalization type detected!")
-    normalized_proprio = np.clip(
-        np.where(
-            mask,
-            2 * (proprio - proprio_low) / (proprio_high - proprio_low + 1e-8) - 1,
-            proprio,
-        ),
-        a_min=-1.0,
-        a_max=1.0,
-    )
-    return normalized_proprio
-def prepare_images_for_vla(images: List[np.ndarray], cfg: Any) -> List[Image.Image]:
-    """
-    Prepare images for VLA input by resizing and cropping as needed.
-    Args:
-        images: List of input images as numpy arrays
-        cfg: Configuration object with parameters
-    Returns:
-        List[Image.Image]: Processed images ready for the model
-    """
-    processed_images = []
-    for image in images:
-        # Validate format
-        check_image_format(image)
-        # Resize if needed
-        if image.shape != (OPENVLA_IMAGE_SIZE, OPENVLA_IMAGE_SIZE, 3):
-            image = resize_image_for_policy(image, OPENVLA_IMAGE_SIZE)
-        # Convert to PIL image
-        pil_image = Image.fromarray(image).convert("RGB")
-        # Apply center crop if configured
-        if cfg.center_crop:
-            pil_image = center_crop_image(pil_image)
-        processed_images.append(pil_image)
-    return processed_images
-def get_vla_action(
-    cfg: Any,
-    vla: torch.nn.Module,
-    processor: Any,
-    obs: Dict[str, Any],
-    task_label: str,
-    action_head: Optional[torch.nn.Module] = None,
-    proprio_projector: Optional[torch.nn.Module] = None,
-    noisy_action_projector: Optional[torch.nn.Module] = None,
-    use_film: bool = False,
-) -> List[np.ndarray]:
-    """
-    Generate action predictions with the VLA policy.
-    Args:
-        cfg: Configuration object with parameters
-        vla: The VLA model
-        processor: Model processor for inputs
-        obs: Observation dictionary
-        task_label: Text description of the task
-        action_head: Optional action head for continuous actions
-        proprio_projector: Optional proprioception projector
-        noisy_action_projector: Optional noisy action projector for diffusion
-        use_film: Whether to use FiLM
-    Returns:
-        List[np.ndarray]: Predicted actions
-    """
-    with torch.inference_mode():
-        # Collect all input images
-        all_images = [obs["full_image"]]
-        if cfg.num_images_in_input > 1:
-            all_images.extend([obs[k] for k in obs.keys() if "wrist" in k])
-        # Process images
-        all_images = prepare_images_for_vla(all_images, cfg)
-        # Extract primary image and additional images
-        primary_image = all_images.pop(0)
-        # Build VLA prompt
-        prompt = f"In: What action should the robot take to {task_label.lower()}?\nOut:"
-        # Process primary image
-        inputs = processor(prompt, primary_image).to(DEVICE, dtype=torch.bfloat16)
-        # Process additional wrist images if any
-        if all_images:
-            all_wrist_inputs = [
-                processor(prompt, image_wrist).to(DEVICE, dtype=torch.bfloat16) for image_wrist in all_images
-            ]
-            # Concatenate all images
-            primary_pixel_values = inputs["pixel_values"]
-            all_wrist_pixel_values = [wrist_inputs["pixel_values"] for wrist_inputs in all_wrist_inputs]
-            inputs["pixel_values"] = torch.cat([primary_pixel_values] + all_wrist_pixel_values, dim=1)
-        # Process proprioception data if used
-        proprio = None
-        if cfg.use_proprio:
-            proprio = obs["state"]
-            proprio_norm_stats = vla.norm_stats[cfg.unnorm_key]["proprio"]
-            obs["state"] = normalize_proprio(proprio, proprio_norm_stats)
-            proprio = obs["state"]
-        # Generate action
-        if action_head is None:
-            # Standard VLA output (single-image inputs, discrete actions)
-            action, _ = vla.predict_action(**inputs, unnorm_key=cfg.unnorm_key, do_sample=False)
-        else:
-            # Custom action head for continuous actions
-            action, _ = vla.predict_action(
-                **inputs,
-                unnorm_key=cfg.unnorm_key,
-                do_sample=False,
-                proprio=proprio,
-                proprio_projector=proprio_projector,
-                noisy_action_projector=noisy_action_projector,
-                action_head=action_head,
-                use_film=use_film,
-            )
-    # Return action chunk as list of actions
-    return [action[i] for i in range(len(action))]
-def get_action_from_server(
-    observation: Dict[str, Any], server_endpoint: str = "http://0.0.0.0:8777/act"
-) -> Dict[str, Any]:
-    """
-    Get VLA action from remote inference server.
-    Args:
-        observation: Observation data to send to server
-        server_endpoint: URL of the inference server
-    Returns:
-        Dict[str, Any]: Action response from server
-    """
-    response = requests.post(
-        server_endpoint,
-        json=observation,
-    )
-    return response.json()

capvector-oft/experiments/robot/robot_utils.py DELETED Viewed

@@ -1,199 +0,0 @@
-"""Utils for evaluating robot policies in various environments."""
-import os
-import random
-import time
-from typing import Any, Dict, List, Optional, Union
-import numpy as np
-import torch
-from experiments.robot.openvla_utils import (
-    get_vla,
-    get_vla_action,
-)
-# Initialize important constants
-ACTION_DIM = 7
-DATE = time.strftime("%Y_%m_%d")
-DATE_TIME = time.strftime("%Y_%m_%d-%H_%M_%S")
-DEVICE = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
-# Configure NumPy print settings
-np.set_printoptions(formatter={"float": lambda x: "{0:0.3f}".format(x)})
-# Initialize system prompt for OpenVLA v0.1
-OPENVLA_V01_SYSTEM_PROMPT = (
-    "A chat between a curious user and an artificial intelligence assistant. "
-    "The assistant gives helpful, detailed, and polite answers to the user's questions."
-)
-# Model image size configuration
-MODEL_IMAGE_SIZES = {
-    "openvla": 224,
-    # Add other models as needed
-}
-def set_seed_everywhere(seed: int) -> None:
-    """
-    Set random seed for all random number generators for reproducibility.
-    Args:
-        seed: The random seed to use
-    """
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    np.random.seed(seed)
-    random.seed(seed)
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-    os.environ["PYTHONHASHSEED"] = str(seed)
-def get_model(cfg: Any, wrap_diffusion_policy_for_droid: bool = False) -> torch.nn.Module:
-    """
-    Load and initialize model for evaluation based on configuration.
-    Args:
-        cfg: Configuration object with model parameters
-        wrap_diffusion_policy_for_droid: Whether to wrap diffusion policy for DROID
-    Returns:
-        torch.nn.Module: The loaded model
-    Raises:
-        ValueError: If model family is not supported
-    """
-    if cfg.model_family == "openvla":
-        model = get_vla(cfg)
-    else:
-        raise ValueError(f"Unsupported model family: {cfg.model_family}")
-    print(f"Loaded model: {type(model)}")
-    return model
-def get_image_resize_size(cfg: Any) -> Union[int, tuple]:
-    """
-    Get image resize dimensions for a specific model.
-    If returned value is an int, the resized image will be a square.
-    If returned value is a tuple, the resized image will be a rectangle.
-    Args:
-        cfg: Configuration object with model parameters
-    Returns:
-        Union[int, tuple]: Image resize dimensions
-    Raises:
-        ValueError: If model family is not supported
-    """
-    if cfg.model_family not in MODEL_IMAGE_SIZES:
-        raise ValueError(f"Unsupported model family: {cfg.model_family}")
-    return MODEL_IMAGE_SIZES[cfg.model_family]
-def get_action(
-    cfg: Any,
-    model: torch.nn.Module,
-    obs: Dict[str, Any],
-    task_label: str,
-    processor: Optional[Any] = None,
-    action_head: Optional[torch.nn.Module] = None,
-    proprio_projector: Optional[torch.nn.Module] = None,
-    noisy_action_projector: Optional[torch.nn.Module] = None,
-    use_film: bool = False,
-) -> Union[List[np.ndarray], np.ndarray]:
-    """
-    Query the model to get action predictions.
-    Args:
-        cfg: Configuration object with model parameters
-        model: The loaded model
-        obs: Observation dictionary
-        task_label: Text description of the task
-        processor: Model processor for inputs
-        action_head: Optional action head for continuous actions
-        proprio_projector: Optional proprioception projector
-        noisy_action_projector: Optional noisy action projector for diffusion
-        use_film: Whether to use FiLM
-    Returns:
-        Union[List[np.ndarray], np.ndarray]: Predicted actions
-    Raises:
-        ValueError: If model family is not supported
-    """
-    with torch.no_grad():
-        if cfg.model_family == "openvla":
-            action = get_vla_action(
-                cfg=cfg,
-                vla=model,
-                processor=processor,
-                obs=obs,
-                task_label=task_label,
-                action_head=action_head,
-                proprio_projector=proprio_projector,
-                noisy_action_projector=noisy_action_projector,
-                use_film=use_film,
-            )
-        else:
-            raise ValueError(f"Unsupported model family: {cfg.model_family}")
-    return action
-def normalize_gripper_action(action: np.ndarray, binarize: bool = True) -> np.ndarray:
-    """
-    Normalize gripper action from [0,1] to [-1,+1] range.
-    This is necessary for some environments because the dataset wrapper
-    standardizes gripper actions to [0,1]. Note that unlike the other action
-    dimensions, the gripper action is not normalized to [-1,+1] by default.
-    Normalization formula: y = 2 * (x - orig_low) / (orig_high - orig_low) - 1
-    Args:
-        action: Action array with gripper action in the last dimension
-        binarize: Whether to binarize gripper action to -1 or +1
-    Returns:
-        np.ndarray: Action array with normalized gripper action
-    """
-    # Create a copy to avoid modifying the original
-    normalized_action = action.copy()
-    # Normalize the last action dimension to [-1,+1]
-    orig_low, orig_high = 0.0, 1.0
-    normalized_action[..., -1] = 2 * (normalized_action[..., -1] - orig_low) / (orig_high - orig_low) - 1
-    if binarize:
-        # Binarize to -1 or +1
-        normalized_action[..., -1] = np.sign(normalized_action[..., -1])
-    return normalized_action
-def invert_gripper_action(action: np.ndarray) -> np.ndarray:
-    """
-    Flip the sign of the gripper action (last dimension of action vector).
-    This is necessary for environments where -1 = open, +1 = close, since
-    the RLDS dataloader aligns gripper actions such that 0 = close, 1 = open.
-    Args:
-        action: Action array with gripper action in the last dimension
-    Returns:
-        np.ndarray: Action array with inverted gripper action
-    """
-    # Create a copy to avoid modifying the original
-    inverted_action = action.copy()
-    # Invert the gripper action
-    inverted_action[..., -1] *= -1.0
-    return inverted_action

capvector-oft/prismatic/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .models import available_model_names, available_models, get_model_description, load

capvector-oft/prismatic/conf/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .datasets import DatasetConfig, DatasetRegistry
-from .models import ModelConfig, ModelRegistry
-from .vla import VLAConfig, VLARegistry

capvector-oft/prismatic/conf/datasets.py DELETED Viewed

@@ -1,133 +0,0 @@
-"""
-datasets.py
-Draccus Dataclass Definition for a DatasetConfig object, with various registered subclasses for each dataset variant
-and processing scheme. A given dataset variant (e.g., `llava-lightning`) configures the following attributes:
-    - Dataset Variant (Identifier) --> e.g., "llava-v15"
-    - Align Stage Dataset Components (annotations, images)
-    - Finetune Stage Dataset Components (annotations, images)
-    - Dataset Root Directory (Path)
-"""
-from dataclasses import dataclass
-from enum import Enum, unique
-from pathlib import Path
-from typing import Tuple
-from draccus import ChoiceRegistry
-@dataclass
-class DatasetConfig(ChoiceRegistry):
-    # fmt: off
-    dataset_id: str                                 # Unique ID that fully specifies a dataset variant
-    # Dataset Components for each Stage in < align | finetune >
-    align_stage_components: Tuple[Path, Path]       # Path to annotation file and images directory for `align` stage
-    finetune_stage_components: Tuple[Path, Path]    # Path to annotation file and images directory for `finetune` stage
-    dataset_root_dir: Path                          # Path to dataset root directory; others paths are relative to root
-    # fmt: on
-# [Reproduction] LLaVa-v15 (exact dataset used in all public LLaVa-v15 models)
-@dataclass
-class LLaVa_V15_Config(DatasetConfig):
-    dataset_id: str = "llava-v15"
-    align_stage_components: Tuple[Path, Path] = (
-        Path("download/llava-laion-cc-sbu-558k/chat.json"),
-        Path("download/llava-laion-cc-sbu-558k/"),
-    )
-    finetune_stage_components: Tuple[Path, Path] = (
-        Path("download/llava-v1.5-instruct/llava_v1_5_mix665k.json"),
-        Path("download/llava-v1.5-instruct/"),
-    )
-    dataset_root_dir: Path = Path("/mnt/fsx/skaramcheti/datasets/prismatic-vlms")
-# [Multimodal-Only] LLava-v15 WITHOUT the Language-Only ShareGPT Data (No Co-Training)
-@dataclass
-class LLaVa_Multimodal_Only_Config(DatasetConfig):
-    dataset_id: str = "llava-multimodal"
-    align_stage_components: Tuple[Path, Path] = (
-        Path("download/llava-laion-cc-sbu-558k/chat.json"),
-        Path("download/llava-laion-cc-sbu-558k/"),
-    )
-    finetune_stage_components: Tuple[Path, Path] = (
-        Path("download/llava-v1.5-instruct/llava_v1_5_stripped625k.json"),
-        Path("download/llava-v1.5-instruct/"),
-    )
-    dataset_root_dir: Path = Path("/mnt/fsx/skaramcheti/datasets/prismatic-vlms")
-# LLaVa-v15 + LVIS-Instruct-4V
-@dataclass
-class LLaVa_LVIS4V_Config(DatasetConfig):
-    dataset_id: str = "llava-lvis4v"
-    align_stage_components: Tuple[Path, Path] = (
-        Path("download/llava-laion-cc-sbu-558k/chat.json"),
-        Path("download/llava-laion-cc-sbu-558k/"),
-    )
-    finetune_stage_components: Tuple[Path, Path] = (
-        Path("download/llava-v1.5-instruct/llava_v1_5_lvis4v_mix888k.json"),
-        Path("download/llava-v1.5-instruct/"),
-    )
-    dataset_root_dir: Path = Path("/mnt/fsx/skaramcheti/datasets/prismatic-vlms")
-# LLaVa-v15 + LRV-Instruct
-@dataclass
-class LLaVa_LRV_Config(DatasetConfig):
-    dataset_id: str = "llava-lrv"
-    align_stage_components: Tuple[Path, Path] = (
-        Path("download/llava-laion-cc-sbu-558k/chat.json"),
-        Path("download/llava-laion-cc-sbu-558k/"),
-    )
-    finetune_stage_components: Tuple[Path, Path] = (
-        Path("download/llava-v1.5-instruct/llava_v1_5_lrv_mix1008k.json"),
-        Path("download/llava-v1.5-instruct/"),
-    )
-    dataset_root_dir: Path = Path("/mnt/fsx/skaramcheti/datasets/prismatic-vlms")
-# LLaVa-v15 + LVIS-Instruct-4V + LRV-Instruct
-@dataclass
-class LLaVa_LVIS4V_LRV_Config(DatasetConfig):
-    dataset_id: str = "llava-lvis4v-lrv"
-    align_stage_components: Tuple[Path, Path] = (
-        Path("download/llava-laion-cc-sbu-558k/chat.json"),
-        Path("download/llava-laion-cc-sbu-558k/"),
-    )
-    finetune_stage_components: Tuple[Path, Path] = (
-        Path("download/llava-v1.5-instruct/llava_v1_5_lvis4v_lrv_mix1231k.json"),
-        Path("download/llava-v1.5-instruct/"),
-    )
-    dataset_root_dir: Path = Path("/mnt/fsx/skaramcheti/datasets/prismatic-vlms")
-# === Define a Dataset Registry Enum for Reference & Validation =>> all *new* datasets must be added here! ===
-@unique
-class DatasetRegistry(Enum):
-    # === LLaVa v1.5 ===
-    LLAVA_V15 = LLaVa_V15_Config
-    LLAVA_MULTIMODAL_ONLY = LLaVa_Multimodal_Only_Config
-    LLAVA_LVIS4V = LLaVa_LVIS4V_Config
-    LLAVA_LRV = LLaVa_LRV_Config
-    LLAVA_LVIS4V_LRV = LLaVa_LVIS4V_LRV_Config
-    @property
-    def dataset_id(self) -> str:
-        return self.value.dataset_id
-# Register Datasets in Choice Registry
-for dataset_variant in DatasetRegistry:
-    DatasetConfig.register_subclass(dataset_variant.dataset_id, dataset_variant.value)

capvector-oft/prismatic/conf/models.py DELETED Viewed

@@ -1,584 +0,0 @@
-"""
-models.py
-Draccus Dataclass Definition for a ModelConfig object, with various registered subclasses for each model family and
-variant thereof. A given model variant configures the following attributes:
-    - Pretrained Visual Representation (e.g., OpenAI CLIP ViT-L/14) + Pretrained LLM Backbone (e.g., LLaMa-2 7B)
-    - VLM Configuration + Parameters (e.g., MLP Projector, Image Preprocessing, etc.)
-    - [Optional] Stage 1 (`align`) Optimization Hyperparameters
-    - Stage 2 (`finetune`) Optimization Hyperparameters
-"""
-from dataclasses import dataclass
-from enum import Enum, unique
-from typing import Optional
-from draccus import ChoiceRegistry
-@dataclass
-class ModelConfig(ChoiceRegistry):
-    # fmt: off
-    model_id: str                                           # Unique Model ID that fully specifies a given variant
-    arch_specifier: str                                     # Architecture specifier string (e.g., "gelu-mlp")
-    # Pretrained Backbones
-    vision_backbone_id: str                                 # Pretrained Visual Featurizer (from TIMM) to load
-    llm_backbone_id: str                                    # Pretrained LLM (from HF Transformers) to load
-    # Backbone Parameters
-    image_resize_strategy: str                              # Resizing strategy in < crop | letterbox | corner-pad >
-    llm_max_length: int                                     # Maximum context length for LLM (can be < than max!)
-    # === Multi-Stage Optimization Hyperparameters ===
-    # By default, we assume an AdamW optimizer with FSDP (Gradient Sharding or Full Sharding depending on stage)
-    # Align Stage Optimization Parameters
-    align_epochs: int                                       # Epochs to Run (in case `max_steps` is not specified)
-    align_max_steps: Optional[int]                          # [Optional] Max Gradient Steps (overrides epochs)
-    align_global_batch_size: int                            # Global Batch Size (divided across processes)
-    align_per_device_batch_size: int                        # Per-Device Batch Size (per-process)
-                                                            #   => # of accumulation steps is auto-computed
-    align_learning_rate: float                              # Peak Learning Rate (lr_scheduler sets warmup/decay)
-    align_weight_decay: float                               # Weight Decay for AdamW Optimizer
-    align_max_grad_norm: float                              # Max Grad Norm (for global gradient clipping)
-    align_lr_scheduler_type: str                            # LR Scheduler (default: "linear-warmup+cosine-decay")
-    align_warmup_ratio: float                               # Fraction of total steps to warmup
-    align_train_strategy: str                               # Align Train Strategy (default: "fsdp-shard-grad-op")
-    # Finetune Stage Optimization Parameters
-    finetune_epochs: int                                    # Epochs to Run (in case `max_steps` is not specified)
-    finetune_max_steps: Optional[int]                       # [Optional] Max Gradient Steps (overrides epochs)
-    finetune_global_batch_size: int                         # Global Batch Size (divided across processes)
-    finetune_per_device_batch_size: int                     # Per-Device Batch Size (per-process)
-                                                            #   => # of accumulation steps is auto-computed
-    finetune_learning_rate: float                           # Peak Learning Rate (lr_scheduler sets warmup/decay)
-    finetune_weight_decay: float                            # Weight Decay for AdamW Optimizer
-    finetune_max_grad_norm: float                           # Max Grad Norm (for global gradient clipping)
-    finetune_lr_scheduler_type: str                         # LR Scheduler (default: "linear-warmup+cosine-decay")
-    finetune_warmup_ratio: float                            # Fraction of total steps to warmup
-    finetune_train_strategy: str                            # Finetune Train Strategy (default: "fsdp-full-shard")
-    # Enable Gradient/Activation Checkpointing (for the LLM Backbone)
-    enable_gradient_checkpointing: bool = True
-    # Enable Traditional Mixed Precision Training via Torch Native AMP (`autocast`)
-    enable_mixed_precision_training: bool = True            # Whether to enable mixed precision training
-    reduce_in_full_precision: bool = False                  # Whether to run gradient reduction in FP32
-    # fmt: on
-# === LLaVa v1.5 Reproduction - Fully Specified Configurations ===
-@dataclass
-class LLaVa_v15_Reproduction_7B(ModelConfig):
-    model_id: str = "reproduction-llava-v15+7b"
-    arch_specifier: str = "gelu-mlp"
-    vision_backbone_id: str = "clip-vit-l-336px"
-    llm_backbone_id: str = "vicuna-v15-7b"
-    image_resize_strategy: str = "letterbox"
-    llm_max_length: int = 2048
-    # Align Stage Optimization Parameters
-    align_epochs: int = 1
-    align_max_steps: Optional[int] = None
-    align_global_batch_size: int = 256
-    align_per_device_batch_size: int = 16
-    align_learning_rate: float = 1e-3
-    align_weight_decay: float = 0.0
-    align_max_grad_norm: float = 1.0
-    align_lr_scheduler_type: str = "linear-warmup+cosine-decay"
-    align_warmup_ratio: float = 0.03
-    align_train_strategy: str = "fsdp-shard-grad-op"
-    # Finetune Stage Optimization Parameters
-    finetune_epochs: int = 1
-    finetune_max_steps: Optional[int] = None
-    finetune_global_batch_size: int = 128
-    finetune_per_device_batch_size: int = 16
-    finetune_learning_rate: float = 2e-5
-    finetune_weight_decay: float = 0.1
-    finetune_max_grad_norm: float = 1.0
-    finetune_lr_scheduler_type: str = "linear-warmup+cosine-decay"
-    finetune_warmup_ratio: float = 0.03
-    finetune_train_strategy: str = "fsdp-full-shard"
-@dataclass
-class LLaVa_v15_Reproduction_13B(LLaVa_v15_Reproduction_7B):
-    model_id: str = "reproduction-llava-v15+13b"
-    llm_backbone_id: str = "vicuna-v15-13b"
-# === Section 4.1 :: Optimization Procedure ===
-# Section 4.1A :: 🚀 --> Necessity of Multi-Stage Training
-@dataclass
-class Exp_7B_One_Stage(LLaVa_v15_Reproduction_7B):
-    model_id: str = "one-stage+7b"
-    arch_specifier: str = "no-align+gelu-mlp"
-@dataclass
-class Exp_13B_One_Stage(LLaVa_v15_Reproduction_13B):
-    model_id: str = "one-stage+13b"
-    arch_specifier: str = "no-align+gelu-mlp"
-# Section 4.1B :: 🛠️ --> Full Finetuning through Visual Backbones
-#   =>> Note :: Run with `--stage full-finetune`
-@dataclass
-class Exp_7B_Full_Finetune_Multi_Stage(LLaVa_v15_Reproduction_7B):
-    model_id: str = "full-ft-multi-stage+7b"
-@dataclass
-class Exp_7B_Full_Finetune_One_Stage(Exp_7B_One_Stage):
-    model_id: str = "full-ft-one-stage+7b"
-# === Section 4.2 :: Image Processing and Visual Representations ===
-# Section 4.2A :: 📸 --> Choosing a Pretrained Representation
-@dataclass
-class Exp_7B_IN1K_ViT_L_p16_224px(Exp_7B_One_Stage):
-    model_id: str = "in1k-224px+7b"
-    vision_backbone_id: str = "in1k-vit-l"
-@dataclass
-class Exp_7B_DINOv2_ViT_L_p14_224px(Exp_7B_One_Stage):
-    model_id: str = "dinov2-224px+7b"
-    vision_backbone_id: str = "dinov2-vit-l"
-@dataclass
-class Exp_7B_CLIP_ViT_L_p14_224px(Exp_7B_One_Stage):
-    model_id: str = "clip-224px+7b"
-    vision_backbone_id: str = "clip-vit-l"
-@dataclass
-class Exp_7B_SigLIP_ViT_SO_p14_224px(Exp_7B_One_Stage):
-    model_id: str = "siglip-224px+7b"
-    vision_backbone_id: str = "siglip-vit-so400m"
-# Section 4.2B :: 📐 --> Choosing an Image Preprocessing Strategy
-@dataclass
-class Exp_7B_CLIP_ViT_L_p14_336px_Resize_Crop(Exp_7B_One_Stage):
-    model_id: str = "clip-336px-resize-crop+7b"
-    image_resize_strategy: str = "resize-crop"
-@dataclass
-class Exp_7B_CLIP_ViT_L_p14_336px_Resize_Naive(Exp_7B_One_Stage):
-    model_id: str = "clip-336px-resize-naive+7b"
-    image_resize_strategy: str = "resize-naive"
-@dataclass
-class Exp_7B_SigLIP_ViT_SO_p14_384px_Letterbox(Exp_7B_One_Stage):
-    model_id: str = "siglip-384px-letterbox+7b"
-    vision_backbone_id: str = "siglip-vit-so400m-384px"
-    image_resize_strategy: str = "letterbox"
-@dataclass
-class Exp_7B_SigLIP_ViT_SO_p14_384px_Resize_Crop(Exp_7B_One_Stage):
-    model_id: str = "siglip-384px-resize-crop+7b"
-    vision_backbone_id: str = "siglip-vit-so400m-384px"
-    image_resize_strategy: str = "resize-crop"
-@dataclass
-class Exp_7B_SigLIP_ViT_SO_p14_384px_Resize_Naive(Exp_7B_One_Stage):
-    model_id: str = "siglip-384px-resize-naive+7b"
-    vision_backbone_id: str = "siglip-vit-so400m-384px"
-    image_resize_strategy: str = "resize-naive"
-# Section 4.2D :: 🥞 --> Stacking/Ensembling Visual Representations
-@dataclass
-class Exp_7B_DINOCLIP_ViT_L_p14_336px_Letterbox(Exp_7B_One_Stage):
-    model_id: str = "dinoclip-336px-letterbox+7b"
-    vision_backbone_id: str = "dinoclip-vit-l-336px"
-    image_resize_strategy: str = "letterbox"
-    arch_specifier: str = "no-align+fused-gelu-mlp"
-@dataclass
-class Exp_7B_DINOCLIP_ViT_L_p14_336px_Resize_Naive(Exp_7B_One_Stage):
-    model_id: str = "dinoclip-336px-resize-naive+7b"
-    vision_backbone_id: str = "dinoclip-vit-l-336px"
-    image_resize_strategy: str = "resize-naive"
-    arch_specifier: str = "no-align+fused-gelu-mlp"
-@dataclass
-class Exp_7B_DINOSigLIP_ViT_L_p14_384px_Letterbox(Exp_7B_One_Stage):
-    model_id: str = "dinosiglip-384px-letterbox+7b"
-    vision_backbone_id: str = "dinosiglip-vit-so-384px"
-    image_resize_strategy: str = "letterbox"
-    arch_specifier: str = "no-align+fused-gelu-mlp"
-@dataclass
-class Exp_7B_DINOSigLIP_ViT_L_p14_384px_Resize_Naive(Exp_7B_One_Stage):
-    model_id: str = "dinosiglip-384px-resize-naive+7b"
-    vision_backbone_id: str = "dinosiglip-vit-so-384px"
-    image_resize_strategy: str = "resize-naive"
-    arch_specifier: str = "no-align+fused-gelu-mlp"
-# === Section 4.3 :: Language Models ===
-# Section 4.3A :: 📝 --> Base vs. Instruct-Tuned (Chat) LLMs
-@dataclass
-class Exp_7B_Llama2(Exp_7B_One_Stage):
-    model_id: str = "llama2+7b"
-    llm_backbone_id: str = "llama2-7b-pure"
-@dataclass
-class Exp_13B_Llama2(Exp_13B_One_Stage):
-    model_id: str = "llama2+13b"
-    llm_backbone_id: str = "llama2-13b-pure"
-# ~ Additional LLM Backbones :: LLaMa-2 Chat, Mistral v0.1, Mistral v0.1 Instruct, Phi-2 ~
-@dataclass
-class Ext_Exp_7B_Llama2_Chat(Exp_7B_One_Stage):
-    model_id: str = "llama2-chat+7b"
-    llm_backbone_id: str = "llama2-7b-chat"
-@dataclass
-class Ext_Exp_13B_Llama2_Chat(Exp_13B_One_Stage):
-    model_id: str = "llama2-chat+13b"
-    llm_backbone_id: str = "llama2-13b-chat"
-@dataclass
-class Ext_Exp_7B_Mistral_V1(Exp_7B_One_Stage):
-    model_id: str = "mistral-v0.1+7b"
-    llm_backbone_id: str = "mistral-v0.1-7b-pure"
-@dataclass
-class Ext_Exp_7B_Mistral_Instruct_V1(Exp_7B_One_Stage):
-    model_id: str = "mistral-instruct-v0.1+7b"
-    llm_backbone_id: str = "mistral-v0.1-7b-instruct"
-@dataclass
-class Ext_Exp_3B_Phi_2(Exp_7B_One_Stage):
-    model_id: str = "phi-2+3b"
-    llm_backbone_id: str = "phi-2-3b"
-# Section 4.3B :: ✌️ --> Co-training on Language-only Data
-#   =>> Note :: Run with `--dataset.type "llava-multimodal" (multimodal data only / no co-training)
-@dataclass
-class Exp_7B_Vicuna_No_Cotraining(Exp_7B_One_Stage):
-    model_id: str = "vicuna-no-cotraining+7b"
-@dataclass
-class Exp_7B_Llama2_No_Cotraining(Exp_7B_One_Stage):
-    model_id: str = "llama2-no-cotraining+7b"
-    llm_backbone_id: str = "llama2-7b-pure"
-# === Section 4.4 :: Scaling Properties - Train Time & Data ===
-# Section 4.4A :: ⏰ --> Scaling Train Time
-@dataclass
-class Exp_7B_1p25_Epochs(Exp_7B_One_Stage):
-    model_id: str = "train-1.25-epochs+7b"
-    finetune_max_steps: int = 6500
-@dataclass
-class Exp_7B_1p5_Epochs(Exp_7B_One_Stage):
-    model_id: str = "train-1.5-epochs+7b"
-    finetune_max_steps: int = 7800
-@dataclass
-class Exp_7B_2_Epochs(Exp_7B_One_Stage):
-    model_id: str = "train-2-epochs+7b"
-    finetune_epochs: int = 2
-@dataclass
-class Exp_7B_3_Epochs(Exp_7B_One_Stage):
-    model_id: str = "train-3-epochs+7b"
-    finetune_epochs: int = 3
-# Section 4.4B :: 📚 --> Scaling Data
-#   =>> Note :: Run with `--dataset.type "llava-lvis4v"`
-@dataclass
-class Exp_7B_LLaVa_LVIS4V(Exp_7B_One_Stage):
-    model_id: str = "llava-lvis4v+7b"
-#   =>> Note :: Run with `--dataset.type "llava-lrv"`
-@dataclass
-class Exp_7B_LLaVa_LRV(Exp_7B_One_Stage):
-    model_id: str = "llava-lrv+7b"
-#   =>> Note :: Run with `--dataset.type "llava-lvis4v-lrv"`
-@dataclass
-class Exp_7B_LLaVa_LVIS4V_LRV(Exp_7B_One_Stage):
-    model_id: str = "llava-lvis4v-lrv+7b"
-# === Section 5 :: Prisms ===
-# Prism-CLIP
-@dataclass
-class Prism_7B_CLIP_Controlled(Exp_7B_One_Stage):
-    model_id: str = "prism-clip-controlled+7b"
-    vision_backbone_id: str = "clip-vit-l-336px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-7b-pure"
-@dataclass
-class Prism_13B_CLIP_Controlled(Exp_13B_One_Stage):
-    model_id: str = "prism-clip-controlled+13b"
-    vision_backbone_id: str = "clip-vit-l-336px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-13b-pure"
-#   =>> Note :: Run with `--dataset.type "llava-lvis4v-lrv"`
-@dataclass
-class Prism_7B_CLIP(Exp_7B_One_Stage):
-    model_id: str = "prism-clip+7b"
-    vision_backbone_id: str = "clip-vit-l-336px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-7b-pure"
-    finetune_epochs: int = 2
-#   =>> Note :: Run with `--dataset.type "llava-lvis4v-lrv"`
-@dataclass
-class Prism_13B_CLIP(Exp_13B_One_Stage):
-    model_id: str = "prism-clip+13b"
-    vision_backbone_id: str = "clip-vit-l-336px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-13b-pure"
-    finetune_epochs: int = 2
-# Prism-SigLIP
-@dataclass
-class Prism_7B_SigLIP_Controlled(Exp_7B_One_Stage):
-    model_id: str = "prism-siglip-controlled+7b"
-    vision_backbone_id: str = "siglip-vit-so400m-384px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-7b-pure"
-@dataclass
-class Prism_13B_SigLIP_Controlled(Exp_13B_One_Stage):
-    model_id: str = "prism-siglip-controlled+13b"
-    vision_backbone_id: str = "siglip-vit-so400m-384px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-13b-pure"
-#   =>> Note :: Run with `--dataset.type "llava-lvis4v-lrv"`
-@dataclass
-class Prism_7B_SigLIP(Exp_7B_One_Stage):
-    model_id: str = "prism-siglip+7b"
-    vision_backbone_id: str = "siglip-vit-so400m-384px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-7b-pure"
-    finetune_epochs: int = 2
-#   =>> Note :: Run with `--dataset.type "llava-lvis4v-lrv"`
-@dataclass
-class Prism_13B_SigLIP(Exp_13B_One_Stage):
-    model_id: str = "prism-siglip+13b"
-    vision_backbone_id: str = "clip-vit-l-336px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-13b-pure"
-    finetune_epochs: int = 2
-# Prism-DINOSigLIP
-@dataclass
-class Prism_7B_DINOSigLIP_Controlled(Exp_7B_One_Stage):
-    model_id: str = "prism-dinosiglip-controlled+7b"
-    vision_backbone_id: str = "dinosiglip-vit-so-384px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-7b-pure"
-    arch_specifier: str = "no-align+fused-gelu-mlp"
-@dataclass
-class Prism_13B_DINOSigLIP_Controlled(Exp_13B_One_Stage):
-    model_id: str = "prism-dinosiglip-controlled+13b"
-    vision_backbone_id: str = "dinosiglip-vit-so-384px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-13b-pure"
-    arch_specifier: str = "no-align+fused-gelu-mlp"
-#   =>> Note :: Run with `--dataset.type "llava-lvis4v-lrv"`
-@dataclass
-class Prism_7B_DINOSigLIP(Exp_7B_One_Stage):
-    model_id: str = "prism-dinosiglip+7b"
-    vision_backbone_id: str = "dinosiglip-vit-so-384px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-7b-pure"
-    arch_specifier: str = "no-align+fused-gelu-mlp"
-    finetune_epochs: int = 2
-#   =>> Note :: Run with `--dataset.type "llava-lvis4v-lrv"`
-@dataclass
-class Prism_13B_DINOSigLIP(Exp_13B_One_Stage):
-    model_id: str = "prism-dinosiglip+13b"
-    vision_backbone_id: str = "dinosiglip-vit-so-384px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-13b-pure"
-    arch_specifier: str = "no-align+fused-gelu-mlp"
-    finetune_epochs: int = 2
-# [Inference-Optimized] 224px Prisms
-@dataclass
-class Opt_7B_DINOSigLIP_ViT_SO_p14_224px_Resize_Naive(Exp_7B_One_Stage):
-    model_id: str = "dinosiglip-224px-resize-naive+7b"
-    vision_backbone_id: str = "dinosiglip-vit-so-224px"
-    image_resize_strategy: str = "resize-naive"
-    arch_specifier: str = "no-align+fused-gelu-mlp"
-@dataclass
-class Prism_7B_DINOSigLIP_224px_Controlled(Exp_7B_One_Stage):
-    model_id: str = "prism-dinosiglip-224px-controlled+7b"
-    vision_backbone_id: str = "dinosiglip-vit-so-224px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-7b-pure"
-    arch_specifier: str = "no-align+fused-gelu-mlp"
-#   =>> Note :: Run with `--dataset.type "llava-lvis4v-lrv"`
-@dataclass
-class Prism_7B_DINOSigLIP_224px(Exp_7B_One_Stage):
-    model_id: str = "prism-dinosiglip-224px+7b"
-    vision_backbone_id: str = "dinosiglip-vit-so-224px"
-    image_resize_strategy: str = "resize-naive"
-    llm_backbone_id: str = "llama2-7b-pure"
-    arch_specifier: str = "no-align+fused-gelu-mlp"
-    finetune_epochs: int = 2
-# === Define a Model Registry Enum for Reference & Validation ===
-@unique
-class ModelRegistry(Enum):
-    # === LLaVa v1.5 Base Reproductions ===
-    REPRODUCTION_7B = LLaVa_v15_Reproduction_7B
-    REPRODUCTION_13B = LLaVa_v15_Reproduction_13B
-    # === Section 4.1 :: Optimization Procedure ===
-    EXP_ONE_STAGE_7B = Exp_7B_One_Stage
-    EXP_ONE_STAGE_13B = Exp_13B_One_Stage
-    EXP_FULL_FT_MULTI_STAGE = Exp_7B_Full_Finetune_Multi_Stage
-    EXP_FULL_FT_ONE_STAGE = Exp_7B_Full_Finetune_One_Stage
-    # === Section 4.2 :: Image Processing and Visual Representations ===
-    EXP_IN1K_224PX = Exp_7B_IN1K_ViT_L_p16_224px
-    EXP_DINOV2_224PX = Exp_7B_DINOv2_ViT_L_p14_224px
-    EXP_CLIP_224PX = Exp_7B_CLIP_ViT_L_p14_224px
-    EXP_SIGLIP_224PX = Exp_7B_SigLIP_ViT_SO_p14_224px
-    EXP_CLIP_336PX_RESIZE_CROP = Exp_7B_CLIP_ViT_L_p14_336px_Resize_Crop
-    EXP_CLIP_336PX_RESIZE_NAIVE = Exp_7B_CLIP_ViT_L_p14_336px_Resize_Naive
-    EXP_SIGLIP_384PX_LETTERBOX = Exp_7B_SigLIP_ViT_SO_p14_384px_Letterbox
-    EXP_SIGLIP_384PX_RESIZE_CROP = Exp_7B_SigLIP_ViT_SO_p14_384px_Resize_Crop
-    EXP_SIGLIP_384PX_RESIZE_NAIVE = Exp_7B_SigLIP_ViT_SO_p14_384px_Resize_Naive
-    EXP_DINOCLIP_336PX_LETTERBOX = Exp_7B_DINOCLIP_ViT_L_p14_336px_Letterbox
-    EXP_DINOCLIP_336PX_RESIZE_NAIVE = Exp_7B_DINOCLIP_ViT_L_p14_336px_Resize_Naive
-    EXP_DINOSIGLIP_384PX_LETTERBOX = Exp_7B_DINOSigLIP_ViT_L_p14_384px_Letterbox
-    EXP_DINOSIGLIP_384PX_RESIZE_NAIVE = Exp_7B_DINOSigLIP_ViT_L_p14_384px_Resize_Naive
-    # === Section 4.3 :: Language Models ===
-    EXP_LLAMA2_7B = Exp_7B_Llama2
-    EXP_LLAMA2_13B = Exp_13B_Llama2
-    # ~ Additional LLM Backbone Experiments :: LLaMa-2 Chat, Mistral v0.1, Mistral v0.1 Instruct ~
-    EXT_EXP_LLAMA2_CHAT_7B = Ext_Exp_7B_Llama2_Chat
-    EXT_EXP_LLAMA2_CHAT_13B = Ext_Exp_13B_Llama2_Chat
-    EXT_EXP_MISTRAL_V1_7B = Ext_Exp_7B_Mistral_V1
-    EXT_EXP_MISTRAL_INSTRUCT_V1_7B = Ext_Exp_7B_Mistral_Instruct_V1
-    EXT_EXP_PHI_2_3B = Ext_Exp_3B_Phi_2
-    # Cotraining w/ Unimodal Data
-    EXP_VICUNA_NO_COTRAINING_7B = Exp_7B_Vicuna_No_Cotraining
-    EXP_LLAMA2_NO_COTRAINING_7B = Exp_7B_Llama2_No_Cotraining
-    # === Section 4.4 :: Scaling Properties - Train Time & Data ===
-    EXP_1P25_EPOCHS = Exp_7B_1p25_Epochs
-    EXP_1P5_EPOCHS = Exp_7B_1p5_Epochs
-    EXP_2_EPOCHS = Exp_7B_2_Epochs
-    EXP_3_EPOCHS = Exp_7B_3_Epochs
-    EXP_LLAVA_LVIS4V = Exp_7B_LLaVa_LVIS4V
-    EXP_LLAVA_LRV = Exp_7B_LLaVa_LRV
-    EXP_LLAVA_LVIS4V_LRV = Exp_7B_LLaVa_LVIS4V_LRV
-    # === Section 5 :: Prisms ===
-    PRISM_CLIP_CONTROLLED_7B = Prism_7B_CLIP_Controlled
-    PRISM_CLIP_CONTROLLED_13B = Prism_13B_CLIP_Controlled
-    PRISM_CLIP_7B = Prism_7B_CLIP
-    PRISM_CLIP_13B = Prism_13B_CLIP
-    PRISM_SIGLIP_CONTROLLED_7B = Prism_7B_SigLIP_Controlled
-    PRISM_SIGLIP_CONTROLLED_13B = Prism_13B_SigLIP_Controlled
-    PRISM_SIGLIP_7B = Prism_7B_SigLIP
-    PRISM_SIGLIP_13B = Prism_13B_SigLIP
-    PRISM_DINOSIGLIP_CONTROLLED_7B = Prism_7B_DINOSigLIP_Controlled
-    PRISM_DINOSIGLIP_CONTROLLED_13B = Prism_13B_DINOSigLIP_Controlled
-    PRISM_DINOSIGLIP_7B = Prism_7B_DINOSigLIP
-    PRISM_DINOSIGLIP_13B = Prism_13B_DINOSigLIP
-    # === Inference Optimized :: 224px Prisms ===
-    OPT_DINOSIGLIP_224PX_RESIZE_NAIVE = Opt_7B_DINOSigLIP_ViT_SO_p14_224px_Resize_Naive
-    PRISM_DINOSIGLIP_224PX_CONTROLLED_7B = Prism_7B_DINOSigLIP_224px_Controlled
-    PRISM_DINOSIGLIP_224PX_7B = Prism_7B_DINOSigLIP_224px
-    @property
-    def model_id(self) -> str:
-        return self.value.model_id
-# Register Models in Choice Registry
-for model_variant in ModelRegistry:
-    ModelConfig.register_subclass(model_variant.model_id, model_variant.value)

capvector-oft/prismatic/conf/vla.py DELETED Viewed

@@ -1,235 +0,0 @@
-"""
-vla.py
-Draccus Dataclass Definition for a VLAConfig object, with various registered subclasses for each VLA experiment and
-model configuration thereof. A given VLA model (`policy`) configures the following attributes:
-    - Data Mixture (e.g., Bridge, OXE_MAGIC_SOUP, etc.)
-    - Base VLM from Prismatic Registry (e.g., `prism-dinosiglip+7b`)
-    - VLA Model Architecture / Parameters (e.g., freeze vision encoder, last layer finetuning)
-    - Training / Optimization Hyperparameters
-"""
-from dataclasses import dataclass
-from enum import Enum, unique
-from pathlib import Path
-from typing import Optional, Union
-from draccus import ChoiceRegistry
-@dataclass
-class VLAConfig(ChoiceRegistry):
-    # fmt: off
-    vla_id: str                                     # Unique VLA Policy ID that fully specifies a configuration variant
-    base_vlm: Union[str, Path]                      # Base VLM as ID/Path to Run Directory (e.g., `prism-dinosiglip+7b`)
-    freeze_vision_backbone: bool                    # Freeze Vision Backbone Parameters (akin to pretraining)
-    freeze_llm_backbone: bool                       # Freeze LLM Backbone parameters
-    unfreeze_last_llm_layer: bool                   # Unfreeze final layer of LLM (only takes effect if LLM is frozen)
-    # Data Mixture Parameters
-    data_mix: str                                   # Open-X Embodiment Dataset =>> Unique Mixture ID (e.g., `bridge`)
-    shuffle_buffer_size: int                        # Size of Shuffle Buffer (100K for Bridge, 1M for OXE)
-    # Optimization Parameters
-    epochs: int                                     # Epochs to Run (in case `max_steps` is not specified)
-    max_steps: Optional[int]                        # [Optional] Max Gradient Steps to Run (overrides `epochs`)
-    expected_world_size: int                        # Expected # of GPUs =>> allows us to gate training on hardware
-    global_batch_size: int                          # Global Batch Size (divided across processes / world size)
-    per_device_batch_size: int                      # Per-Device Batch Size (per-process / individual GPU)
-                                                    #   =>> # of accumulation steps is auto-computed
-    learning_rate: float                            # Peak Learning Rate (`lr_scheduler_type` sets warmup/decay)
-    weight_decay: float                             # Weight Decay for AdamW Optimizer
-    max_grad_norm: float                            # Max Grad Norm (for global gradient clipping)
-    lr_scheduler_type: str                          # LR Scheduler (usually: "constant" | "linear-warmup+cosine-decay")
-    warmup_ratio: float                             # Fraction of Steps to Warmup (for warmup LR schedulers)
-    train_strategy: str                             # Train Strategy (default "fsdp-full-shard")
-    # Enable Gradient/Activation Checkpointing (for the LLM Backbone)
-    enable_gradient_checkpointing: bool = True      # Enable Gradient/Activation Checkpointing during Training
-    # Mixed Precision Training via Torch Native AMP (`autocast`)
-    enable_mixed_precision_training: bool = True    # Enable Traditional BF16 Mixed Precision
-    reduce_in_full_precision: bool = True           # Accumulate/Reduce All-Gather Gradients in FP32 Full Precision
-    # fmt: on
-# === OpenVLA Training Configurations ===
-# = [8 GPU] Fast Iteration =>> SigLIP 224px + Bridge =
-@dataclass
-class Exp_SigLIP_224px_Bridge(VLAConfig):
-    vla_id: str = "siglip-224px+mx-bridge"
-    base_vlm: Union[str, Path] = "siglip-224px+7b"
-    freeze_vision_backbone: bool = False
-    freeze_llm_backbone: bool = False
-    unfreeze_last_llm_layer: bool = False
-    # Data Mixture Parameters
-    data_mix: str = "bridge"
-    shuffle_buffer_size: int = 256_000
-    # Optimization Parameters
-    epochs: int = 1000
-    max_steps: Optional[int] = None
-    expected_world_size: int = 8
-    global_batch_size: int = 256
-    per_device_batch_size: int = 32
-    learning_rate: float = 2e-5
-    weight_decay: float = 0.0
-    max_grad_norm: float = 1.0
-    lr_scheduler_type: str = "constant"
-    warmup_ratio: float = 0.0
-    train_strategy: str = "fsdp-full-shard"
-# = [8 GPU] SigLIP 224px Frozen Vision Backbone + Bridge =
-@dataclass
-class Exp_FreezeVIT_SigLIP_224px_Bridge(Exp_SigLIP_224px_Bridge):
-    vla_id: str = "siglip-224px-icy+mx-bridge"
-    base_vlm: Union[str, Path] = "siglip-224px+7b"
-    freeze_vision_backbone: bool = True
-# = [8 GPU] Fast Iteration =>> DINO-SigLIP 224px + Bridge =
-@dataclass
-class Exp_DinoSigLIP_224px_Bridge(Exp_SigLIP_224px_Bridge):
-    vla_id: str = "prism-dinosiglip-224px+mx-bridge"
-    base_vlm: Union[str, Path] = "prism-dinosiglip-224px+7b"
-    data_mix: str = "bridge"
-# = [64 GPU] SigLIP 224px + OXE Magic Soup =
-@dataclass
-class Exp_SigLIP_224px_OXE_Magic_Soup(Exp_SigLIP_224px_Bridge):
-    vla_id: str = "siglip-224px+mx-oxe-magic-soup"
-    base_vlm: Union[str, Path] = "siglip-224px+7b"
-    data_mix: str = "oxe_magic_soup"
-    expected_world_size: int = 64
-    global_batch_size: int = 2048
-    per_device_batch_size: int = 32
-# = [64 GPU] DINO-SigLIP 224px + OXE Magic Soup++ =
-@dataclass
-class Exp_DinoSigLIP_224px_OXE_Magic_Soup_Plus(Exp_SigLIP_224px_Bridge):
-    vla_id: str = "prism-dinosiglip-224px+mx-oxe-magic-soup-plus"
-    base_vlm: Union[str, Path] = "prism-dinosiglip-224px+7b"
-    # Note =>> We adopt two stages, training on a mixture including DROID for 70% of training, before resampling!
-    # data_mix: str = "oxe_magic_soup_plus"
-    data_mix: str = "oxe_magic_soup_plus_minus"
-    expected_world_size: int = 64
-    global_batch_size: int = 2048
-    per_device_batch_size: int = 32
-# === OpenVLA Fine-tuning Configurations ===
-# = [8 GPU] SigLIP 224px + T-DROID =
-@dataclass
-class Exp_SigLIP_224px_TDROID_CarrotInBowl(Exp_SigLIP_224px_Bridge):
-    vla_id: str = "siglip-224px+mx-tdroid_carrot_in_bowl"
-    base_vlm: Union[str, Path] = "siglip-224px+7b"
-    data_mix: str = "tdroid_carrot_in_bowl"
-@dataclass
-class Exp_SigLIP_224px_TDROID_PourCornInPot(Exp_SigLIP_224px_Bridge):
-    vla_id: str = "siglip-224px+mx-tdroid_pour_corn_in_pot"
-    base_vlm: Union[str, Path] = "siglip-224px+7b"
-    data_mix: str = "tdroid_pour_corn_in_pot"
-# = [8 GPU] SigLIP 224px + T-DROID -- Partial Finetuning =
-@dataclass
-class Exp_SigLIP_224px_Icy_TDROID_CarrotInBowl(Exp_SigLIP_224px_Bridge):
-    vla_id: str = "siglip-224px-icy+mx-tdroid_carrot_in_bowl"
-    base_vlm: Union[str, Path] = "siglip-224px+7b"
-    freeze_vision_backbone: bool = True
-    freeze_llm_backbone: bool = False
-    data_mix: str = "tdroid_carrot_in_bowl"
-@dataclass
-class Exp_SigLIP_224px_LastLayer_TDROID_CarrotInBowl(Exp_SigLIP_224px_Bridge):
-    vla_id: str = "siglip-224px-last_layer+mx-tdroid_carrot_in_bowl"
-    base_vlm: Union[str, Path] = "siglip-224px+7b"
-    freeze_vision_backbone: bool = True
-    freeze_llm_backbone: bool = True
-    unfreeze_last_llm_layer: bool = True
-    data_mix: str = "tdroid_carrot_in_bowl"
-@dataclass
-class Exp_SigLIP_224px_Sandwich_TDROID_CarrotInBowl(Exp_SigLIP_224px_Bridge):
-    vla_id: str = "siglip-224px-sandwich+mx-tdroid_carrot_in_bowl"
-    base_vlm: Union[str, Path] = "siglip-224px+7b"
-    freeze_vision_backbone: bool = False
-    freeze_llm_backbone: bool = True
-    unfreeze_last_llm_layer: bool = True
-    data_mix: str = "tdroid_carrot_in_bowl"
-# === [8 GPU] SigLIP 224px + FrankaWipe ===
-@dataclass
-class Exp_SigLIP_224px_Droid_Wipe(Exp_SigLIP_224px_Bridge):
-    vla_id: str = "siglip-224px+mx-droid_wipe"
-    base_vlm: Union[str, Path] = "siglip-224px+7b"
-    data_mix: str = "droid_wipe"
-# === Define a VLA Registry Enum for Reference & Validation ===
-@unique
-class VLARegistry(Enum):
-    # Sanity Check Configurations =>> BridgeV2
-    SIGLIP_224PX_MX_BRIDGE = Exp_SigLIP_224px_Bridge
-    DINOSIGLIP_224PX_MX_BRIDGE = Exp_DinoSigLIP_224px_Bridge
-    # SigLIP Frozen Backbone Experiment
-    FREEZE_SIGLIP_224PX_MX_BRIDGE = Exp_FreezeVIT_SigLIP_224px_Bridge
-    # [OpenVLA v0.1 7B] SigLIP 224px + OXE Magic Soup
-    SIGLIP_224PX_MX_OXE_MAGIC_SOUP = Exp_SigLIP_224px_OXE_Magic_Soup
-    # [OpenVLA 7B] DINO + SigLIP 224px + OXE Magic Soup++
-    DINOSIGLIP_224PX_MX_OXE_MAGIC_SOUP_PLUS = Exp_DinoSigLIP_224px_OXE_Magic_Soup_Plus
-    # === TDROID Fine-tuning Configs ===
-    SIGLIP_224PX_MX_TDROID_CARROT_IN_BOWL = Exp_SigLIP_224px_TDROID_CarrotInBowl
-    SIGLIP_224PX_MX_TDROID_POUR_CORN_IN_POT = Exp_SigLIP_224px_TDROID_PourCornInPot
-    SIGLIP_224PX_ICY_MX_TDROID_CARROT_IN_BOWL = Exp_SigLIP_224px_Icy_TDROID_CarrotInBowl
-    SIGLIP_224PX_LASTLAYER_MX_TDROID_CARROT_IN_BOWL = Exp_SigLIP_224px_LastLayer_TDROID_CarrotInBowl
-    SIGLIP_224PX_SANDWICH_MX_TDROID_CARROT_IN_BOWL = Exp_SigLIP_224px_Sandwich_TDROID_CarrotInBowl
-    # === DROID Fine-tuning Configs ===
-    SIGLIP_224PX_MX_DROID_WIPE = Exp_SigLIP_224px_Droid_Wipe
-    @property
-    def vla_id(self) -> str:
-        return self.value.vla_id
-# Register VLAs in Choice Registry
-for vla_variant in VLARegistry:
-    VLAConfig.register_subclass(vla_variant.vla_id, vla_variant.value)

capvector-oft/prismatic/extern/__init__.py DELETED Viewed

File without changes

capvector-oft/prismatic/extern/hf/__init__.py DELETED Viewed

File without changes

capvector-oft/prismatic/extern/hf/configuration_prismatic.py DELETED Viewed

@@ -1,140 +0,0 @@
-"""
-configuration_prismatic.py
-HuggingFace-style configuration definition for Prismatic VLMs, inheriting from `transformers.PretrainedConfig`.
-Default configuration specifies `siglip-224px+7b`.
-"""
-from typing import Any, Dict, List, Optional
-from transformers import PretrainedConfig
-from transformers.models.auto import CONFIG_MAPPING
-# === Utilities for Mapping Prismatic names to HF names ===
-# fmt: off
-VISION_BACKBONE_TO_RESOLUTION: Dict[str, List[int]] = {
-    "clip-vit-l": [224], "siglip-vit-so400m": [224], "dinov2-vit-l": [224], "in1k-vit-l": [224],
-    "clip-vit-l-336px": [336],
-    "siglip-vit-so400m-384px": [384],
-    "dinoclip-vit-l-336px": [336, 336],
-    "dinosiglip-vit-so-224px": [224, 224],
-    "dinosiglip-vit-so-384px": [384, 384],
-}
-VISION_BACKBONE_TO_TIMM_ID: Dict[str, List[str]] = {
-    "clip-vit-l": ["vit_large_patch14_clip_224.openai"],
-    "clip-vit-l-336px": ["vit_large_patch14_clip_336.openai"],
-    "dinov2-vit-l": ["vit_large_patch14_reg4_dinov2.lvd142m"],
-    "in1k-vit-l": ["vit_large_patch16_224.augreg_in21k_ft_in1k"],
-    "siglip-vit-so400m": ["vit_so400m_patch14_siglip_224"],
-    "siglip-vit-so400m-384px": ["vit_so400m_patch14_siglip_384"],
-    "dinoclip-vit-l-336px": ["vit_large_patch14_reg4_dinov2.lvd142m", "vit_large_patch14_clip_336.openai"],
-    "dinosiglip-vit-so-224px": ["vit_large_patch14_reg4_dinov2.lvd142m", "vit_so400m_patch14_siglip_224"],
-    "dinosiglip-vit-so-384px": ["vit_large_patch14_reg4_dinov2.lvd142m", "vit_so400m_patch14_siglip_384"],
-}
-TIMM_OVERRIDE_ACT_LAYER: Dict[str, List[Optional[str]]] = {
-    "clip-vit-l": ["quick_gelu"], "clip-vit-l-336px": ["quick_gelu"],
-    "dinov2-vit-l": [None], "in1k-vit-l": [None],
-    "siglip-vit-so400m": [None], "siglip-vit-so400m-384px": [None],
-    "dinoclip-vit-l-336px": [None, "quick_gelu"],
-    "dinosiglip-vit-so-224px": [None, None], "dinosiglip-vit-so-384px": [None, None]
-}
-LLM_BACKBONE_TO_HF_PATH = {
-    "llama2-7b-pure": "meta-llama/Llama-2-7b-hf", "llama2-13b-pure": "meta-llama/Llama-2-13b-hf",
-    "llama2-7b-chat": "meta-llama/Llama-2-7b-chat-hf", "llama2-13b-chat": "meta-llama/Llama-2-13b-chat-hf",
-    "vicuna-v15-7b": "lmsys/vicuna-7b-v1.5", "vicuna-v15-13b": "lmsys/vicuna-13b-v1.5",
-    "mistral-v0.1-7b-pure": "mistralai/Mistral-7B-v0.1",
-    "mistral-v0.1-7b-instruct": "mistralai/Mistral-7B-Instruct-v0.1",
-    "phi-2-3b": "microsoft/phi-2",
-}
-LLM_BACKBONE_TO_HF_METACLASS = {
-    "llama2-7b-pure": "llama", "llama2-13b-pure": "llama", "llama2-7b-chat": "llama", "llama2-13b-chat": "llama",
-    "vicuna-v15-7b": "llama", "vicuna-v15-13b": "llama",
-    "mistral-v0.1-7b-pure": "mistral", "mistral-v0.1-7b-instruct": "mistral",
-    "phi-2-3b": "phi",
-}
-VALID_VISION_BACKBONES = set(VISION_BACKBONE_TO_RESOLUTION.keys())
-VALID_LLM_BACKBONES = set(LLM_BACKBONE_TO_HF_PATH)
-# fmt: on
-class PrismaticConfig(PretrainedConfig):
-    model_type: str = "prismatic"
-    is_composition: bool = False
-    def __init__(
-        self,
-        vision_backbone_id: str = "siglip-vit-so400m",
-        llm_backbone_id: str = "vicuna-v15-7b",
-        arch_specifier: str = "no-align+gelu-mlp",
-        use_fused_vision_backbone: Optional[bool] = None,
-        image_resize_strategy: str = "letterbox",
-        text_config: Optional[Dict[str, Any]] = None,
-        llm_max_length: int = 2048,
-        pad_token_id: int = 32000,
-        pad_to_multiple_of: int = 64,
-        output_projector_states: bool = False,
-        **kwargs: str,
-    ) -> None:
-        if vision_backbone_id not in VALID_VISION_BACKBONES:
-            raise ValueError(f"Vision backbone `{vision_backbone_id}` not in {VALID_VISION_BACKBONES = }")
-        if llm_backbone_id not in VALID_LLM_BACKBONES:
-            raise ValueError(f"LLM backbone `{llm_backbone_id}` not in {VALID_LLM_BACKBONES = }")
-        # Set Prismatic Configuration Fields
-        self.vision_backbone_id = vision_backbone_id
-        self.llm_backbone_id = llm_backbone_id
-        self.arch_specifier = arch_specifier
-        self.output_projector_states = output_projector_states
-        # [Contract] All vision backbone parameters are lists =>> supports fused backbones with different preprocessing
-        self.use_fused_vision_backbone = (
-            use_fused_vision_backbone
-            if use_fused_vision_backbone is not None
-            else any(self.vision_backbone_id.startswith(v) for v in ["dinoclip", "dinosiglip"])
-        )
-        self.timm_model_ids = VISION_BACKBONE_TO_TIMM_ID[self.vision_backbone_id]
-        self.timm_override_act_layers = TIMM_OVERRIDE_ACT_LAYER[self.vision_backbone_id]
-        self.image_sizes = VISION_BACKBONE_TO_RESOLUTION[self.vision_backbone_id]
-        self.image_resize_strategy = image_resize_strategy
-        self.hf_llm_id = LLM_BACKBONE_TO_HF_PATH[self.llm_backbone_id]
-        self.llm_max_length = llm_max_length
-        self.pad_token_id, self.pad_to_multiple_of = pad_token_id, pad_to_multiple_of
-        # [IMPORTANT] HF Utilities actually look for a `text_config` field... we need to use that specific naming!
-        self.text_config = (
-            CONFIG_MAPPING[LLM_BACKBONE_TO_HF_METACLASS[self.llm_backbone_id]](**text_config)
-            if text_config is not None
-            else CONFIG_MAPPING[LLM_BACKBONE_TO_HF_METACLASS[self.llm_backbone_id]]()
-        )
-        # Dispatch **kwargs to super() =>> note that `pad_token_id` collides, so we pass it in here as well...
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
-class OpenVLAConfig(PrismaticConfig):
-    model_type: str = "openvla"
-    def __init__(
-        self,
-        norm_stats: Optional[Dict[str, Dict[str, Dict[str, Dict[str, List[float]]]]]] = None,
-        n_action_bins: int = 256,
-        **kwargs: str,
-    ) -> None:
-        self.norm_stats, self.n_action_bins = norm_stats, n_action_bins
-        super().__init__(**kwargs)

capvector-oft/prismatic/extern/hf/modeling_prismatic.py DELETED Viewed

@@ -1,1085 +0,0 @@
-"""
-modeling_prismatic.py
-Core HuggingFace-style PrismaticPreTrainedModel and PrismaticForConditionalGeneration class definitions.
-Inherits from the default `transformers.PretrainedModel`. Meant to be standalone and self-contained,
-but exactly replicate the logic in `prismatic.models.vlms.prismatic.py`.
-"""
-import logging
-from dataclasses import dataclass
-from functools import partial
-from typing import Any, Callable, ClassVar, Dict, List, Optional, Tuple, Union
-import numpy as np
-import timm
-import tokenizers
-import torch
-import torch.nn as nn
-import transformers
-from timm.models.vision_transformer import LayerScale
-from transformers import AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
-from transformers.modeling_outputs import ModelOutput
-from prismatic.training.train_utils import (
-    get_current_action_mask,
-    get_next_actions_mask,
-)
-from prismatic.vla.constants import (
-    ACTION_DIM,
-    ACTION_PROPRIO_NORMALIZATION_TYPE,
-    ACTION_TOKEN_BEGIN_IDX,
-    IGNORE_INDEX,
-    NUM_ACTIONS_CHUNK,
-    STOP_INDEX,
-    NormalizationType,
-)
-from .configuration_prismatic import OpenVLAConfig, PrismaticConfig
-# Set up logger
-logger = logging.getLogger(__name__)
-# === Utility Functions for Monkey-Patching ===
-def unpack_tuple(fn: Callable[[Any], Tuple[Any]]) -> Callable[[Any], Any]:
-    def wrapper(*args: Any, **kwargs: Any) -> Any:
-        result = fn(*args, **kwargs)
-        return result[0] if isinstance(result, tuple) else result
-    return wrapper
-# HF Transformers overwrites parameters with names containing `gamma`; we're going to patch VisionBackbone.LayerScale.
-#   =>> TIMM :: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L109
-#   =>> Transformers :: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L3960
-def _ls_new_forward(self, x: torch.Tensor) -> torch.Tensor:
-    return x.mul_(self.scale_factor) if self.inplace else x * self.scale_factor
-def ls_apply_patch(ls_module: LayerScale):
-    ls_module.scale_factor = nn.Parameter(ls_module.gamma.clone())
-    ls_module.forward = _ls_new_forward.__get__(ls_module, LayerScale)
-    del ls_module.gamma
-# === Prismatic Vision Backbone (nn.Module) Definitions (w/ Fused Backbone Support) ===
-class PrismaticVisionBackbone(nn.Module):
-    """
-    Vision backbone for Prismatic models that handles image feature extraction.
-    Supports both single backbone (e.g., SigLIP) and fused backbone (e.g., SigLIP + DINOv2) configurations.
-    For fused backbones, features from both models are concatenated along the feature dimension.
-    """
-    def __init__(
-        self,
-        use_fused_vision_backbone: bool,
-        image_sizes: List[int],
-        timm_model_ids: List[str],
-        timm_override_act_layers: List[Optional[str]],
-    ) -> None:
-        """
-        Initialize the vision backbone.
-        Args:
-            use_fused_vision_backbone: Whether to use two backbones and fuse their features
-            image_sizes: List of image sizes for each backbone
-            timm_model_ids: List of TIMM model IDs to use for each backbone
-            timm_override_act_layers: List of activation layer overrides for each backbone
-        """
-        super().__init__()
-        self.use_fused_vision_backbone = use_fused_vision_backbone
-        self.num_images_in_input = 1  # Default value, can be overridden later
-        # Validate number of (fused) vision backbones
-        if len(timm_model_ids) > 2:
-            raise ValueError("Prismatic models only support up to 2 (fused) vision backbones!")
-        # Create primary featurizer
-        self.featurizer = self._create_featurizer(
-            model_id=timm_model_ids[0], img_size=image_sizes[0], act_layer=timm_override_act_layers[0]
-        )
-        self.embed_dim = self.featurizer.embed_dim
-        # Create secondary featurizer if using fused backbone
-        if self.use_fused_vision_backbone:
-            self.fused_featurizer = self._create_featurizer(
-                model_id=timm_model_ids[1], img_size=image_sizes[1], act_layer=timm_override_act_layers[1]
-            )
-            self.embed_dim += self.fused_featurizer.embed_dim
-        # Patch LayerScale modules for HF compatibility
-        self._patch_layer_scales()
-    def _create_featurizer(self, model_id: str, img_size: int, act_layer: Optional[str]) -> nn.Module:
-        """
-        Create a TIMM-based featurizer model with appropriate configurations.
-        Args:
-            model_id: The TIMM model ID to load
-            img_size: Input image size for the model
-            act_layer: Override for the activation layer type
-        Returns:
-            A configured featurizer model
-        """
-        featurizer = timm.create_model(
-            model_id,
-            pretrained=False,
-            num_classes=0,
-            img_size=img_size,
-            act_layer=act_layer,
-        )
-        # Monkey-patch the forward function to extract the second-to-last layer features
-        num_blocks = len(featurizer.blocks)
-        featurizer.forward = unpack_tuple(partial(featurizer.get_intermediate_layers, n={num_blocks - 2}))
-        return featurizer
-    def _patch_layer_scales(self) -> None:
-        """
-        Patch all LayerScale modules to be compatible with HF's parameter naming.
-        HF Transformers overwrites parameters with names containing 'gamma',
-        so we need to rename and modify the forward method.
-        """
-        # Patch primary featurizer
-        for module in self.featurizer.modules():
-            if isinstance(module, LayerScale):
-                ls_apply_patch(module)
-        # Patch secondary featurizer if it exists
-        if self.use_fused_vision_backbone:
-            for module in self.fused_featurizer.modules():
-                if isinstance(module, LayerScale):
-                    ls_apply_patch(module)
-    def get_num_patches(self) -> int:
-        """
-        Returns the number of vision patches output by the vision backbone.
-        Returns:
-            Number of patches per image
-        """
-        return self.featurizer.patch_embed.num_patches
-    def get_num_images_in_input(self) -> int:
-        """
-        Returns the number of input images for the vision backbone.
-        Returns:
-            Number of images expected in the input
-        """
-        return self.num_images_in_input
-    def set_num_images_in_input(self, num_images_in_input: int) -> None:
-        """
-        Sets the number of input images for the vision backbone.
-        Args:
-            num_images_in_input: Number of images to expect in the input
-        """
-        self.num_images_in_input = num_images_in_input
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        """
-        Implements the forward pass for the vision backbone.
-        If `self.use_fused_vision_backbone == True`, uses both SigLIP and DINOv2 transformers to extract visual features
-        (otherwise uses SigLIP only). Allows multi-image inputs (but only for fused vision backbone).
-        Args:
-            pixel_values (torch.Tensor): Pixels for input image(s), (B, C, H, W).
-        """
-        if self.num_images_in_input == 1:
-            if not self.use_fused_vision_backbone:
-                return self.featurizer(pixel_values)
-            # Split `pixel_values :: [bsz, 2 * 3, resolution, resolution]` =>> featurize =>> channel stack
-            img, img_fused = torch.split(pixel_values, [3, 3], dim=1)
-            patches, patches_fused = self.featurizer(img), self.fused_featurizer(img_fused)
-            return torch.cat([patches, patches_fused], dim=2)
-        else:
-            assert self.use_fused_vision_backbone, "Multi-image inputs require using fused backbone!"
-            # Split `pixel_values` into individual images (each with 6 channels: 3 for SigLIP + 3 for DINOv2)
-            images = torch.split(pixel_values, [6] * self.num_images_in_input, dim=1)
-            # Process each image and collect patches
-            all_patches = []
-            for img in images:
-                # Split each image further into two stacks of channels (each with 3 channels)
-                img_regular, img_fused = torch.split(img, [3, 3], dim=1)
-                # Get patches from both SigLIP and DINOv2 vision transformers
-                patches = self.featurizer(img_regular)
-                patches_fused = self.fused_featurizer(img_fused)
-                # Concatenate SigLIP and DINOv2 patches along the hidden dimension
-                combined_patches = torch.cat([patches, patches_fused], dim=2)
-                all_patches.append(combined_patches)
-            # Concatenate all patches along the patch dimension
-            return torch.cat(all_patches, dim=1)
-# === Prismatic Projector (nn.Module) Definitions ===
-class PrismaticProjector(nn.Module):
-    def __init__(self, use_fused_vision_backbone: bool, vision_dim: int, llm_dim: int) -> None:
-        super().__init__()
-        self.use_fused_vision_backbone = use_fused_vision_backbone
-        self.vision_dim, self.llm_dim = vision_dim, llm_dim
-        # Switch on `use_fused_vision_backbone` =>> use slightly different MLPs and projection factors!
-        if not self.use_fused_vision_backbone:
-            self.fc1 = nn.Linear(self.vision_dim, self.llm_dim, bias=True)
-            self.fc2 = nn.Linear(self.llm_dim, self.llm_dim, bias=True)
-            self.act_fn1 = nn.GELU()
-        else:
-            initial_projection_dim = 4 * vision_dim
-            self.fc1 = nn.Linear(self.vision_dim, initial_projection_dim, bias=True)
-            self.fc2 = nn.Linear(initial_projection_dim, self.llm_dim, bias=True)
-            self.fc3 = nn.Linear(self.llm_dim, self.llm_dim, bias=True)
-            self.act_fn1 = nn.GELU()
-            self.act_fn2 = nn.GELU()
-    def forward(self, img_patches: torch.Tensor) -> torch.Tensor:
-        if not self.use_fused_vision_backbone:
-            projected_features = self.fc1(img_patches)
-            projected_features = self.act_fn1(projected_features)
-            projected_features = self.fc2(projected_features)
-        else:
-            projected_features = self.fc1(img_patches)
-            projected_features = self.act_fn1(projected_features)
-            projected_features = self.fc2(projected_features)
-            projected_features = self.act_fn2(projected_features)
-            projected_features = self.fc3(projected_features)
-        return projected_features
-# === Main HF Class Definitions ===
-@dataclass
-class PrismaticCausalLMOutputWithPast(ModelOutput):
-    """Base class for Prismatic casual (visually-conditioned) language model outputs; also exposes visual features."""
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    # Additions for VLMs
-    projector_features: Optional[torch.FloatTensor] = None
-class PrismaticPreTrainedModel(PreTrainedModel):
-    config_class: PretrainedConfig = PrismaticConfig
-    base_model_prefix: str = "model"
-    supports_gradient_checkpointing: bool = True
-    _no_split_modules: ClassVar[List[str]] = ["PrismaticProjector"]
-    _skip_keys_device_placement: str = "past_key_values"
-    _supports_flash_attn_2: bool = True
-    def _init_weights(self, module: nn.Module) -> None:
-        # Important :: this HF ported version is *not* meant for training from scratch; only inference and fine-tuning!
-        #   => As such, this init_weights code is not correct; if training VLMs from scratch, use the main codebase at
-        #      https://github.com/TRI-ML/prismatic-vlms
-        std = (
-            self.config.initializer_range
-            if hasattr(self.config, "initializer_range")
-            else self.config.text_config.initializer_range
-        )
-        if hasattr(module, "class_embedding"):
-            module.class_embedding.data.normal_(mean=0.0, std=std)
-        if isinstance(module, (nn.Linear, nn.Conv2d)):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-    @property
-    def _supports_sdpa(self) -> bool:
-        """Check LLM supports SDPA Attention"""
-        return self.language_model._supports_sdpa
-class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
-    def __init__(self, config: PrismaticConfig) -> None:
-        super().__init__(config)
-        # [Validation] Lightweight Validate on `config` Fields + Dependency Versions
-        if config.use_fused_vision_backbone is None:
-            raise ValueError("Missing config field `use_fused_vision_backbone`")
-        if timm.__version__ not in {"0.9.10", "0.9.11", "0.9.12", "0.9.16"}:
-            raise NotImplementedError(
-                "TIMM Version must be >= 0.9.10 and < 1.0.0 (breaking); please raise a GitHub Issue "
-                "if you urgently need support for latest TIMM versions."
-            )
-        if (transformers.__version__ != "4.40.1") or (tokenizers.__version__ != "0.19.1"):
-            logger.warning(
-                f"Expected `transformers==4.40.1` and `tokenizers==0.19.1` but got "
-                f"`transformers=={transformers.__version__}` and `tokenizers=={tokenizers.__version__}`; "
-                f"there might be inference-time regressions due to dependency changes. If in doubt, please"
-                f"use the above versions."
-            )
-        # Instantiate PrismaticVisionBackbone (w/ Potential Fused Backbone)
-        self.vision_backbone = PrismaticVisionBackbone(
-            config.use_fused_vision_backbone, config.image_sizes, config.timm_model_ids, config.timm_override_act_layers
-        )
-        # Create Multimodal Projector
-        self.projector = PrismaticProjector(
-            config.use_fused_vision_backbone,
-            vision_dim=self.vision_backbone.embed_dim,
-            llm_dim=config.text_config.hidden_size,
-        )
-        # Instantiate LLM Backbone
-        self.language_model = AutoModelForCausalLM.from_config(
-            config.text_config, attn_implementation=config._attn_implementation
-        )
-        self.vocab_size = config.text_config.vocab_size
-        self.pad_token_id = config.pad_token_id
-        self.llm_dim = config.text_config.hidden_size
-        # HF Boilerplate =>> initializes weights via `_init_weights()` and sets gradient checkpointing
-        self.post_init()
-    # === `PreTrainedModel` Boilerplate ===
-    def get_input_embeddings(self) -> nn.Module:
-        return self.language_model.get_input_embeddings()
-    def set_input_embeddings(self, value: nn.Module) -> None:
-        self.language_model.set_input_embeddings(value)
-    def get_output_embeddings(self) -> nn.Module:
-        return self.language_model.get_output_embeddings()
-    def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
-        self.language_model.set_output_embeddings(new_embeddings)
-    def get_decoder(self) -> nn.Module:
-        return self.language_model.get_decoder()
-    def set_decoder(self, decoder: nn.Module) -> None:
-        self.language_model.set_decoder(decoder)
-    def tie_weights(self) -> None:
-        self.language_model.tie_weights()  # Note: `Llama-2` and `Mistral` don't tie weights (no-op)
-    def resize_token_embeddings(
-        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
-    ) -> nn.Embedding:
-        updated_embeddings = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        # Update config/instance variables
-        self.config.text_config.vocab_size = updated_embeddings.num_embeddings
-        self.vocab_size = updated_embeddings.num_embeddings
-        return updated_embeddings
-    def _replace_input_embeddings(self, input_embeddings, all_actions_mask, noisy_action_features):
-        """
-        Replace embeddings in input_embeddings at positions where all_actions_mask is True
-        with embeddings from noisy_action_features, using vectorized operations.
-        Args:
-            input_embeddings: Tensor of shape (B, S, D)
-            all_actions_mask: Boolean tensor of shape (B, S)
-            noisy_action_features: Tensor of shape (B, K, D) where K is the number of True values in mask per sample
-        Returns:
-            Modified input_embeddings tensor
-        """
-        # Clone input to avoid modifying the original tensor
-        new_input_embeddings = input_embeddings.clone()
-        # Create a tensor with the same shape of input_embeddings to hold the noisy action features
-        repositioned_noisy_action_features = torch.zeros_like(input_embeddings)
-        # Create batch indices for splicing
-        batch_indices = torch.arange(input_embeddings.shape[0], device=input_embeddings.device)
-        batch_indices = batch_indices.unsqueeze(1).expand(-1, noisy_action_features.shape[1])
-        # Get indices where mask is True for each sample
-        masked_indices = torch.stack([torch.where(mask)[0] for mask in all_actions_mask])
-        # Move the noisy action features into their correct positions
-        repositioned_noisy_action_features[batch_indices, masked_indices] = noisy_action_features
-        # Combine original input embeddings and noisy action embeddings using the mask
-        new_input_embeddings = torch.where(
-            all_actions_mask.unsqueeze(-1), repositioned_noisy_action_features, new_input_embeddings
-        )
-        return new_input_embeddings
-    def _process_action_masks(self, labels):
-        """Helper to get action masks from labels"""
-        current_action_mask = get_current_action_mask(labels)
-        next_actions_mask = get_next_actions_mask(labels)
-        all_actions_mask = current_action_mask | next_actions_mask  # (B, seq_len)
-        return all_actions_mask
-    def _process_vision_features(self, pixel_values, language_embeddings=None, use_film=False):
-        """Process vision features with optional FiLM conditioning"""
-        if use_film:
-            # FiLM: Infuse language inputs into visual features
-            patch_features = self.vision_backbone(pixel_values, language_embeddings)  # (bsz, 256 * num_images, D)
-        else:
-            patch_features = self.vision_backbone(pixel_values)  # (bsz, 256 * num_images, D)
-        # Project patch embeddings into language embedding space
-        return self.projector(patch_features)
-    def _process_proprio_features(self, projected_patch_embeddings, proprio, proprio_projector):
-        """Process proprioceptive features and append to vision features"""
-        if proprio_projector is not None and proprio is not None:
-            # projected_patch_embeddings: (bsz, num_patches * num_images, llm_dim)
-            # proprio: (bsz, proprio_dim) or (propro_dim,)
-            proprio = proprio.reshape(projected_patch_embeddings.shape[0], -1)  # (bsz, proprio_dim)
-            proprio_features = proprio_projector(proprio)  # (bsz, llm_dim)
-            proprio_features = proprio_features.unsqueeze(dim=1)  # (bsz, 1, llm_dim)
-            # For simplicity, just append proprio token to the end of projected vision patch tokens
-            return torch.cat((projected_patch_embeddings, proprio_features), dim=1)
-        return projected_patch_embeddings
-    def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddings, attention_mask):
-        """Build multimodal embeddings and attention mask"""
-        # Update attention mask
-        projected_patch_attention_mask = None
-        if attention_mask is not None:
-            projected_patch_attention_mask = torch.full(
-                (projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]),
-                fill_value=True,
-                dtype=attention_mask.dtype,
-                device=attention_mask.device,
-            )
-        # Build multimodal embeddings & attention mask; insert embeddings after <BOS> token (1:)
-        multimodal_embeddings = torch.cat(
-            [input_embeddings[:, :1, :], projected_patch_embeddings, input_embeddings[:, 1:, :]], dim=1
-        )
-        multimodal_attention_mask = None
-        if attention_mask is not None:
-            multimodal_attention_mask = torch.cat(
-                [attention_mask[:, :1], projected_patch_attention_mask, attention_mask[:, 1:]], dim=1
-            )
-        return multimodal_embeddings, multimodal_attention_mask
-    def _build_multimodal_labels(self, labels, projected_patch_embeddings):
-        """Build multimodal labels with IGNORE_INDEX for patch embeddings"""
-        if labels is not None:
-            projected_patch_labels = torch.full(
-                (projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]),
-                fill_value=IGNORE_INDEX,
-                dtype=labels.dtype,
-                device=labels.device,
-            )
-            return torch.cat([labels[:, :1], projected_patch_labels, labels[:, 1:]], dim=1)
-        return None
-    # === Core Prismatic VLM `forward()` Logic ===
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_projector_features: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        proprio=None,
-        proprio_projector=None,
-        noisy_actions=None,
-        noisy_action_projector=None,
-        diffusion_timestep_embeddings=None,
-        use_film: bool = False,
-    ) -> Union[Tuple, PrismaticCausalLMOutputWithPast]:
-        """Run a forward pass through the VLM, returning a PrismaticCausalLMOutputWithPast instance."""
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        output_projector_features = output_projector_features if output_projector_features is not None else False
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # Respect `use_cache` only if not training (even if `gradient_checkpointing` is off)
-        use_cache = use_cache and not self.training
-        # Instantiate Placeholder for Projector Features
-        projected_patch_embeddings = None
-        # === Handle Generation with Cache (`input_ids.shape[1] == 1`) =>> requires `past_keys_values` ===
-        if input_ids.shape[1] == 1:
-            assert input_ids.shape[0] == 1, "Generation is only currently supported for batch size of 1!"
-            assert past_key_values is not None, "You must provide `past_key_values` during cached generation!"
-            assert labels is None, "Unexpected key `labels` provided during cached generation!"
-            language_model_output = self.language_model(
-                input_ids=input_ids,
-                attention_mask=None,
-                position_ids=None,
-                past_key_values=past_key_values,
-                inputs_embeds=None,
-                labels=None,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        # === Handle Unimodal Forward ===
-        elif pixel_values is None:
-            assert (input_ids is not None) and (inputs_embeds is None), "Missing `input_ids` in language-only forward!"
-            assert past_key_values is None, "Unexpected key `past_key_values` provided during language-only forward!"
-            language_model_output = self.language_model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                position_ids=None,
-                past_key_values=None,
-                inputs_embeds=None,
-                labels=labels,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        # === Handle Multimodal Forward ===
-        elif (input_ids.shape[0] == pixel_values.shape[0]) or (inputs_embeds.shape[0] == pixel_values.shape[0]):
-            assert past_key_values is None, "Unexpected key `past_key_values` provided during multimodal forward!"
-            # Get input embeddings (from language model embeddings)
-            input_embeddings = self.get_input_embeddings()(input_ids)  # (B, seq_len, D)
-            # Extract action masks
-            all_actions_mask = self._process_action_masks(labels)
-            # Extract the language portion of the input embeddings (i.e. remove the action tokens portion)
-            language_embeddings = input_embeddings[~all_actions_mask].reshape(
-                input_embeddings.shape[0], -1, input_embeddings.shape[2]
-            )  # (B, lang_seq_len, llm_dim)
-            # Get visual features
-            projected_patch_embeddings = self._process_vision_features(pixel_values, language_embeddings, use_film)
-            # Add proprioceptive state if provided
-            projected_patch_embeddings = self._process_proprio_features(
-                projected_patch_embeddings, proprio, proprio_projector
-            )
-            # [Diffusion] Add diffusion timestep embedding if provided
-            if diffusion_timestep_embeddings is not None:
-                # For simplicity, just append diffusion timestep embedding to the end of projected vision patch tokens
-                projected_patch_embeddings = torch.cat(
-                    (projected_patch_embeddings, diffusion_timestep_embeddings), dim=1
-                )
-            # Process action embeddings
-            if noisy_actions is not None:
-                # Get mask corresponding to all action tokens
-                all_actions_mask = self._process_action_masks(labels)
-                # Reshape noisy actions into individual action tokens
-                # noisy_actions: (B, chunk_len, action_dim) -> (B, chunk_len * action_dim, 1)
-                B = noisy_actions.shape[0]
-                noisy_actions = noisy_actions.reshape(B, -1).unsqueeze(-1)
-                # Project noisy action tokens into language model embedding space
-                noisy_action_features = noisy_action_projector(noisy_actions)  # (B, chunk_len * action_dim, llm_dim)
-                # Replace embeddings of the action tokens with noisy action embeddings
-                input_embeddings = self._replace_input_embeddings(
-                    input_embeddings, all_actions_mask, noisy_action_features
-                )
-            else:
-                # Replace the embeddings of the action tokens with zeros
-                # (Later on, the positional embeddings will be added to them)
-                all_actions_mask = all_actions_mask.unsqueeze(-1)  # (B, seq_len, 1)
-                input_embeddings = input_embeddings * ~all_actions_mask
-            # Build multimodal embeddings & attention mask
-            multimodal_embeddings, multimodal_attention_mask = self._build_multimodal_attention(
-                input_embeddings, projected_patch_embeddings, attention_mask
-            )
-            # Build labels for multimodal sequence if needed
-            multimodal_labels = self._build_multimodal_labels(labels, projected_patch_embeddings)
-            # Dispatch to language model
-            language_model_output = self.language_model(
-                input_ids=None,
-                attention_mask=multimodal_attention_mask,
-                position_ids=None,
-                past_key_values=None,
-                inputs_embeds=multimodal_embeddings,
-                labels=multimodal_labels,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        # === Otherwise =>> Assume Invalid! ===
-        elif (input_ids.shape[0] != pixel_values.shape[0]) or (inputs_embeds.shape[0] != pixel_values.shape[0]):
-            raise ValueError("Non-homogenous batch of (text, image) input -- forward() does not support mixed batches!")
-        else:
-            raise ValueError(
-                "Invalid PrismaticForConditionalGeneration `forward()` call with provided arguments:\n"
-                f"=> `input_ids` = {input_ids is not None}\n"
-                f"=> `attention_mask` = {attention_mask is not None}\n"
-                f"=> `pixel_values` = {pixel_values is not None}\n"
-                f"=> `labels` = {labels is not None}\n"
-                f"=> `input_embeds` = {inputs_embeds is not None}\n"
-                f"=> `past_key_values` = {past_key_values is not None}\n"
-                f"=> `use_cache` = {use_cache}"
-            )
-        # Unpack `language_model_output` and return PrismaticCausalLMOutputWithPast (or tuple if not `return_dict`)
-        if not return_dict:
-            if output_projector_features and (projected_patch_embeddings is not None):
-                return *language_model_output, projected_patch_embeddings
-            return language_model_output
-        return PrismaticCausalLMOutputWithPast(
-            loss=language_model_output.loss,
-            logits=language_model_output.logits,
-            past_key_values=language_model_output.past_key_values,
-            hidden_states=language_model_output.hidden_states,
-            attentions=language_model_output.attentions,
-            projector_features=projected_patch_embeddings,
-        )
-    # === GenerationMixin Methods ===
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: str,
-    ) -> Dict[str, torch.Tensor]:
-        """Borrowed from `LlamaForCausalLM` and simplified for batch size = 1; mirrors original PrismaticVLM logic."""
-        if ((input_ids is not None) and (input_ids.shape[0] > 1)) or (
-            (inputs_embeds is not None) and (inputs_embeds.shape[0] > 1)
-        ):
-            raise ValueError("Generation with batch size > 1 is not currently supported!")
-        # Handle `past_key_values` (cache) =>> assume `input_ids` just has unprocessed tokens
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1:]
-        # If `input_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"input_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-        # Make sure `pixel_values` are preserved in `model_inputs`
-        model_inputs.update(
-            {
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-            }
-        )
-        return model_inputs
-    # Defer to Language Model (all handle this differently, with different return types)
-    def _reorder_cache(self, *args, **kwargs) -> Any:
-        return self.language_model._reorder_cache(*args, **kwargs)
-class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
-    config_class: PretrainedConfig = OpenVLAConfig
-    def __init__(self, config: OpenVLAConfig) -> None:
-        super().__init__(config)
-        self.norm_stats = config.norm_stats
-        # Compute action bins
-        self.bins = np.linspace(-1, 1, config.n_action_bins)
-        self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2.0
-        # Compute vocab size for de-tokenization -- revert added "multiple of"
-        self.vocab_size = self.config.text_config.vocab_size - self.config.pad_to_multiple_of
-    def _prepare_input_for_action_prediction(self, input_ids, attention_mask):
-        """Prepares input for action prediction by adding necessary tokens"""
-        # Add (ACTION_DIM * NUM_ACTIONS_CHUNK) placeholder tokens to input_ids to simulate action tokens
-        placeholder_action_token_ids = (
-            torch.ones((input_ids.shape[0], ACTION_DIM * NUM_ACTIONS_CHUNK)).to(input_ids.device).to(input_ids.dtype)
-        )
-        input_ids = torch.cat([input_ids, placeholder_action_token_ids], dim=-1)
-        # Add stop token to sequence (needed in non-causal bi-directional self-attention, as it appears at train time)
-        stop_token_id = torch.ones((input_ids.shape[0], 1)).to(input_ids.device).to(input_ids.dtype) * STOP_INDEX
-        input_ids = torch.cat([input_ids, stop_token_id], dim=-1)
-        # Extend the attention mask to fit the new shape of input
-        # Note: Only batch size == 1 supported right now
-        mask_extension = (
-            torch.ones((attention_mask.shape[0], input_ids.shape[-1] - attention_mask.shape[-1]))
-            .to(attention_mask.device)
-            .to(attention_mask.dtype)
-        )
-        attention_mask = torch.cat([attention_mask, mask_extension], dim=-1)
-        return input_ids, attention_mask
-    def _prepare_labels_for_action_prediction(self, labels, input_ids):
-        """Creates labels tensor for action prediction if not provided"""
-        # Extend labels tensor with fake action labels
-        ARBITRARY_ACTION_TOKEN_IDX = ACTION_TOKEN_BEGIN_IDX + 1
-        labels_extension = (
-            torch.ones((labels.shape[0], input_ids.shape[-1] - labels.shape[-1])).to(labels.device).to(labels.dtype)
-            * ARBITRARY_ACTION_TOKEN_IDX
-        )
-        labels = torch.cat([labels, labels_extension], dim=-1)
-        # Replace last label token with stop token
-        labels[:, -1] = STOP_INDEX
-        return labels
-    def _unnormalize_actions(self, normalized_actions, unnorm_key=None):
-        """Unnormalize actions using dataset statistics"""
-        action_norm_stats = self.get_action_stats(unnorm_key)
-        if ACTION_PROPRIO_NORMALIZATION_TYPE == NormalizationType.BOUNDS:
-            mask = action_norm_stats.get("mask", np.ones_like(action_norm_stats["min"], dtype=bool))
-            action_high, action_low = np.array(action_norm_stats["max"]), np.array(action_norm_stats["min"])
-        elif ACTION_PROPRIO_NORMALIZATION_TYPE == NormalizationType.BOUNDS_Q99:
-            mask = action_norm_stats.get("mask", np.ones_like(action_norm_stats["q01"], dtype=bool))
-            action_high, action_low = np.array(action_norm_stats["q99"]), np.array(action_norm_stats["q01"])
-        else:
-            raise ValueError("Unsupported action/proprio normalization type detected!")
-        actions = np.where(
-            mask,
-            0.5 * (normalized_actions + 1) * (action_high - action_low + 1e-8) + action_low,
-            normalized_actions,
-        )
-        return actions
-    def _run_diffusion_prediction(
-        self,
-        input_embeddings,
-        all_actions_mask,
-        noise,
-        action_head,
-        projected_patch_embeddings,
-        labels,
-        attention_mask,
-        NUM_PATCHES,
-        NUM_PROMPT_TOKENS,
-        noisy_action_projector,
-    ):
-        """Run diffusion-based action prediction"""
-        # Clone embedding for reuse in each timestep
-        orig_projected_patch_embeddings = projected_patch_embeddings.clone()
-        curr_noisy_actions = noise
-        # Reverse diffusion: Iteratively denoise to generate action prediction
-        for t in action_head.noise_scheduler.timesteps:
-            # Get diffusion model's noise prediction (conditioned on VLA latent embedding, current noisy action
-            # embedding, and diffusion timestep embedding)
-            timesteps = torch.Tensor([t]).to(labels.device)
-            diffusion_timestep_embeddings = (
-                action_head.time_encoder(timesteps).to(curr_noisy_actions.dtype).to(curr_noisy_actions.device)
-            )  # (B, llm_dim)
-            diffusion_timestep_embeddings = diffusion_timestep_embeddings.unsqueeze(1)  # (B, 1, llm_dim)
-            # [Diffusion] Replace the embeddings of the action tokens with noisy actions
-            # (Later on, the positional embeddings will be added to them)
-            # For simplicity, append diffusion timestep embedding to the end of projected vision tokens
-            projected_patch_embeddings = torch.cat(
-                (orig_projected_patch_embeddings, diffusion_timestep_embeddings), dim=1
-            )
-            # Reshape and project noisy actions into language embedding space
-            B = curr_noisy_actions.shape[0]
-            orig_curr_noisy_actions_shape = curr_noisy_actions.shape
-            curr_noisy_actions = curr_noisy_actions.reshape(B, -1).unsqueeze(-1)
-            noisy_action_features = noisy_action_projector(curr_noisy_actions)
-            curr_noisy_actions = curr_noisy_actions.reshape(orig_curr_noisy_actions_shape)
-            # Replace action token embeddings with noisy action embeddings
-            input_embeddings = self._replace_input_embeddings(
-                input_embeddings.clone(), all_actions_mask, noisy_action_features
-            )
-            # Build multimodal embeddings and attention mask
-            multimodal_embeddings, multimodal_attention_mask = self._build_multimodal_attention(
-                input_embeddings, projected_patch_embeddings, attention_mask
-            )
-            # Forward pass through language model
-            language_model_output = self.language_model(
-                input_ids=None,
-                attention_mask=multimodal_attention_mask,
-                position_ids=None,
-                past_key_values=None,
-                inputs_embeds=multimodal_embeddings,
-                labels=None,
-                use_cache=None,
-                output_attentions=False,
-                output_hidden_states=True,
-                return_dict=True,
-            )
-            # Extract hidden states for action portion of response
-            last_hidden_states = language_model_output.hidden_states[-1]  # (B, seq_len, D)
-            actions_hidden_states = last_hidden_states[
-                :,
-                NUM_PATCHES + NUM_PROMPT_TOKENS : NUM_PATCHES + NUM_PROMPT_TOKENS + ACTION_DIM * NUM_ACTIONS_CHUNK,
-                :,
-            ]  # (B, act_chunk_len, D)
-            # Predict noise and update noisy actions: x_t -> x_{t-1}
-            noise_pred = action_head.predict_noise(actions_hidden_states)
-            curr_noisy_actions = action_head.noise_scheduler.step(noise_pred, t, curr_noisy_actions).prev_sample
-        curr_noisy_actions = curr_noisy_actions.reshape(NUM_ACTIONS_CHUNK, ACTION_DIM)
-        # Return final actions
-        return curr_noisy_actions.float().cpu().detach().numpy(), actions_hidden_states
-    def _regression_or_discrete_prediction(
-        self,
-        input_embeddings,
-        all_actions_mask,
-        projected_patch_embeddings,
-        attention_mask,
-        labels,
-        NUM_PATCHES,
-        NUM_PROMPT_TOKENS,
-        action_head=None,
-    ):
-        """Run L1 regression-based continuous action prediction or discrete action tokens prediction."""
-        # Zero out action token embeddings
-        all_actions_mask = all_actions_mask.unsqueeze(-1)  # (B, seq_len, 1)
-        input_embeddings = input_embeddings * ~all_actions_mask
-        # Build multimodal embeddings and attention mask
-        multimodal_embeddings, multimodal_attention_mask = self._build_multimodal_attention(
-            input_embeddings, projected_patch_embeddings, attention_mask
-        )
-        # Forward pass through language model
-        language_model_output = self.language_model(
-            input_ids=None,
-            attention_mask=multimodal_attention_mask,
-            position_ids=None,
-            past_key_values=None,
-            inputs_embeds=multimodal_embeddings,
-            labels=None,
-            use_cache=None,
-            output_attentions=False,
-            output_hidden_states=True,
-            return_dict=True,
-        )
-        # Extract hidden states for action tokens
-        last_hidden_states = language_model_output.hidden_states[-1]  # (B, seq_len, D)
-        actions_hidden_states = last_hidden_states[
-            :,
-            NUM_PATCHES + NUM_PROMPT_TOKENS : NUM_PATCHES + NUM_PROMPT_TOKENS + ACTION_DIM * NUM_ACTIONS_CHUNK,
-            :,
-        ]  # (B, act_chunk_len, D)
-        # Handle different prediction methods
-        if action_head is not None:
-            # L1 regression prediction
-            normalized_actions = action_head.predict_action(actions_hidden_states)
-            normalized_actions = normalized_actions.reshape(NUM_ACTIONS_CHUNK, ACTION_DIM)
-            normalized_actions = normalized_actions.float().cpu().detach().numpy()
-        else:
-            # Discrete token-based prediction
-            predicted_action_token_ids = (
-                language_model_output.logits[
-                    :,
-                    NUM_PATCHES + NUM_PROMPT_TOKENS : NUM_PATCHES + NUM_PROMPT_TOKENS + ACTION_DIM * NUM_ACTIONS_CHUNK,
-                ]
-                .argmax(dim=2)
-                .cpu()
-                .numpy()
-            )
-            discretized_actions = self.vocab_size - predicted_action_token_ids
-            discretized_actions = np.clip(discretized_actions - 1, a_min=0, a_max=self.bin_centers.shape[0] - 1)
-            normalized_actions = self.bin_centers[discretized_actions]
-            normalized_actions = normalized_actions.reshape(NUM_ACTIONS_CHUNK, ACTION_DIM)
-        return normalized_actions, actions_hidden_states
-    def predict_action(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        unnorm_key: Optional[str] = None,
-        proprio=None,
-        proprio_projector=None,
-        action_head=None,
-        noisy_action_projector=None,
-        use_film: bool = False,
-        **kwargs: str,
-    ) -> np.ndarray:
-        """Predict actions from input sequence, with options for different prediction methods.
-        Args:
-            input_ids: Input token ids
-            unnorm_key: Key for unnormalization statistics
-            proprio: Proprioceptive features
-            proprio_projector: Projector for proprioceptive features
-            action_head: Optional head for L1 regression or diffusion-based prediction
-            noisy_action_projector: Projector for noisy actions in diffusion-based prediction
-            use_film: Whether to use FiLM conditioning
-            **kwargs: Additional arguments including pixel_values and attention_mask
-        Returns:
-            Tuple of (unnormalized_actions, action_hidden_states)
-        """
-        # If the special empty token ('') does not already appear after the colon (':') token in the prompt
-        # (after "OUT:" or "ASSISTANT:"), insert it to match the inputs seen at training time
-        if not torch.all(input_ids[:, -1] == 29871):
-            input_ids = torch.cat(
-                (input_ids, torch.unsqueeze(torch.Tensor([29871]).long(), dim=0).to(input_ids.device)), dim=1
-            )
-        pixel_values = kwargs["pixel_values"]
-        attention_mask = kwargs["attention_mask"]
-        # Create fake labels tensor (needed for action mask)
-        labels = input_ids.clone()
-        labels[:] = IGNORE_INDEX
-        # Get number of tokens in prompt (excluding the start token)
-        NUM_PROMPT_TOKENS = input_ids.shape[-1] - 1  # Subtract action tokens and stop token
-        # Prepare inputs by adding necessary tokens
-        input_ids, attention_mask = self._prepare_input_for_action_prediction(input_ids, attention_mask)
-        # Update labels tensor for action mask computation later
-        labels = self._prepare_labels_for_action_prediction(labels, input_ids)
-        # Get input embeddings and action masks
-        input_embeddings = self.get_input_embeddings()(input_ids)
-        all_actions_mask = self._process_action_masks(labels)
-        # Extract language embeddings
-        language_embeddings = input_embeddings[~all_actions_mask].reshape(
-            input_embeddings.shape[0], -1, input_embeddings.shape[2]
-        )
-        # Process vision features
-        projected_patch_embeddings = self._process_vision_features(pixel_values, language_embeddings, use_film)
-        # Add proprioceptive features if provided
-        use_proprio = proprio_projector is not None and proprio is not None
-        if use_proprio:
-            proprio = torch.Tensor(proprio).to(projected_patch_embeddings.device, dtype=projected_patch_embeddings.dtype)
-            projected_patch_embeddings = self._process_proprio_features(
-                projected_patch_embeddings, proprio, proprio_projector
-            )
-        # Use diffusion if provided, otherwise use regression or discrete prediction
-        use_diffusion = noisy_action_projector is not None and hasattr(action_head, "noise_scheduler")
-        # Calculate number of patches (including proprio token and/or diffusion timestep embedding if present)
-        NUM_PATCHES = self.vision_backbone.get_num_patches() * self.vision_backbone.get_num_images_in_input()
-        if use_proprio:
-            NUM_PATCHES += 1
-        if use_diffusion:
-            NUM_PATCHES += 1
-        if use_diffusion:
-            # Sample random noise with shape equal to output action, used as the starting state for reverse diffusion
-            noise = torch.randn(
-                size=(1, NUM_ACTIONS_CHUNK, ACTION_DIM), device=input_embeddings.device, dtype=input_embeddings.dtype
-            )
-            # Run diffusion-based prediction
-            normalized_actions, actions_hidden_states = self._run_diffusion_prediction(
-                input_embeddings,
-                all_actions_mask,
-                noise,
-                action_head,
-                projected_patch_embeddings,
-                labels,
-                attention_mask,
-                NUM_PATCHES,
-                NUM_PROMPT_TOKENS,
-                noisy_action_projector,
-            )
-        else:
-            # Run regression or discrete token-based prediction
-            normalized_actions, actions_hidden_states = self._regression_or_discrete_prediction(
-                input_embeddings,
-                all_actions_mask,
-                projected_patch_embeddings,
-                attention_mask,
-                labels,
-                NUM_PATCHES,
-                NUM_PROMPT_TOKENS,
-                action_head,
-            )
-        # Unnormalize predicted actions
-        actions = self._unnormalize_actions(normalized_actions, unnorm_key)
-        return actions, actions_hidden_states
-    @staticmethod
-    def _check_unnorm_key(norm_stats: Dict[str, Dict[str, Any]], unnorm_key: Optional[str]) -> str:
-        """Validate and resolve the unnormalization key for action statistics"""
-        if unnorm_key is None:
-            assert len(norm_stats) == 1, (
-                f"Your model was trained on more than one dataset, "
-                f"please pass a `unnorm_key` from the following options to choose the statistics "
-                f"used for un-normalizing actions: {norm_stats.keys()}"
-            )
-            unnorm_key = next(iter(norm_stats.keys()))
-        assert unnorm_key in norm_stats, (
-            f"The `unnorm_key` you chose is not in the set of available dataset statistics, "
-            f"please choose from: {norm_stats.keys()}"
-        )
-        return unnorm_key
-    def get_action_dim(self, unnorm_key: Optional[str] = None) -> int:
-        """Get the dimensionality of the policy's action space."""
-        unnorm_key = self._check_unnorm_key(self.norm_stats, unnorm_key)
-        return len(self.norm_stats[unnorm_key]["action"]["min"])
-    def get_action_stats(self, unnorm_key: Optional[str] = None) -> Dict[str, Any]:
-        """Get all the logged statistics for the given dataset."""
-        unnorm_key = self._check_unnorm_key(self.norm_stats, unnorm_key)
-        return self.norm_stats[unnorm_key]["action"]

capvector-oft/prismatic/extern/hf/processing_prismatic.py DELETED Viewed

@@ -1,252 +0,0 @@
-"""
-processing_prismatic.py
-HuggingFace-style preprocessor definitions for Prismatic VLMs, inheriting from `ProcessorMixin`. Default configuration
-specifies `siglip-224px+7b`.
-"""
-from typing import Any, ClassVar, List, Optional, Tuple, Union
-import timm.data
-import torch
-import torchvision.transforms.functional as TVF
-from PIL import Image
-from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
-from transformers import PreTrainedTokenizerBase
-from transformers.image_processing_utils import BatchFeature, ImageProcessingMixin
-from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from transformers.utils import TensorType
-# === Image Processing ===
-def letterbox_pad_transform(image: Image.Image, padding_fill_value: Tuple[int, int, int]) -> Image.Image:
-    """Given a PIL.Image, pad to square by adding a symmetric border around the height/width."""
-    (w, h), max_wh = image.size, max(image.size)
-    horizontal_pad, vertical_pad = int((max_wh - w) / 2), int((max_wh - h) / 2)
-    padding = (horizontal_pad, vertical_pad, horizontal_pad, vertical_pad)
-    return TVF.pad(image, padding, fill=padding_fill_value, padding_mode="constant")
-class PrismaticImageProcessor(ImageProcessingMixin):
-    model_input_names: ClassVar[List[str]] = ["pixel_values"]
-    def __init__(
-        self,
-        use_fused_vision_backbone: bool = False,
-        image_resize_strategy: str = "letterbox",
-        input_sizes: Optional[List[Tuple[int, int, int]]] = None,
-        interpolations: Optional[List[str]] = None,
-        means: Optional[List[Tuple[float, float, float]]] = None,
-        stds: Optional[List[Tuple[float, float, float]]] = None,
-        **kwargs: str,
-    ) -> None:
-        """
-        Initialize a PrismaticImageProcessor as a wrapper around a torchvision transform; this transform will be
-        created by TIMM, and edited to follow our custom `image_resize_strategy` logic.
-        @param use_fused_vision_backbone: Boolean indicating single or fused (dual) vision backbone
-        @param image_resize_strategy: Prismatic image resize strategy in < resize-naive | resize-crop | letterbox >
-        @param input_size: [TIMM :: `data_cfg`] Input image size as tuple (channels, width, height)
-        @param interpolation: [TIMM :: `data_cfg`] Interpolation as string (default: "bicubic")
-        @param mean: [TIMM :: `data_cfg`] Normalization mean as float tuple (or two-tuple if `fused_backbone`)
-        @param std: [TIMM :: `data_cfg`] Normalization std as float tuple (or two-tuple if `fused_backbone`)
-        """
-        self.use_fused_vision_backbone = use_fused_vision_backbone
-        self.image_resize_strategy = image_resize_strategy
-        # Handle `None` default values
-        input_sizes = [(3, 224, 224)] if input_sizes is None else input_sizes
-        means = [(0.5, 0.5, 0.5)] if means is None else means
-        stds = [(0.5, 0.5, 0.5)] if stds is None else stds
-        # TIMM `data_cfg` Parameters
-        self.input_sizes, self.interpolations, self.means, self.stds = input_sizes, interpolations, means, stds
-        # Grab torchvision transforms via TIMM =>> need to parse for specific "functional" transform values!
-        self.tvf_resize_params, self.tvf_crop_params, self.tvf_normalize_params = [], [], []
-        self.tvf_do_letterbox, self.tvf_letterbox_fill = False, None
-        for idx in range(len(input_sizes)):
-            transform = timm.data.create_transform(
-                input_size=self.input_sizes[idx],
-                interpolation=self.interpolations[idx],
-                mean=self.means[idx],
-                std=self.stds[idx],
-                crop_pct=1.0,  # Set to 1.0 to ignore cropping (initial Resize sets `input_size`)
-                crop_mode="center",  # Default crop mode -- no-op when `crop_pct == 1.0`
-                is_training=False,  # No image augmentations when loading the transform!
-            )
-            # [Validation] Ensure appropriate transform structure, expected sizes
-            if not (
-                isinstance(transform, Compose)
-                and (len(transform.transforms) == 4)
-                and isinstance(transform.transforms[0], Resize)
-                and isinstance(transform.transforms[1], CenterCrop)
-                and isinstance(transform.transforms[2], ToTensor)
-                and isinstance(transform.transforms[3], Normalize)
-                and (transform.transforms[0].size == self.input_sizes[idx][-1])
-                and (transform.transforms[1].size == self.input_sizes[idx][-2:])
-            ):
-                raise ValueError(f"Unexpected TIMM image transformation structure/sizes: `{transform}`")
-            # HF Image Processors *must* be JSON-serializable; as such, cannot have torchvision. as an attribute.
-            #   => Instead, we're going to parse the transform and call "torchvision.transforms.functional" (`tvf`)
-            resize_t, crop_t, norm_t = transform.transforms[0], transform.transforms[1], transform.transforms[3]
-            self.tvf_resize_params.append(
-                {
-                    "size": resize_t.size,
-                    "interpolation": TVF.pil_modes_mapping[resize_t.interpolation],
-                    "max_size": None,
-                    "antialias": True,
-                }
-            )
-            self.tvf_crop_params.append({"output_size": crop_t.size})
-            self.tvf_normalize_params.append(
-                {
-                    "mean": norm_t.mean.float().numpy().tolist(),
-                    "std": norm_t.std.float().numpy().tolist(),
-                    "inplace": False,
-                }
-            )
-            self.tvf_do_letterbox, self.tvf_letterbox_fill = False, None
-            # Handle Prismatic `image_resize_strategy`
-            if self.image_resize_strategy == "resize-naive":
-                self.tvf_resize_params[idx]["size"] = (resize_t.size, resize_t.size)
-            elif self.image_resize_strategy == "letterbox":
-                self.tvf_do_letterbox, self.tvf_letterbox_fill = True, tuple([int(x * 255) for x in self.means[idx]])
-            elif self.image_resize_strategy == "resize-crop":
-                pass
-            else:
-                raise ValueError(f"Image resize strategy `{self.image_resize_strategy}` is not supported!")
-        # Dispatch **kwargs to super()
-        super().__init__(**kwargs)
-    def apply_transform(self, img: Image.Image) -> torch.Tensor:
-        """Apply `functional` variant of TIMM's Transform = Compose([Resize -> CenterCrop -> ToTensor -> Normalize])"""
-        if self.tvf_do_letterbox:
-            img = letterbox_pad_transform(img, self.tvf_letterbox_fill)
-        # [Contract] Fused Backbones expect "channel-stacked" inputs; we'll unpack on the model side!
-        imgs_t = []
-        for idx in range(len(self.input_sizes)):
-            img_idx = TVF.resize(img, **self.tvf_resize_params[idx])
-            img_idx = TVF.center_crop(img_idx, **self.tvf_crop_params[idx])
-            img_idx_t = TVF.to_tensor(img_idx)
-            img_idx_t = TVF.normalize(img_idx_t, **self.tvf_normalize_params[idx])
-            imgs_t.append(img_idx_t)
-        # [Contract] `imgs_t` is a list of Tensors of shape [3, input_size, input_size]; stack along dim = 0
-        img_t = torch.vstack(imgs_t)
-        return img_t
-    def preprocess(
-        self,
-        images: Union[Image.Image, List[Image.Image]],
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **_: str,
-    ) -> BatchFeature:
-        """
-        Preprocess an image (or batch of images); note that unlike the `transformers :: BaseImageProcessor` we
-        explicitly only handle PIL.Image.Image instances for simplicity.
-        @param images: A (batch of) PIL.Image.Image instance(s) to preprocess.
-        @param return_tensors: BatchFeature default Tensor format (e.g., "pt" for torch); if None, returns np.ndarray
-        @return: Instance of `transformers :: BatchFeature` with a single key "pixel_values"
-        """
-        if not isinstance(images, list):
-            images = [images]
-        # Apply `self.img_transform` to each image (will return list of torch.Tensors); stack into "batched" Tensor
-        pixel_values = torch.stack([self.apply_transform(img.convert("RGB")) for img in images])
-        # Return BatchFeature =>> note that for compatibility, constructor expects Dict[str, np.ndarray], so we convert
-        return BatchFeature(data={"pixel_values": pixel_values.float().numpy()}, tensor_type=return_tensors)
-    def __call__(self, images: Union[Image.Image, List[Image.Image]], **kwargs) -> BatchFeature:
-        return self.preprocess(images, **kwargs)
-# === PrismaticProcessor =>> Wraps both ImageProcessor and Tokenizer ===
-#   =>> https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava/processing_llava.py
-class PrismaticProcessor(ProcessorMixin):
-    attributes: ClassVar[List[str]] = ["image_processor", "tokenizer"]
-    image_processor_class: str = "AutoImageProcessor"
-    tokenizer_class: str = "AutoTokenizer"
-    def __init__(
-        self,
-        image_processor: Optional[ImageProcessingMixin] = None,
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
-    ) -> None:
-        super().__init__(image_processor, tokenizer)
-    def __call__(
-        self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
-        images: Union[Image.Image, List[Image.Image]],
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Optional[Union[bool, str, TruncationStrategy]] = None,
-        max_length: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
-    ) -> BatchFeature:
-        """
-        Preprocess a given (batch) of text/images for a Prismatic VLM; forwards text to the underlying LLM's tokenizer,
-        forwards images to PrismaticImageProcessor.
-        @param text: The (batch) of text to encode; must be a string or list of strings.
-        @param images: A (batch of) PIL.Image.Image instance(s) to preprocess.
-        @param padding: Sequence padding strategy (if multiple specified) in < True = "longest" | "max_length" | False >
-        @param truncation: Truncation strategy for the output sequences; requires `max_length` to be specified
-        @param max_length: Maximum length (in tokens) to truncate
-        @param return_tensors: Type of return tensors (usually "pt" or TensorType.PYTORCH)
-        @return: BatchFeature with keys for `input_ids`, `attention_mask` and `pixel_values`.
-        """
-        pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"]
-        text_inputs = self.tokenizer(
-            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
-        )
-        # [Validate] Need same number of images and text inputs!
-        if pixel_values.shape[0] != text_inputs.input_ids.shape[0]:
-            raise ValueError("Batch is malformed; expected same number of images and text inputs!")
-        return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
-    # === Tokenizer Dispatch Utilities =>> check `PreTrainedTokenizerBase` for documentation ===
-    def batch_decode(
-        self,
-        sequences: Union[List[int], List[List[int]], torch.Tensor, Any],  # `Any` = np.ndarray | tf.Tensor
-        skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: Optional[bool] = None,
-        **kwargs: str,
-    ) -> List[str]:
-        return self.tokenizer.batch_decode(
-            sequences=sequences,
-            skip_special_tokens=skip_special_tokens,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-    def decode(
-        self,
-        token_ids: Union[int, List[int], torch.Tensor, Any],  # `Any` = np.ndarray | tf.Tensor
-        skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: Optional[bool] = None,
-        **kwargs: str,
-    ) -> str:
-        return self.tokenizer.decode(
-            token_ids=token_ids,
-            skip_special_tokens=skip_special_tokens,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-    @property
-    def model_input_names(self) -> List[str]:
-        tokenizer_input_names = self.tokenizer.model_input_names
-        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

capvector-oft/prismatic/models/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .load import available_model_names, available_models, get_model_description, load, load_vla
2	- from .materialize import get_llm_backbone_and_tokenizer, get_vision_backbone_and_transform, get_vlm

capvector-oft/prismatic/models/action_heads.py DELETED Viewed

@@ -1,211 +0,0 @@
-"""Implementations of various action heads, which serve as alternatives to VLM sequential token prediction."""
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-from diffusers.schedulers.scheduling_ddim import DDIMScheduler
-from prismatic.vla.constants import ACTION_DIM, ACTION_TOKEN_BEGIN_IDX, IGNORE_INDEX, NUM_ACTIONS_CHUNK, PROPRIO_DIM, STOP_INDEX
-class SinusoidalPositionalEncoding(nn.Module):
-    """
-    Sine- and cosine-based positional encoding that produces embeddings of a batch of timesteps.
-    For example, at train time, the input might be a batch of 32 randomly sampled diffusion timesteps -> shape (32,)
-    Then the output would be a batch of 32 timestep embeddings -> shape (32, D)
-    Adapted from: https://github.com/real-stanford/diffusion_policy/blob/main/diffusion_policy/model/diffusion/positional_embedding.py
-    """
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim  # dimensionality of the positional encoding
-    def forward(self, x):
-        # x: (batch_size,)
-        device = x.device
-        assert self.dim % 2 == 0, f"# dimensions must be even but got {self.dim}"
-        half_dim = self.dim // 2
-        exponent = torch.arange(half_dim, device=device) * -math.log(10000) / (half_dim - 1)  # shape: (D/2,)
-        emb = torch.exp(exponent)  # shape: (D/2,)
-        emb = x[:, None] * emb[None, :]  # shape: (batch_size, 1) * (1, D/2) -> (batch_size, D/2)
-        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)  # shape: (batch_size, D)
-        return emb
-class MLPResNetBlock(nn.Module):
-    """One MLP ResNet block with a residual connection."""
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-        self.ffn = nn.Sequential(  # feedforward network, similar to the ones in Transformers
-            nn.LayerNorm(dim),
-            nn.Linear(dim, dim),
-            nn.ReLU(),
-        )
-    def forward(self, x):
-        # x: (batch_size, hidden_dim)
-        # We follow the module ordering of "Pre-Layer Normalization" feedforward networks in Transformers as
-        # described here: https://arxiv.org/pdf/2002.04745.pdf
-        identity = x
-        x = self.ffn(x)
-        x = x + identity
-        return x
-class MLPResNet(nn.Module):
-    """MLP with residual connection blocks."""
-    def __init__(self, num_blocks, input_dim, hidden_dim, output_dim):
-        super().__init__()
-        self.layer_norm1 = nn.LayerNorm(input_dim)
-        self.fc1 = nn.Linear(input_dim, hidden_dim)
-        self.relu = nn.ReLU()
-        self.mlp_resnet_blocks = nn.ModuleList()
-        for _ in range(num_blocks):
-            self.mlp_resnet_blocks.append(MLPResNetBlock(dim=hidden_dim))
-        self.layer_norm2 = nn.LayerNorm(hidden_dim)
-        self.fc2 = nn.Linear(hidden_dim, output_dim)
-    def forward(self, x):
-        # x: (batch_size, input_dim)
-        x = self.layer_norm1(x)  # shape: (batch_size, input_dim)
-        x = self.fc1(x)  # shape: (batch_size, hidden_dim)
-        x = self.relu(x)  # shape: (batch_size, hidden_dim)
-        for block in self.mlp_resnet_blocks:
-            x = block(x)  # shape: (batch_size, hidden_dim)
-        x = self.layer_norm2(x)  # shape: (batch_size, hidden_dim)
-        x = self.fc2(x)  # shape: (batch_size, output_dim)
-        return x
-class L1RegressionActionHead(nn.Module):
-    """Simple MLP-based action head that generates continuous actions via L1 regression."""
-    def __init__(
-        self,
-        input_dim=4096,
-        hidden_dim=4096,
-        action_dim=7,
-    ):
-        super().__init__()
-        self.action_dim = action_dim
-        self.model = MLPResNet(
-            num_blocks=2, input_dim=input_dim*ACTION_DIM, hidden_dim=hidden_dim, output_dim=action_dim
-        )
-    def predict_action(self, actions_hidden_states):
-        # actions_hidden_states: last hidden states of Transformer corresponding to action tokens in sequence
-        # - shape: (batch_size, chunk_len * action_dim, hidden_dim)
-        # ground_truth_actions: ground-truth actions
-        # - shape: (batch_size, chunk_len, action_dim)
-        batch_size = actions_hidden_states.shape[0]
-        device = actions_hidden_states.device
-        rearranged_actions_hidden_states = actions_hidden_states.reshape(batch_size, NUM_ACTIONS_CHUNK, -1)
-        action = self.model(rearranged_actions_hidden_states)
-        return action
-class NoisePredictionModel(nn.Module):
-    """
-    Diffusion noise prediction model that takes an observation embedding (which fuses the
-    noisy action, diffusion timestep, and image-language observation embeddings) and
-    outputs a noise prediction.
-    """
-    def __init__(
-        self,
-        transformer_hidden_dim,  # Transformer hidden embedding size
-        hidden_dim,  # MLP hidden size
-        action_dim=7,  # action dimensionality
-    ):
-        super().__init__()
-        self.mlp_resnet = MLPResNet(
-            num_blocks=2,
-            input_dim=transformer_hidden_dim,
-            hidden_dim=hidden_dim,
-            output_dim=action_dim,
-        )
-    def forward(
-        self,
-        obs,
-    ):
-        # obs: observation embeddings to condition the generation on
-        # - shape: (batch_size, chunk_len, rearranged_hidden_dim=action_dim*hidden_dim)
-        #
-        # output: predicted noise
-        # - shape: (batch_size, action_dim)
-        output = self.mlp_resnet(obs)
-        return output
-class DiffusionActionHead(nn.Module):
-    """
-    Simple MLP-based action head that generates continuous actions via conditional denoising diffusion process.
-    Loosely inspired by: https://github.com/real-stanford/diffusion_policy/blob/main/diffusion_policy/model/diffusion/transformer_for_diffusion.py
-    """
-    def __init__(
-        self,
-        input_dim=4096,
-        hidden_dim=4096,
-        action_dim=7,
-        num_diffusion_steps_train=50,
-    ):
-        super().__init__()
-        self.action_dim = action_dim
-        self.noise_predictor = NoisePredictionModel(
-            transformer_hidden_dim=hidden_dim*ACTION_DIM, hidden_dim=hidden_dim, action_dim=action_dim
-        )
-        self.num_diffusion_steps_train = num_diffusion_steps_train
-        self.noise_scheduler = DDIMScheduler(num_train_timesteps=num_diffusion_steps_train, beta_schedule="squaredcos_cap_v2")
-        self.time_encoder = SinusoidalPositionalEncoding(dim=hidden_dim)
-    def sample_noisy_actions(self, ground_truth_actions):
-        """
-        Samples noise and applies noise to ground-truth actions to produce noisy actions, which are
-        used as input in the noise prediction network. Returns noise, noisy actions, and the
-        corresponding diffusion timestep embeddings.
-        """
-        # ground_truth_actions: ground-truth actions
-        # - shape: (batch_size, chunk_len, action_dim)
-        batch_size = ground_truth_actions.shape[0]
-        device = ground_truth_actions.device
-        # Sample random noise with shape equal to actions, used for closed-form forward diffusion.
-        noise = torch.randn(size=(batch_size, NUM_ACTIONS_CHUNK, ACTION_DIM), device=device, dtype=ground_truth_actions.dtype)  # (B, chunk_len, action_dim)
-        # Sample random diffusion timesteps (one for each action in batch).
-        timesteps = torch.randint(
-            low=0, high=self.noise_scheduler.config.num_train_timesteps, size=(batch_size,), device=device
-        )
-        # Add noise to clean actions according to the magnitude at each diffusion timestep via
-        # closed-form forward diffusion.
-        noisy_actions = self.noise_scheduler.add_noise(ground_truth_actions, noise, timesteps)  # (B, chunk_len, action_dim)
-        # Get diffusion timestep embeddings as well
-        diffusion_timestep_embeddings = self.time_encoder(timesteps).to(noisy_actions.dtype).to(noisy_actions.device)  # (B, llm_dim)
-        diffusion_timestep_embeddings = diffusion_timestep_embeddings.unsqueeze(1)  # (B, 1, llm_dim)
-        return_dict = dict(
-            noise=noise,
-            noisy_actions=noisy_actions,
-            diffusion_timestep_embeddings=diffusion_timestep_embeddings,
-        )
-        return return_dict
-    def predict_noise(self, actions_hidden_states):
-        """
-        Given a batch of last hidden Transformer layer embeddings (which fuse the vision-language observation embeddings,
-        noisy action embeddings, and diffusion timestep embedding), predicts the noise applied to the actions.
-        """
-        # actions_hidden_states: last hidden states of Transformer corresponding to action tokens in sequence
-        # - shape: (batch_size, chunk_len * action_dim, hidden_dim)
-        batch_size = actions_hidden_states.shape[0]
-        device = actions_hidden_states.device
-        rearranged_actions_hidden_states = actions_hidden_states.reshape(batch_size, NUM_ACTIONS_CHUNK, -1)  # (batch_size, chunk_len, action_dim * hidden_dim)
-        # Get diffusion model's noise prediction.
-        noise_pred = self.noise_predictor(rearranged_actions_hidden_states)
-        return noise_pred

capvector-oft/prismatic/models/backbones/__init__.py DELETED Viewed

File without changes

capvector-oft/prismatic/models/backbones/llm/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .base_llm import LLMBackbone
-from .llama2 import LLaMa2LLMBackbone
-from .mistral import MistralLLMBackbone
-from .phi import PhiLLMBackbone

capvector-oft/prismatic/models/backbones/llm/base_llm.py DELETED Viewed

@@ -1,223 +0,0 @@
-"""
-base_llm.py
-Abstract class definition of a large (autoregressive) language model backbone (LLM), with full annotations of class
-methods, utility functions, and initialization logic.
-We also define the generic HFLLMBackbone class here, providing a default interface for loading any HF
-AutoModelForCausalLM (e.g., LLamaForCausalLM). In general, we make the assumption that any given LLM backbone implements
-the AutoModelForCausalLM API (though we may add Seq2Seq models in the future).
-We make this assumption to keep the LLM handling in this codebase relatively lightweight, and to inherit all the nice HF
-utilities around different types of decoding/generation strategies.
-"""
-import warnings
-from abc import ABC, abstractmethod
-from functools import partial
-from typing import Callable, List, Optional, Sequence, Type
-import torch
-import torch.nn as nn
-from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
-from transformers import AutoConfig, AutoTokenizer, PreTrainedModel, PreTrainedTokenizerBase
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from prismatic.models.backbones.llm.prompting import PromptBuilder
-from prismatic.overwatch import initialize_overwatch
-# Suppress HF Deprecation Warnings
-warnings.filterwarnings("ignore", category=FutureWarning)
-# Initialize Overwatch =>> Wraps `logging.Logger`
-overwatch = initialize_overwatch(__name__)
-# === Abstract Base Class for arbitrary HF LLM Backbones ===
-class LLMBackbone(nn.Module, ABC):
-    def __init__(self, llm_backbone_id: str) -> None:
-        super().__init__()
-        self.identifier = llm_backbone_id
-        # Instance attributes for an LLM Backbone
-        self.llm: PreTrainedModel = None
-        self.tokenizer: PreTrainedTokenizerBase = None
-    def get_tokenizer(self) -> PreTrainedTokenizerBase:
-        return self.tokenizer
-    @abstractmethod
-    def get_fsdp_wrapping_policy(self) -> Callable: ...
-    @abstractmethod
-    def enable_gradient_checkpointing(self) -> None: ...
-    @abstractmethod
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> CausalLMOutputWithPast:
-        """Run a forward pass through the LLM given targets (labels), returning the scalar Cross-Entropy Loss"""
-        raise NotImplementedError
-    @abstractmethod
-    def embed_input_ids(self, input_ids: torch.LongTensor) -> torch.Tensor: ...
-    @property
-    @abstractmethod
-    def prompt_builder_fn(self) -> Type[PromptBuilder]: ...
-    @property
-    @abstractmethod
-    def transformer_layer_cls(self) -> Type[nn.Module]: ...
-    @property
-    @abstractmethod
-    def half_precision_dtype(self) -> torch.dtype: ...
-    @property
-    @abstractmethod
-    def last_layer_finetune_modules(self) -> Sequence[nn.Module]: ...
-    @property
-    def embed_dim(self) -> int:
-        return self.llm.config.hidden_size
-    @property
-    def pad_token_id(self) -> int:
-        return self.tokenizer.pad_token_id
-# === Abstract Base Class for Arbitrary HF Causal LLMs ===
-class HFCausalLLMBackbone(LLMBackbone, ABC):
-    def __init__(
-        self,
-        llm_backbone_id: str,
-        llm_family: str,
-        llm_cls: Type[PreTrainedModel],
-        hf_hub_path: str,
-        llm_max_length: int = 2048,
-        hf_token: Optional[str] = None,
-        inference_mode: bool = False,
-        use_flash_attention_2: bool = False,
-    ) -> None:
-        super().__init__(llm_backbone_id)
-        self.llm_family = llm_family
-        self.llm_max_length = llm_max_length
-        self.inference_mode = inference_mode
-        # Initialize LLM (downloading from HF Hub if necessary) --> `llm_cls` is the actual {Model}ForCausalLM class!
-        #   => Note: We're eschewing use of the AutoModel API so that we can be more explicit about LLM-specific details
-        if not self.inference_mode:
-            overwatch.info(f"Loading [bold]{llm_family}[/] LLM from [underline]`{hf_hub_path}`[/]", ctx_level=1)
-            self.llm = llm_cls.from_pretrained(
-                hf_hub_path,
-                token=hf_token,
-                use_flash_attention_2=use_flash_attention_2 if not self.inference_mode else False,
-                # The following parameters are set to prevent `UserWarnings` from HF; we want greedy decoding!
-                do_sample=False,
-                temperature=1.0,
-                top_p=1.0,
-            )
-        # [Contract] `inference_mode` means we're loading from a pretrained checkpoint; no need to load base weights!
-        else:
-            overwatch.info(f"Building empty [bold]{llm_family}[/] LLM from [underline]`{hf_hub_path}`[/]", ctx_level=1)
-            llm_config = AutoConfig.from_pretrained(hf_hub_path, token=hf_token)
-            self.llm = llm_cls._from_config(llm_config)
-        # Lightweight Handling (with extended explanation) for setting some LLM Parameters
-        #   => Set `decoder.use_cache = False` --> incompatible with gradient checkpointing (+ training in general)
-        #
-        #      Reference: https://discuss.huggingface.co/t/what-is-the-purpose-of-use-cache-in-decoder/958
-        self.llm.config.use_cache = False if not self.inference_mode else True
-        #   => Turns out that when gradient checkpointing is on and the underlying LLM has no "trainable" parameters
-        #      (requires_grad is False), backprop will fail; setting `enable_input_requires_grad()` registers a new
-        #      forward hook that fixes this =>> also totally safe for the "full finetuning" setting!
-        if not self.inference_mode:
-            self.llm.enable_input_require_grads()
-        # Load (Fast) Tokenizer
-        overwatch.info(f"Loading [bold]{llm_family}[/] (Fast) Tokenizer via the AutoTokenizer API", ctx_level=1)
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            hf_hub_path, model_max_length=self.llm_max_length, token=hf_token, padding_side="right"
-        )
-        # Validation =>> Our VLM logic currently operates under the assumption that the tokenization of a new input
-        #                starts with a <BOS> token unless `add_special_tokens = False`; for these models, we empirically
-        #                find that adding image patches *after* the BOS leads to much better performance.
-        #
-        # As a result we explicitly validate that a tokenizer conforms to the expected behavior; if you're reading this
-        # line, it's probably because you're adding a new LLM with a different tokenizer behavior. If so, feel free to
-        # override the `SPECIAL_CASES` set below, but make sure to make the appropriate changes in the `datasets.py`
-        # and VLM `forward()` logic!
-        SPECIAL_CASES = {
-            # Phi-2 Tokenizer doesn't add any BOS tokens by default, and sets BOS == EOS == "<|endoftext|>"
-            #   =>> We'll prepend BOS to first input (to play nicely with image token insertion logic; verified that
-            #       this works well with base LLM generation.
-            #   =>> Like Llama-2 Tokenizers -- we'll add a special PAD token for training purposes.
-            "phi-2-3b",
-        }
-        if self.identifier in SPECIAL_CASES:
-            return
-        # Note =>> this assert should hold for all Llama-derived tokenizers (`LlamaTokenizerFast` ==> includes Mistral!
-        assert (self.tokenizer("Test 123", add_special_tokens=True).input_ids[0] == self.tokenizer.bos_token_id) and (
-            self.tokenizer("Test 123", add_special_tokens=False).input_ids[0] != self.tokenizer.bos_token_id
-        ), (
-            f"Default Tokenizer of type `{type(self.tokenizer)}` does not automatically prefix inputs with BOS token!\n"
-            "Please read the comment in `base_llm.py` for more information!"
-        )
-    def get_fsdp_wrapping_policy(self) -> Callable:
-        """Return a `transformer_auto_wrap_policy` where we wrap each instance of `self.transformer_layer_cls`"""
-        transformer_block_policy = partial(
-            transformer_auto_wrap_policy, transformer_layer_cls={self.transformer_layer_cls}
-        )
-        return transformer_block_policy
-    def enable_gradient_checkpointing(self) -> None:
-        """Dispatch to underlying LLM instance's `gradient_checkpointing_enable`; defined for all `PretrainedModel`."""
-        self.llm.gradient_checkpointing_enable()
-    def embed_input_ids(self, input_ids: torch.LongTensor) -> torch.Tensor:
-        return self.llm.get_input_embeddings()(input_ids)
-    # [Contract] Should match the `forward` call of the underlying `llm` instance!
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> CausalLMOutputWithPast:
-        output: CausalLMOutputWithPast = self.llm(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            labels=labels,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        return output

capvector-oft/prismatic/models/backbones/llm/llama2.py DELETED Viewed

@@ -1,102 +0,0 @@
-"""
-llama2.py
-Class definition for all LLMs derived from LlamaForCausalLM.
-"""
-from typing import Optional, Sequence, Type
-import torch
-from torch import nn as nn
-from transformers import LlamaForCausalLM
-from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-from prismatic.models.backbones.llm.base_llm import HFCausalLLMBackbone
-from prismatic.models.backbones.llm.prompting import (
-    LLaMa2ChatPromptBuilder,
-    PromptBuilder,
-    PurePromptBuilder,
-    VicunaV15ChatPromptBuilder,
-)
-# Registry =>> Support LLaMa-2 Models (from HF Transformers)
-# fmt: off
-LLAMA2_MODELS = {
-    # === Pure Meta LLaMa-2 (non-instruct/chat-tuned) Models ===
-    "llama2-7b-pure": {
-        "llm_family": "llama2", "llm_cls": LlamaForCausalLM, "hf_hub_path": "meta-llama/Llama-2-7b-hf"
-    },
-    "llama2-13b-pure": {
-        "llm_family": "llama2", "llm_cls": LlamaForCausalLM, "hf_hub_path": "meta-llama/Llama-2-13b-hf"
-    },
-    # === Meta LLaMa-2 Chat Models ===
-    "llama2-7b-chat": {
-        "llm_family": "llama2", "llm_cls": LlamaForCausalLM, "hf_hub_path": "meta-llama/Llama-2-7b-chat-hf"
-    },
-    "llama2-13b-chat": {
-        "llm_family": "llama2", "llm_cls": LlamaForCausalLM, "hf_hub_path": "meta-llama/Llama-2-13b-chat-hf"
-    },
-    # === Vicuna v1.5 Chat Models ===
-    "vicuna-v15-7b": {
-        "llm_family": "llama2", "llm_cls": LlamaForCausalLM, "hf_hub_path": "lmsys/vicuna-7b-v1.5"
-    },
-    "vicuna-v15-13b": {
-        "llm_family": "llama2", "llm_cls": LlamaForCausalLM, "hf_hub_path": "lmsys/vicuna-13b-v1.5"
-    },
-}
-# fmt: on
-class LLaMa2LLMBackbone(HFCausalLLMBackbone):
-    def __init__(
-        self,
-        llm_backbone_id: str,
-        llm_max_length: int = 2048,
-        hf_token: Optional[str] = None,
-        inference_mode: bool = False,
-        use_flash_attention_2: bool = True,
-    ) -> None:
-        super().__init__(
-            llm_backbone_id,
-            llm_max_length=llm_max_length,
-            hf_token=hf_token,
-            inference_mode=inference_mode,
-            use_flash_attention_2=use_flash_attention_2,
-            **LLAMA2_MODELS[llm_backbone_id],
-        )
-        # [Special Case] LLaMa-2 PAD Token Handling --> for clarity, we add an extra token (and resize)
-        self.tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-        self.llm.config.pad_token_id = self.tokenizer.pad_token_id
-        self.llm.resize_token_embeddings(len(self.tokenizer), pad_to_multiple_of=64)
-    @property
-    def prompt_builder_fn(self) -> Type[PromptBuilder]:
-        if self.identifier.startswith("llama2-") and self.identifier.endswith("-pure"):
-            return PurePromptBuilder
-        elif self.identifier.startswith("llama2-") and self.identifier.endswith("-chat"):
-            return LLaMa2ChatPromptBuilder
-        elif self.identifier.startswith("vicuna"):
-            return VicunaV15ChatPromptBuilder
-        raise ValueError(f"No PromptBuilder defined for LLM Backbone `{self.identifier}`")
-    @property
-    def transformer_layer_cls(self) -> Type[nn.Module]:
-        return LlamaDecoderLayer
-    @property
-    def half_precision_dtype(self) -> torch.dtype:
-        """LLaMa-2 was trained in BF16; see https://huggingface.co/docs/transformers/main/model_doc/llama2."""
-        return torch.bfloat16
-    @property
-    def last_layer_finetune_modules(self) -> Sequence[nn.Module]:
-        return (self.llm.model.embed_tokens, self.llm.model.layers[-1], self.llm.lm_head)

capvector-oft/prismatic/models/backbones/llm/mistral.py DELETED Viewed

@@ -1,72 +0,0 @@
-"""
-mistral.py
-Class definition for all LLMs derived from MistralForCausalLM.
-"""
-from typing import Optional, Type
-import torch
-from torch import nn as nn
-from transformers import MistralForCausalLM
-from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
-from prismatic.models.backbones.llm.base_llm import HFCausalLLMBackbone
-from prismatic.models.backbones.llm.prompting import MistralInstructPromptBuilder, PromptBuilder, PurePromptBuilder
-# Registry =>> Support Mistral Models (from HF Transformers)
-# fmt: off
-MISTRAL_MODELS = {
-    # === Base Mistral v0.1 ===
-    "mistral-v0.1-7b-pure": {
-        "llm_family": "mistral", "llm_cls": MistralForCausalLM, "hf_hub_path": "mistralai/Mistral-7B-v0.1"
-    },
-    # === Mistral Instruct v0.1 ===
-    "mistral-v0.1-7b-instruct": {
-        "llm_family": "mistral", "llm_cls": MistralForCausalLM, "hf_hub_path": "mistralai/Mistral-7B-Instruct-v0.1"
-    }
-}
-# fmt: on
-class MistralLLMBackbone(HFCausalLLMBackbone):
-    def __init__(
-        self,
-        llm_backbone_id: str,
-        llm_max_length: int = 2048,
-        hf_token: Optional[str] = None,
-        inference_mode: bool = False,
-        use_flash_attention_2: bool = True,
-    ) -> None:
-        super().__init__(
-            llm_backbone_id,
-            llm_max_length=llm_max_length,
-            hf_token=hf_token,
-            inference_mode=inference_mode,
-            use_flash_attention_2=use_flash_attention_2,
-            **MISTRAL_MODELS[llm_backbone_id],
-        )
-        # [Special Case] Mistral PAD Token Handling --> for clarity, we add an extra token (and resize)
-        self.tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-        self.llm.config.pad_token_id = self.tokenizer.pad_token_id
-        self.llm.resize_token_embeddings(len(self.tokenizer), pad_to_multiple_of=64)
-    @property
-    def prompt_builder_fn(self) -> Type[PromptBuilder]:
-        if self.identifier.endswith("-pure"):
-            return PurePromptBuilder
-        elif self.identifier.endswith("-instruct"):
-            return MistralInstructPromptBuilder
-        raise ValueError(f"No PromptBuilder defined for LLM Backbone `{self.identifier}`")
-    @property
-    def transformer_layer_cls(self) -> Type[nn.Module]:
-        return MistralDecoderLayer
-    @property
-    def half_precision_dtype(self) -> torch.dtype:
-        return torch.bfloat16