JosephBai commited on 8 days ago

Commit

857c2e9

verified ·

1 Parent(s): 74d1c6f

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
.gitattributes +71 -0
Dev/.DS_Store +0 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250926003154.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928021537.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928101936.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928110056.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115107.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115109.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175228.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175432.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175459.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230226.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230315.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230435.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928234553.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929122641.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124054.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124057.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929130229.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223735.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223952.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224119.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224233.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224326.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114711.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114715.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114806.sh +109 -0
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114816.sh +109 -0
Dev/.history/launch_vlac_service_20251002114022.py +0 -0
Dev/.history/launch_vlac_service_20251002114026.py +23 -0
Dev/.history/setup_verl_20250930114055.sh +0 -0
Dev/.history/setup_verl_20250930114105.sh +32 -0
Dev/.history/setup_vlac_20250930114110.sh +0 -0
Dev/.history/setup_vlac_20250930114358.sh +6 -0
Dev/.history/setup_vlac_20250930120731.sh +6 -0
Dev/.history/testing/evaluate_test_demo_values_20251008150855.py +422 -0
Dev/.history/testing/evaluate_test_demo_values_20251008150925.py +422 -0
Dev/.history/testing/evaluate_test_demo_values_20251008151015.py +422 -0
Dev/.history/testing/evaluate_test_demo_values_20251008151156.py +422 -0
Dev/.history/testing/evaluate_test_demo_values_20251008151427.py +465 -0
Dev/.history/testing/evaluate_test_demo_values_20251008151542.py +466 -0
Dev/.history/testing/evaluate_test_demo_values_20251008151723.py +466 -0
Dev/.history/testing/evaluate_test_demo_values_20251008151816.py +465 -0
Dev/.history/testing/evaluate_test_demo_values_20251008152522.py +477 -0
Dev/.history/testing/evaluate_test_demo_values_20251008152534.py +491 -0
Dev/.history/testing/evaluate_test_demo_values_20251008152548.py +519 -0
Dev/.history/testing/evaluate_test_demo_values_20251008152620.py +683 -0
Dev/.history/testing/evaluate_test_demo_values_20251008152700.py +784 -0
Dev/.history/testing/evaluate_test_demo_values_20251008152727.py +784 -0

.DS_Store ADDED Viewed

Binary file (12.3 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,74 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Dev/evo_vlac/examples/videos/pick-bowl-ref.mov filter=lfs diff=lfs merge=lfs -text
+Dev/evo_vlac/examples/videos/pick-bowl-test.mp4 filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks/aggregate_statistics.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_2frms/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_2frms/aggregate_statistics.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_8frms/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy_value_distribution.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/evaluation_results_all_tasks_8frms/aggregate_statistics.png filter=lfs diff=lfs merge=lfs -text
+Dev/testing/success_rate_across_trials.png filter=lfs diff=lfs merge=lfs -text
+Dev/visual_prompting/task_1_demo_with_traj.png filter=lfs diff=lfs merge=lfs -text
+Release/docs/assets/method_overview.png filter=lfs diff=lfs merge=lfs -text
+Release/docs/assets/qualitative.png filter=lfs diff=lfs merge=lfs -text
+Release/docs/assets/teaser.png filter=lfs diff=lfs merge=lfs -text
+Release/reward_model/evo_vlac/examples/videos/pick-bowl-ref.mov filter=lfs diff=lfs merge=lfs -text
+Release/reward_model/evo_vlac/examples/videos/pick-bowl-test.mp4 filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/assets/eval.png filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/assets/example_backward.png filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/assets/example_forward.png filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/assets/example_incremental.png filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/assets/method.png filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/assets/teasor.png filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/assets/vsi.png filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/dataset/example_raw_data/episode_001/cam_high.mp4 filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/dataset/example_raw_data/episode_001/cam_left_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/dataset/example_raw_data/episode_001/cam_right_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/dataset/example_raw_data/episode_002/cam_high.mp4 filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/dataset/example_raw_data/episode_002/cam_left_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/dataset/example_raw_data/episode_002/cam_right_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/examples/demo_table/cam_high.mp4 filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/examples/demo_table/cam_left_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/examples/demo_table/cam_right_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
+Reward/Robo-Dopamine/examples/demo_table/goal_image.png filter=lfs diff=lfs merge=lfs -text
+Reward/VLAC/data/VLAC_EAI.pdf filter=lfs diff=lfs merge=lfs -text
+Reward/VLAC/data/framework.png filter=lfs diff=lfs merge=lfs -text
+Reward/VLAC/data/title_banner-2.gif filter=lfs diff=lfs merge=lfs -text
+Reward/VLAC/evo_vlac/examples/videos/pick-bowl-ref.mov filter=lfs diff=lfs merge=lfs -text
+Reward/VLAC/evo_vlac/examples/videos/pick-bowl-test.mp4 filter=lfs diff=lfs merge=lfs -text
+Reward/robometer/assets/robometer.jpg filter=lfs diff=lfs merge=lfs -text
+Reward/robometer/scripts/example_videos/soar_put_green_stick_in_brown_bowl_rewards_progress_success.png filter=lfs diff=lfs merge=lfs -text
+arxiv/arxiv.pdf filter=lfs diff=lfs merge=lfs -text
+arxiv/fig/fig1.pdf filter=lfs diff=lfs merge=lfs -text
+arxiv/fig/mismatch.pdf filter=lfs diff=lfs merge=lfs -text
+arxiv/fig/qualitative.pdf filter=lfs diff=lfs merge=lfs -text
+arxiv/fig/ttt_vla_main.pdf filter=lfs diff=lfs merge=lfs -text

Dev/.DS_Store ADDED Viewed

Binary file (10.2 kB). View file

Dev/.history/examples/run_openvla_oft_rl_vlac_20250926003154.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=False \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928021537.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=2 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=False \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928101936.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=8 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=False \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928110056.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=8 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=False \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115107.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=8 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=4 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=False \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115109.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=8 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=4 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=False \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175228.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=8 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=4 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175432.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175459.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230226.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230315.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230435.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250928234553.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=25 \
+    trainer.test_freq=4 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250929122641.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=5 \
+    trainer.test_freq=1 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124054.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=1 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124057.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=1 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_20250929130229.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=1 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223735.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=False \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=1 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223952.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=True \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=1 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224119.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=True \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=2 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224233.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=True \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=2 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224326.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=True \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=2 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114711.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=True \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=2 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114715.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=True \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=2 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=True \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114806.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=True \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=2 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=False \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114816.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/bin/bash
+# OpenVLA-OFT RL Training with VLAC Integration
+# Based on run_openvla_oft_rl.sh but with VLAC service integration
+set -x
+export MUJOCO_GL="egl"    # glfw, glx, osmesa, egl
+export PYOPENGL_PLATFORM="egl"
+export NCCL_DEBUG=WARN
+export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=true
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+# VLAC Service Configuration
+export VLAC_SERVICE_URL="http://localhost:8111"
+# Before starting training, make sure VLAC service is running:
+# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
+PROJECT_NAME='SimpleVLA-RL-VLAC'
+EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
+# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
+SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
+CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
+# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
+DATASET_NAME="libero_10"
+VLA_NAME="openvla-oft"
+NUM_GPUS=8
+# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
+NUM_NODES=1
+ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
+HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
+    data.task_suite_name=$DATASET_NAME \
+    data.num_trials_per_task=50 \
+    data.n_samples=8 \
+    data.filter_accuracy=True \
+    data.accuracy_lower_bound=0.1 \
+    data.accuracy_upper_bound=0.9 \
+    data.oversample_factor=1 \
+    data.train_batch_size=64 \
+    data.val_batch_size=496 \
+    data.max_prompt_length=256 \
+    data.max_response_length=128 \
+    actor_rollout_ref.model.path=$SFT_MODEL_PATH \
+    actor_rollout_ref.model.vla=$VLA_NAME \
+    actor_rollout_ref.model.action_token_len=7 \
+    actor_rollout_ref.model.action_chunks_len=8 \
+    actor_rollout_ref.actor.optim.lr=5e-6 \
+    actor_rollout_ref.actor.optim.warmup_style=constant \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.grad_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.grad_clip=1 \
+    actor_rollout_ref.actor.clip_ratio_high=0.28 \
+    actor_rollout_ref.actor.clip_ratio_low=0.2 \
+    actor_rollout_ref.actor.num_images_in_input=1 \
+    actor_rollout_ref.actor.traj_mini_batch_size=16 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=False \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.entropy_coeff=0. \
+    actor_rollout_ref.rollout.num_images_in_input=1 \
+    actor_rollout_ref.rollout.val_micro_batch_size=8 \
+    actor_rollout_ref.rollout.temperature=1.6 \
+    actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
+    actor_rollout_ref.rollout.micro_batch_size=1 \
+    actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
+    actor_rollout_ref.rollout.model_family=openvla \
+    actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
+    actor_rollout_ref.rollout.num_steps_wait=10 \
+    actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
+    actor_rollout_ref.rollout.center_crop=True \
+    actor_rollout_ref.rollout.max_prompt_length=512 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=hf \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    \
+    +actor_rollout_ref.rollout.use_vlac=true \
+    +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
+    \
+    actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.00 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$PROJECT_NAME \
+    trainer.experiment_name=$EXPERIMENT_NAME \
+    trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
+    trainer.n_gpus_per_node=$NUM_GPUS \
+    trainer.nnodes=$NUM_NODES \
+    trainer.save_freq=10 \
+    trainer.test_freq=2 \
+    trainer.total_epochs=100 \
+    trainer.val_only=False \
+    trainer.val_before_train=False \
+    trainer.val_use_vlac=False \
+    algorithm.adv_estimator=grpo \
+    algorithm.adv_params.verifier_gamma=1.0 \
+    algorithm.adv_params.reward_model_gamma=1.0 \
+    trainer.runtime_env=$ALIGN_PATH \
+    trainer.wandb_mode=online

Dev/.history/launch_vlac_service_20251002114022.py ADDED Viewed

File without changes

Dev/.history/launch_vlac_service_20251002114026.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import subprocess
+import torch
+def launch_servers(base_port=8111):
+    num_gpus = torch.cuda.device_count()
+    processes = []
+    for gpu_id in range(num_gpus):
+        port = base_port + gpu_id
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+        cmd = [
+            "python", "vlac_service.py",  # 你写的 FastAPI 代码文件
+            "--port", str(port)
+        ]
+        print(f"Launching GPU {gpu_id} on port {port}")
+        p = subprocess.Popen(cmd, env=env)
+        processes.append(p)
+    for p in processes:
+        p.wait()
+if __name__ == "__main__":
+    launch_servers()

Dev/.history/setup_verl_20250930114055.sh ADDED Viewed

File without changes

Dev/.history/setup_verl_20250930114105.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+conda create -n verl python==3.10
+conda activate verl
+cd /mnt/bn/vgfm2/test_dit/zechen/RL_Playground/verl
+pip install --no-deps -e .
+cd ../../openvla-oft/
+pip install -e .
+cd LIBERO
+pip install -e .
+cd ..
+pip install -r experiments/robot/libero/libero_requirements.txt
+pip install packaging ninja
+ninja --version; echo $?
+pip install git+https://github.com/NICTA/pyairports.git
+cd ../SimpleVLA-RL
+pip install -r req.txt
+pip uninstall torch torchvision torchaudio
+pip3 install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121
+pip install transformers@git+https://github.com/moojink/transformers-openvla-oft.git
+pip uninstall flash_attn
+pip install "flash-attn==2.5.5" --no-build-isolation --no-cache-dir
+conda install -c conda-forge libegl-devel
+sudo apt install libosmesa6 libosmesa6-dev

Dev/.history/setup_vlac_20250930114110.sh ADDED Viewed

File without changes

Dev/.history/setup_vlac_20250930114358.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+conda create -n vlac python==3.10
+conda activate vlac
+pip install ms-swift==3.3 transformers==4.51.0 peft==0.15.2 opencv-python loguru
+pip3 install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121
+pip install "flash-attn==2.5.5" --no-build-isolation --no-cache-dir

Dev/.history/setup_vlac_20250930120731.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+conda create -n vlac python==3.10
+conda activate vlac
+pip install ms-swift==3.3 transformers==4.51.0 peft==0.15.2 opencv-python loguru timm
+pip3 install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121
+pip install "flash-attn==2.5.5" --no-build-isolation --no-cache-dir

Dev/.history/testing/evaluate_test_demo_values_20251008150855.py ADDED Viewed

	@@ -0,0 +1,422 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Example:
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        try:
+            # Encode frames
+            frames_b64 = encode_images(frame_paths)
+            # For now, no reference trajectory (can be added later)
+            reference_b64 = None
+            # Call VLAC service
+            result = call_trajectory_critic(
+                session=session,
+                base_url=base_url,
+                task=task_name,
+                frames_b64=frames_b64,
+                reference_b64=reference_b64,
+                timeout=timeout,
+            )
+            # Extract values
+            value_list = result.get("value_list", [])
+            if not value_list:
+                print(f"\n[warn] No values returned for demo {demo_name}")
+                failed_demos.append(demo_name)
+                continue
+            # Record results
+            demo_result = {
+                "demo_name": demo_name,
+                "total_frames": demo["total_frames"],
+                "success_index": demo["success_index"],
+                "num_sampled_frames": len(frame_paths),
+                "value_list": value_list,
+                "last_value": value_list[-1],  # The critical value for success frame
+                "mean_value": float(np.mean(value_list)),
+                "std_value": float(np.std(value_list)),
+                "latency_sec": result.get("latency_sec", 0.0),
+            }
+            results.append(demo_result)
+        except requests.RequestException as exc:
+            print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+        except Exception as exc:
+            print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
+        help="Path to the test manifest JSON file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008150925.py ADDED Viewed

	@@ -0,0 +1,422 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Example:
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        try:
+            # Encode frames
+            frames_b64 = encode_images(frame_paths)
+            # For now, no reference trajectory (can be added later)
+            reference_b64 = None
+            # Call VLAC service
+            result = call_trajectory_critic(
+                session=session,
+                base_url=base_url,
+                task=task_name,
+                frames_b64=frames_b64,
+                reference_b64=reference_b64,
+                timeout=timeout,
+            )
+            # Extract values
+            value_list = result.get("value_list", [])
+            if not value_list:
+                print(f"\n[warn] No values returned for demo {demo_name}")
+                failed_demos.append(demo_name)
+                continue
+            # Record results
+            demo_result = {
+                "demo_name": demo_name,
+                "total_frames": demo["total_frames"],
+                "success_index": demo["success_index"],
+                "num_sampled_frames": len(frame_paths),
+                "value_list": value_list,
+                "last_value": value_list[-1],  # The critical value for success frame
+                "mean_value": float(np.mean(value_list)),
+                "std_value": float(np.std(value_list)),
+                "latency_sec": result.get("latency_sec", 0.0),
+            }
+            results.append(demo_result)
+        except requests.RequestException as exc:
+            print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+        except Exception as exc:
+            print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
+        help="Path to the test manifest JSON file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008151015.py ADDED Viewed

	@@ -0,0 +1,422 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Example:
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        try:
+            # Encode frames
+            frames_b64 = encode_images(frame_paths)
+            # For now, no reference trajectory (can be added later)
+            reference_b64 = None
+            # Call VLAC service
+            result = call_trajectory_critic(
+                session=session,
+                base_url=base_url,
+                task=task_name,
+                frames_b64=frames_b64,
+                reference_b64=reference_b64,
+                timeout=timeout,
+            )
+            # Extract values
+            value_list = result.get("value_list", [])
+            if not value_list:
+                print(f"\n[warn] No values returned for demo {demo_name}")
+                failed_demos.append(demo_name)
+                continue
+            # Record results
+            demo_result = {
+                "demo_name": demo_name,
+                "total_frames": demo["total_frames"],
+                "success_index": demo["success_index"],
+                "num_sampled_frames": len(frame_paths),
+                "value_list": value_list,
+                "last_value": value_list[-1],  # The critical value for success frame
+                "mean_value": float(np.mean(value_list)),
+                "std_value": float(np.std(value_list)),
+                "latency_sec": result.get("latency_sec", 0.0),
+            }
+            results.append(demo_result)
+        except requests.RequestException as exc:
+            print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+        except Exception as exc:
+            print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
+        help="Path to the test manifest JSON file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008151156.py ADDED Viewed

	@@ -0,0 +1,422 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Example:
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        try:
+            # Encode frames
+            frames_b64 = encode_images(frame_paths)
+            # For now, no reference trajectory (can be added later)
+            reference_b64 = None
+            # Call VLAC service
+            result = call_trajectory_critic(
+                session=session,
+                base_url=base_url,
+                task=task_name,
+                frames_b64=frames_b64,
+                reference_b64=reference_b64,
+                timeout=timeout,
+            )
+            # Extract values
+            value_list = result.get("value_list", [])
+            if not value_list:
+                print(f"\n[warn] No values returned for demo {demo_name}")
+                failed_demos.append(demo_name)
+                continue
+            # Record results
+            demo_result = {
+                "demo_name": demo_name,
+                "total_frames": demo["total_frames"],
+                "success_index": demo["success_index"],
+                "num_sampled_frames": len(frame_paths),
+                "value_list": value_list,
+                "last_value": value_list[-1],  # The critical value for success frame
+                "mean_value": float(np.mean(value_list)),
+                "std_value": float(np.std(value_list)),
+                "latency_sec": result.get("latency_sec", 0.0),
+            }
+            results.append(demo_result)
+        except requests.RequestException as exc:
+            print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+        except Exception as exc:
+            print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
+        help="Path to the test manifest JSON file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008151427.py ADDED Viewed

	@@ -0,0 +1,465 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Example:
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import os
+import glob
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def sample_fixed_interval_frames(image_list, num_frames):
+    # sample num_frames frames from image_list
+    # sample with equal interval while also ensuring the first and the last frames are included
+    if len(image_list) == 0:
+        raise ValueError("image_list is empty")
+    elif len(image_list) == 1:
+        return [image_list[0]] * num_frames
+    elif num_frames == 2:
+        return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
+    elif num_frames == 3:
+        return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
+    else:
+        total_frames = len(image_list)
+        indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
+        sampled_frames = [image_list[i] for i in indices]
+    return sampled_frames
+num_frames_for_reference = 8
+ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
+libero_10_task_list = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
+]
+reference_frames_dict = {}
+for task_name in libero_10_task_list:
+    ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
+    ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
+    ref_frm_file_list.sort()
+    reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
+    reference_frames_temp = [np.array(Image.open(frame)) for frame in reference_frames_temp]
+    reference_frames_dict[task_name] = reference_frames_temp
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        try:
+            # Encode frames
+            frames_b64 = encode_images(frame_paths)
+            # For now, no reference trajectory (can be added later)
+            reference_b64 = reference_frames_dict[task_name]
+            # Call VLAC service
+            result = call_trajectory_critic(
+                session=session,
+                base_url=base_url,
+                task=task_name,
+                frames_b64=frames_b64,
+                reference_b64=reference_b64,
+                timeout=timeout,
+            )
+            # Extract values
+            value_list = result.get("value_list", [])
+            if not value_list:
+                print(f"\n[warn] No values returned for demo {demo_name}")
+                failed_demos.append(demo_name)
+                continue
+            # Record results
+            demo_result = {
+                "demo_name": demo_name,
+                "total_frames": demo["total_frames"],
+                "success_index": demo["success_index"],
+                "num_sampled_frames": len(frame_paths),
+                "value_list": value_list,
+                "last_value": value_list[-1],  # The critical value for success frame
+                "mean_value": float(np.mean(value_list)),
+                "std_value": float(np.std(value_list)),
+                "latency_sec": result.get("latency_sec", 0.0),
+            }
+            results.append(demo_result)
+        except requests.RequestException as exc:
+            print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+        except Exception as exc:
+            print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
+        help="Path to the test manifest JSON file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008151542.py ADDED Viewed

	@@ -0,0 +1,466 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Example:
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import os
+import glob
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def sample_fixed_interval_frames(image_list, num_frames):
+    # sample num_frames frames from image_list
+    # sample with equal interval while also ensuring the first and the last frames are included
+    if len(image_list) == 0:
+        raise ValueError("image_list is empty")
+    elif len(image_list) == 1:
+        return [image_list[0]] * num_frames
+    elif num_frames == 2:
+        return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
+    elif num_frames == 3:
+        return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
+    else:
+        total_frames = len(image_list)
+        indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
+        sampled_frames = [image_list[i] for i in indices]
+    return sampled_frames
+num_frames_for_reference = 8
+ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
+libero_10_task_list = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
+]
+reference_frames_dict = {}
+for task_name in libero_10_task_list:
+    ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
+    ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
+    ref_frm_file_list.sort()
+    reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
+    reference_frames_temp = [np.array(Image.open(frame)) for frame in reference_frames_temp]
+    reference_frames_dict[task_name] = reference_frames_temp
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        try:
+            # Encode frames
+            frames_b64 = encode_images(frame_paths)
+            # For now, no reference trajectory (can be added later)
+            print(f"Using reference frames for task {task_name}")
+            reference_b64 = reference_frames_dict[task_name]
+            # Call VLAC service
+            result = call_trajectory_critic(
+                session=session,
+                base_url=base_url,
+                task=task_name,
+                frames_b64=frames_b64,
+                reference_b64=reference_b64,
+                timeout=timeout,
+            )
+            # Extract values
+            value_list = result.get("value_list", [])
+            if not value_list:
+                print(f"\n[warn] No values returned for demo {demo_name}")
+                failed_demos.append(demo_name)
+                continue
+            # Record results
+            demo_result = {
+                "demo_name": demo_name,
+                "total_frames": demo["total_frames"],
+                "success_index": demo["success_index"],
+                "num_sampled_frames": len(frame_paths),
+                "value_list": value_list,
+                "last_value": value_list[-1],  # The critical value for success frame
+                "mean_value": float(np.mean(value_list)),
+                "std_value": float(np.std(value_list)),
+                "latency_sec": result.get("latency_sec", 0.0),
+            }
+            results.append(demo_result)
+        except requests.RequestException as exc:
+            print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+        except Exception as exc:
+            print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+            failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
+        help="Path to the test manifest JSON file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008151723.py ADDED Viewed

	@@ -0,0 +1,466 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Example:
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import os
+import glob
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def sample_fixed_interval_frames(image_list, num_frames):
+    # sample num_frames frames from image_list
+    # sample with equal interval while also ensuring the first and the last frames are included
+    if len(image_list) == 0:
+        raise ValueError("image_list is empty")
+    elif len(image_list) == 1:
+        return [image_list[0]] * num_frames
+    elif num_frames == 2:
+        return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
+    elif num_frames == 3:
+        return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
+    else:
+        total_frames = len(image_list)
+        indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
+        sampled_frames = [image_list[i] for i in indices]
+    return sampled_frames
+num_frames_for_reference = 8
+ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
+libero_10_task_list = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
+]
+reference_frames_dict = {}
+for task_name in libero_10_task_list:
+    ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
+    ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
+    ref_frm_file_list.sort()
+    reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
+    reference_frames_temp = [np.array(Image.open(frame)) for frame in reference_frames_temp]
+    reference_frames_dict[task_name] = reference_frames_temp
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        # try:
+        # Encode frames
+        frames_b64 = encode_images(frame_paths)
+        # For now, no reference trajectory (can be added later)
+        print(f"Using reference frames for task {task_name}")
+        reference_b64 = reference_frames_dict[task_name]
+        # Call VLAC service
+        result = call_trajectory_critic(
+            session=session,
+            base_url=base_url,
+            task=task_name,
+            frames_b64=frames_b64,
+            reference_b64=reference_b64,
+            timeout=timeout,
+        )
+        # Extract values
+        value_list = result.get("value_list", [])
+        if not value_list:
+            print(f"\n[warn] No values returned for demo {demo_name}")
+            failed_demos.append(demo_name)
+            continue
+        # Record results
+        demo_result = {
+            "demo_name": demo_name,
+            "total_frames": demo["total_frames"],
+            "success_index": demo["success_index"],
+            "num_sampled_frames": len(frame_paths),
+            "value_list": value_list,
+            "last_value": value_list[-1],  # The critical value for success frame
+            "mean_value": float(np.mean(value_list)),
+            "std_value": float(np.std(value_list)),
+            "latency_sec": result.get("latency_sec", 0.0),
+        }
+        results.append(demo_result)
+        # except requests.RequestException as exc:
+        #     print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+        # except Exception as exc:
+        #     print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
+        help="Path to the test manifest JSON file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008151816.py ADDED Viewed

	@@ -0,0 +1,465 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Example:
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import os
+import glob
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def sample_fixed_interval_frames(image_list, num_frames):
+    # sample num_frames frames from image_list
+    # sample with equal interval while also ensuring the first and the last frames are included
+    if len(image_list) == 0:
+        raise ValueError("image_list is empty")
+    elif len(image_list) == 1:
+        return [image_list[0]] * num_frames
+    elif num_frames == 2:
+        return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
+    elif num_frames == 3:
+        return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
+    else:
+        total_frames = len(image_list)
+        indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
+        sampled_frames = [image_list[i] for i in indices]
+    return sampled_frames
+num_frames_for_reference = 8
+ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
+libero_10_task_list = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
+]
+reference_frames_dict = {}
+for task_name in libero_10_task_list:
+    ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
+    ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
+    ref_frm_file_list.sort()
+    reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
+    reference_frames_dict[task_name] = reference_frames_temp
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        # try:
+        # Encode frames
+        frames_b64 = encode_images(frame_paths)
+        # For now, no reference trajectory (can be added later)
+        print(f"Using reference frames for task {task_name}")
+        reference_b64 = encode_images(reference_frames_dict[task_name])
+        # Call VLAC service
+        result = call_trajectory_critic(
+            session=session,
+            base_url=base_url,
+            task=task_name,
+            frames_b64=frames_b64,
+            reference_b64=reference_b64,
+            timeout=timeout,
+        )
+        # Extract values
+        value_list = result.get("value_list", [])
+        if not value_list:
+            print(f"\n[warn] No values returned for demo {demo_name}")
+            failed_demos.append(demo_name)
+            continue
+        # Record results
+        demo_result = {
+            "demo_name": demo_name,
+            "total_frames": demo["total_frames"],
+            "success_index": demo["success_index"],
+            "num_sampled_frames": len(frame_paths),
+            "value_list": value_list,
+            "last_value": value_list[-1],  # The critical value for success frame
+            "mean_value": float(np.mean(value_list)),
+            "std_value": float(np.std(value_list)),
+            "latency_sec": result.get("latency_sec", 0.0),
+        }
+        results.append(demo_result)
+        # except requests.RequestException as exc:
+        #     print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+        # except Exception as exc:
+        #     print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
+        help="Path to the test manifest JSON file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008152522.py ADDED Viewed

	@@ -0,0 +1,477 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
+    # Evaluate a single task
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Examples:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py \
+        --process-all-tasks \
+        --manifests-root toy_test_demos_LIBERO_10 \
+        --output-dir evaluation_results_all_tasks \
+        --base-url http://localhost:8111
+    # Evaluate a single task
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import os
+import glob
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def sample_fixed_interval_frames(image_list, num_frames):
+    # sample num_frames frames from image_list
+    # sample with equal interval while also ensuring the first and the last frames are included
+    if len(image_list) == 0:
+        raise ValueError("image_list is empty")
+    elif len(image_list) == 1:
+        return [image_list[0]] * num_frames
+    elif num_frames == 2:
+        return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
+    elif num_frames == 3:
+        return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
+    else:
+        total_frames = len(image_list)
+        indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
+        sampled_frames = [image_list[i] for i in indices]
+    return sampled_frames
+num_frames_for_reference = 8
+ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
+libero_10_task_list = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
+]
+reference_frames_dict = {}
+for task_name in libero_10_task_list:
+    ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
+    ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
+    ref_frm_file_list.sort()
+    reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
+    reference_frames_dict[task_name] = reference_frames_temp
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        # try:
+        # Encode frames
+        frames_b64 = encode_images(frame_paths)
+        # For now, no reference trajectory (can be added later)
+        print(f"Using reference frames for task {task_name}")
+        reference_b64 = encode_images(reference_frames_dict[task_name])
+        # Call VLAC service
+        result = call_trajectory_critic(
+            session=session,
+            base_url=base_url,
+            task=task_name,
+            frames_b64=frames_b64,
+            reference_b64=reference_b64,
+            timeout=timeout,
+        )
+        # Extract values
+        value_list = result.get("value_list", [])
+        if not value_list:
+            print(f"\n[warn] No values returned for demo {demo_name}")
+            failed_demos.append(demo_name)
+            continue
+        # Record results
+        demo_result = {
+            "demo_name": demo_name,
+            "total_frames": demo["total_frames"],
+            "success_index": demo["success_index"],
+            "num_sampled_frames": len(frame_paths),
+            "value_list": value_list,
+            "last_value": value_list[-1],  # The critical value for success frame
+            "mean_value": float(np.mean(value_list)),
+            "std_value": float(np.std(value_list)),
+            "latency_sec": result.get("latency_sec", 0.0),
+        }
+        results.append(demo_result)
+        # except requests.RequestException as exc:
+        #     print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+        # except Exception as exc:
+        #     print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
+        help="Path to the test manifest JSON file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008152534.py ADDED Viewed

	@@ -0,0 +1,491 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
+    # Evaluate a single task
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Examples:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py \
+        --process-all-tasks \
+        --manifests-root toy_test_demos_LIBERO_10 \
+        --output-dir evaluation_results_all_tasks \
+        --base-url http://localhost:8111
+    # Evaluate a single task
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import os
+import glob
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# LIBERO-10 task list
+LIBERO_10_TASKS = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
+]
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def sample_fixed_interval_frames(image_list, num_frames):
+    # sample num_frames frames from image_list
+    # sample with equal interval while also ensuring the first and the last frames are included
+    if len(image_list) == 0:
+        raise ValueError("image_list is empty")
+    elif len(image_list) == 1:
+        return [image_list[0]] * num_frames
+    elif num_frames == 2:
+        return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
+    elif num_frames == 3:
+        return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
+    else:
+        total_frames = len(image_list)
+        indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
+        sampled_frames = [image_list[i] for i in indices]
+    return sampled_frames
+num_frames_for_reference = 8
+ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
+libero_10_task_list = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
+]
+reference_frames_dict = {}
+for task_name in libero_10_task_list:
+    ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
+    ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
+    ref_frm_file_list.sort()
+    reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
+    reference_frames_dict[task_name] = reference_frames_temp
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        # try:
+        # Encode frames
+        frames_b64 = encode_images(frame_paths)
+        # For now, no reference trajectory (can be added later)
+        print(f"Using reference frames for task {task_name}")
+        reference_b64 = encode_images(reference_frames_dict[task_name])
+        # Call VLAC service
+        result = call_trajectory_critic(
+            session=session,
+            base_url=base_url,
+            task=task_name,
+            frames_b64=frames_b64,
+            reference_b64=reference_b64,
+            timeout=timeout,
+        )
+        # Extract values
+        value_list = result.get("value_list", [])
+        if not value_list:
+            print(f"\n[warn] No values returned for demo {demo_name}")
+            failed_demos.append(demo_name)
+            continue
+        # Record results
+        demo_result = {
+            "demo_name": demo_name,
+            "total_frames": demo["total_frames"],
+            "success_index": demo["success_index"],
+            "num_sampled_frames": len(frame_paths),
+            "value_list": value_list,
+            "last_value": value_list[-1],  # The critical value for success frame
+            "mean_value": float(np.mean(value_list)),
+            "std_value": float(np.std(value_list)),
+            "latency_sec": result.get("latency_sec", 0.0),
+        }
+        results.append(demo_result)
+        # except requests.RequestException as exc:
+        #     print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+        # except Exception as exc:
+        #     print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
+        help="Path to the test manifest JSON file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008152548.py ADDED Viewed

	@@ -0,0 +1,519 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
+    # Evaluate a single task
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Examples:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py \
+        --process-all-tasks \
+        --manifests-root toy_test_demos_LIBERO_10 \
+        --output-dir evaluation_results_all_tasks \
+        --base-url http://localhost:8111
+    # Evaluate a single task
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import os
+import glob
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# LIBERO-10 task list
+LIBERO_10_TASKS = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
+]
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def sample_fixed_interval_frames(image_list, num_frames):
+    # sample num_frames frames from image_list
+    # sample with equal interval while also ensuring the first and the last frames are included
+    if len(image_list) == 0:
+        raise ValueError("image_list is empty")
+    elif len(image_list) == 1:
+        return [image_list[0]] * num_frames
+    elif num_frames == 2:
+        return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
+    elif num_frames == 3:
+        return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
+    else:
+        total_frames = len(image_list)
+        indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
+        sampled_frames = [image_list[i] for i in indices]
+    return sampled_frames
+num_frames_for_reference = 8
+ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
+libero_10_task_list = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
+]
+reference_frames_dict = {}
+for task_name in libero_10_task_list:
+    ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
+    ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
+    ref_frm_file_list.sort()
+    reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
+    reference_frames_dict[task_name] = reference_frames_temp
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        # try:
+        # Encode frames
+        frames_b64 = encode_images(frame_paths)
+        # For now, no reference trajectory (can be added later)
+        print(f"Using reference frames for task {task_name}")
+        reference_b64 = encode_images(reference_frames_dict[task_name])
+        # Call VLAC service
+        result = call_trajectory_critic(
+            session=session,
+            base_url=base_url,
+            task=task_name,
+            frames_b64=frames_b64,
+            reference_b64=reference_b64,
+            timeout=timeout,
+        )
+        # Extract values
+        value_list = result.get("value_list", [])
+        if not value_list:
+            print(f"\n[warn] No values returned for demo {demo_name}")
+            failed_demos.append(demo_name)
+            continue
+        # Record results
+        demo_result = {
+            "demo_name": demo_name,
+            "total_frames": demo["total_frames"],
+            "success_index": demo["success_index"],
+            "num_sampled_frames": len(frame_paths),
+            "value_list": value_list,
+            "last_value": value_list[-1],  # The critical value for success frame
+            "mean_value": float(np.mean(value_list)),
+            "std_value": float(np.std(value_list)),
+            "latency_sec": result.get("latency_sec", 0.0),
+        }
+        results.append(demo_result)
+        # except requests.RequestException as exc:
+        #     print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+        # except Exception as exc:
+        #     print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    # Mode selection
+    parser.add_argument(
+        "--process-all-tasks",
+        action="store_true",
+        help="Process all LIBERO-10 tasks"
+    )
+    # Arguments for processing all tasks
+    parser.add_argument(
+        "--manifests-root",
+        type=Path,
+        help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
+    )
+    # Arguments for processing a single task
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        help="Path to the test manifest JSON file (for single task mode)",
+    )
+    # Common arguments
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    args = parser.parse_args()
+    # Validate arguments
+    if args.process_all_tasks:
+        if not args.manifests_root:
+            parser.error("--manifests-root is required when using --process-all-tasks")
+    else:
+        if not args.manifest_path:
+            parser.error("--manifest-path is required for single task mode")
+    return args
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008152620.py ADDED Viewed

	@@ -0,0 +1,683 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
+    # Evaluate a single task
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Examples:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py \
+        --process-all-tasks \
+        --manifests-root toy_test_demos_LIBERO_10 \
+        --output-dir evaluation_results_all_tasks \
+        --base-url http://localhost:8111
+    # Evaluate a single task
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import os
+import glob
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# LIBERO-10 task list
+LIBERO_10_TASKS = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
+]
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def sample_fixed_interval_frames(image_list, num_frames):
+    # sample num_frames frames from image_list
+    # sample with equal interval while also ensuring the first and the last frames are included
+    if len(image_list) == 0:
+        raise ValueError("image_list is empty")
+    elif len(image_list) == 1:
+        return [image_list[0]] * num_frames
+    elif num_frames == 2:
+        return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
+    elif num_frames == 3:
+        return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
+    else:
+        total_frames = len(image_list)
+        indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
+        sampled_frames = [image_list[i] for i in indices]
+    return sampled_frames
+num_frames_for_reference = 8
+ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
+libero_10_task_list = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
+]
+reference_frames_dict = {}
+for task_name in libero_10_task_list:
+    ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
+    ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
+    ref_frm_file_list.sort()
+    reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
+    reference_frames_dict[task_name] = reference_frames_temp
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        # try:
+        # Encode frames
+        frames_b64 = encode_images(frame_paths)
+        # For now, no reference trajectory (can be added later)
+        print(f"Using reference frames for task {task_name}")
+        reference_b64 = encode_images(reference_frames_dict[task_name])
+        # Call VLAC service
+        result = call_trajectory_critic(
+            session=session,
+            base_url=base_url,
+            task=task_name,
+            frames_b64=frames_b64,
+            reference_b64=reference_b64,
+            timeout=timeout,
+        )
+        # Extract values
+        value_list = result.get("value_list", [])
+        if not value_list:
+            print(f"\n[warn] No values returned for demo {demo_name}")
+            failed_demos.append(demo_name)
+            continue
+        # Record results
+        demo_result = {
+            "demo_name": demo_name,
+            "total_frames": demo["total_frames"],
+            "success_index": demo["success_index"],
+            "num_sampled_frames": len(frame_paths),
+            "value_list": value_list,
+            "last_value": value_list[-1],  # The critical value for success frame
+            "mean_value": float(np.mean(value_list)),
+            "std_value": float(np.std(value_list)),
+            "latency_sec": result.get("latency_sec", 0.0),
+        }
+        results.append(demo_result)
+        # except requests.RequestException as exc:
+        #     print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+        # except Exception as exc:
+        #     print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+def find_manifest_file(manifests_root: Path, task_name: str) -> Optional[Path]:
+    """Find the manifest file for a given task name.
+    Tries different patterns commonly used.
+    """
+    # Try different patterns
+    patterns = [
+        manifests_root / task_name / f"{task_name}_test_manifest.json",
+        manifests_root / task_name / "test_manifest.json",
+        manifests_root / f"{task_name}_test_manifest.json",
+    ]
+    for candidate in patterns:
+        if candidate.exists():
+            return candidate
+    return None
+def evaluate_single_task(
+    manifest_path: Path,
+    output_dir: Path,
+    base_url: str,
+    timeout: float,
+    use_reference: bool,
+) -> Optional[Dict]:
+    """Evaluate a single task and return the statistics.
+    Returns:
+        Dictionary with evaluation results and statistics, or None if failed
+    """
+    try:
+        manifest_data = read_manifest(manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error reading manifest: {exc}")
+        return None
+    task_name = manifest_data.get("task_name", "unknown")
+    print(f"\n{'='*80}")
+    print(f"Evaluating task: {task_name}")
+    print(f"Manifest: {manifest_path}")
+    print(f"{'='*80}")
+    # Run evaluation
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=base_url,
+        timeout=timeout,
+        use_reference=use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "-" * 80)
+    print("TASK EVALUATION SUMMARY")
+    print("-" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print(f"\nMean success value: {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev: {statistics['last_value_std']:.2f}")
+        print(f"Median: {statistics['last_value_median']:.2f}")
+        print(f"Values >= 90: {statistics.get('count_above_90', 0)} ({statistics.get('percent_above_90', 0):.1f}%)")
+    # Save results
+    task_output_dir = output_dir / task_name
+    task_output_dir.mkdir(parents=True, exist_ok=True)
+    save_results(evaluation_results, statistics, task_output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, task_output_dir)
+    return {
+        "task_name": task_name,
+        "evaluation_results": evaluation_results,
+        "statistics": statistics,
+    }
+def plot_aggregate_statistics(all_task_results: List[Dict], output_dir: Path) -> None:
+    """Create aggregate plots across all tasks."""
+    if not all_task_results:
+        return
+    # Extract data
+    task_names = [r["task_name"] for r in all_task_results]
+    mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
+    median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
+    std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
+    # Create figure with subplots
+    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+    fig.suptitle("VLAC Value Estimation - Aggregate Statistics Across All Tasks", fontsize=16, fontweight='bold')
+    # 1. Mean values per task
+    ax1 = axes[0, 0]
+    bars = ax1.bar(range(len(task_names)), mean_values, color='steelblue', alpha=0.7)
+    ax1.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axhline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Overall Mean ({np.mean(mean_values):.1f})')
+    ax1.set_xlabel('Task', fontsize=12)
+    ax1.set_ylabel('Mean Success Value', fontsize=12)
+    ax1.set_title('Mean Success Frame Values by Task', fontsize=14)
+    ax1.set_xticks(range(len(task_names)))
+    ax1.set_xticklabels(range(1, len(task_names) + 1))
+    ax1.legend()
+    ax1.grid(True, alpha=0.3, axis='y')
+    # 2. Distribution of mean values
+    ax2 = axes[0, 1]
+    ax2.hist(mean_values, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
+    ax2.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.axvline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(mean_values):.1f})')
+    ax2.set_xlabel('Mean Success Value', fontsize=12)
+    ax2.set_ylabel('Frequency (Tasks)', fontsize=12)
+    ax2.set_title('Distribution of Task-Level Mean Values', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3)
+    # 3. Median values per task
+    ax3 = axes[1, 0]
+    bars = ax3.bar(range(len(task_names)), median_values, color='coral', alpha=0.7)
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.median(median_values), color='green', linestyle='-', linewidth=2, label=f'Overall Median ({np.median(median_values):.1f})')
+    ax3.set_xlabel('Task', fontsize=12)
+    ax3.set_ylabel('Median Success Value', fontsize=12)
+    ax3.set_title('Median Success Frame Values by Task', fontsize=14)
+    ax3.set_xticks(range(len(task_names)))
+    ax3.set_xticklabels(range(1, len(task_names) + 1))
+    ax3.legend()
+    ax3.grid(True, alpha=0.3, axis='y')
+    # 4. Std deviation per task
+    ax4 = axes[1, 1]
+    bars = ax4.bar(range(len(task_names)), std_values, color='orange', alpha=0.7)
+    ax4.axhline(np.mean(std_values), color='green', linestyle='-', linewidth=2, label=f'Mean Std ({np.mean(std_values):.1f})')
+    ax4.set_xlabel('Task', fontsize=12)
+    ax4.set_ylabel('Standard Deviation', fontsize=12)
+    ax4.set_title('Variability in Success Values by Task', fontsize=14)
+    ax4.set_xticks(range(len(task_names)))
+    ax4.set_xticklabels(range(1, len(task_names) + 1))
+    ax4.legend()
+    ax4.grid(True, alpha=0.3, axis='y')
+    plt.tight_layout()
+    # Save plots
+    plot_path = output_dir / "aggregate_statistics.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nAggregate plot saved to: {plot_path}")
+    pdf_path = output_dir / "aggregate_statistics.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"Aggregate PDF saved to: {pdf_path}")
+    plt.close()
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    # Mode selection
+    parser.add_argument(
+        "--process-all-tasks",
+        action="store_true",
+        help="Process all LIBERO-10 tasks"
+    )
+    # Arguments for processing all tasks
+    parser.add_argument(
+        "--manifests-root",
+        type=Path,
+        help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
+    )
+    # Arguments for processing a single task
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        help="Path to the test manifest JSON file (for single task mode)",
+    )
+    # Common arguments
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    args = parser.parse_args()
+    # Validate arguments
+    if args.process_all_tasks:
+        if not args.manifests_root:
+            parser.error("--manifests-root is required when using --process-all-tasks")
+    else:
+        if not args.manifest_path:
+            parser.error("--manifest-path is required for single task mode")
+    return args
+def main() -> int:
+    args = parse_args()
+    # Read manifest
+    try:
+        manifest_data = read_manifest(args.manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error: {exc}")
+        return 1
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Run evaluation
+    print("=" * 80)
+    print("VLAC Value Estimation Evaluation")
+    print("=" * 80)
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=args.base_url,
+        timeout=args.timeout,
+        use_reference=args.use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print("\n" + "-" * 80)
+        print("SUCCESS FRAME VALUE STATISTICS")
+        print("-" * 80)
+        print(f"Mean:     {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+        print(f"Median:   {statistics['last_value_median']:.2f}")
+        print(f"Min:      {statistics['last_value_min']:.2f}")
+        print(f"Max:      {statistics['last_value_max']:.2f}")
+        print(f"Q25:      {statistics['last_value_q25']:.2f}")
+        print(f"Q75:      {statistics['last_value_q75']:.2f}")
+        print("\n" + "-" * 80)
+        print("THRESHOLD ANALYSIS")
+        print("-" * 80)
+        for threshold in [80, 85, 90, 95, 100]:
+            count = statistics[f"count_above_{threshold}"]
+            percent = statistics[f"percent_above_{threshold}"]
+            print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+        print("\n" + "-" * 80)
+        print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+        print("-" * 80)
+    # Save results
+    save_results(evaluation_results, statistics, output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, output_dir)
+    else:
+        print("\nNo successful evaluations to plot.")
+    print("\n" + "=" * 80)
+    print("EVALUATION COMPLETE")
+    print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008152700.py ADDED Viewed

	@@ -0,0 +1,784 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
+    # Evaluate a single task
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Examples:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py \
+        --process-all-tasks \
+        --manifests-root toy_test_demos_LIBERO_10 \
+        --output-dir evaluation_results_all_tasks \
+        --base-url http://localhost:8111
+    # Evaluate a single task
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import os
+import glob
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# LIBERO-10 task list
+LIBERO_10_TASKS = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
+]
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def sample_fixed_interval_frames(image_list, num_frames):
+    # sample num_frames frames from image_list
+    # sample with equal interval while also ensuring the first and the last frames are included
+    if len(image_list) == 0:
+        raise ValueError("image_list is empty")
+    elif len(image_list) == 1:
+        return [image_list[0]] * num_frames
+    elif num_frames == 2:
+        return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
+    elif num_frames == 3:
+        return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
+    else:
+        total_frames = len(image_list)
+        indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
+        sampled_frames = [image_list[i] for i in indices]
+    return sampled_frames
+num_frames_for_reference = 8
+ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
+libero_10_task_list = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
+]
+reference_frames_dict = {}
+for task_name in libero_10_task_list:
+    ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
+    ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
+    ref_frm_file_list.sort()
+    reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
+    reference_frames_dict[task_name] = reference_frames_temp
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        # try:
+        # Encode frames
+        frames_b64 = encode_images(frame_paths)
+        # For now, no reference trajectory (can be added later)
+        print(f"Using reference frames for task {task_name}")
+        reference_b64 = encode_images(reference_frames_dict[task_name])
+        # Call VLAC service
+        result = call_trajectory_critic(
+            session=session,
+            base_url=base_url,
+            task=task_name,
+            frames_b64=frames_b64,
+            reference_b64=reference_b64,
+            timeout=timeout,
+        )
+        # Extract values
+        value_list = result.get("value_list", [])
+        if not value_list:
+            print(f"\n[warn] No values returned for demo {demo_name}")
+            failed_demos.append(demo_name)
+            continue
+        # Record results
+        demo_result = {
+            "demo_name": demo_name,
+            "total_frames": demo["total_frames"],
+            "success_index": demo["success_index"],
+            "num_sampled_frames": len(frame_paths),
+            "value_list": value_list,
+            "last_value": value_list[-1],  # The critical value for success frame
+            "mean_value": float(np.mean(value_list)),
+            "std_value": float(np.std(value_list)),
+            "latency_sec": result.get("latency_sec", 0.0),
+        }
+        results.append(demo_result)
+        # except requests.RequestException as exc:
+        #     print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+        # except Exception as exc:
+        #     print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+def find_manifest_file(manifests_root: Path, task_name: str) -> Optional[Path]:
+    """Find the manifest file for a given task name.
+    Tries different patterns commonly used.
+    """
+    # Try different patterns
+    patterns = [
+        manifests_root / task_name / f"{task_name}_test_manifest.json",
+        manifests_root / task_name / "test_manifest.json",
+        manifests_root / f"{task_name}_test_manifest.json",
+    ]
+    for candidate in patterns:
+        if candidate.exists():
+            return candidate
+    return None
+def evaluate_single_task(
+    manifest_path: Path,
+    output_dir: Path,
+    base_url: str,
+    timeout: float,
+    use_reference: bool,
+) -> Optional[Dict]:
+    """Evaluate a single task and return the statistics.
+    Returns:
+        Dictionary with evaluation results and statistics, or None if failed
+    """
+    try:
+        manifest_data = read_manifest(manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error reading manifest: {exc}")
+        return None
+    task_name = manifest_data.get("task_name", "unknown")
+    print(f"\n{'='*80}")
+    print(f"Evaluating task: {task_name}")
+    print(f"Manifest: {manifest_path}")
+    print(f"{'='*80}")
+    # Run evaluation
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=base_url,
+        timeout=timeout,
+        use_reference=use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "-" * 80)
+    print("TASK EVALUATION SUMMARY")
+    print("-" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print(f"\nMean success value: {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev: {statistics['last_value_std']:.2f}")
+        print(f"Median: {statistics['last_value_median']:.2f}")
+        print(f"Values >= 90: {statistics.get('count_above_90', 0)} ({statistics.get('percent_above_90', 0):.1f}%)")
+    # Save results
+    task_output_dir = output_dir / task_name
+    task_output_dir.mkdir(parents=True, exist_ok=True)
+    save_results(evaluation_results, statistics, task_output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, task_output_dir)
+    return {
+        "task_name": task_name,
+        "evaluation_results": evaluation_results,
+        "statistics": statistics,
+    }
+def plot_aggregate_statistics(all_task_results: List[Dict], output_dir: Path) -> None:
+    """Create aggregate plots across all tasks."""
+    if not all_task_results:
+        return
+    # Extract data
+    task_names = [r["task_name"] for r in all_task_results]
+    mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
+    median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
+    std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
+    # Create figure with subplots
+    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+    fig.suptitle("VLAC Value Estimation - Aggregate Statistics Across All Tasks", fontsize=16, fontweight='bold')
+    # 1. Mean values per task
+    ax1 = axes[0, 0]
+    bars = ax1.bar(range(len(task_names)), mean_values, color='steelblue', alpha=0.7)
+    ax1.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axhline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Overall Mean ({np.mean(mean_values):.1f})')
+    ax1.set_xlabel('Task', fontsize=12)
+    ax1.set_ylabel('Mean Success Value', fontsize=12)
+    ax1.set_title('Mean Success Frame Values by Task', fontsize=14)
+    ax1.set_xticks(range(len(task_names)))
+    ax1.set_xticklabels(range(1, len(task_names) + 1))
+    ax1.legend()
+    ax1.grid(True, alpha=0.3, axis='y')
+    # 2. Distribution of mean values
+    ax2 = axes[0, 1]
+    ax2.hist(mean_values, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
+    ax2.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.axvline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(mean_values):.1f})')
+    ax2.set_xlabel('Mean Success Value', fontsize=12)
+    ax2.set_ylabel('Frequency (Tasks)', fontsize=12)
+    ax2.set_title('Distribution of Task-Level Mean Values', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3)
+    # 3. Median values per task
+    ax3 = axes[1, 0]
+    bars = ax3.bar(range(len(task_names)), median_values, color='coral', alpha=0.7)
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.median(median_values), color='green', linestyle='-', linewidth=2, label=f'Overall Median ({np.median(median_values):.1f})')
+    ax3.set_xlabel('Task', fontsize=12)
+    ax3.set_ylabel('Median Success Value', fontsize=12)
+    ax3.set_title('Median Success Frame Values by Task', fontsize=14)
+    ax3.set_xticks(range(len(task_names)))
+    ax3.set_xticklabels(range(1, len(task_names) + 1))
+    ax3.legend()
+    ax3.grid(True, alpha=0.3, axis='y')
+    # 4. Std deviation per task
+    ax4 = axes[1, 1]
+    bars = ax4.bar(range(len(task_names)), std_values, color='orange', alpha=0.7)
+    ax4.axhline(np.mean(std_values), color='green', linestyle='-', linewidth=2, label=f'Mean Std ({np.mean(std_values):.1f})')
+    ax4.set_xlabel('Task', fontsize=12)
+    ax4.set_ylabel('Standard Deviation', fontsize=12)
+    ax4.set_title('Variability in Success Values by Task', fontsize=14)
+    ax4.set_xticks(range(len(task_names)))
+    ax4.set_xticklabels(range(1, len(task_names) + 1))
+    ax4.legend()
+    ax4.grid(True, alpha=0.3, axis='y')
+    plt.tight_layout()
+    # Save plots
+    plot_path = output_dir / "aggregate_statistics.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nAggregate plot saved to: {plot_path}")
+    pdf_path = output_dir / "aggregate_statistics.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"Aggregate PDF saved to: {pdf_path}")
+    plt.close()
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    # Mode selection
+    parser.add_argument(
+        "--process-all-tasks",
+        action="store_true",
+        help="Process all LIBERO-10 tasks"
+    )
+    # Arguments for processing all tasks
+    parser.add_argument(
+        "--manifests-root",
+        type=Path,
+        help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
+    )
+    # Arguments for processing a single task
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        help="Path to the test manifest JSON file (for single task mode)",
+    )
+    # Common arguments
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    args = parser.parse_args()
+    # Validate arguments
+    if args.process_all_tasks:
+        if not args.manifests_root:
+            parser.error("--manifests-root is required when using --process-all-tasks")
+    else:
+        if not args.manifest_path:
+            parser.error("--manifest-path is required for single task mode")
+    return args
+def main() -> int:
+    args = parse_args()
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if args.process_all_tasks:
+        # Process all LIBERO-10 tasks
+        manifests_root = args.manifests_root.expanduser()
+        if not manifests_root.exists():
+            print(f"Error: Manifests root directory not found: {manifests_root}")
+            return 1
+        print("=" * 80)
+        print("EVALUATING ALL LIBERO-10 TASKS")
+        print("=" * 80)
+        print(f"Manifests root: {manifests_root}")
+        print(f"Output directory: {output_dir}")
+        print(f"Base URL: {args.base_url}")
+        print(f"Total tasks to evaluate: {len(LIBERO_10_TASKS)}")
+        print("=" * 80)
+        successful_tasks = []
+        failed_tasks = []
+        all_task_results = []
+        for idx, task_name in enumerate(LIBERO_10_TASKS, 1):
+            print(f"\n[{idx}/{len(LIBERO_10_TASKS)}] Processing: {task_name}")
+            # Find manifest file
+            manifest_path = find_manifest_file(manifests_root, task_name)
+            if manifest_path is None:
+                print(f"  [ERROR] Manifest file not found for task: {task_name}")
+                failed_tasks.append(task_name)
+                continue
+            # Evaluate the task
+            result = evaluate_single_task(
+                manifest_path=manifest_path,
+                output_dir=output_dir,
+                base_url=args.base_url,
+                timeout=args.timeout,
+                use_reference=args.use_reference,
+            )
+            if result:
+                successful_tasks.append(task_name)
+                all_task_results.append(result)
+            else:
+                failed_tasks.append(task_name)
+        # Print overall summary
+        print("\n" + "=" * 80)
+        print("EVALUATION COMPLETE - ALL TASKS")
+        print("=" * 80)
+        print(f"Successfully evaluated: {len(successful_tasks)}/{len(LIBERO_10_TASKS)} tasks")
+        print(f"Failed: {len(failed_tasks)}/{len(LIBERO_10_TASKS)} tasks")
+        if failed_tasks:
+            print("\nFailed tasks:")
+            for task in failed_tasks:
+                print(f"  - {task}")
+        # Compute and display aggregate statistics
+        if all_task_results:
+            print("\n" + "=" * 80)
+            print("AGGREGATE STATISTICS ACROSS ALL TASKS")
+            print("=" * 80)
+            all_mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
+            all_median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
+            all_std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
+            print(f"\nOverall mean of task means: {np.mean(all_mean_values):.2f} ± {np.std(all_mean_values):.2f}")
+            print(f"Overall median of task medians: {np.median(all_median_values):.2f}")
+            print(f"Average std deviation: {np.mean(all_std_values):.2f}")
+            print(f"\nBest performing task: {all_task_results[np.argmax(all_mean_values)]['task_name']} ({max(all_mean_values):.2f})")
+            print(f"Worst performing task: {all_task_results[np.argmin(all_mean_values)]['task_name']} ({min(all_mean_values):.2f})")
+            # Save aggregate statistics
+            aggregate_stats = {
+                "total_tasks": len(LIBERO_10_TASKS),
+                "successful_tasks": len(successful_tasks),
+                "failed_tasks": len(failed_tasks),
+                "overall_mean_of_means": float(np.mean(all_mean_values)),
+                "overall_std_of_means": float(np.std(all_mean_values)),
+                "overall_median_of_medians": float(np.median(all_median_values)),
+                "average_std_deviation": float(np.mean(all_std_values)),
+                "best_task": all_task_results[np.argmax(all_mean_values)]['task_name'],
+                "best_task_mean_value": float(max(all_mean_values)),
+                "worst_task": all_task_results[np.argmin(all_mean_values)]['task_name'],
+                "worst_task_mean_value": float(min(all_mean_values)),
+                "task_results": [
+                    {
+                        "task_name": r["task_name"],
+                        "mean_value": r["statistics"]["last_value_mean"],
+                        "median_value": r["statistics"]["last_value_median"],
+                        "std_value": r["statistics"]["last_value_std"],
+                    }
+                    for r in all_task_results
+                ]
+            }
+            aggregate_path = output_dir / "aggregate_statistics.json"
+            with aggregate_path.open("w", encoding="utf-8") as f:
+                json.dump(aggregate_stats, f, indent=2)
+            print(f"\nAggregate statistics saved to: {aggregate_path}")
+            # Create aggregate plots
+            plot_aggregate_statistics(all_task_results, output_dir)
+        print("\n" + "=" * 80)
+        print(f"All results saved to: {output_dir}")
+        print("=" * 80)
+    else:
+        # Process a single task
+        print("=" * 80)
+        print("VLAC Value Estimation Evaluation - Single Task")
+        print("=" * 80)
+        result = evaluate_single_task(
+            manifest_path=args.manifest_path.expanduser(),
+            output_dir=output_dir,
+            base_url=args.base_url,
+            timeout=args.timeout,
+            use_reference=args.use_reference,
+        )
+        if not result:
+            print("\nEvaluation failed!")
+            return 1
+        # Print detailed statistics for single task
+        statistics = result["statistics"]
+        evaluation_results = result["evaluation_results"]
+        print("\n" + "=" * 80)
+        print("DETAILED EVALUATION SUMMARY")
+        print("=" * 80)
+        print(f"Task: {evaluation_results['task_name']}")
+        print(f"Total demos: {evaluation_results['total_demos']}")
+        print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+        print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+        if statistics:
+            print("\n" + "-" * 80)
+            print("SUCCESS FRAME VALUE STATISTICS")
+            print("-" * 80)
+            print(f"Mean:     {statistics['last_value_mean']:.2f}")
+            print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+            print(f"Median:   {statistics['last_value_median']:.2f}")
+            print(f"Min:      {statistics['last_value_min']:.2f}")
+            print(f"Max:      {statistics['last_value_max']:.2f}")
+            print(f"Q25:      {statistics['last_value_q25']:.2f}")
+            print(f"Q75:      {statistics['last_value_q75']:.2f}")
+            print("\n" + "-" * 80)
+            print("THRESHOLD ANALYSIS")
+            print("-" * 80)
+            for threshold in [80, 85, 90, 95, 100]:
+                count = statistics[f"count_above_{threshold}"]
+                percent = statistics[f"percent_above_{threshold}"]
+                print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+            print("\n" + "-" * 80)
+            print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+            print("-" * 80)
+        print("\n" + "=" * 80)
+        print("EVALUATION COMPLETE")
+        print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

Dev/.history/testing/evaluate_test_demo_values_20251008152727.py ADDED Viewed

	@@ -0,0 +1,784 @@

+#!/usr/bin/env python3
+"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
+This script:
+1. Reads test demo manifests created by prepare_test_demo_single_task.py
+2. Calls the VLAC trajectory-critic service for each demo
+3. Records the last value (success frame value) - ideally should be 100
+4. Plots statistics to visualize the value distribution
+Usage:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
+    # Evaluate a single task
+    python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
+Examples:
+    # Evaluate all LIBERO-10 tasks
+    python evaluate_test_demo_values.py \
+        --process-all-tasks \
+        --manifests-root toy_test_demos_LIBERO_10 \
+        --output-dir evaluation_results_all_tasks \
+        --base-url http://localhost:8111
+    # Evaluate a single task
+    python evaluate_test_demo_values.py \
+        --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
+        --output-dir evaluation_results \
+        --base-url http://localhost:8111
+"""
+from __future__ import annotations
+import argparse
+import base64
+import json
+import os
+import glob
+import sys
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Dict, List, Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+from PIL import Image
+from tqdm import tqdm
+# LIBERO-10 task list
+LIBERO_10_TASKS = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
+]
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def sample_fixed_interval_frames(image_list, num_frames):
+    # sample num_frames frames from image_list
+    # sample with equal interval while also ensuring the first and the last frames are included
+    if len(image_list) == 0:
+        raise ValueError("image_list is empty")
+    elif len(image_list) == 1:
+        return [image_list[0]] * num_frames
+    elif num_frames == 2:
+        return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
+    elif num_frames == 3:
+        return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
+    else:
+        total_frames = len(image_list)
+        indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
+        sampled_frames = [image_list[i] for i in indices]
+    return sampled_frames
+num_frames_for_reference = 8
+ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
+libero_10_task_list = [
+    "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
+    "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
+    "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
+    "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
+    "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
+    "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
+    "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
+    "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
+    "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
+]
+reference_frames_dict = {}
+for task_name in libero_10_task_list:
+    ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
+    ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
+    ref_frm_file_list.sort()
+    reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
+    reference_frames_dict[task_name] = reference_frames_temp
+def read_manifest(manifest_path: Path) -> Dict:
+    """Read the test demo manifest JSON file."""
+    if not manifest_path.is_file():
+        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
+    with manifest_path.open("r", encoding="utf-8") as f:
+        manifest_data = json.load(f)
+    # Convert relative paths to absolute paths
+    manifest_dir = manifest_path.parent
+    for demo in manifest_data.get("demos", []):
+        demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
+    return manifest_data
+def image_to_base64(path: Path) -> str:
+    """Convert an image file to base64 encoded JPEG."""
+    with Image.open(path) as img:
+        img = img.convert("RGB")
+        buffer = BytesIO()
+        img.save(buffer, format="JPEG", quality=95)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def encode_images(paths: List[str]) -> List[str]:
+    """Encode a list of image paths to base64."""
+    return [image_to_base64(Path(p)) for p in paths]
+def call_trajectory_critic(
+    session: requests.Session,
+    base_url: str,
+    task: str,
+    frames_b64: List[str],
+    reference_b64: Optional[List[str]],
+    timeout: float,
+) -> Dict:
+    """Call the VLAC trajectory-critic endpoint."""
+    payload = {
+        "task": task,
+        "frames": frames_b64,
+        "reference": reference_b64,
+        "ref_num": len(reference_b64 or []),
+        "skip": 1,
+        "batch_size": min(len(frames_b64), 8),
+        "think": False,
+        "return_video": False,
+    }
+    start = time.time()
+    resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
+    resp.raise_for_status()
+    result = resp.json()
+    result["latency_sec"] = time.time() - start
+    return result
+# ---------------------------------------------------------------------------
+# Evaluation
+# ---------------------------------------------------------------------------
+def evaluate_demos(
+    manifest_data: Dict,
+    base_url: str,
+    timeout: float,
+    use_reference: bool = False,
+) -> Dict[str, any]:
+    """Evaluate all demos and collect value statistics."""
+    session = requests.Session()
+    task_name = manifest_data.get("task_name", "")
+    demos = manifest_data.get("demos", [])
+    results = []
+    failed_demos = []
+    print(f"\nEvaluating {len(demos)} test demonstrations...")
+    print(f"Task: {task_name}")
+    print(f"Use reference: {use_reference}\n")
+    for demo in tqdm(demos, desc="Processing demos"):
+        demo_name = demo["demo_name"]
+        frame_paths = demo["frame_paths"]
+        # try:
+        # Encode frames
+        frames_b64 = encode_images(frame_paths)
+        # For now, no reference trajectory (can be added later)
+        print(f"Using reference frames for task {task_name}")
+        reference_b64 = encode_images(reference_frames_dict[task_name])
+        # Call VLAC service
+        result = call_trajectory_critic(
+            session=session,
+            base_url=base_url,
+            task=task_name,
+            frames_b64=frames_b64,
+            reference_b64=reference_b64,
+            timeout=timeout,
+        )
+        # Extract values
+        value_list = result.get("value_list", [])
+        if not value_list:
+            print(f"\n[warn] No values returned for demo {demo_name}")
+            failed_demos.append(demo_name)
+            continue
+        # Record results
+        demo_result = {
+            "demo_name": demo_name,
+            "total_frames": demo["total_frames"],
+            "success_index": demo["success_index"],
+            "num_sampled_frames": len(frame_paths),
+            "value_list": value_list,
+            "last_value": value_list[-1],  # The critical value for success frame
+            "mean_value": float(np.mean(value_list)),
+            "std_value": float(np.std(value_list)),
+            "latency_sec": result.get("latency_sec", 0.0),
+        }
+        results.append(demo_result)
+        # except requests.RequestException as exc:
+        #     print(f"\n[error] Request failed for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+        # except Exception as exc:
+        #     print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
+        #     failed_demos.append(demo_name)
+    return {
+        "task_name": task_name,
+        "total_demos": len(demos),
+        "successful_evals": len(results),
+        "failed_demos": failed_demos,
+        "results": results,
+    }
+def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
+    """Compute summary statistics from evaluation results."""
+    results = evaluation_results["results"]
+    if not results:
+        return {}
+    last_values = [r["last_value"] for r in results]
+    mean_values = [r["mean_value"] for r in results]
+    latencies = [r["latency_sec"] for r in results]
+    stats = {
+        "last_value_mean": float(np.mean(last_values)),
+        "last_value_std": float(np.std(last_values)),
+        "last_value_min": float(np.min(last_values)),
+        "last_value_max": float(np.max(last_values)),
+        "last_value_median": float(np.median(last_values)),
+        "last_value_q25": float(np.percentile(last_values, 25)),
+        "last_value_q75": float(np.percentile(last_values, 75)),
+        "mean_latency": float(np.mean(latencies)),
+        "total_evaluated": len(results),
+    }
+    # Count how many demos have last_value >= various thresholds
+    for threshold in [80, 85, 90, 95, 100]:
+        count = sum(1 for v in last_values if v >= threshold)
+        stats[f"count_above_{threshold}"] = count
+        stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
+    return stats
+def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
+    """Create visualization plots for value distribution."""
+    results = evaluation_results["results"]
+    if not results:
+        print("No results to plot")
+        return
+    task_name = evaluation_results["task_name"]
+    last_values = [r["last_value"] for r in results]
+    # Create figure with multiple subplots
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
+    # 1. Histogram of last values
+    ax1 = axes[0, 0]
+    ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
+    ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
+    ax1.set_ylabel('Frequency', fontsize=12)
+    ax1.set_title('Distribution of Success Frame Values', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # 2. Box plot of last values
+    ax2 = axes[0, 1]
+    box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
+    for patch in box_data['boxes']:
+        patch.set_facecolor('lightblue')
+    ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.set_ylabel('Value', fontsize=12)
+    ax2.set_title('Success Frame Value Distribution', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3, axis='y')
+    # 3. Value progression across demos
+    ax3 = axes[1, 0]
+    demo_indices = range(len(results))
+    ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
+    ax3.set_xlabel('Demo Index', fontsize=12)
+    ax3.set_ylabel('Last Frame Value', fontsize=12)
+    ax3.set_title('Success Frame Values Across Demos', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # 4. Cumulative distribution
+    ax4 = axes[1, 1]
+    sorted_values = np.sort(last_values)
+    cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
+    ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
+    ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax4.set_xlabel('Success Frame Value', fontsize=12)
+    ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
+    ax4.set_title('Cumulative Distribution', fontsize=14)
+    ax4.legend()
+    ax4.grid(True, alpha=0.3)
+    plt.tight_layout()
+    # Save the plot
+    plot_path = output_dir / f"{task_name}_value_distribution.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nPlot saved to: {plot_path}")
+    # Also save a PDF version
+    pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"PDF saved to: {pdf_path}")
+    plt.close()
+def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
+    """Save evaluation results and statistics to JSON files."""
+    task_name = evaluation_results["task_name"]
+    # Save detailed results
+    results_path = output_dir / f"{task_name}_evaluation_results.json"
+    with results_path.open("w", encoding="utf-8") as f:
+        json.dump(evaluation_results, f, indent=2)
+    print(f"\nDetailed results saved to: {results_path}")
+    # Save summary statistics
+    stats_path = output_dir / f"{task_name}_statistics.json"
+    with stats_path.open("w", encoding="utf-8") as f:
+        json.dump(statistics, f, indent=2)
+    print(f"Statistics saved to: {stats_path}")
+def find_manifest_file(manifests_root: Path, task_name: str) -> Optional[Path]:
+    """Find the manifest file for a given task name.
+    Tries different patterns commonly used.
+    """
+    # Try different patterns
+    patterns = [
+        manifests_root / task_name / f"{task_name}_test_manifest.json",
+        manifests_root / task_name / "test_manifest.json",
+        manifests_root / f"{task_name}_test_manifest.json",
+    ]
+    for candidate in patterns:
+        if candidate.exists():
+            return candidate
+    return None
+def evaluate_single_task(
+    manifest_path: Path,
+    output_dir: Path,
+    base_url: str,
+    timeout: float,
+    use_reference: bool,
+) -> Optional[Dict]:
+    """Evaluate a single task and return the statistics.
+    Returns:
+        Dictionary with evaluation results and statistics, or None if failed
+    """
+    try:
+        manifest_data = read_manifest(manifest_path)
+    except FileNotFoundError as exc:
+        print(f"Error reading manifest: {exc}")
+        return None
+    task_name = manifest_data.get("task_name", "unknown")
+    print(f"\n{'='*80}")
+    print(f"Evaluating task: {task_name}")
+    print(f"Manifest: {manifest_path}")
+    print(f"{'='*80}")
+    # Run evaluation
+    evaluation_results = evaluate_demos(
+        manifest_data=manifest_data,
+        base_url=base_url,
+        timeout=timeout,
+        use_reference=use_reference,
+    )
+    # Compute statistics
+    statistics = compute_statistics(evaluation_results)
+    # Print summary
+    print("\n" + "-" * 80)
+    print("TASK EVALUATION SUMMARY")
+    print("-" * 80)
+    print(f"Task: {evaluation_results['task_name']}")
+    print(f"Total demos: {evaluation_results['total_demos']}")
+    print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+    print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+    if statistics:
+        print(f"\nMean success value: {statistics['last_value_mean']:.2f}")
+        print(f"Std Dev: {statistics['last_value_std']:.2f}")
+        print(f"Median: {statistics['last_value_median']:.2f}")
+        print(f"Values >= 90: {statistics.get('count_above_90', 0)} ({statistics.get('percent_above_90', 0):.1f}%)")
+    # Save results
+    task_output_dir = output_dir / task_name
+    task_output_dir.mkdir(parents=True, exist_ok=True)
+    save_results(evaluation_results, statistics, task_output_dir)
+    # Create plots
+    if evaluation_results["results"]:
+        plot_value_distribution(evaluation_results, task_output_dir)
+    return {
+        "task_name": task_name,
+        "evaluation_results": evaluation_results,
+        "statistics": statistics,
+    }
+def plot_aggregate_statistics(all_task_results: List[Dict], output_dir: Path) -> None:
+    """Create aggregate plots across all tasks."""
+    if not all_task_results:
+        return
+    # Extract data
+    task_names = [r["task_name"] for r in all_task_results]
+    mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
+    median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
+    std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
+    # Create figure with subplots
+    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+    fig.suptitle("VLAC Value Estimation - Aggregate Statistics Across All Tasks", fontsize=16, fontweight='bold')
+    # 1. Mean values per task
+    ax1 = axes[0, 0]
+    bars = ax1.bar(range(len(task_names)), mean_values, color='steelblue', alpha=0.7)
+    ax1.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax1.axhline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Overall Mean ({np.mean(mean_values):.1f})')
+    ax1.set_xlabel('Task', fontsize=12)
+    ax1.set_ylabel('Mean Success Value', fontsize=12)
+    ax1.set_title('Mean Success Frame Values by Task', fontsize=14)
+    ax1.set_xticks(range(len(task_names)))
+    ax1.set_xticklabels(range(1, len(task_names) + 1))
+    ax1.legend()
+    ax1.grid(True, alpha=0.3, axis='y')
+    # 2. Distribution of mean values
+    ax2 = axes[0, 1]
+    ax2.hist(mean_values, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
+    ax2.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax2.axvline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(mean_values):.1f})')
+    ax2.set_xlabel('Mean Success Value', fontsize=12)
+    ax2.set_ylabel('Frequency (Tasks)', fontsize=12)
+    ax2.set_title('Distribution of Task-Level Mean Values', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3)
+    # 3. Median values per task
+    ax3 = axes[1, 0]
+    bars = ax3.bar(range(len(task_names)), median_values, color='coral', alpha=0.7)
+    ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
+    ax3.axhline(np.median(median_values), color='green', linestyle='-', linewidth=2, label=f'Overall Median ({np.median(median_values):.1f})')
+    ax3.set_xlabel('Task', fontsize=12)
+    ax3.set_ylabel('Median Success Value', fontsize=12)
+    ax3.set_title('Median Success Frame Values by Task', fontsize=14)
+    ax3.set_xticks(range(len(task_names)))
+    ax3.set_xticklabels(range(1, len(task_names) + 1))
+    ax3.legend()
+    ax3.grid(True, alpha=0.3, axis='y')
+    # 4. Std deviation per task
+    ax4 = axes[1, 1]
+    bars = ax4.bar(range(len(task_names)), std_values, color='orange', alpha=0.7)
+    ax4.axhline(np.mean(std_values), color='green', linestyle='-', linewidth=2, label=f'Mean Std ({np.mean(std_values):.1f})')
+    ax4.set_xlabel('Task', fontsize=12)
+    ax4.set_ylabel('Standard Deviation', fontsize=12)
+    ax4.set_title('Variability in Success Values by Task', fontsize=14)
+    ax4.set_xticks(range(len(task_names)))
+    ax4.set_xticklabels(range(1, len(task_names) + 1))
+    ax4.legend()
+    ax4.grid(True, alpha=0.3, axis='y')
+    plt.tight_layout()
+    # Save plots
+    plot_path = output_dir / "aggregate_statistics.png"
+    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+    print(f"\nAggregate plot saved to: {plot_path}")
+    pdf_path = output_dir / "aggregate_statistics.pdf"
+    plt.savefig(pdf_path, bbox_inches='tight')
+    print(f"Aggregate PDF saved to: {pdf_path}")
+    plt.close()
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate value estimation for test demonstrations"
+    )
+    # Mode selection
+    parser.add_argument(
+        "--process-all-tasks",
+        action="store_true",
+        help="Process all LIBERO-10 tasks"
+    )
+    # Arguments for processing all tasks
+    parser.add_argument(
+        "--manifests-root",
+        type=Path,
+        help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
+    )
+    # Arguments for processing a single task
+    parser.add_argument(
+        "--manifest-path",
+        type=Path,
+        help="Path to the test manifest JSON file (for single task mode)",
+    )
+    # Common arguments
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default="evaluation_results",
+        help="Directory to save evaluation results and plots",
+    )
+    parser.add_argument(
+        "--base-url",
+        default="http://localhost:8111",
+        help="VLAC service base URL (default: http://localhost:8111)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=30.0,
+        help="HTTP request timeout in seconds (default: 30.0)",
+    )
+    parser.add_argument(
+        "--use-reference",
+        action="store_true",
+        help="Use reference trajectory (if available)",
+    )
+    args = parser.parse_args()
+    # Validate arguments
+    if args.process_all_tasks:
+        if not args.manifests_root:
+            parser.error("--manifests-root is required when using --process-all-tasks")
+    else:
+        if not args.manifest_path:
+            parser.error("--manifest-path is required for single task mode")
+    return args
+def main() -> int:
+    args = parse_args()
+    # Create output directory
+    output_dir = args.output_dir.expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if args.process_all_tasks:
+        # Process all LIBERO-10 tasks
+        manifests_root = args.manifests_root.expanduser()
+        if not manifests_root.exists():
+            print(f"Error: Manifests root directory not found: {manifests_root}")
+            return 1
+        print("=" * 80)
+        print("EVALUATING ALL LIBERO-10 TASKS")
+        print("=" * 80)
+        print(f"Manifests root: {manifests_root}")
+        print(f"Output directory: {output_dir}")
+        print(f"Base URL: {args.base_url}")
+        print(f"Total tasks to evaluate: {len(LIBERO_10_TASKS)}")
+        print("=" * 80)
+        successful_tasks = []
+        failed_tasks = []
+        all_task_results = []
+        for idx, task_name in enumerate(LIBERO_10_TASKS, 1):
+            print(f"\n[{idx}/{len(LIBERO_10_TASKS)}] Processing: {task_name}")
+            # Find manifest file
+            manifest_path = find_manifest_file(manifests_root, task_name)
+            if manifest_path is None:
+                print(f"  [ERROR] Manifest file not found for task: {task_name}")
+                failed_tasks.append(task_name)
+                continue
+            # Evaluate the task
+            result = evaluate_single_task(
+                manifest_path=manifest_path,
+                output_dir=output_dir,
+                base_url=args.base_url,
+                timeout=args.timeout,
+                use_reference=args.use_reference,
+            )
+            if result:
+                successful_tasks.append(task_name)
+                all_task_results.append(result)
+            else:
+                failed_tasks.append(task_name)
+        # Print overall summary
+        print("\n" + "=" * 80)
+        print("EVALUATION COMPLETE - ALL TASKS")
+        print("=" * 80)
+        print(f"Successfully evaluated: {len(successful_tasks)}/{len(LIBERO_10_TASKS)} tasks")
+        print(f"Failed: {len(failed_tasks)}/{len(LIBERO_10_TASKS)} tasks")
+        if failed_tasks:
+            print("\nFailed tasks:")
+            for task in failed_tasks:
+                print(f"  - {task}")
+        # Compute and display aggregate statistics
+        if all_task_results:
+            print("\n" + "=" * 80)
+            print("AGGREGATE STATISTICS ACROSS ALL TASKS")
+            print("=" * 80)
+            all_mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
+            all_median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
+            all_std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
+            print(f"\nOverall mean of task means: {np.mean(all_mean_values):.2f} ± {np.std(all_mean_values):.2f}")
+            print(f"Overall median of task medians: {np.median(all_median_values):.2f}")
+            print(f"Average std deviation: {np.mean(all_std_values):.2f}")
+            print(f"\nBest performing task: {all_task_results[np.argmax(all_mean_values)]['task_name']} ({max(all_mean_values):.2f})")
+            print(f"Worst performing task: {all_task_results[np.argmin(all_mean_values)]['task_name']} ({min(all_mean_values):.2f})")
+            # Save aggregate statistics
+            aggregate_stats = {
+                "total_tasks": len(LIBERO_10_TASKS),
+                "successful_tasks": len(successful_tasks),
+                "failed_tasks": len(failed_tasks),
+                "overall_mean_of_means": float(np.mean(all_mean_values)),
+                "overall_std_of_means": float(np.std(all_mean_values)),
+                "overall_median_of_medians": float(np.median(all_median_values)),
+                "average_std_deviation": float(np.mean(all_std_values)),
+                "best_task": all_task_results[np.argmax(all_mean_values)]['task_name'],
+                "best_task_mean_value": float(max(all_mean_values)),
+                "worst_task": all_task_results[np.argmin(all_mean_values)]['task_name'],
+                "worst_task_mean_value": float(min(all_mean_values)),
+                "task_results": [
+                    {
+                        "task_name": r["task_name"],
+                        "mean_value": r["statistics"]["last_value_mean"],
+                        "median_value": r["statistics"]["last_value_median"],
+                        "std_value": r["statistics"]["last_value_std"],
+                    }
+                    for r in all_task_results
+                ]
+            }
+            aggregate_path = output_dir / "aggregate_statistics.json"
+            with aggregate_path.open("w", encoding="utf-8") as f:
+                json.dump(aggregate_stats, f, indent=2)
+            print(f"\nAggregate statistics saved to: {aggregate_path}")
+            # Create aggregate plots
+            plot_aggregate_statistics(all_task_results, output_dir)
+        print("\n" + "=" * 80)
+        print(f"All results saved to: {output_dir}")
+        print("=" * 80)
+    else:
+        # Process a single task
+        print("=" * 80)
+        print("VLAC Value Estimation Evaluation - Single Task")
+        print("=" * 80)
+        result = evaluate_single_task(
+            manifest_path=args.manifest_path.expanduser(),
+            output_dir=output_dir,
+            base_url=args.base_url,
+            timeout=args.timeout,
+            use_reference=args.use_reference,
+        )
+        if not result:
+            print("\nEvaluation failed!")
+            return 1
+        # Print detailed statistics for single task
+        statistics = result["statistics"]
+        evaluation_results = result["evaluation_results"]
+        print("\n" + "=" * 80)
+        print("DETAILED EVALUATION SUMMARY")
+        print("=" * 80)
+        print(f"Task: {evaluation_results['task_name']}")
+        print(f"Total demos: {evaluation_results['total_demos']}")
+        print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
+        print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
+        if statistics:
+            print("\n" + "-" * 80)
+            print("SUCCESS FRAME VALUE STATISTICS")
+            print("-" * 80)
+            print(f"Mean:     {statistics['last_value_mean']:.2f}")
+            print(f"Std Dev:  {statistics['last_value_std']:.2f}")
+            print(f"Median:   {statistics['last_value_median']:.2f}")
+            print(f"Min:      {statistics['last_value_min']:.2f}")
+            print(f"Max:      {statistics['last_value_max']:.2f}")
+            print(f"Q25:      {statistics['last_value_q25']:.2f}")
+            print(f"Q75:      {statistics['last_value_q75']:.2f}")
+            print("\n" + "-" * 80)
+            print("THRESHOLD ANALYSIS")
+            print("-" * 80)
+            for threshold in [80, 85, 90, 95, 100]:
+                count = statistics[f"count_above_{threshold}"]
+                percent = statistics[f"percent_above_{threshold}"]
+                print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
+            print("\n" + "-" * 80)
+            print(f"Mean latency: {statistics['mean_latency']:.2f}s")
+            print("-" * 80)
+        print("\n" + "=" * 80)
+        print("EVALUATION COMPLETE")
+        print("=" * 80)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())