JosephBai commited on
Commit
857c2e9
·
verified ·
1 Parent(s): 74d1c6f

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. .gitattributes +71 -0
  3. Dev/.DS_Store +0 -0
  4. Dev/.history/examples/run_openvla_oft_rl_vlac_20250926003154.sh +109 -0
  5. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928021537.sh +109 -0
  6. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928101936.sh +109 -0
  7. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928110056.sh +109 -0
  8. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115107.sh +109 -0
  9. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115109.sh +109 -0
  10. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175228.sh +109 -0
  11. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175432.sh +109 -0
  12. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175459.sh +109 -0
  13. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230226.sh +109 -0
  14. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230315.sh +109 -0
  15. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230435.sh +109 -0
  16. Dev/.history/examples/run_openvla_oft_rl_vlac_20250928234553.sh +109 -0
  17. Dev/.history/examples/run_openvla_oft_rl_vlac_20250929122641.sh +109 -0
  18. Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124054.sh +109 -0
  19. Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124057.sh +109 -0
  20. Dev/.history/examples/run_openvla_oft_rl_vlac_20250929130229.sh +109 -0
  21. Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223735.sh +109 -0
  22. Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223952.sh +109 -0
  23. Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224119.sh +109 -0
  24. Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224233.sh +109 -0
  25. Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224326.sh +109 -0
  26. Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114711.sh +109 -0
  27. Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114715.sh +109 -0
  28. Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114806.sh +109 -0
  29. Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114816.sh +109 -0
  30. Dev/.history/launch_vlac_service_20251002114022.py +0 -0
  31. Dev/.history/launch_vlac_service_20251002114026.py +23 -0
  32. Dev/.history/setup_verl_20250930114055.sh +0 -0
  33. Dev/.history/setup_verl_20250930114105.sh +32 -0
  34. Dev/.history/setup_vlac_20250930114110.sh +0 -0
  35. Dev/.history/setup_vlac_20250930114358.sh +6 -0
  36. Dev/.history/setup_vlac_20250930120731.sh +6 -0
  37. Dev/.history/testing/evaluate_test_demo_values_20251008150855.py +422 -0
  38. Dev/.history/testing/evaluate_test_demo_values_20251008150925.py +422 -0
  39. Dev/.history/testing/evaluate_test_demo_values_20251008151015.py +422 -0
  40. Dev/.history/testing/evaluate_test_demo_values_20251008151156.py +422 -0
  41. Dev/.history/testing/evaluate_test_demo_values_20251008151427.py +465 -0
  42. Dev/.history/testing/evaluate_test_demo_values_20251008151542.py +466 -0
  43. Dev/.history/testing/evaluate_test_demo_values_20251008151723.py +466 -0
  44. Dev/.history/testing/evaluate_test_demo_values_20251008151816.py +465 -0
  45. Dev/.history/testing/evaluate_test_demo_values_20251008152522.py +477 -0
  46. Dev/.history/testing/evaluate_test_demo_values_20251008152534.py +491 -0
  47. Dev/.history/testing/evaluate_test_demo_values_20251008152548.py +519 -0
  48. Dev/.history/testing/evaluate_test_demo_values_20251008152620.py +683 -0
  49. Dev/.history/testing/evaluate_test_demo_values_20251008152700.py +784 -0
  50. Dev/.history/testing/evaluate_test_demo_values_20251008152727.py +784 -0
.DS_Store ADDED
Binary file (12.3 kB). View file
 
.gitattributes CHANGED
@@ -33,3 +33,74 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Dev/evo_vlac/examples/videos/pick-bowl-ref.mov filter=lfs diff=lfs merge=lfs -text
37
+ Dev/evo_vlac/examples/videos/pick-bowl-test.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
39
+ Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
40
+ Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
41
+ Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove_value_distribution.png filter=lfs diff=lfs merge=lfs -text
42
+ Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
43
+ Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
44
+ Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
45
+ Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
46
+ Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
47
+ Dev/testing/evaluation_results_all_tasks/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy_value_distribution.png filter=lfs diff=lfs merge=lfs -text
48
+ Dev/testing/evaluation_results_all_tasks/aggregate_statistics.png filter=lfs diff=lfs merge=lfs -text
49
+ Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
50
+ Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
51
+ Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
52
+ Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove_value_distribution.png filter=lfs diff=lfs merge=lfs -text
53
+ Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
54
+ Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
55
+ Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
56
+ Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
57
+ Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
58
+ Dev/testing/evaluation_results_all_tasks_2frms/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy_value_distribution.png filter=lfs diff=lfs merge=lfs -text
59
+ Dev/testing/evaluation_results_all_tasks_2frms/aggregate_statistics.png filter=lfs diff=lfs merge=lfs -text
60
+ Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
61
+ Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
62
+ Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
63
+ Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove_value_distribution.png filter=lfs diff=lfs merge=lfs -text
64
+ Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
65
+ Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
66
+ Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
67
+ Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
68
+ Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
69
+ Dev/testing/evaluation_results_all_tasks_8frms/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy_value_distribution.png filter=lfs diff=lfs merge=lfs -text
70
+ Dev/testing/evaluation_results_all_tasks_8frms/aggregate_statistics.png filter=lfs diff=lfs merge=lfs -text
71
+ Dev/testing/success_rate_across_trials.png filter=lfs diff=lfs merge=lfs -text
72
+ Dev/visual_prompting/task_1_demo_with_traj.png filter=lfs diff=lfs merge=lfs -text
73
+ Release/docs/assets/method_overview.png filter=lfs diff=lfs merge=lfs -text
74
+ Release/docs/assets/qualitative.png filter=lfs diff=lfs merge=lfs -text
75
+ Release/docs/assets/teaser.png filter=lfs diff=lfs merge=lfs -text
76
+ Release/reward_model/evo_vlac/examples/videos/pick-bowl-ref.mov filter=lfs diff=lfs merge=lfs -text
77
+ Release/reward_model/evo_vlac/examples/videos/pick-bowl-test.mp4 filter=lfs diff=lfs merge=lfs -text
78
+ Reward/Robo-Dopamine/assets/eval.png filter=lfs diff=lfs merge=lfs -text
79
+ Reward/Robo-Dopamine/assets/example_backward.png filter=lfs diff=lfs merge=lfs -text
80
+ Reward/Robo-Dopamine/assets/example_forward.png filter=lfs diff=lfs merge=lfs -text
81
+ Reward/Robo-Dopamine/assets/example_incremental.png filter=lfs diff=lfs merge=lfs -text
82
+ Reward/Robo-Dopamine/assets/method.png filter=lfs diff=lfs merge=lfs -text
83
+ Reward/Robo-Dopamine/assets/teasor.png filter=lfs diff=lfs merge=lfs -text
84
+ Reward/Robo-Dopamine/assets/vsi.png filter=lfs diff=lfs merge=lfs -text
85
+ Reward/Robo-Dopamine/dataset/example_raw_data/episode_001/cam_high.mp4 filter=lfs diff=lfs merge=lfs -text
86
+ Reward/Robo-Dopamine/dataset/example_raw_data/episode_001/cam_left_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
87
+ Reward/Robo-Dopamine/dataset/example_raw_data/episode_001/cam_right_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
88
+ Reward/Robo-Dopamine/dataset/example_raw_data/episode_002/cam_high.mp4 filter=lfs diff=lfs merge=lfs -text
89
+ Reward/Robo-Dopamine/dataset/example_raw_data/episode_002/cam_left_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
90
+ Reward/Robo-Dopamine/dataset/example_raw_data/episode_002/cam_right_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
91
+ Reward/Robo-Dopamine/examples/demo_table/cam_high.mp4 filter=lfs diff=lfs merge=lfs -text
92
+ Reward/Robo-Dopamine/examples/demo_table/cam_left_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
93
+ Reward/Robo-Dopamine/examples/demo_table/cam_right_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
94
+ Reward/Robo-Dopamine/examples/demo_table/goal_image.png filter=lfs diff=lfs merge=lfs -text
95
+ Reward/VLAC/data/VLAC_EAI.pdf filter=lfs diff=lfs merge=lfs -text
96
+ Reward/VLAC/data/framework.png filter=lfs diff=lfs merge=lfs -text
97
+ Reward/VLAC/data/title_banner-2.gif filter=lfs diff=lfs merge=lfs -text
98
+ Reward/VLAC/evo_vlac/examples/videos/pick-bowl-ref.mov filter=lfs diff=lfs merge=lfs -text
99
+ Reward/VLAC/evo_vlac/examples/videos/pick-bowl-test.mp4 filter=lfs diff=lfs merge=lfs -text
100
+ Reward/robometer/assets/robometer.jpg filter=lfs diff=lfs merge=lfs -text
101
+ Reward/robometer/scripts/example_videos/soar_put_green_stick_in_brown_bowl_rewards_progress_success.png filter=lfs diff=lfs merge=lfs -text
102
+ arxiv/arxiv.pdf filter=lfs diff=lfs merge=lfs -text
103
+ arxiv/fig/fig1.pdf filter=lfs diff=lfs merge=lfs -text
104
+ arxiv/fig/mismatch.pdf filter=lfs diff=lfs merge=lfs -text
105
+ arxiv/fig/qualitative.pdf filter=lfs diff=lfs merge=lfs -text
106
+ arxiv/fig/ttt_vla_main.pdf filter=lfs diff=lfs merge=lfs -text
Dev/.DS_Store ADDED
Binary file (10.2 kB). View file
 
Dev/.history/examples/run_openvla_oft_rl_vlac_20250926003154.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=False \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928021537.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=2 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=False \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928101936.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=8 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=False \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928110056.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=8 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=False \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115107.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=8 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=4 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=False \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115109.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=8 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=4 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=False \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175228.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=8 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=4 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175432.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175459.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230226.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230315.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230435.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928234553.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=25 \
100
+ trainer.test_freq=4 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929122641.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=5 \
100
+ trainer.test_freq=1 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124054.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=1 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124057.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=1 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929130229.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=1 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223735.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=False \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=1 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223952.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=True \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=1 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224119.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=True \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=2 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224233.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=True \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=2 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224326.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=True \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=2 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114711.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=True \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=2 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114715.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=True \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=2 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=True \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114806.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=True \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=2 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=False \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114816.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # OpenVLA-OFT RL Training with VLAC Integration
4
+ # Based on run_openvla_oft_rl.sh but with VLAC service integration
5
+
6
+ set -x
7
+
8
+ export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
9
+ export PYOPENGL_PLATFORM="egl"
10
+
11
+ export NCCL_DEBUG=WARN
12
+ export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
13
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14
+ export TOKENIZERS_PARALLELISM=true
15
+ export CUDA_LAUNCH_BLOCKING=1
16
+ export TORCH_USE_CUDA_DSA=1
17
+
18
+ # VLAC Service Configuration
19
+ export VLAC_SERVICE_URL="http://localhost:8111"
20
+
21
+ # Before starting training, make sure VLAC service is running:
22
+ # python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
23
+
24
+ PROJECT_NAME='SimpleVLA-RL-VLAC'
25
+ EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
26
+
27
+ # For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
28
+ SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
29
+ CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
30
+ # DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
31
+ DATASET_NAME="libero_10"
32
+ VLA_NAME="openvla-oft"
33
+ NUM_GPUS=8
34
+ # If you want to use 2*8 GPU to RL. Set NUM_NODES=2
35
+ NUM_NODES=1
36
+ ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
37
+
38
+ HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
39
+ data.task_suite_name=$DATASET_NAME \
40
+ data.num_trials_per_task=50 \
41
+ data.n_samples=8 \
42
+ data.filter_accuracy=True \
43
+ data.accuracy_lower_bound=0.1 \
44
+ data.accuracy_upper_bound=0.9 \
45
+ data.oversample_factor=1 \
46
+ data.train_batch_size=64 \
47
+ data.val_batch_size=496 \
48
+ data.max_prompt_length=256 \
49
+ data.max_response_length=128 \
50
+ actor_rollout_ref.model.path=$SFT_MODEL_PATH \
51
+ actor_rollout_ref.model.vla=$VLA_NAME \
52
+ actor_rollout_ref.model.action_token_len=7 \
53
+ actor_rollout_ref.model.action_chunks_len=8 \
54
+ actor_rollout_ref.actor.optim.lr=5e-6 \
55
+ actor_rollout_ref.actor.optim.warmup_style=constant \
56
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
57
+ actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
59
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
60
+ actor_rollout_ref.actor.fsdp_config.grad_offload=True \
61
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
62
+ actor_rollout_ref.actor.grad_clip=1 \
63
+ actor_rollout_ref.actor.clip_ratio_high=0.28 \
64
+ actor_rollout_ref.actor.clip_ratio_low=0.2 \
65
+ actor_rollout_ref.actor.num_images_in_input=1 \
66
+ actor_rollout_ref.actor.traj_mini_batch_size=16 \
67
+ actor_rollout_ref.model.enable_gradient_checkpointing=False \
68
+ actor_rollout_ref.model.use_remove_padding=False \
69
+ actor_rollout_ref.actor.entropy_coeff=0. \
70
+ actor_rollout_ref.rollout.num_images_in_input=1 \
71
+ actor_rollout_ref.rollout.val_micro_batch_size=8 \
72
+ actor_rollout_ref.rollout.temperature=1.6 \
73
+ actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
74
+ actor_rollout_ref.rollout.micro_batch_size=1 \
75
+ actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
76
+ actor_rollout_ref.rollout.model_family=openvla \
77
+ actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
78
+ actor_rollout_ref.rollout.num_steps_wait=10 \
79
+ actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
80
+ actor_rollout_ref.rollout.center_crop=True \
81
+ actor_rollout_ref.rollout.max_prompt_length=512 \
82
+ actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
83
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
84
+ actor_rollout_ref.rollout.name=hf \
85
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
86
+ \
87
+ +actor_rollout_ref.rollout.use_vlac=true \
88
+ +actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
89
+ \
90
+ actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
91
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
92
+ algorithm.kl_ctrl.kl_coef=0.00 \
93
+ trainer.logger=['console','wandb'] \
94
+ trainer.project_name=$PROJECT_NAME \
95
+ trainer.experiment_name=$EXPERIMENT_NAME \
96
+ trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
97
+ trainer.n_gpus_per_node=$NUM_GPUS \
98
+ trainer.nnodes=$NUM_NODES \
99
+ trainer.save_freq=10 \
100
+ trainer.test_freq=2 \
101
+ trainer.total_epochs=100 \
102
+ trainer.val_only=False \
103
+ trainer.val_before_train=False \
104
+ trainer.val_use_vlac=False \
105
+ algorithm.adv_estimator=grpo \
106
+ algorithm.adv_params.verifier_gamma=1.0 \
107
+ algorithm.adv_params.reward_model_gamma=1.0 \
108
+ trainer.runtime_env=$ALIGN_PATH \
109
+ trainer.wandb_mode=online
Dev/.history/launch_vlac_service_20251002114022.py ADDED
File without changes
Dev/.history/launch_vlac_service_20251002114026.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import torch
4
+
5
+ def launch_servers(base_port=8111):
6
+ num_gpus = torch.cuda.device_count()
7
+ processes = []
8
+ for gpu_id in range(num_gpus):
9
+ port = base_port + gpu_id
10
+ env = os.environ.copy()
11
+ env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
12
+ cmd = [
13
+ "python", "vlac_service.py", # 你写的 FastAPI 代码文件
14
+ "--port", str(port)
15
+ ]
16
+ print(f"Launching GPU {gpu_id} on port {port}")
17
+ p = subprocess.Popen(cmd, env=env)
18
+ processes.append(p)
19
+ for p in processes:
20
+ p.wait()
21
+
22
+ if __name__ == "__main__":
23
+ launch_servers()
Dev/.history/setup_verl_20250930114055.sh ADDED
File without changes
Dev/.history/setup_verl_20250930114105.sh ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conda create -n verl python==3.10
2
+ conda activate verl
3
+
4
+ cd /mnt/bn/vgfm2/test_dit/zechen/RL_Playground/verl
5
+ pip install --no-deps -e .
6
+
7
+ cd ../../openvla-oft/
8
+ pip install -e .
9
+
10
+ cd LIBERO
11
+ pip install -e .
12
+
13
+ cd ..
14
+ pip install -r experiments/robot/libero/libero_requirements.txt
15
+
16
+ pip install packaging ninja
17
+ ninja --version; echo $?
18
+
19
+ pip install git+https://github.com/NICTA/pyairports.git
20
+
21
+ cd ../SimpleVLA-RL
22
+ pip install -r req.txt
23
+ pip uninstall torch torchvision torchaudio
24
+
25
+ pip3 install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121
26
+ pip install transformers@git+https://github.com/moojink/transformers-openvla-oft.git
27
+
28
+ pip uninstall flash_attn
29
+ pip install "flash-attn==2.5.5" --no-build-isolation --no-cache-dir
30
+
31
+ conda install -c conda-forge libegl-devel
32
+ sudo apt install libosmesa6 libosmesa6-dev
Dev/.history/setup_vlac_20250930114110.sh ADDED
File without changes
Dev/.history/setup_vlac_20250930114358.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ conda create -n vlac python==3.10
2
+ conda activate vlac
3
+
4
+ pip install ms-swift==3.3 transformers==4.51.0 peft==0.15.2 opencv-python loguru
5
+ pip3 install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121
6
+ pip install "flash-attn==2.5.5" --no-build-isolation --no-cache-dir
Dev/.history/setup_vlac_20250930120731.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ conda create -n vlac python==3.10
2
+ conda activate vlac
3
+
4
+ pip install ms-swift==3.3 transformers==4.51.0 peft==0.15.2 opencv-python loguru timm
5
+ pip3 install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121
6
+ pip install "flash-attn==2.5.5" --no-build-isolation --no-cache-dir
Dev/.history/testing/evaluate_test_demo_values_20251008150855.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
12
+
13
+ Example:
14
+ python evaluate_test_demo_values.py \
15
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
16
+ --output-dir evaluation_results \
17
+ --base-url http://localhost:8111
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import base64
24
+ import json
25
+ import sys
26
+ import time
27
+ from io import BytesIO
28
+ from pathlib import Path
29
+ from typing import Dict, List, Optional
30
+
31
+ import matplotlib.pyplot as plt
32
+ import numpy as np
33
+ import requests
34
+ from PIL import Image
35
+ from tqdm import tqdm
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Helpers
39
+ # ---------------------------------------------------------------------------
40
+
41
+
42
+ def read_manifest(manifest_path: Path) -> Dict:
43
+ """Read the test demo manifest JSON file."""
44
+ if not manifest_path.is_file():
45
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
46
+
47
+ with manifest_path.open("r", encoding="utf-8") as f:
48
+ manifest_data = json.load(f)
49
+
50
+ # Convert relative paths to absolute paths
51
+ manifest_dir = manifest_path.parent
52
+ for demo in manifest_data.get("demos", []):
53
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
54
+
55
+ return manifest_data
56
+
57
+
58
+ def image_to_base64(path: Path) -> str:
59
+ """Convert an image file to base64 encoded JPEG."""
60
+ with Image.open(path) as img:
61
+ img = img.convert("RGB")
62
+ buffer = BytesIO()
63
+ img.save(buffer, format="JPEG", quality=95)
64
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
65
+
66
+
67
+ def encode_images(paths: List[str]) -> List[str]:
68
+ """Encode a list of image paths to base64."""
69
+ return [image_to_base64(Path(p)) for p in paths]
70
+
71
+
72
+ def call_trajectory_critic(
73
+ session: requests.Session,
74
+ base_url: str,
75
+ task: str,
76
+ frames_b64: List[str],
77
+ reference_b64: Optional[List[str]],
78
+ timeout: float,
79
+ ) -> Dict:
80
+ """Call the VLAC trajectory-critic endpoint."""
81
+ payload = {
82
+ "task": task,
83
+ "frames": frames_b64,
84
+ "reference": reference_b64,
85
+ "ref_num": len(reference_b64 or []),
86
+ "skip": 1,
87
+ "batch_size": min(len(frames_b64), 8),
88
+ "think": False,
89
+ "return_video": False,
90
+ }
91
+ start = time.time()
92
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
93
+ resp.raise_for_status()
94
+ result = resp.json()
95
+ result["latency_sec"] = time.time() - start
96
+ return result
97
+
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # Evaluation
101
+ # ---------------------------------------------------------------------------
102
+
103
+
104
+ def evaluate_demos(
105
+ manifest_data: Dict,
106
+ base_url: str,
107
+ timeout: float,
108
+ use_reference: bool = False,
109
+ ) -> Dict[str, any]:
110
+ """Evaluate all demos and collect value statistics."""
111
+ session = requests.Session()
112
+ task_name = manifest_data.get("task_name", "")
113
+ demos = manifest_data.get("demos", [])
114
+
115
+ results = []
116
+ failed_demos = []
117
+
118
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
119
+ print(f"Task: {task_name}")
120
+ print(f"Use reference: {use_reference}\n")
121
+
122
+ for demo in tqdm(demos, desc="Processing demos"):
123
+ demo_name = demo["demo_name"]
124
+ frame_paths = demo["frame_paths"]
125
+
126
+ try:
127
+ # Encode frames
128
+ frames_b64 = encode_images(frame_paths)
129
+
130
+ # For now, no reference trajectory (can be added later)
131
+ reference_b64 = None
132
+
133
+ # Call VLAC service
134
+ result = call_trajectory_critic(
135
+ session=session,
136
+ base_url=base_url,
137
+ task=task_name,
138
+ frames_b64=frames_b64,
139
+ reference_b64=reference_b64,
140
+ timeout=timeout,
141
+ )
142
+
143
+ # Extract values
144
+ value_list = result.get("value_list", [])
145
+ if not value_list:
146
+ print(f"\n[warn] No values returned for demo {demo_name}")
147
+ failed_demos.append(demo_name)
148
+ continue
149
+
150
+ # Record results
151
+ demo_result = {
152
+ "demo_name": demo_name,
153
+ "total_frames": demo["total_frames"],
154
+ "success_index": demo["success_index"],
155
+ "num_sampled_frames": len(frame_paths),
156
+ "value_list": value_list,
157
+ "last_value": value_list[-1], # The critical value for success frame
158
+ "mean_value": float(np.mean(value_list)),
159
+ "std_value": float(np.std(value_list)),
160
+ "latency_sec": result.get("latency_sec", 0.0),
161
+ }
162
+ results.append(demo_result)
163
+
164
+ except requests.RequestException as exc:
165
+ print(f"\n[error] Request failed for demo {demo_name}: {exc}")
166
+ failed_demos.append(demo_name)
167
+ except Exception as exc:
168
+ print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
169
+ failed_demos.append(demo_name)
170
+
171
+ return {
172
+ "task_name": task_name,
173
+ "total_demos": len(demos),
174
+ "successful_evals": len(results),
175
+ "failed_demos": failed_demos,
176
+ "results": results,
177
+ }
178
+
179
+
180
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
181
+ """Compute summary statistics from evaluation results."""
182
+ results = evaluation_results["results"]
183
+ if not results:
184
+ return {}
185
+
186
+ last_values = [r["last_value"] for r in results]
187
+ mean_values = [r["mean_value"] for r in results]
188
+ latencies = [r["latency_sec"] for r in results]
189
+
190
+ stats = {
191
+ "last_value_mean": float(np.mean(last_values)),
192
+ "last_value_std": float(np.std(last_values)),
193
+ "last_value_min": float(np.min(last_values)),
194
+ "last_value_max": float(np.max(last_values)),
195
+ "last_value_median": float(np.median(last_values)),
196
+ "last_value_q25": float(np.percentile(last_values, 25)),
197
+ "last_value_q75": float(np.percentile(last_values, 75)),
198
+ "mean_latency": float(np.mean(latencies)),
199
+ "total_evaluated": len(results),
200
+ }
201
+
202
+ # Count how many demos have last_value >= various thresholds
203
+ for threshold in [80, 85, 90, 95, 100]:
204
+ count = sum(1 for v in last_values if v >= threshold)
205
+ stats[f"count_above_{threshold}"] = count
206
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
207
+
208
+ return stats
209
+
210
+
211
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
212
+ """Create visualization plots for value distribution."""
213
+ results = evaluation_results["results"]
214
+ if not results:
215
+ print("No results to plot")
216
+ return
217
+
218
+ task_name = evaluation_results["task_name"]
219
+ last_values = [r["last_value"] for r in results]
220
+
221
+ # Create figure with multiple subplots
222
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
223
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
224
+
225
+ # 1. Histogram of last values
226
+ ax1 = axes[0, 0]
227
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
228
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
229
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
230
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
231
+ ax1.set_ylabel('Frequency', fontsize=12)
232
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
233
+ ax1.legend()
234
+ ax1.grid(True, alpha=0.3)
235
+
236
+ # 2. Box plot of last values
237
+ ax2 = axes[0, 1]
238
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
239
+ for patch in box_data['boxes']:
240
+ patch.set_facecolor('lightblue')
241
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
242
+ ax2.set_ylabel('Value', fontsize=12)
243
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
244
+ ax2.legend()
245
+ ax2.grid(True, alpha=0.3, axis='y')
246
+
247
+ # 3. Value progression across demos
248
+ ax3 = axes[1, 0]
249
+ demo_indices = range(len(results))
250
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
251
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
252
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
253
+ ax3.set_xlabel('Demo Index', fontsize=12)
254
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
255
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
256
+ ax3.legend()
257
+ ax3.grid(True, alpha=0.3)
258
+
259
+ # 4. Cumulative distribution
260
+ ax4 = axes[1, 1]
261
+ sorted_values = np.sort(last_values)
262
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
263
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
264
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
265
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
266
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
267
+ ax4.set_title('Cumulative Distribution', fontsize=14)
268
+ ax4.legend()
269
+ ax4.grid(True, alpha=0.3)
270
+
271
+ plt.tight_layout()
272
+
273
+ # Save the plot
274
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
275
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
276
+ print(f"\nPlot saved to: {plot_path}")
277
+
278
+ # Also save a PDF version
279
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
280
+ plt.savefig(pdf_path, bbox_inches='tight')
281
+ print(f"PDF saved to: {pdf_path}")
282
+
283
+ plt.close()
284
+
285
+
286
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
287
+ """Save evaluation results and statistics to JSON files."""
288
+ task_name = evaluation_results["task_name"]
289
+
290
+ # Save detailed results
291
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
292
+ with results_path.open("w", encoding="utf-8") as f:
293
+ json.dump(evaluation_results, f, indent=2)
294
+ print(f"\nDetailed results saved to: {results_path}")
295
+
296
+ # Save summary statistics
297
+ stats_path = output_dir / f"{task_name}_statistics.json"
298
+ with stats_path.open("w", encoding="utf-8") as f:
299
+ json.dump(statistics, f, indent=2)
300
+ print(f"Statistics saved to: {stats_path}")
301
+
302
+
303
+ # ---------------------------------------------------------------------------
304
+ # CLI
305
+ # ---------------------------------------------------------------------------
306
+
307
+
308
+ def parse_args() -> argparse.Namespace:
309
+ parser = argparse.ArgumentParser(
310
+ description="Evaluate value estimation for test demonstrations"
311
+ )
312
+ parser.add_argument(
313
+ "--manifest-path",
314
+ type=Path,
315
+ default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
316
+ help="Path to the test manifest JSON file",
317
+ )
318
+ parser.add_argument(
319
+ "--output-dir",
320
+ type=Path,
321
+ default="evaluation_results",
322
+ help="Directory to save evaluation results and plots",
323
+ )
324
+ parser.add_argument(
325
+ "--base-url",
326
+ default="http://localhost:8111",
327
+ help="VLAC service base URL (default: http://localhost:8111)",
328
+ )
329
+ parser.add_argument(
330
+ "--timeout",
331
+ type=float,
332
+ default=30.0,
333
+ help="HTTP request timeout in seconds (default: 30.0)",
334
+ )
335
+ parser.add_argument(
336
+ "--use-reference",
337
+ action="store_true",
338
+ help="Use reference trajectory (if available)",
339
+ )
340
+ return parser.parse_args()
341
+
342
+
343
+ def main() -> int:
344
+ args = parse_args()
345
+
346
+ # Read manifest
347
+ try:
348
+ manifest_data = read_manifest(args.manifest_path)
349
+ except FileNotFoundError as exc:
350
+ print(f"Error: {exc}")
351
+ return 1
352
+
353
+ # Create output directory
354
+ output_dir = args.output_dir.expanduser()
355
+ output_dir.mkdir(parents=True, exist_ok=True)
356
+
357
+ # Run evaluation
358
+ print("=" * 80)
359
+ print("VLAC Value Estimation Evaluation")
360
+ print("=" * 80)
361
+
362
+ evaluation_results = evaluate_demos(
363
+ manifest_data=manifest_data,
364
+ base_url=args.base_url,
365
+ timeout=args.timeout,
366
+ use_reference=args.use_reference,
367
+ )
368
+
369
+ # Compute statistics
370
+ statistics = compute_statistics(evaluation_results)
371
+
372
+ # Print summary
373
+ print("\n" + "=" * 80)
374
+ print("EVALUATION SUMMARY")
375
+ print("=" * 80)
376
+ print(f"Task: {evaluation_results['task_name']}")
377
+ print(f"Total demos: {evaluation_results['total_demos']}")
378
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
379
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
380
+
381
+ if statistics:
382
+ print("\n" + "-" * 80)
383
+ print("SUCCESS FRAME VALUE STATISTICS")
384
+ print("-" * 80)
385
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
386
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
387
+ print(f"Median: {statistics['last_value_median']:.2f}")
388
+ print(f"Min: {statistics['last_value_min']:.2f}")
389
+ print(f"Max: {statistics['last_value_max']:.2f}")
390
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
391
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
392
+
393
+ print("\n" + "-" * 80)
394
+ print("THRESHOLD ANALYSIS")
395
+ print("-" * 80)
396
+ for threshold in [80, 85, 90, 95, 100]:
397
+ count = statistics[f"count_above_{threshold}"]
398
+ percent = statistics[f"percent_above_{threshold}"]
399
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
400
+
401
+ print("\n" + "-" * 80)
402
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
403
+ print("-" * 80)
404
+
405
+ # Save results
406
+ save_results(evaluation_results, statistics, output_dir)
407
+
408
+ # Create plots
409
+ if evaluation_results["results"]:
410
+ plot_value_distribution(evaluation_results, output_dir)
411
+ else:
412
+ print("\nNo successful evaluations to plot.")
413
+
414
+ print("\n" + "=" * 80)
415
+ print("EVALUATION COMPLETE")
416
+ print("=" * 80)
417
+
418
+ return 0
419
+
420
+
421
+ if __name__ == "__main__":
422
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008150925.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
12
+
13
+ Example:
14
+ python evaluate_test_demo_values.py \
15
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
16
+ --output-dir evaluation_results \
17
+ --base-url http://localhost:8111
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import base64
24
+ import json
25
+ import sys
26
+ import time
27
+ from io import BytesIO
28
+ from pathlib import Path
29
+ from typing import Dict, List, Optional
30
+
31
+ import matplotlib.pyplot as plt
32
+ import numpy as np
33
+ import requests
34
+ from PIL import Image
35
+ from tqdm import tqdm
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Helpers
39
+ # ---------------------------------------------------------------------------
40
+
41
+
42
+ def read_manifest(manifest_path: Path) -> Dict:
43
+ """Read the test demo manifest JSON file."""
44
+ if not manifest_path.is_file():
45
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
46
+
47
+ with manifest_path.open("r", encoding="utf-8") as f:
48
+ manifest_data = json.load(f)
49
+
50
+ # Convert relative paths to absolute paths
51
+ manifest_dir = manifest_path.parent
52
+ for demo in manifest_data.get("demos", []):
53
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
54
+
55
+ return manifest_data
56
+
57
+
58
+ def image_to_base64(path: Path) -> str:
59
+ """Convert an image file to base64 encoded JPEG."""
60
+ with Image.open(path) as img:
61
+ img = img.convert("RGB")
62
+ buffer = BytesIO()
63
+ img.save(buffer, format="JPEG", quality=95)
64
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
65
+
66
+
67
+ def encode_images(paths: List[str]) -> List[str]:
68
+ """Encode a list of image paths to base64."""
69
+ return [image_to_base64(Path(p)) for p in paths]
70
+
71
+
72
+ def call_trajectory_critic(
73
+ session: requests.Session,
74
+ base_url: str,
75
+ task: str,
76
+ frames_b64: List[str],
77
+ reference_b64: Optional[List[str]],
78
+ timeout: float,
79
+ ) -> Dict:
80
+ """Call the VLAC trajectory-critic endpoint."""
81
+ payload = {
82
+ "task": task,
83
+ "frames": frames_b64,
84
+ "reference": reference_b64,
85
+ "ref_num": len(reference_b64 or []),
86
+ "skip": 1,
87
+ "batch_size": min(len(frames_b64), 8),
88
+ "think": False,
89
+ "return_video": False,
90
+ }
91
+ start = time.time()
92
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
93
+ resp.raise_for_status()
94
+ result = resp.json()
95
+ result["latency_sec"] = time.time() - start
96
+ return result
97
+
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # Evaluation
101
+ # ---------------------------------------------------------------------------
102
+
103
+
104
+ def evaluate_demos(
105
+ manifest_data: Dict,
106
+ base_url: str,
107
+ timeout: float,
108
+ use_reference: bool = False,
109
+ ) -> Dict[str, any]:
110
+ """Evaluate all demos and collect value statistics."""
111
+ session = requests.Session()
112
+ task_name = manifest_data.get("task_name", "")
113
+ demos = manifest_data.get("demos", [])
114
+
115
+ results = []
116
+ failed_demos = []
117
+
118
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
119
+ print(f"Task: {task_name}")
120
+ print(f"Use reference: {use_reference}\n")
121
+
122
+ for demo in tqdm(demos, desc="Processing demos"):
123
+ demo_name = demo["demo_name"]
124
+ frame_paths = demo["frame_paths"]
125
+
126
+ try:
127
+ # Encode frames
128
+ frames_b64 = encode_images(frame_paths)
129
+
130
+ # For now, no reference trajectory (can be added later)
131
+ reference_b64 = None
132
+
133
+ # Call VLAC service
134
+ result = call_trajectory_critic(
135
+ session=session,
136
+ base_url=base_url,
137
+ task=task_name,
138
+ frames_b64=frames_b64,
139
+ reference_b64=reference_b64,
140
+ timeout=timeout,
141
+ )
142
+
143
+ # Extract values
144
+ value_list = result.get("value_list", [])
145
+ if not value_list:
146
+ print(f"\n[warn] No values returned for demo {demo_name}")
147
+ failed_demos.append(demo_name)
148
+ continue
149
+
150
+ # Record results
151
+ demo_result = {
152
+ "demo_name": demo_name,
153
+ "total_frames": demo["total_frames"],
154
+ "success_index": demo["success_index"],
155
+ "num_sampled_frames": len(frame_paths),
156
+ "value_list": value_list,
157
+ "last_value": value_list[-1], # The critical value for success frame
158
+ "mean_value": float(np.mean(value_list)),
159
+ "std_value": float(np.std(value_list)),
160
+ "latency_sec": result.get("latency_sec", 0.0),
161
+ }
162
+ results.append(demo_result)
163
+
164
+ except requests.RequestException as exc:
165
+ print(f"\n[error] Request failed for demo {demo_name}: {exc}")
166
+ failed_demos.append(demo_name)
167
+ except Exception as exc:
168
+ print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
169
+ failed_demos.append(demo_name)
170
+
171
+ return {
172
+ "task_name": task_name,
173
+ "total_demos": len(demos),
174
+ "successful_evals": len(results),
175
+ "failed_demos": failed_demos,
176
+ "results": results,
177
+ }
178
+
179
+
180
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
181
+ """Compute summary statistics from evaluation results."""
182
+ results = evaluation_results["results"]
183
+ if not results:
184
+ return {}
185
+
186
+ last_values = [r["last_value"] for r in results]
187
+ mean_values = [r["mean_value"] for r in results]
188
+ latencies = [r["latency_sec"] for r in results]
189
+
190
+ stats = {
191
+ "last_value_mean": float(np.mean(last_values)),
192
+ "last_value_std": float(np.std(last_values)),
193
+ "last_value_min": float(np.min(last_values)),
194
+ "last_value_max": float(np.max(last_values)),
195
+ "last_value_median": float(np.median(last_values)),
196
+ "last_value_q25": float(np.percentile(last_values, 25)),
197
+ "last_value_q75": float(np.percentile(last_values, 75)),
198
+ "mean_latency": float(np.mean(latencies)),
199
+ "total_evaluated": len(results),
200
+ }
201
+
202
+ # Count how many demos have last_value >= various thresholds
203
+ for threshold in [80, 85, 90, 95, 100]:
204
+ count = sum(1 for v in last_values if v >= threshold)
205
+ stats[f"count_above_{threshold}"] = count
206
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
207
+
208
+ return stats
209
+
210
+
211
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
212
+ """Create visualization plots for value distribution."""
213
+ results = evaluation_results["results"]
214
+ if not results:
215
+ print("No results to plot")
216
+ return
217
+
218
+ task_name = evaluation_results["task_name"]
219
+ last_values = [r["last_value"] for r in results]
220
+
221
+ # Create figure with multiple subplots
222
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
223
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
224
+
225
+ # 1. Histogram of last values
226
+ ax1 = axes[0, 0]
227
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
228
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
229
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
230
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
231
+ ax1.set_ylabel('Frequency', fontsize=12)
232
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
233
+ ax1.legend()
234
+ ax1.grid(True, alpha=0.3)
235
+
236
+ # 2. Box plot of last values
237
+ ax2 = axes[0, 1]
238
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
239
+ for patch in box_data['boxes']:
240
+ patch.set_facecolor('lightblue')
241
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
242
+ ax2.set_ylabel('Value', fontsize=12)
243
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
244
+ ax2.legend()
245
+ ax2.grid(True, alpha=0.3, axis='y')
246
+
247
+ # 3. Value progression across demos
248
+ ax3 = axes[1, 0]
249
+ demo_indices = range(len(results))
250
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
251
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
252
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
253
+ ax3.set_xlabel('Demo Index', fontsize=12)
254
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
255
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
256
+ ax3.legend()
257
+ ax3.grid(True, alpha=0.3)
258
+
259
+ # 4. Cumulative distribution
260
+ ax4 = axes[1, 1]
261
+ sorted_values = np.sort(last_values)
262
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
263
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
264
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
265
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
266
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
267
+ ax4.set_title('Cumulative Distribution', fontsize=14)
268
+ ax4.legend()
269
+ ax4.grid(True, alpha=0.3)
270
+
271
+ plt.tight_layout()
272
+
273
+ # Save the plot
274
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
275
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
276
+ print(f"\nPlot saved to: {plot_path}")
277
+
278
+ # Also save a PDF version
279
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
280
+ plt.savefig(pdf_path, bbox_inches='tight')
281
+ print(f"PDF saved to: {pdf_path}")
282
+
283
+ plt.close()
284
+
285
+
286
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
287
+ """Save evaluation results and statistics to JSON files."""
288
+ task_name = evaluation_results["task_name"]
289
+
290
+ # Save detailed results
291
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
292
+ with results_path.open("w", encoding="utf-8") as f:
293
+ json.dump(evaluation_results, f, indent=2)
294
+ print(f"\nDetailed results saved to: {results_path}")
295
+
296
+ # Save summary statistics
297
+ stats_path = output_dir / f"{task_name}_statistics.json"
298
+ with stats_path.open("w", encoding="utf-8") as f:
299
+ json.dump(statistics, f, indent=2)
300
+ print(f"Statistics saved to: {stats_path}")
301
+
302
+
303
+ # ---------------------------------------------------------------------------
304
+ # CLI
305
+ # ---------------------------------------------------------------------------
306
+
307
+
308
+ def parse_args() -> argparse.Namespace:
309
+ parser = argparse.ArgumentParser(
310
+ description="Evaluate value estimation for test demonstrations"
311
+ )
312
+ parser.add_argument(
313
+ "--manifest-path",
314
+ type=Path,
315
+ default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
316
+ help="Path to the test manifest JSON file",
317
+ )
318
+ parser.add_argument(
319
+ "--output-dir",
320
+ type=Path,
321
+ default="evaluation_results",
322
+ help="Directory to save evaluation results and plots",
323
+ )
324
+ parser.add_argument(
325
+ "--base-url",
326
+ default="http://localhost:8111",
327
+ help="VLAC service base URL (default: http://localhost:8111)",
328
+ )
329
+ parser.add_argument(
330
+ "--timeout",
331
+ type=float,
332
+ default=30.0,
333
+ help="HTTP request timeout in seconds (default: 30.0)",
334
+ )
335
+ parser.add_argument(
336
+ "--use-reference",
337
+ action="store_true",
338
+ help="Use reference trajectory (if available)",
339
+ )
340
+ return parser.parse_args()
341
+
342
+
343
+ def main() -> int:
344
+ args = parse_args()
345
+
346
+ # Read manifest
347
+ try:
348
+ manifest_data = read_manifest(args.manifest_path)
349
+ except FileNotFoundError as exc:
350
+ print(f"Error: {exc}")
351
+ return 1
352
+
353
+ # Create output directory
354
+ output_dir = args.output_dir.expanduser()
355
+ output_dir.mkdir(parents=True, exist_ok=True)
356
+
357
+ # Run evaluation
358
+ print("=" * 80)
359
+ print("VLAC Value Estimation Evaluation")
360
+ print("=" * 80)
361
+
362
+ evaluation_results = evaluate_demos(
363
+ manifest_data=manifest_data,
364
+ base_url=args.base_url,
365
+ timeout=args.timeout,
366
+ use_reference=args.use_reference,
367
+ )
368
+
369
+ # Compute statistics
370
+ statistics = compute_statistics(evaluation_results)
371
+
372
+ # Print summary
373
+ print("\n" + "=" * 80)
374
+ print("EVALUATION SUMMARY")
375
+ print("=" * 80)
376
+ print(f"Task: {evaluation_results['task_name']}")
377
+ print(f"Total demos: {evaluation_results['total_demos']}")
378
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
379
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
380
+
381
+ if statistics:
382
+ print("\n" + "-" * 80)
383
+ print("SUCCESS FRAME VALUE STATISTICS")
384
+ print("-" * 80)
385
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
386
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
387
+ print(f"Median: {statistics['last_value_median']:.2f}")
388
+ print(f"Min: {statistics['last_value_min']:.2f}")
389
+ print(f"Max: {statistics['last_value_max']:.2f}")
390
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
391
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
392
+
393
+ print("\n" + "-" * 80)
394
+ print("THRESHOLD ANALYSIS")
395
+ print("-" * 80)
396
+ for threshold in [80, 85, 90, 95, 100]:
397
+ count = statistics[f"count_above_{threshold}"]
398
+ percent = statistics[f"percent_above_{threshold}"]
399
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
400
+
401
+ print("\n" + "-" * 80)
402
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
403
+ print("-" * 80)
404
+
405
+ # Save results
406
+ save_results(evaluation_results, statistics, output_dir)
407
+
408
+ # Create plots
409
+ if evaluation_results["results"]:
410
+ plot_value_distribution(evaluation_results, output_dir)
411
+ else:
412
+ print("\nNo successful evaluations to plot.")
413
+
414
+ print("\n" + "=" * 80)
415
+ print("EVALUATION COMPLETE")
416
+ print("=" * 80)
417
+
418
+ return 0
419
+
420
+
421
+ if __name__ == "__main__":
422
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008151015.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
12
+
13
+ Example:
14
+ python evaluate_test_demo_values.py \
15
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
16
+ --output-dir evaluation_results \
17
+ --base-url http://localhost:8111
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import base64
24
+ import json
25
+ import sys
26
+ import time
27
+ from io import BytesIO
28
+ from pathlib import Path
29
+ from typing import Dict, List, Optional
30
+
31
+ import matplotlib.pyplot as plt
32
+ import numpy as np
33
+ import requests
34
+ from PIL import Image
35
+ from tqdm import tqdm
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Helpers
39
+ # ---------------------------------------------------------------------------
40
+
41
+
42
+ def read_manifest(manifest_path: Path) -> Dict:
43
+ """Read the test demo manifest JSON file."""
44
+ if not manifest_path.is_file():
45
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
46
+
47
+ with manifest_path.open("r", encoding="utf-8") as f:
48
+ manifest_data = json.load(f)
49
+
50
+ # Convert relative paths to absolute paths
51
+ manifest_dir = manifest_path.parent
52
+ for demo in manifest_data.get("demos", []):
53
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
54
+
55
+ return manifest_data
56
+
57
+
58
+ def image_to_base64(path: Path) -> str:
59
+ """Convert an image file to base64 encoded JPEG."""
60
+ with Image.open(path) as img:
61
+ img = img.convert("RGB")
62
+ buffer = BytesIO()
63
+ img.save(buffer, format="JPEG", quality=95)
64
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
65
+
66
+
67
+ def encode_images(paths: List[str]) -> List[str]:
68
+ """Encode a list of image paths to base64."""
69
+ return [image_to_base64(Path(p)) for p in paths]
70
+
71
+
72
+ def call_trajectory_critic(
73
+ session: requests.Session,
74
+ base_url: str,
75
+ task: str,
76
+ frames_b64: List[str],
77
+ reference_b64: Optional[List[str]],
78
+ timeout: float,
79
+ ) -> Dict:
80
+ """Call the VLAC trajectory-critic endpoint."""
81
+ payload = {
82
+ "task": task,
83
+ "frames": frames_b64,
84
+ "reference": reference_b64,
85
+ "ref_num": len(reference_b64 or []),
86
+ "skip": 1,
87
+ "batch_size": min(len(frames_b64), 8),
88
+ "think": False,
89
+ "return_video": False,
90
+ }
91
+ start = time.time()
92
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
93
+ resp.raise_for_status()
94
+ result = resp.json()
95
+ result["latency_sec"] = time.time() - start
96
+ return result
97
+
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # Evaluation
101
+ # ---------------------------------------------------------------------------
102
+
103
+
104
+ def evaluate_demos(
105
+ manifest_data: Dict,
106
+ base_url: str,
107
+ timeout: float,
108
+ use_reference: bool = False,
109
+ ) -> Dict[str, any]:
110
+ """Evaluate all demos and collect value statistics."""
111
+ session = requests.Session()
112
+ task_name = manifest_data.get("task_name", "")
113
+ demos = manifest_data.get("demos", [])
114
+
115
+ results = []
116
+ failed_demos = []
117
+
118
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
119
+ print(f"Task: {task_name}")
120
+ print(f"Use reference: {use_reference}\n")
121
+
122
+ for demo in tqdm(demos, desc="Processing demos"):
123
+ demo_name = demo["demo_name"]
124
+ frame_paths = demo["frame_paths"]
125
+
126
+ try:
127
+ # Encode frames
128
+ frames_b64 = encode_images(frame_paths)
129
+
130
+ # For now, no reference trajectory (can be added later)
131
+ reference_b64 = None
132
+
133
+ # Call VLAC service
134
+ result = call_trajectory_critic(
135
+ session=session,
136
+ base_url=base_url,
137
+ task=task_name,
138
+ frames_b64=frames_b64,
139
+ reference_b64=reference_b64,
140
+ timeout=timeout,
141
+ )
142
+
143
+ # Extract values
144
+ value_list = result.get("value_list", [])
145
+ if not value_list:
146
+ print(f"\n[warn] No values returned for demo {demo_name}")
147
+ failed_demos.append(demo_name)
148
+ continue
149
+
150
+ # Record results
151
+ demo_result = {
152
+ "demo_name": demo_name,
153
+ "total_frames": demo["total_frames"],
154
+ "success_index": demo["success_index"],
155
+ "num_sampled_frames": len(frame_paths),
156
+ "value_list": value_list,
157
+ "last_value": value_list[-1], # The critical value for success frame
158
+ "mean_value": float(np.mean(value_list)),
159
+ "std_value": float(np.std(value_list)),
160
+ "latency_sec": result.get("latency_sec", 0.0),
161
+ }
162
+ results.append(demo_result)
163
+
164
+ except requests.RequestException as exc:
165
+ print(f"\n[error] Request failed for demo {demo_name}: {exc}")
166
+ failed_demos.append(demo_name)
167
+ except Exception as exc:
168
+ print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
169
+ failed_demos.append(demo_name)
170
+
171
+ return {
172
+ "task_name": task_name,
173
+ "total_demos": len(demos),
174
+ "successful_evals": len(results),
175
+ "failed_demos": failed_demos,
176
+ "results": results,
177
+ }
178
+
179
+
180
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
181
+ """Compute summary statistics from evaluation results."""
182
+ results = evaluation_results["results"]
183
+ if not results:
184
+ return {}
185
+
186
+ last_values = [r["last_value"] for r in results]
187
+ mean_values = [r["mean_value"] for r in results]
188
+ latencies = [r["latency_sec"] for r in results]
189
+
190
+ stats = {
191
+ "last_value_mean": float(np.mean(last_values)),
192
+ "last_value_std": float(np.std(last_values)),
193
+ "last_value_min": float(np.min(last_values)),
194
+ "last_value_max": float(np.max(last_values)),
195
+ "last_value_median": float(np.median(last_values)),
196
+ "last_value_q25": float(np.percentile(last_values, 25)),
197
+ "last_value_q75": float(np.percentile(last_values, 75)),
198
+ "mean_latency": float(np.mean(latencies)),
199
+ "total_evaluated": len(results),
200
+ }
201
+
202
+ # Count how many demos have last_value >= various thresholds
203
+ for threshold in [80, 85, 90, 95, 100]:
204
+ count = sum(1 for v in last_values if v >= threshold)
205
+ stats[f"count_above_{threshold}"] = count
206
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
207
+
208
+ return stats
209
+
210
+
211
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
212
+ """Create visualization plots for value distribution."""
213
+ results = evaluation_results["results"]
214
+ if not results:
215
+ print("No results to plot")
216
+ return
217
+
218
+ task_name = evaluation_results["task_name"]
219
+ last_values = [r["last_value"] for r in results]
220
+
221
+ # Create figure with multiple subplots
222
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
223
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
224
+
225
+ # 1. Histogram of last values
226
+ ax1 = axes[0, 0]
227
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
228
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
229
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
230
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
231
+ ax1.set_ylabel('Frequency', fontsize=12)
232
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
233
+ ax1.legend()
234
+ ax1.grid(True, alpha=0.3)
235
+
236
+ # 2. Box plot of last values
237
+ ax2 = axes[0, 1]
238
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
239
+ for patch in box_data['boxes']:
240
+ patch.set_facecolor('lightblue')
241
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
242
+ ax2.set_ylabel('Value', fontsize=12)
243
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
244
+ ax2.legend()
245
+ ax2.grid(True, alpha=0.3, axis='y')
246
+
247
+ # 3. Value progression across demos
248
+ ax3 = axes[1, 0]
249
+ demo_indices = range(len(results))
250
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
251
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
252
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
253
+ ax3.set_xlabel('Demo Index', fontsize=12)
254
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
255
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
256
+ ax3.legend()
257
+ ax3.grid(True, alpha=0.3)
258
+
259
+ # 4. Cumulative distribution
260
+ ax4 = axes[1, 1]
261
+ sorted_values = np.sort(last_values)
262
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
263
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
264
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
265
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
266
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
267
+ ax4.set_title('Cumulative Distribution', fontsize=14)
268
+ ax4.legend()
269
+ ax4.grid(True, alpha=0.3)
270
+
271
+ plt.tight_layout()
272
+
273
+ # Save the plot
274
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
275
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
276
+ print(f"\nPlot saved to: {plot_path}")
277
+
278
+ # Also save a PDF version
279
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
280
+ plt.savefig(pdf_path, bbox_inches='tight')
281
+ print(f"PDF saved to: {pdf_path}")
282
+
283
+ plt.close()
284
+
285
+
286
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
287
+ """Save evaluation results and statistics to JSON files."""
288
+ task_name = evaluation_results["task_name"]
289
+
290
+ # Save detailed results
291
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
292
+ with results_path.open("w", encoding="utf-8") as f:
293
+ json.dump(evaluation_results, f, indent=2)
294
+ print(f"\nDetailed results saved to: {results_path}")
295
+
296
+ # Save summary statistics
297
+ stats_path = output_dir / f"{task_name}_statistics.json"
298
+ with stats_path.open("w", encoding="utf-8") as f:
299
+ json.dump(statistics, f, indent=2)
300
+ print(f"Statistics saved to: {stats_path}")
301
+
302
+
303
+ # ---------------------------------------------------------------------------
304
+ # CLI
305
+ # ---------------------------------------------------------------------------
306
+
307
+
308
+ def parse_args() -> argparse.Namespace:
309
+ parser = argparse.ArgumentParser(
310
+ description="Evaluate value estimation for test demonstrations"
311
+ )
312
+ parser.add_argument(
313
+ "--manifest-path",
314
+ type=Path,
315
+ default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
316
+ help="Path to the test manifest JSON file",
317
+ )
318
+ parser.add_argument(
319
+ "--output-dir",
320
+ type=Path,
321
+ default="evaluation_results",
322
+ help="Directory to save evaluation results and plots",
323
+ )
324
+ parser.add_argument(
325
+ "--base-url",
326
+ default="http://localhost:8111",
327
+ help="VLAC service base URL (default: http://localhost:8111)",
328
+ )
329
+ parser.add_argument(
330
+ "--timeout",
331
+ type=float,
332
+ default=30.0,
333
+ help="HTTP request timeout in seconds (default: 30.0)",
334
+ )
335
+ parser.add_argument(
336
+ "--use-reference",
337
+ action="store_true",
338
+ help="Use reference trajectory (if available)",
339
+ )
340
+ return parser.parse_args()
341
+
342
+
343
+ def main() -> int:
344
+ args = parse_args()
345
+
346
+ # Read manifest
347
+ try:
348
+ manifest_data = read_manifest(args.manifest_path)
349
+ except FileNotFoundError as exc:
350
+ print(f"Error: {exc}")
351
+ return 1
352
+
353
+ # Create output directory
354
+ output_dir = args.output_dir.expanduser()
355
+ output_dir.mkdir(parents=True, exist_ok=True)
356
+
357
+ # Run evaluation
358
+ print("=" * 80)
359
+ print("VLAC Value Estimation Evaluation")
360
+ print("=" * 80)
361
+
362
+ evaluation_results = evaluate_demos(
363
+ manifest_data=manifest_data,
364
+ base_url=args.base_url,
365
+ timeout=args.timeout,
366
+ use_reference=args.use_reference,
367
+ )
368
+
369
+ # Compute statistics
370
+ statistics = compute_statistics(evaluation_results)
371
+
372
+ # Print summary
373
+ print("\n" + "=" * 80)
374
+ print("EVALUATION SUMMARY")
375
+ print("=" * 80)
376
+ print(f"Task: {evaluation_results['task_name']}")
377
+ print(f"Total demos: {evaluation_results['total_demos']}")
378
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
379
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
380
+
381
+ if statistics:
382
+ print("\n" + "-" * 80)
383
+ print("SUCCESS FRAME VALUE STATISTICS")
384
+ print("-" * 80)
385
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
386
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
387
+ print(f"Median: {statistics['last_value_median']:.2f}")
388
+ print(f"Min: {statistics['last_value_min']:.2f}")
389
+ print(f"Max: {statistics['last_value_max']:.2f}")
390
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
391
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
392
+
393
+ print("\n" + "-" * 80)
394
+ print("THRESHOLD ANALYSIS")
395
+ print("-" * 80)
396
+ for threshold in [80, 85, 90, 95, 100]:
397
+ count = statistics[f"count_above_{threshold}"]
398
+ percent = statistics[f"percent_above_{threshold}"]
399
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
400
+
401
+ print("\n" + "-" * 80)
402
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
403
+ print("-" * 80)
404
+
405
+ # Save results
406
+ save_results(evaluation_results, statistics, output_dir)
407
+
408
+ # Create plots
409
+ if evaluation_results["results"]:
410
+ plot_value_distribution(evaluation_results, output_dir)
411
+ else:
412
+ print("\nNo successful evaluations to plot.")
413
+
414
+ print("\n" + "=" * 80)
415
+ print("EVALUATION COMPLETE")
416
+ print("=" * 80)
417
+
418
+ return 0
419
+
420
+
421
+ if __name__ == "__main__":
422
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008151156.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
12
+
13
+ Example:
14
+ python evaluate_test_demo_values.py \
15
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
16
+ --output-dir evaluation_results \
17
+ --base-url http://localhost:8111
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import base64
24
+ import json
25
+ import sys
26
+ import time
27
+ from io import BytesIO
28
+ from pathlib import Path
29
+ from typing import Dict, List, Optional
30
+
31
+ import matplotlib.pyplot as plt
32
+ import numpy as np
33
+ import requests
34
+ from PIL import Image
35
+ from tqdm import tqdm
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Helpers
39
+ # ---------------------------------------------------------------------------
40
+
41
+
42
+ def read_manifest(manifest_path: Path) -> Dict:
43
+ """Read the test demo manifest JSON file."""
44
+ if not manifest_path.is_file():
45
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
46
+
47
+ with manifest_path.open("r", encoding="utf-8") as f:
48
+ manifest_data = json.load(f)
49
+
50
+ # Convert relative paths to absolute paths
51
+ manifest_dir = manifest_path.parent
52
+ for demo in manifest_data.get("demos", []):
53
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
54
+
55
+ return manifest_data
56
+
57
+
58
+ def image_to_base64(path: Path) -> str:
59
+ """Convert an image file to base64 encoded JPEG."""
60
+ with Image.open(path) as img:
61
+ img = img.convert("RGB")
62
+ buffer = BytesIO()
63
+ img.save(buffer, format="JPEG", quality=95)
64
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
65
+
66
+
67
+ def encode_images(paths: List[str]) -> List[str]:
68
+ """Encode a list of image paths to base64."""
69
+ return [image_to_base64(Path(p)) for p in paths]
70
+
71
+
72
+ def call_trajectory_critic(
73
+ session: requests.Session,
74
+ base_url: str,
75
+ task: str,
76
+ frames_b64: List[str],
77
+ reference_b64: Optional[List[str]],
78
+ timeout: float,
79
+ ) -> Dict:
80
+ """Call the VLAC trajectory-critic endpoint."""
81
+ payload = {
82
+ "task": task,
83
+ "frames": frames_b64,
84
+ "reference": reference_b64,
85
+ "ref_num": len(reference_b64 or []),
86
+ "skip": 1,
87
+ "batch_size": min(len(frames_b64), 8),
88
+ "think": False,
89
+ "return_video": False,
90
+ }
91
+ start = time.time()
92
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
93
+ resp.raise_for_status()
94
+ result = resp.json()
95
+ result["latency_sec"] = time.time() - start
96
+ return result
97
+
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # Evaluation
101
+ # ---------------------------------------------------------------------------
102
+
103
+
104
+ def evaluate_demos(
105
+ manifest_data: Dict,
106
+ base_url: str,
107
+ timeout: float,
108
+ use_reference: bool = False,
109
+ ) -> Dict[str, any]:
110
+ """Evaluate all demos and collect value statistics."""
111
+ session = requests.Session()
112
+ task_name = manifest_data.get("task_name", "")
113
+ demos = manifest_data.get("demos", [])
114
+
115
+ results = []
116
+ failed_demos = []
117
+
118
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
119
+ print(f"Task: {task_name}")
120
+ print(f"Use reference: {use_reference}\n")
121
+
122
+ for demo in tqdm(demos, desc="Processing demos"):
123
+ demo_name = demo["demo_name"]
124
+ frame_paths = demo["frame_paths"]
125
+
126
+ try:
127
+ # Encode frames
128
+ frames_b64 = encode_images(frame_paths)
129
+
130
+ # For now, no reference trajectory (can be added later)
131
+ reference_b64 = None
132
+
133
+ # Call VLAC service
134
+ result = call_trajectory_critic(
135
+ session=session,
136
+ base_url=base_url,
137
+ task=task_name,
138
+ frames_b64=frames_b64,
139
+ reference_b64=reference_b64,
140
+ timeout=timeout,
141
+ )
142
+
143
+ # Extract values
144
+ value_list = result.get("value_list", [])
145
+ if not value_list:
146
+ print(f"\n[warn] No values returned for demo {demo_name}")
147
+ failed_demos.append(demo_name)
148
+ continue
149
+
150
+ # Record results
151
+ demo_result = {
152
+ "demo_name": demo_name,
153
+ "total_frames": demo["total_frames"],
154
+ "success_index": demo["success_index"],
155
+ "num_sampled_frames": len(frame_paths),
156
+ "value_list": value_list,
157
+ "last_value": value_list[-1], # The critical value for success frame
158
+ "mean_value": float(np.mean(value_list)),
159
+ "std_value": float(np.std(value_list)),
160
+ "latency_sec": result.get("latency_sec", 0.0),
161
+ }
162
+ results.append(demo_result)
163
+
164
+ except requests.RequestException as exc:
165
+ print(f"\n[error] Request failed for demo {demo_name}: {exc}")
166
+ failed_demos.append(demo_name)
167
+ except Exception as exc:
168
+ print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
169
+ failed_demos.append(demo_name)
170
+
171
+ return {
172
+ "task_name": task_name,
173
+ "total_demos": len(demos),
174
+ "successful_evals": len(results),
175
+ "failed_demos": failed_demos,
176
+ "results": results,
177
+ }
178
+
179
+
180
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
181
+ """Compute summary statistics from evaluation results."""
182
+ results = evaluation_results["results"]
183
+ if not results:
184
+ return {}
185
+
186
+ last_values = [r["last_value"] for r in results]
187
+ mean_values = [r["mean_value"] for r in results]
188
+ latencies = [r["latency_sec"] for r in results]
189
+
190
+ stats = {
191
+ "last_value_mean": float(np.mean(last_values)),
192
+ "last_value_std": float(np.std(last_values)),
193
+ "last_value_min": float(np.min(last_values)),
194
+ "last_value_max": float(np.max(last_values)),
195
+ "last_value_median": float(np.median(last_values)),
196
+ "last_value_q25": float(np.percentile(last_values, 25)),
197
+ "last_value_q75": float(np.percentile(last_values, 75)),
198
+ "mean_latency": float(np.mean(latencies)),
199
+ "total_evaluated": len(results),
200
+ }
201
+
202
+ # Count how many demos have last_value >= various thresholds
203
+ for threshold in [80, 85, 90, 95, 100]:
204
+ count = sum(1 for v in last_values if v >= threshold)
205
+ stats[f"count_above_{threshold}"] = count
206
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
207
+
208
+ return stats
209
+
210
+
211
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
212
+ """Create visualization plots for value distribution."""
213
+ results = evaluation_results["results"]
214
+ if not results:
215
+ print("No results to plot")
216
+ return
217
+
218
+ task_name = evaluation_results["task_name"]
219
+ last_values = [r["last_value"] for r in results]
220
+
221
+ # Create figure with multiple subplots
222
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
223
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
224
+
225
+ # 1. Histogram of last values
226
+ ax1 = axes[0, 0]
227
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
228
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
229
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
230
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
231
+ ax1.set_ylabel('Frequency', fontsize=12)
232
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
233
+ ax1.legend()
234
+ ax1.grid(True, alpha=0.3)
235
+
236
+ # 2. Box plot of last values
237
+ ax2 = axes[0, 1]
238
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
239
+ for patch in box_data['boxes']:
240
+ patch.set_facecolor('lightblue')
241
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
242
+ ax2.set_ylabel('Value', fontsize=12)
243
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
244
+ ax2.legend()
245
+ ax2.grid(True, alpha=0.3, axis='y')
246
+
247
+ # 3. Value progression across demos
248
+ ax3 = axes[1, 0]
249
+ demo_indices = range(len(results))
250
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
251
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
252
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
253
+ ax3.set_xlabel('Demo Index', fontsize=12)
254
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
255
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
256
+ ax3.legend()
257
+ ax3.grid(True, alpha=0.3)
258
+
259
+ # 4. Cumulative distribution
260
+ ax4 = axes[1, 1]
261
+ sorted_values = np.sort(last_values)
262
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
263
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
264
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
265
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
266
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
267
+ ax4.set_title('Cumulative Distribution', fontsize=14)
268
+ ax4.legend()
269
+ ax4.grid(True, alpha=0.3)
270
+
271
+ plt.tight_layout()
272
+
273
+ # Save the plot
274
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
275
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
276
+ print(f"\nPlot saved to: {plot_path}")
277
+
278
+ # Also save a PDF version
279
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
280
+ plt.savefig(pdf_path, bbox_inches='tight')
281
+ print(f"PDF saved to: {pdf_path}")
282
+
283
+ plt.close()
284
+
285
+
286
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
287
+ """Save evaluation results and statistics to JSON files."""
288
+ task_name = evaluation_results["task_name"]
289
+
290
+ # Save detailed results
291
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
292
+ with results_path.open("w", encoding="utf-8") as f:
293
+ json.dump(evaluation_results, f, indent=2)
294
+ print(f"\nDetailed results saved to: {results_path}")
295
+
296
+ # Save summary statistics
297
+ stats_path = output_dir / f"{task_name}_statistics.json"
298
+ with stats_path.open("w", encoding="utf-8") as f:
299
+ json.dump(statistics, f, indent=2)
300
+ print(f"Statistics saved to: {stats_path}")
301
+
302
+
303
+ # ---------------------------------------------------------------------------
304
+ # CLI
305
+ # ---------------------------------------------------------------------------
306
+
307
+
308
+ def parse_args() -> argparse.Namespace:
309
+ parser = argparse.ArgumentParser(
310
+ description="Evaluate value estimation for test demonstrations"
311
+ )
312
+ parser.add_argument(
313
+ "--manifest-path",
314
+ type=Path,
315
+ default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
316
+ help="Path to the test manifest JSON file",
317
+ )
318
+ parser.add_argument(
319
+ "--output-dir",
320
+ type=Path,
321
+ default="evaluation_results",
322
+ help="Directory to save evaluation results and plots",
323
+ )
324
+ parser.add_argument(
325
+ "--base-url",
326
+ default="http://localhost:8111",
327
+ help="VLAC service base URL (default: http://localhost:8111)",
328
+ )
329
+ parser.add_argument(
330
+ "--timeout",
331
+ type=float,
332
+ default=30.0,
333
+ help="HTTP request timeout in seconds (default: 30.0)",
334
+ )
335
+ parser.add_argument(
336
+ "--use-reference",
337
+ action="store_true",
338
+ help="Use reference trajectory (if available)",
339
+ )
340
+ return parser.parse_args()
341
+
342
+
343
+ def main() -> int:
344
+ args = parse_args()
345
+
346
+ # Read manifest
347
+ try:
348
+ manifest_data = read_manifest(args.manifest_path)
349
+ except FileNotFoundError as exc:
350
+ print(f"Error: {exc}")
351
+ return 1
352
+
353
+ # Create output directory
354
+ output_dir = args.output_dir.expanduser()
355
+ output_dir.mkdir(parents=True, exist_ok=True)
356
+
357
+ # Run evaluation
358
+ print("=" * 80)
359
+ print("VLAC Value Estimation Evaluation")
360
+ print("=" * 80)
361
+
362
+ evaluation_results = evaluate_demos(
363
+ manifest_data=manifest_data,
364
+ base_url=args.base_url,
365
+ timeout=args.timeout,
366
+ use_reference=args.use_reference,
367
+ )
368
+
369
+ # Compute statistics
370
+ statistics = compute_statistics(evaluation_results)
371
+
372
+ # Print summary
373
+ print("\n" + "=" * 80)
374
+ print("EVALUATION SUMMARY")
375
+ print("=" * 80)
376
+ print(f"Task: {evaluation_results['task_name']}")
377
+ print(f"Total demos: {evaluation_results['total_demos']}")
378
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
379
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
380
+
381
+ if statistics:
382
+ print("\n" + "-" * 80)
383
+ print("SUCCESS FRAME VALUE STATISTICS")
384
+ print("-" * 80)
385
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
386
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
387
+ print(f"Median: {statistics['last_value_median']:.2f}")
388
+ print(f"Min: {statistics['last_value_min']:.2f}")
389
+ print(f"Max: {statistics['last_value_max']:.2f}")
390
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
391
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
392
+
393
+ print("\n" + "-" * 80)
394
+ print("THRESHOLD ANALYSIS")
395
+ print("-" * 80)
396
+ for threshold in [80, 85, 90, 95, 100]:
397
+ count = statistics[f"count_above_{threshold}"]
398
+ percent = statistics[f"percent_above_{threshold}"]
399
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
400
+
401
+ print("\n" + "-" * 80)
402
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
403
+ print("-" * 80)
404
+
405
+ # Save results
406
+ save_results(evaluation_results, statistics, output_dir)
407
+
408
+ # Create plots
409
+ if evaluation_results["results"]:
410
+ plot_value_distribution(evaluation_results, output_dir)
411
+ else:
412
+ print("\nNo successful evaluations to plot.")
413
+
414
+ print("\n" + "=" * 80)
415
+ print("EVALUATION COMPLETE")
416
+ print("=" * 80)
417
+
418
+ return 0
419
+
420
+
421
+ if __name__ == "__main__":
422
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008151427.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
12
+
13
+ Example:
14
+ python evaluate_test_demo_values.py \
15
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
16
+ --output-dir evaluation_results \
17
+ --base-url http://localhost:8111
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import base64
24
+ import json
25
+ import os
26
+ import glob
27
+ import sys
28
+ import time
29
+ from io import BytesIO
30
+ from pathlib import Path
31
+ from typing import Dict, List, Optional
32
+
33
+ import matplotlib.pyplot as plt
34
+ import numpy as np
35
+ import requests
36
+ from PIL import Image
37
+ from tqdm import tqdm
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Helpers
41
+ # ---------------------------------------------------------------------------
42
+
43
+ def sample_fixed_interval_frames(image_list, num_frames):
44
+ # sample num_frames frames from image_list
45
+ # sample with equal interval while also ensuring the first and the last frames are included
46
+ if len(image_list) == 0:
47
+ raise ValueError("image_list is empty")
48
+ elif len(image_list) == 1:
49
+ return [image_list[0]] * num_frames
50
+ elif num_frames == 2:
51
+ return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
52
+ elif num_frames == 3:
53
+ return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
54
+ else:
55
+ total_frames = len(image_list)
56
+ indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
57
+ sampled_frames = [image_list[i] for i in indices]
58
+ return sampled_frames
59
+
60
+
61
+ num_frames_for_reference = 8
62
+ ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
63
+ libero_10_task_list = [
64
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
65
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
66
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
67
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
68
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
69
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
70
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
71
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
72
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
73
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
74
+ ]
75
+ reference_frames_dict = {}
76
+ for task_name in libero_10_task_list:
77
+ ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
78
+ ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
79
+ ref_frm_file_list.sort()
80
+ reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
81
+ reference_frames_temp = [np.array(Image.open(frame)) for frame in reference_frames_temp]
82
+ reference_frames_dict[task_name] = reference_frames_temp
83
+
84
+
85
+ def read_manifest(manifest_path: Path) -> Dict:
86
+ """Read the test demo manifest JSON file."""
87
+ if not manifest_path.is_file():
88
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
89
+
90
+ with manifest_path.open("r", encoding="utf-8") as f:
91
+ manifest_data = json.load(f)
92
+
93
+ # Convert relative paths to absolute paths
94
+ manifest_dir = manifest_path.parent
95
+ for demo in manifest_data.get("demos", []):
96
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
97
+
98
+ return manifest_data
99
+
100
+
101
+ def image_to_base64(path: Path) -> str:
102
+ """Convert an image file to base64 encoded JPEG."""
103
+ with Image.open(path) as img:
104
+ img = img.convert("RGB")
105
+ buffer = BytesIO()
106
+ img.save(buffer, format="JPEG", quality=95)
107
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
108
+
109
+
110
+ def encode_images(paths: List[str]) -> List[str]:
111
+ """Encode a list of image paths to base64."""
112
+ return [image_to_base64(Path(p)) for p in paths]
113
+
114
+
115
+ def call_trajectory_critic(
116
+ session: requests.Session,
117
+ base_url: str,
118
+ task: str,
119
+ frames_b64: List[str],
120
+ reference_b64: Optional[List[str]],
121
+ timeout: float,
122
+ ) -> Dict:
123
+ """Call the VLAC trajectory-critic endpoint."""
124
+ payload = {
125
+ "task": task,
126
+ "frames": frames_b64,
127
+ "reference": reference_b64,
128
+ "ref_num": len(reference_b64 or []),
129
+ "skip": 1,
130
+ "batch_size": min(len(frames_b64), 8),
131
+ "think": False,
132
+ "return_video": False,
133
+ }
134
+ start = time.time()
135
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
136
+ resp.raise_for_status()
137
+ result = resp.json()
138
+ result["latency_sec"] = time.time() - start
139
+ return result
140
+
141
+
142
+ # ---------------------------------------------------------------------------
143
+ # Evaluation
144
+ # ---------------------------------------------------------------------------
145
+
146
+
147
+ def evaluate_demos(
148
+ manifest_data: Dict,
149
+ base_url: str,
150
+ timeout: float,
151
+ use_reference: bool = False,
152
+ ) -> Dict[str, any]:
153
+ """Evaluate all demos and collect value statistics."""
154
+ session = requests.Session()
155
+ task_name = manifest_data.get("task_name", "")
156
+ demos = manifest_data.get("demos", [])
157
+
158
+ results = []
159
+ failed_demos = []
160
+
161
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
162
+ print(f"Task: {task_name}")
163
+ print(f"Use reference: {use_reference}\n")
164
+
165
+ for demo in tqdm(demos, desc="Processing demos"):
166
+ demo_name = demo["demo_name"]
167
+ frame_paths = demo["frame_paths"]
168
+
169
+ try:
170
+ # Encode frames
171
+ frames_b64 = encode_images(frame_paths)
172
+
173
+ # For now, no reference trajectory (can be added later)
174
+ reference_b64 = reference_frames_dict[task_name]
175
+
176
+ # Call VLAC service
177
+ result = call_trajectory_critic(
178
+ session=session,
179
+ base_url=base_url,
180
+ task=task_name,
181
+ frames_b64=frames_b64,
182
+ reference_b64=reference_b64,
183
+ timeout=timeout,
184
+ )
185
+
186
+ # Extract values
187
+ value_list = result.get("value_list", [])
188
+ if not value_list:
189
+ print(f"\n[warn] No values returned for demo {demo_name}")
190
+ failed_demos.append(demo_name)
191
+ continue
192
+
193
+ # Record results
194
+ demo_result = {
195
+ "demo_name": demo_name,
196
+ "total_frames": demo["total_frames"],
197
+ "success_index": demo["success_index"],
198
+ "num_sampled_frames": len(frame_paths),
199
+ "value_list": value_list,
200
+ "last_value": value_list[-1], # The critical value for success frame
201
+ "mean_value": float(np.mean(value_list)),
202
+ "std_value": float(np.std(value_list)),
203
+ "latency_sec": result.get("latency_sec", 0.0),
204
+ }
205
+ results.append(demo_result)
206
+
207
+ except requests.RequestException as exc:
208
+ print(f"\n[error] Request failed for demo {demo_name}: {exc}")
209
+ failed_demos.append(demo_name)
210
+ except Exception as exc:
211
+ print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
212
+ failed_demos.append(demo_name)
213
+
214
+ return {
215
+ "task_name": task_name,
216
+ "total_demos": len(demos),
217
+ "successful_evals": len(results),
218
+ "failed_demos": failed_demos,
219
+ "results": results,
220
+ }
221
+
222
+
223
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
224
+ """Compute summary statistics from evaluation results."""
225
+ results = evaluation_results["results"]
226
+ if not results:
227
+ return {}
228
+
229
+ last_values = [r["last_value"] for r in results]
230
+ mean_values = [r["mean_value"] for r in results]
231
+ latencies = [r["latency_sec"] for r in results]
232
+
233
+ stats = {
234
+ "last_value_mean": float(np.mean(last_values)),
235
+ "last_value_std": float(np.std(last_values)),
236
+ "last_value_min": float(np.min(last_values)),
237
+ "last_value_max": float(np.max(last_values)),
238
+ "last_value_median": float(np.median(last_values)),
239
+ "last_value_q25": float(np.percentile(last_values, 25)),
240
+ "last_value_q75": float(np.percentile(last_values, 75)),
241
+ "mean_latency": float(np.mean(latencies)),
242
+ "total_evaluated": len(results),
243
+ }
244
+
245
+ # Count how many demos have last_value >= various thresholds
246
+ for threshold in [80, 85, 90, 95, 100]:
247
+ count = sum(1 for v in last_values if v >= threshold)
248
+ stats[f"count_above_{threshold}"] = count
249
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
250
+
251
+ return stats
252
+
253
+
254
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
255
+ """Create visualization plots for value distribution."""
256
+ results = evaluation_results["results"]
257
+ if not results:
258
+ print("No results to plot")
259
+ return
260
+
261
+ task_name = evaluation_results["task_name"]
262
+ last_values = [r["last_value"] for r in results]
263
+
264
+ # Create figure with multiple subplots
265
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
266
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
267
+
268
+ # 1. Histogram of last values
269
+ ax1 = axes[0, 0]
270
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
271
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
272
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
273
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
274
+ ax1.set_ylabel('Frequency', fontsize=12)
275
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
276
+ ax1.legend()
277
+ ax1.grid(True, alpha=0.3)
278
+
279
+ # 2. Box plot of last values
280
+ ax2 = axes[0, 1]
281
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
282
+ for patch in box_data['boxes']:
283
+ patch.set_facecolor('lightblue')
284
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
285
+ ax2.set_ylabel('Value', fontsize=12)
286
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
287
+ ax2.legend()
288
+ ax2.grid(True, alpha=0.3, axis='y')
289
+
290
+ # 3. Value progression across demos
291
+ ax3 = axes[1, 0]
292
+ demo_indices = range(len(results))
293
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
294
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
295
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
296
+ ax3.set_xlabel('Demo Index', fontsize=12)
297
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
298
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
299
+ ax3.legend()
300
+ ax3.grid(True, alpha=0.3)
301
+
302
+ # 4. Cumulative distribution
303
+ ax4 = axes[1, 1]
304
+ sorted_values = np.sort(last_values)
305
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
306
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
307
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
308
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
309
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
310
+ ax4.set_title('Cumulative Distribution', fontsize=14)
311
+ ax4.legend()
312
+ ax4.grid(True, alpha=0.3)
313
+
314
+ plt.tight_layout()
315
+
316
+ # Save the plot
317
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
318
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
319
+ print(f"\nPlot saved to: {plot_path}")
320
+
321
+ # Also save a PDF version
322
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
323
+ plt.savefig(pdf_path, bbox_inches='tight')
324
+ print(f"PDF saved to: {pdf_path}")
325
+
326
+ plt.close()
327
+
328
+
329
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
330
+ """Save evaluation results and statistics to JSON files."""
331
+ task_name = evaluation_results["task_name"]
332
+
333
+ # Save detailed results
334
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
335
+ with results_path.open("w", encoding="utf-8") as f:
336
+ json.dump(evaluation_results, f, indent=2)
337
+ print(f"\nDetailed results saved to: {results_path}")
338
+
339
+ # Save summary statistics
340
+ stats_path = output_dir / f"{task_name}_statistics.json"
341
+ with stats_path.open("w", encoding="utf-8") as f:
342
+ json.dump(statistics, f, indent=2)
343
+ print(f"Statistics saved to: {stats_path}")
344
+
345
+
346
+ # ---------------------------------------------------------------------------
347
+ # CLI
348
+ # ---------------------------------------------------------------------------
349
+
350
+
351
+ def parse_args() -> argparse.Namespace:
352
+ parser = argparse.ArgumentParser(
353
+ description="Evaluate value estimation for test demonstrations"
354
+ )
355
+ parser.add_argument(
356
+ "--manifest-path",
357
+ type=Path,
358
+ default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
359
+ help="Path to the test manifest JSON file",
360
+ )
361
+ parser.add_argument(
362
+ "--output-dir",
363
+ type=Path,
364
+ default="evaluation_results",
365
+ help="Directory to save evaluation results and plots",
366
+ )
367
+ parser.add_argument(
368
+ "--base-url",
369
+ default="http://localhost:8111",
370
+ help="VLAC service base URL (default: http://localhost:8111)",
371
+ )
372
+ parser.add_argument(
373
+ "--timeout",
374
+ type=float,
375
+ default=30.0,
376
+ help="HTTP request timeout in seconds (default: 30.0)",
377
+ )
378
+ parser.add_argument(
379
+ "--use-reference",
380
+ action="store_true",
381
+ help="Use reference trajectory (if available)",
382
+ )
383
+ return parser.parse_args()
384
+
385
+
386
+ def main() -> int:
387
+ args = parse_args()
388
+
389
+ # Read manifest
390
+ try:
391
+ manifest_data = read_manifest(args.manifest_path)
392
+ except FileNotFoundError as exc:
393
+ print(f"Error: {exc}")
394
+ return 1
395
+
396
+ # Create output directory
397
+ output_dir = args.output_dir.expanduser()
398
+ output_dir.mkdir(parents=True, exist_ok=True)
399
+
400
+ # Run evaluation
401
+ print("=" * 80)
402
+ print("VLAC Value Estimation Evaluation")
403
+ print("=" * 80)
404
+
405
+ evaluation_results = evaluate_demos(
406
+ manifest_data=manifest_data,
407
+ base_url=args.base_url,
408
+ timeout=args.timeout,
409
+ use_reference=args.use_reference,
410
+ )
411
+
412
+ # Compute statistics
413
+ statistics = compute_statistics(evaluation_results)
414
+
415
+ # Print summary
416
+ print("\n" + "=" * 80)
417
+ print("EVALUATION SUMMARY")
418
+ print("=" * 80)
419
+ print(f"Task: {evaluation_results['task_name']}")
420
+ print(f"Total demos: {evaluation_results['total_demos']}")
421
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
422
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
423
+
424
+ if statistics:
425
+ print("\n" + "-" * 80)
426
+ print("SUCCESS FRAME VALUE STATISTICS")
427
+ print("-" * 80)
428
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
429
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
430
+ print(f"Median: {statistics['last_value_median']:.2f}")
431
+ print(f"Min: {statistics['last_value_min']:.2f}")
432
+ print(f"Max: {statistics['last_value_max']:.2f}")
433
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
434
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
435
+
436
+ print("\n" + "-" * 80)
437
+ print("THRESHOLD ANALYSIS")
438
+ print("-" * 80)
439
+ for threshold in [80, 85, 90, 95, 100]:
440
+ count = statistics[f"count_above_{threshold}"]
441
+ percent = statistics[f"percent_above_{threshold}"]
442
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
443
+
444
+ print("\n" + "-" * 80)
445
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
446
+ print("-" * 80)
447
+
448
+ # Save results
449
+ save_results(evaluation_results, statistics, output_dir)
450
+
451
+ # Create plots
452
+ if evaluation_results["results"]:
453
+ plot_value_distribution(evaluation_results, output_dir)
454
+ else:
455
+ print("\nNo successful evaluations to plot.")
456
+
457
+ print("\n" + "=" * 80)
458
+ print("EVALUATION COMPLETE")
459
+ print("=" * 80)
460
+
461
+ return 0
462
+
463
+
464
+ if __name__ == "__main__":
465
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008151542.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
12
+
13
+ Example:
14
+ python evaluate_test_demo_values.py \
15
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
16
+ --output-dir evaluation_results \
17
+ --base-url http://localhost:8111
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import base64
24
+ import json
25
+ import os
26
+ import glob
27
+ import sys
28
+ import time
29
+ from io import BytesIO
30
+ from pathlib import Path
31
+ from typing import Dict, List, Optional
32
+
33
+ import matplotlib.pyplot as plt
34
+ import numpy as np
35
+ import requests
36
+ from PIL import Image
37
+ from tqdm import tqdm
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Helpers
41
+ # ---------------------------------------------------------------------------
42
+
43
+ def sample_fixed_interval_frames(image_list, num_frames):
44
+ # sample num_frames frames from image_list
45
+ # sample with equal interval while also ensuring the first and the last frames are included
46
+ if len(image_list) == 0:
47
+ raise ValueError("image_list is empty")
48
+ elif len(image_list) == 1:
49
+ return [image_list[0]] * num_frames
50
+ elif num_frames == 2:
51
+ return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
52
+ elif num_frames == 3:
53
+ return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
54
+ else:
55
+ total_frames = len(image_list)
56
+ indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
57
+ sampled_frames = [image_list[i] for i in indices]
58
+ return sampled_frames
59
+
60
+
61
+ num_frames_for_reference = 8
62
+ ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
63
+ libero_10_task_list = [
64
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
65
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
66
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
67
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
68
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
69
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
70
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
71
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
72
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
73
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
74
+ ]
75
+ reference_frames_dict = {}
76
+ for task_name in libero_10_task_list:
77
+ ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
78
+ ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
79
+ ref_frm_file_list.sort()
80
+ reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
81
+ reference_frames_temp = [np.array(Image.open(frame)) for frame in reference_frames_temp]
82
+ reference_frames_dict[task_name] = reference_frames_temp
83
+
84
+
85
+ def read_manifest(manifest_path: Path) -> Dict:
86
+ """Read the test demo manifest JSON file."""
87
+ if not manifest_path.is_file():
88
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
89
+
90
+ with manifest_path.open("r", encoding="utf-8") as f:
91
+ manifest_data = json.load(f)
92
+
93
+ # Convert relative paths to absolute paths
94
+ manifest_dir = manifest_path.parent
95
+ for demo in manifest_data.get("demos", []):
96
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
97
+
98
+ return manifest_data
99
+
100
+
101
+ def image_to_base64(path: Path) -> str:
102
+ """Convert an image file to base64 encoded JPEG."""
103
+ with Image.open(path) as img:
104
+ img = img.convert("RGB")
105
+ buffer = BytesIO()
106
+ img.save(buffer, format="JPEG", quality=95)
107
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
108
+
109
+
110
+ def encode_images(paths: List[str]) -> List[str]:
111
+ """Encode a list of image paths to base64."""
112
+ return [image_to_base64(Path(p)) for p in paths]
113
+
114
+
115
+ def call_trajectory_critic(
116
+ session: requests.Session,
117
+ base_url: str,
118
+ task: str,
119
+ frames_b64: List[str],
120
+ reference_b64: Optional[List[str]],
121
+ timeout: float,
122
+ ) -> Dict:
123
+ """Call the VLAC trajectory-critic endpoint."""
124
+ payload = {
125
+ "task": task,
126
+ "frames": frames_b64,
127
+ "reference": reference_b64,
128
+ "ref_num": len(reference_b64 or []),
129
+ "skip": 1,
130
+ "batch_size": min(len(frames_b64), 8),
131
+ "think": False,
132
+ "return_video": False,
133
+ }
134
+ start = time.time()
135
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
136
+ resp.raise_for_status()
137
+ result = resp.json()
138
+ result["latency_sec"] = time.time() - start
139
+ return result
140
+
141
+
142
+ # ---------------------------------------------------------------------------
143
+ # Evaluation
144
+ # ---------------------------------------------------------------------------
145
+
146
+
147
+ def evaluate_demos(
148
+ manifest_data: Dict,
149
+ base_url: str,
150
+ timeout: float,
151
+ use_reference: bool = False,
152
+ ) -> Dict[str, any]:
153
+ """Evaluate all demos and collect value statistics."""
154
+ session = requests.Session()
155
+ task_name = manifest_data.get("task_name", "")
156
+ demos = manifest_data.get("demos", [])
157
+
158
+ results = []
159
+ failed_demos = []
160
+
161
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
162
+ print(f"Task: {task_name}")
163
+ print(f"Use reference: {use_reference}\n")
164
+
165
+ for demo in tqdm(demos, desc="Processing demos"):
166
+ demo_name = demo["demo_name"]
167
+ frame_paths = demo["frame_paths"]
168
+
169
+ try:
170
+ # Encode frames
171
+ frames_b64 = encode_images(frame_paths)
172
+
173
+ # For now, no reference trajectory (can be added later)
174
+ print(f"Using reference frames for task {task_name}")
175
+ reference_b64 = reference_frames_dict[task_name]
176
+
177
+ # Call VLAC service
178
+ result = call_trajectory_critic(
179
+ session=session,
180
+ base_url=base_url,
181
+ task=task_name,
182
+ frames_b64=frames_b64,
183
+ reference_b64=reference_b64,
184
+ timeout=timeout,
185
+ )
186
+
187
+ # Extract values
188
+ value_list = result.get("value_list", [])
189
+ if not value_list:
190
+ print(f"\n[warn] No values returned for demo {demo_name}")
191
+ failed_demos.append(demo_name)
192
+ continue
193
+
194
+ # Record results
195
+ demo_result = {
196
+ "demo_name": demo_name,
197
+ "total_frames": demo["total_frames"],
198
+ "success_index": demo["success_index"],
199
+ "num_sampled_frames": len(frame_paths),
200
+ "value_list": value_list,
201
+ "last_value": value_list[-1], # The critical value for success frame
202
+ "mean_value": float(np.mean(value_list)),
203
+ "std_value": float(np.std(value_list)),
204
+ "latency_sec": result.get("latency_sec", 0.0),
205
+ }
206
+ results.append(demo_result)
207
+
208
+ except requests.RequestException as exc:
209
+ print(f"\n[error] Request failed for demo {demo_name}: {exc}")
210
+ failed_demos.append(demo_name)
211
+ except Exception as exc:
212
+ print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
213
+ failed_demos.append(demo_name)
214
+
215
+ return {
216
+ "task_name": task_name,
217
+ "total_demos": len(demos),
218
+ "successful_evals": len(results),
219
+ "failed_demos": failed_demos,
220
+ "results": results,
221
+ }
222
+
223
+
224
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
225
+ """Compute summary statistics from evaluation results."""
226
+ results = evaluation_results["results"]
227
+ if not results:
228
+ return {}
229
+
230
+ last_values = [r["last_value"] for r in results]
231
+ mean_values = [r["mean_value"] for r in results]
232
+ latencies = [r["latency_sec"] for r in results]
233
+
234
+ stats = {
235
+ "last_value_mean": float(np.mean(last_values)),
236
+ "last_value_std": float(np.std(last_values)),
237
+ "last_value_min": float(np.min(last_values)),
238
+ "last_value_max": float(np.max(last_values)),
239
+ "last_value_median": float(np.median(last_values)),
240
+ "last_value_q25": float(np.percentile(last_values, 25)),
241
+ "last_value_q75": float(np.percentile(last_values, 75)),
242
+ "mean_latency": float(np.mean(latencies)),
243
+ "total_evaluated": len(results),
244
+ }
245
+
246
+ # Count how many demos have last_value >= various thresholds
247
+ for threshold in [80, 85, 90, 95, 100]:
248
+ count = sum(1 for v in last_values if v >= threshold)
249
+ stats[f"count_above_{threshold}"] = count
250
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
251
+
252
+ return stats
253
+
254
+
255
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
256
+ """Create visualization plots for value distribution."""
257
+ results = evaluation_results["results"]
258
+ if not results:
259
+ print("No results to plot")
260
+ return
261
+
262
+ task_name = evaluation_results["task_name"]
263
+ last_values = [r["last_value"] for r in results]
264
+
265
+ # Create figure with multiple subplots
266
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
267
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
268
+
269
+ # 1. Histogram of last values
270
+ ax1 = axes[0, 0]
271
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
272
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
273
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
274
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
275
+ ax1.set_ylabel('Frequency', fontsize=12)
276
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
277
+ ax1.legend()
278
+ ax1.grid(True, alpha=0.3)
279
+
280
+ # 2. Box plot of last values
281
+ ax2 = axes[0, 1]
282
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
283
+ for patch in box_data['boxes']:
284
+ patch.set_facecolor('lightblue')
285
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
286
+ ax2.set_ylabel('Value', fontsize=12)
287
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
288
+ ax2.legend()
289
+ ax2.grid(True, alpha=0.3, axis='y')
290
+
291
+ # 3. Value progression across demos
292
+ ax3 = axes[1, 0]
293
+ demo_indices = range(len(results))
294
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
295
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
296
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
297
+ ax3.set_xlabel('Demo Index', fontsize=12)
298
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
299
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
300
+ ax3.legend()
301
+ ax3.grid(True, alpha=0.3)
302
+
303
+ # 4. Cumulative distribution
304
+ ax4 = axes[1, 1]
305
+ sorted_values = np.sort(last_values)
306
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
307
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
308
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
309
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
310
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
311
+ ax4.set_title('Cumulative Distribution', fontsize=14)
312
+ ax4.legend()
313
+ ax4.grid(True, alpha=0.3)
314
+
315
+ plt.tight_layout()
316
+
317
+ # Save the plot
318
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
319
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
320
+ print(f"\nPlot saved to: {plot_path}")
321
+
322
+ # Also save a PDF version
323
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
324
+ plt.savefig(pdf_path, bbox_inches='tight')
325
+ print(f"PDF saved to: {pdf_path}")
326
+
327
+ plt.close()
328
+
329
+
330
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
331
+ """Save evaluation results and statistics to JSON files."""
332
+ task_name = evaluation_results["task_name"]
333
+
334
+ # Save detailed results
335
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
336
+ with results_path.open("w", encoding="utf-8") as f:
337
+ json.dump(evaluation_results, f, indent=2)
338
+ print(f"\nDetailed results saved to: {results_path}")
339
+
340
+ # Save summary statistics
341
+ stats_path = output_dir / f"{task_name}_statistics.json"
342
+ with stats_path.open("w", encoding="utf-8") as f:
343
+ json.dump(statistics, f, indent=2)
344
+ print(f"Statistics saved to: {stats_path}")
345
+
346
+
347
+ # ---------------------------------------------------------------------------
348
+ # CLI
349
+ # ---------------------------------------------------------------------------
350
+
351
+
352
+ def parse_args() -> argparse.Namespace:
353
+ parser = argparse.ArgumentParser(
354
+ description="Evaluate value estimation for test demonstrations"
355
+ )
356
+ parser.add_argument(
357
+ "--manifest-path",
358
+ type=Path,
359
+ default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
360
+ help="Path to the test manifest JSON file",
361
+ )
362
+ parser.add_argument(
363
+ "--output-dir",
364
+ type=Path,
365
+ default="evaluation_results",
366
+ help="Directory to save evaluation results and plots",
367
+ )
368
+ parser.add_argument(
369
+ "--base-url",
370
+ default="http://localhost:8111",
371
+ help="VLAC service base URL (default: http://localhost:8111)",
372
+ )
373
+ parser.add_argument(
374
+ "--timeout",
375
+ type=float,
376
+ default=30.0,
377
+ help="HTTP request timeout in seconds (default: 30.0)",
378
+ )
379
+ parser.add_argument(
380
+ "--use-reference",
381
+ action="store_true",
382
+ help="Use reference trajectory (if available)",
383
+ )
384
+ return parser.parse_args()
385
+
386
+
387
+ def main() -> int:
388
+ args = parse_args()
389
+
390
+ # Read manifest
391
+ try:
392
+ manifest_data = read_manifest(args.manifest_path)
393
+ except FileNotFoundError as exc:
394
+ print(f"Error: {exc}")
395
+ return 1
396
+
397
+ # Create output directory
398
+ output_dir = args.output_dir.expanduser()
399
+ output_dir.mkdir(parents=True, exist_ok=True)
400
+
401
+ # Run evaluation
402
+ print("=" * 80)
403
+ print("VLAC Value Estimation Evaluation")
404
+ print("=" * 80)
405
+
406
+ evaluation_results = evaluate_demos(
407
+ manifest_data=manifest_data,
408
+ base_url=args.base_url,
409
+ timeout=args.timeout,
410
+ use_reference=args.use_reference,
411
+ )
412
+
413
+ # Compute statistics
414
+ statistics = compute_statistics(evaluation_results)
415
+
416
+ # Print summary
417
+ print("\n" + "=" * 80)
418
+ print("EVALUATION SUMMARY")
419
+ print("=" * 80)
420
+ print(f"Task: {evaluation_results['task_name']}")
421
+ print(f"Total demos: {evaluation_results['total_demos']}")
422
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
423
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
424
+
425
+ if statistics:
426
+ print("\n" + "-" * 80)
427
+ print("SUCCESS FRAME VALUE STATISTICS")
428
+ print("-" * 80)
429
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
430
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
431
+ print(f"Median: {statistics['last_value_median']:.2f}")
432
+ print(f"Min: {statistics['last_value_min']:.2f}")
433
+ print(f"Max: {statistics['last_value_max']:.2f}")
434
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
435
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
436
+
437
+ print("\n" + "-" * 80)
438
+ print("THRESHOLD ANALYSIS")
439
+ print("-" * 80)
440
+ for threshold in [80, 85, 90, 95, 100]:
441
+ count = statistics[f"count_above_{threshold}"]
442
+ percent = statistics[f"percent_above_{threshold}"]
443
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
444
+
445
+ print("\n" + "-" * 80)
446
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
447
+ print("-" * 80)
448
+
449
+ # Save results
450
+ save_results(evaluation_results, statistics, output_dir)
451
+
452
+ # Create plots
453
+ if evaluation_results["results"]:
454
+ plot_value_distribution(evaluation_results, output_dir)
455
+ else:
456
+ print("\nNo successful evaluations to plot.")
457
+
458
+ print("\n" + "=" * 80)
459
+ print("EVALUATION COMPLETE")
460
+ print("=" * 80)
461
+
462
+ return 0
463
+
464
+
465
+ if __name__ == "__main__":
466
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008151723.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
12
+
13
+ Example:
14
+ python evaluate_test_demo_values.py \
15
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
16
+ --output-dir evaluation_results \
17
+ --base-url http://localhost:8111
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import base64
24
+ import json
25
+ import os
26
+ import glob
27
+ import sys
28
+ import time
29
+ from io import BytesIO
30
+ from pathlib import Path
31
+ from typing import Dict, List, Optional
32
+
33
+ import matplotlib.pyplot as plt
34
+ import numpy as np
35
+ import requests
36
+ from PIL import Image
37
+ from tqdm import tqdm
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Helpers
41
+ # ---------------------------------------------------------------------------
42
+
43
+ def sample_fixed_interval_frames(image_list, num_frames):
44
+ # sample num_frames frames from image_list
45
+ # sample with equal interval while also ensuring the first and the last frames are included
46
+ if len(image_list) == 0:
47
+ raise ValueError("image_list is empty")
48
+ elif len(image_list) == 1:
49
+ return [image_list[0]] * num_frames
50
+ elif num_frames == 2:
51
+ return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
52
+ elif num_frames == 3:
53
+ return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
54
+ else:
55
+ total_frames = len(image_list)
56
+ indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
57
+ sampled_frames = [image_list[i] for i in indices]
58
+ return sampled_frames
59
+
60
+
61
+ num_frames_for_reference = 8
62
+ ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
63
+ libero_10_task_list = [
64
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
65
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
66
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
67
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
68
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
69
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
70
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
71
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
72
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
73
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
74
+ ]
75
+ reference_frames_dict = {}
76
+ for task_name in libero_10_task_list:
77
+ ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
78
+ ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
79
+ ref_frm_file_list.sort()
80
+ reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
81
+ reference_frames_temp = [np.array(Image.open(frame)) for frame in reference_frames_temp]
82
+ reference_frames_dict[task_name] = reference_frames_temp
83
+
84
+
85
+ def read_manifest(manifest_path: Path) -> Dict:
86
+ """Read the test demo manifest JSON file."""
87
+ if not manifest_path.is_file():
88
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
89
+
90
+ with manifest_path.open("r", encoding="utf-8") as f:
91
+ manifest_data = json.load(f)
92
+
93
+ # Convert relative paths to absolute paths
94
+ manifest_dir = manifest_path.parent
95
+ for demo in manifest_data.get("demos", []):
96
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
97
+
98
+ return manifest_data
99
+
100
+
101
+ def image_to_base64(path: Path) -> str:
102
+ """Convert an image file to base64 encoded JPEG."""
103
+ with Image.open(path) as img:
104
+ img = img.convert("RGB")
105
+ buffer = BytesIO()
106
+ img.save(buffer, format="JPEG", quality=95)
107
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
108
+
109
+
110
+ def encode_images(paths: List[str]) -> List[str]:
111
+ """Encode a list of image paths to base64."""
112
+ return [image_to_base64(Path(p)) for p in paths]
113
+
114
+
115
+ def call_trajectory_critic(
116
+ session: requests.Session,
117
+ base_url: str,
118
+ task: str,
119
+ frames_b64: List[str],
120
+ reference_b64: Optional[List[str]],
121
+ timeout: float,
122
+ ) -> Dict:
123
+ """Call the VLAC trajectory-critic endpoint."""
124
+ payload = {
125
+ "task": task,
126
+ "frames": frames_b64,
127
+ "reference": reference_b64,
128
+ "ref_num": len(reference_b64 or []),
129
+ "skip": 1,
130
+ "batch_size": min(len(frames_b64), 8),
131
+ "think": False,
132
+ "return_video": False,
133
+ }
134
+ start = time.time()
135
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
136
+ resp.raise_for_status()
137
+ result = resp.json()
138
+ result["latency_sec"] = time.time() - start
139
+ return result
140
+
141
+
142
+ # ---------------------------------------------------------------------------
143
+ # Evaluation
144
+ # ---------------------------------------------------------------------------
145
+
146
+
147
+ def evaluate_demos(
148
+ manifest_data: Dict,
149
+ base_url: str,
150
+ timeout: float,
151
+ use_reference: bool = False,
152
+ ) -> Dict[str, any]:
153
+ """Evaluate all demos and collect value statistics."""
154
+ session = requests.Session()
155
+ task_name = manifest_data.get("task_name", "")
156
+ demos = manifest_data.get("demos", [])
157
+
158
+ results = []
159
+ failed_demos = []
160
+
161
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
162
+ print(f"Task: {task_name}")
163
+ print(f"Use reference: {use_reference}\n")
164
+
165
+ for demo in tqdm(demos, desc="Processing demos"):
166
+ demo_name = demo["demo_name"]
167
+ frame_paths = demo["frame_paths"]
168
+
169
+ # try:
170
+ # Encode frames
171
+ frames_b64 = encode_images(frame_paths)
172
+
173
+ # For now, no reference trajectory (can be added later)
174
+ print(f"Using reference frames for task {task_name}")
175
+ reference_b64 = reference_frames_dict[task_name]
176
+
177
+ # Call VLAC service
178
+ result = call_trajectory_critic(
179
+ session=session,
180
+ base_url=base_url,
181
+ task=task_name,
182
+ frames_b64=frames_b64,
183
+ reference_b64=reference_b64,
184
+ timeout=timeout,
185
+ )
186
+
187
+ # Extract values
188
+ value_list = result.get("value_list", [])
189
+ if not value_list:
190
+ print(f"\n[warn] No values returned for demo {demo_name}")
191
+ failed_demos.append(demo_name)
192
+ continue
193
+
194
+ # Record results
195
+ demo_result = {
196
+ "demo_name": demo_name,
197
+ "total_frames": demo["total_frames"],
198
+ "success_index": demo["success_index"],
199
+ "num_sampled_frames": len(frame_paths),
200
+ "value_list": value_list,
201
+ "last_value": value_list[-1], # The critical value for success frame
202
+ "mean_value": float(np.mean(value_list)),
203
+ "std_value": float(np.std(value_list)),
204
+ "latency_sec": result.get("latency_sec", 0.0),
205
+ }
206
+ results.append(demo_result)
207
+
208
+ # except requests.RequestException as exc:
209
+ # print(f"\n[error] Request failed for demo {demo_name}: {exc}")
210
+ # failed_demos.append(demo_name)
211
+ # except Exception as exc:
212
+ # print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
213
+ # failed_demos.append(demo_name)
214
+
215
+ return {
216
+ "task_name": task_name,
217
+ "total_demos": len(demos),
218
+ "successful_evals": len(results),
219
+ "failed_demos": failed_demos,
220
+ "results": results,
221
+ }
222
+
223
+
224
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
225
+ """Compute summary statistics from evaluation results."""
226
+ results = evaluation_results["results"]
227
+ if not results:
228
+ return {}
229
+
230
+ last_values = [r["last_value"] for r in results]
231
+ mean_values = [r["mean_value"] for r in results]
232
+ latencies = [r["latency_sec"] for r in results]
233
+
234
+ stats = {
235
+ "last_value_mean": float(np.mean(last_values)),
236
+ "last_value_std": float(np.std(last_values)),
237
+ "last_value_min": float(np.min(last_values)),
238
+ "last_value_max": float(np.max(last_values)),
239
+ "last_value_median": float(np.median(last_values)),
240
+ "last_value_q25": float(np.percentile(last_values, 25)),
241
+ "last_value_q75": float(np.percentile(last_values, 75)),
242
+ "mean_latency": float(np.mean(latencies)),
243
+ "total_evaluated": len(results),
244
+ }
245
+
246
+ # Count how many demos have last_value >= various thresholds
247
+ for threshold in [80, 85, 90, 95, 100]:
248
+ count = sum(1 for v in last_values if v >= threshold)
249
+ stats[f"count_above_{threshold}"] = count
250
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
251
+
252
+ return stats
253
+
254
+
255
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
256
+ """Create visualization plots for value distribution."""
257
+ results = evaluation_results["results"]
258
+ if not results:
259
+ print("No results to plot")
260
+ return
261
+
262
+ task_name = evaluation_results["task_name"]
263
+ last_values = [r["last_value"] for r in results]
264
+
265
+ # Create figure with multiple subplots
266
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
267
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
268
+
269
+ # 1. Histogram of last values
270
+ ax1 = axes[0, 0]
271
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
272
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
273
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
274
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
275
+ ax1.set_ylabel('Frequency', fontsize=12)
276
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
277
+ ax1.legend()
278
+ ax1.grid(True, alpha=0.3)
279
+
280
+ # 2. Box plot of last values
281
+ ax2 = axes[0, 1]
282
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
283
+ for patch in box_data['boxes']:
284
+ patch.set_facecolor('lightblue')
285
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
286
+ ax2.set_ylabel('Value', fontsize=12)
287
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
288
+ ax2.legend()
289
+ ax2.grid(True, alpha=0.3, axis='y')
290
+
291
+ # 3. Value progression across demos
292
+ ax3 = axes[1, 0]
293
+ demo_indices = range(len(results))
294
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
295
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
296
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
297
+ ax3.set_xlabel('Demo Index', fontsize=12)
298
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
299
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
300
+ ax3.legend()
301
+ ax3.grid(True, alpha=0.3)
302
+
303
+ # 4. Cumulative distribution
304
+ ax4 = axes[1, 1]
305
+ sorted_values = np.sort(last_values)
306
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
307
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
308
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
309
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
310
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
311
+ ax4.set_title('Cumulative Distribution', fontsize=14)
312
+ ax4.legend()
313
+ ax4.grid(True, alpha=0.3)
314
+
315
+ plt.tight_layout()
316
+
317
+ # Save the plot
318
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
319
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
320
+ print(f"\nPlot saved to: {plot_path}")
321
+
322
+ # Also save a PDF version
323
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
324
+ plt.savefig(pdf_path, bbox_inches='tight')
325
+ print(f"PDF saved to: {pdf_path}")
326
+
327
+ plt.close()
328
+
329
+
330
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
331
+ """Save evaluation results and statistics to JSON files."""
332
+ task_name = evaluation_results["task_name"]
333
+
334
+ # Save detailed results
335
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
336
+ with results_path.open("w", encoding="utf-8") as f:
337
+ json.dump(evaluation_results, f, indent=2)
338
+ print(f"\nDetailed results saved to: {results_path}")
339
+
340
+ # Save summary statistics
341
+ stats_path = output_dir / f"{task_name}_statistics.json"
342
+ with stats_path.open("w", encoding="utf-8") as f:
343
+ json.dump(statistics, f, indent=2)
344
+ print(f"Statistics saved to: {stats_path}")
345
+
346
+
347
+ # ---------------------------------------------------------------------------
348
+ # CLI
349
+ # ---------------------------------------------------------------------------
350
+
351
+
352
+ def parse_args() -> argparse.Namespace:
353
+ parser = argparse.ArgumentParser(
354
+ description="Evaluate value estimation for test demonstrations"
355
+ )
356
+ parser.add_argument(
357
+ "--manifest-path",
358
+ type=Path,
359
+ default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
360
+ help="Path to the test manifest JSON file",
361
+ )
362
+ parser.add_argument(
363
+ "--output-dir",
364
+ type=Path,
365
+ default="evaluation_results",
366
+ help="Directory to save evaluation results and plots",
367
+ )
368
+ parser.add_argument(
369
+ "--base-url",
370
+ default="http://localhost:8111",
371
+ help="VLAC service base URL (default: http://localhost:8111)",
372
+ )
373
+ parser.add_argument(
374
+ "--timeout",
375
+ type=float,
376
+ default=30.0,
377
+ help="HTTP request timeout in seconds (default: 30.0)",
378
+ )
379
+ parser.add_argument(
380
+ "--use-reference",
381
+ action="store_true",
382
+ help="Use reference trajectory (if available)",
383
+ )
384
+ return parser.parse_args()
385
+
386
+
387
+ def main() -> int:
388
+ args = parse_args()
389
+
390
+ # Read manifest
391
+ try:
392
+ manifest_data = read_manifest(args.manifest_path)
393
+ except FileNotFoundError as exc:
394
+ print(f"Error: {exc}")
395
+ return 1
396
+
397
+ # Create output directory
398
+ output_dir = args.output_dir.expanduser()
399
+ output_dir.mkdir(parents=True, exist_ok=True)
400
+
401
+ # Run evaluation
402
+ print("=" * 80)
403
+ print("VLAC Value Estimation Evaluation")
404
+ print("=" * 80)
405
+
406
+ evaluation_results = evaluate_demos(
407
+ manifest_data=manifest_data,
408
+ base_url=args.base_url,
409
+ timeout=args.timeout,
410
+ use_reference=args.use_reference,
411
+ )
412
+
413
+ # Compute statistics
414
+ statistics = compute_statistics(evaluation_results)
415
+
416
+ # Print summary
417
+ print("\n" + "=" * 80)
418
+ print("EVALUATION SUMMARY")
419
+ print("=" * 80)
420
+ print(f"Task: {evaluation_results['task_name']}")
421
+ print(f"Total demos: {evaluation_results['total_demos']}")
422
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
423
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
424
+
425
+ if statistics:
426
+ print("\n" + "-" * 80)
427
+ print("SUCCESS FRAME VALUE STATISTICS")
428
+ print("-" * 80)
429
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
430
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
431
+ print(f"Median: {statistics['last_value_median']:.2f}")
432
+ print(f"Min: {statistics['last_value_min']:.2f}")
433
+ print(f"Max: {statistics['last_value_max']:.2f}")
434
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
435
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
436
+
437
+ print("\n" + "-" * 80)
438
+ print("THRESHOLD ANALYSIS")
439
+ print("-" * 80)
440
+ for threshold in [80, 85, 90, 95, 100]:
441
+ count = statistics[f"count_above_{threshold}"]
442
+ percent = statistics[f"percent_above_{threshold}"]
443
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
444
+
445
+ print("\n" + "-" * 80)
446
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
447
+ print("-" * 80)
448
+
449
+ # Save results
450
+ save_results(evaluation_results, statistics, output_dir)
451
+
452
+ # Create plots
453
+ if evaluation_results["results"]:
454
+ plot_value_distribution(evaluation_results, output_dir)
455
+ else:
456
+ print("\nNo successful evaluations to plot.")
457
+
458
+ print("\n" + "=" * 80)
459
+ print("EVALUATION COMPLETE")
460
+ print("=" * 80)
461
+
462
+ return 0
463
+
464
+
465
+ if __name__ == "__main__":
466
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008151816.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
12
+
13
+ Example:
14
+ python evaluate_test_demo_values.py \
15
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
16
+ --output-dir evaluation_results \
17
+ --base-url http://localhost:8111
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import base64
24
+ import json
25
+ import os
26
+ import glob
27
+ import sys
28
+ import time
29
+ from io import BytesIO
30
+ from pathlib import Path
31
+ from typing import Dict, List, Optional
32
+
33
+ import matplotlib.pyplot as plt
34
+ import numpy as np
35
+ import requests
36
+ from PIL import Image
37
+ from tqdm import tqdm
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Helpers
41
+ # ---------------------------------------------------------------------------
42
+
43
+ def sample_fixed_interval_frames(image_list, num_frames):
44
+ # sample num_frames frames from image_list
45
+ # sample with equal interval while also ensuring the first and the last frames are included
46
+ if len(image_list) == 0:
47
+ raise ValueError("image_list is empty")
48
+ elif len(image_list) == 1:
49
+ return [image_list[0]] * num_frames
50
+ elif num_frames == 2:
51
+ return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
52
+ elif num_frames == 3:
53
+ return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
54
+ else:
55
+ total_frames = len(image_list)
56
+ indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
57
+ sampled_frames = [image_list[i] for i in indices]
58
+ return sampled_frames
59
+
60
+
61
+ num_frames_for_reference = 8
62
+ ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
63
+ libero_10_task_list = [
64
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
65
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
66
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
67
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
68
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
69
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
70
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
71
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
72
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
73
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
74
+ ]
75
+ reference_frames_dict = {}
76
+ for task_name in libero_10_task_list:
77
+ ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
78
+ ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
79
+ ref_frm_file_list.sort()
80
+ reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
81
+ reference_frames_dict[task_name] = reference_frames_temp
82
+
83
+
84
+ def read_manifest(manifest_path: Path) -> Dict:
85
+ """Read the test demo manifest JSON file."""
86
+ if not manifest_path.is_file():
87
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
88
+
89
+ with manifest_path.open("r", encoding="utf-8") as f:
90
+ manifest_data = json.load(f)
91
+
92
+ # Convert relative paths to absolute paths
93
+ manifest_dir = manifest_path.parent
94
+ for demo in manifest_data.get("demos", []):
95
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
96
+
97
+ return manifest_data
98
+
99
+
100
+ def image_to_base64(path: Path) -> str:
101
+ """Convert an image file to base64 encoded JPEG."""
102
+ with Image.open(path) as img:
103
+ img = img.convert("RGB")
104
+ buffer = BytesIO()
105
+ img.save(buffer, format="JPEG", quality=95)
106
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
107
+
108
+
109
+ def encode_images(paths: List[str]) -> List[str]:
110
+ """Encode a list of image paths to base64."""
111
+ return [image_to_base64(Path(p)) for p in paths]
112
+
113
+
114
+ def call_trajectory_critic(
115
+ session: requests.Session,
116
+ base_url: str,
117
+ task: str,
118
+ frames_b64: List[str],
119
+ reference_b64: Optional[List[str]],
120
+ timeout: float,
121
+ ) -> Dict:
122
+ """Call the VLAC trajectory-critic endpoint."""
123
+ payload = {
124
+ "task": task,
125
+ "frames": frames_b64,
126
+ "reference": reference_b64,
127
+ "ref_num": len(reference_b64 or []),
128
+ "skip": 1,
129
+ "batch_size": min(len(frames_b64), 8),
130
+ "think": False,
131
+ "return_video": False,
132
+ }
133
+ start = time.time()
134
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
135
+ resp.raise_for_status()
136
+ result = resp.json()
137
+ result["latency_sec"] = time.time() - start
138
+ return result
139
+
140
+
141
+ # ---------------------------------------------------------------------------
142
+ # Evaluation
143
+ # ---------------------------------------------------------------------------
144
+
145
+
146
+ def evaluate_demos(
147
+ manifest_data: Dict,
148
+ base_url: str,
149
+ timeout: float,
150
+ use_reference: bool = False,
151
+ ) -> Dict[str, any]:
152
+ """Evaluate all demos and collect value statistics."""
153
+ session = requests.Session()
154
+ task_name = manifest_data.get("task_name", "")
155
+ demos = manifest_data.get("demos", [])
156
+
157
+ results = []
158
+ failed_demos = []
159
+
160
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
161
+ print(f"Task: {task_name}")
162
+ print(f"Use reference: {use_reference}\n")
163
+
164
+ for demo in tqdm(demos, desc="Processing demos"):
165
+ demo_name = demo["demo_name"]
166
+ frame_paths = demo["frame_paths"]
167
+
168
+ # try:
169
+ # Encode frames
170
+ frames_b64 = encode_images(frame_paths)
171
+
172
+ # For now, no reference trajectory (can be added later)
173
+ print(f"Using reference frames for task {task_name}")
174
+ reference_b64 = encode_images(reference_frames_dict[task_name])
175
+
176
+ # Call VLAC service
177
+ result = call_trajectory_critic(
178
+ session=session,
179
+ base_url=base_url,
180
+ task=task_name,
181
+ frames_b64=frames_b64,
182
+ reference_b64=reference_b64,
183
+ timeout=timeout,
184
+ )
185
+
186
+ # Extract values
187
+ value_list = result.get("value_list", [])
188
+ if not value_list:
189
+ print(f"\n[warn] No values returned for demo {demo_name}")
190
+ failed_demos.append(demo_name)
191
+ continue
192
+
193
+ # Record results
194
+ demo_result = {
195
+ "demo_name": demo_name,
196
+ "total_frames": demo["total_frames"],
197
+ "success_index": demo["success_index"],
198
+ "num_sampled_frames": len(frame_paths),
199
+ "value_list": value_list,
200
+ "last_value": value_list[-1], # The critical value for success frame
201
+ "mean_value": float(np.mean(value_list)),
202
+ "std_value": float(np.std(value_list)),
203
+ "latency_sec": result.get("latency_sec", 0.0),
204
+ }
205
+ results.append(demo_result)
206
+
207
+ # except requests.RequestException as exc:
208
+ # print(f"\n[error] Request failed for demo {demo_name}: {exc}")
209
+ # failed_demos.append(demo_name)
210
+ # except Exception as exc:
211
+ # print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
212
+ # failed_demos.append(demo_name)
213
+
214
+ return {
215
+ "task_name": task_name,
216
+ "total_demos": len(demos),
217
+ "successful_evals": len(results),
218
+ "failed_demos": failed_demos,
219
+ "results": results,
220
+ }
221
+
222
+
223
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
224
+ """Compute summary statistics from evaluation results."""
225
+ results = evaluation_results["results"]
226
+ if not results:
227
+ return {}
228
+
229
+ last_values = [r["last_value"] for r in results]
230
+ mean_values = [r["mean_value"] for r in results]
231
+ latencies = [r["latency_sec"] for r in results]
232
+
233
+ stats = {
234
+ "last_value_mean": float(np.mean(last_values)),
235
+ "last_value_std": float(np.std(last_values)),
236
+ "last_value_min": float(np.min(last_values)),
237
+ "last_value_max": float(np.max(last_values)),
238
+ "last_value_median": float(np.median(last_values)),
239
+ "last_value_q25": float(np.percentile(last_values, 25)),
240
+ "last_value_q75": float(np.percentile(last_values, 75)),
241
+ "mean_latency": float(np.mean(latencies)),
242
+ "total_evaluated": len(results),
243
+ }
244
+
245
+ # Count how many demos have last_value >= various thresholds
246
+ for threshold in [80, 85, 90, 95, 100]:
247
+ count = sum(1 for v in last_values if v >= threshold)
248
+ stats[f"count_above_{threshold}"] = count
249
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
250
+
251
+ return stats
252
+
253
+
254
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
255
+ """Create visualization plots for value distribution."""
256
+ results = evaluation_results["results"]
257
+ if not results:
258
+ print("No results to plot")
259
+ return
260
+
261
+ task_name = evaluation_results["task_name"]
262
+ last_values = [r["last_value"] for r in results]
263
+
264
+ # Create figure with multiple subplots
265
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
266
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
267
+
268
+ # 1. Histogram of last values
269
+ ax1 = axes[0, 0]
270
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
271
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
272
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
273
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
274
+ ax1.set_ylabel('Frequency', fontsize=12)
275
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
276
+ ax1.legend()
277
+ ax1.grid(True, alpha=0.3)
278
+
279
+ # 2. Box plot of last values
280
+ ax2 = axes[0, 1]
281
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
282
+ for patch in box_data['boxes']:
283
+ patch.set_facecolor('lightblue')
284
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
285
+ ax2.set_ylabel('Value', fontsize=12)
286
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
287
+ ax2.legend()
288
+ ax2.grid(True, alpha=0.3, axis='y')
289
+
290
+ # 3. Value progression across demos
291
+ ax3 = axes[1, 0]
292
+ demo_indices = range(len(results))
293
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
294
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
295
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
296
+ ax3.set_xlabel('Demo Index', fontsize=12)
297
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
298
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
299
+ ax3.legend()
300
+ ax3.grid(True, alpha=0.3)
301
+
302
+ # 4. Cumulative distribution
303
+ ax4 = axes[1, 1]
304
+ sorted_values = np.sort(last_values)
305
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
306
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
307
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
308
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
309
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
310
+ ax4.set_title('Cumulative Distribution', fontsize=14)
311
+ ax4.legend()
312
+ ax4.grid(True, alpha=0.3)
313
+
314
+ plt.tight_layout()
315
+
316
+ # Save the plot
317
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
318
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
319
+ print(f"\nPlot saved to: {plot_path}")
320
+
321
+ # Also save a PDF version
322
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
323
+ plt.savefig(pdf_path, bbox_inches='tight')
324
+ print(f"PDF saved to: {pdf_path}")
325
+
326
+ plt.close()
327
+
328
+
329
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
330
+ """Save evaluation results and statistics to JSON files."""
331
+ task_name = evaluation_results["task_name"]
332
+
333
+ # Save detailed results
334
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
335
+ with results_path.open("w", encoding="utf-8") as f:
336
+ json.dump(evaluation_results, f, indent=2)
337
+ print(f"\nDetailed results saved to: {results_path}")
338
+
339
+ # Save summary statistics
340
+ stats_path = output_dir / f"{task_name}_statistics.json"
341
+ with stats_path.open("w", encoding="utf-8") as f:
342
+ json.dump(statistics, f, indent=2)
343
+ print(f"Statistics saved to: {stats_path}")
344
+
345
+
346
+ # ---------------------------------------------------------------------------
347
+ # CLI
348
+ # ---------------------------------------------------------------------------
349
+
350
+
351
+ def parse_args() -> argparse.Namespace:
352
+ parser = argparse.ArgumentParser(
353
+ description="Evaluate value estimation for test demonstrations"
354
+ )
355
+ parser.add_argument(
356
+ "--manifest-path",
357
+ type=Path,
358
+ default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
359
+ help="Path to the test manifest JSON file",
360
+ )
361
+ parser.add_argument(
362
+ "--output-dir",
363
+ type=Path,
364
+ default="evaluation_results",
365
+ help="Directory to save evaluation results and plots",
366
+ )
367
+ parser.add_argument(
368
+ "--base-url",
369
+ default="http://localhost:8111",
370
+ help="VLAC service base URL (default: http://localhost:8111)",
371
+ )
372
+ parser.add_argument(
373
+ "--timeout",
374
+ type=float,
375
+ default=30.0,
376
+ help="HTTP request timeout in seconds (default: 30.0)",
377
+ )
378
+ parser.add_argument(
379
+ "--use-reference",
380
+ action="store_true",
381
+ help="Use reference trajectory (if available)",
382
+ )
383
+ return parser.parse_args()
384
+
385
+
386
+ def main() -> int:
387
+ args = parse_args()
388
+
389
+ # Read manifest
390
+ try:
391
+ manifest_data = read_manifest(args.manifest_path)
392
+ except FileNotFoundError as exc:
393
+ print(f"Error: {exc}")
394
+ return 1
395
+
396
+ # Create output directory
397
+ output_dir = args.output_dir.expanduser()
398
+ output_dir.mkdir(parents=True, exist_ok=True)
399
+
400
+ # Run evaluation
401
+ print("=" * 80)
402
+ print("VLAC Value Estimation Evaluation")
403
+ print("=" * 80)
404
+
405
+ evaluation_results = evaluate_demos(
406
+ manifest_data=manifest_data,
407
+ base_url=args.base_url,
408
+ timeout=args.timeout,
409
+ use_reference=args.use_reference,
410
+ )
411
+
412
+ # Compute statistics
413
+ statistics = compute_statistics(evaluation_results)
414
+
415
+ # Print summary
416
+ print("\n" + "=" * 80)
417
+ print("EVALUATION SUMMARY")
418
+ print("=" * 80)
419
+ print(f"Task: {evaluation_results['task_name']}")
420
+ print(f"Total demos: {evaluation_results['total_demos']}")
421
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
422
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
423
+
424
+ if statistics:
425
+ print("\n" + "-" * 80)
426
+ print("SUCCESS FRAME VALUE STATISTICS")
427
+ print("-" * 80)
428
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
429
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
430
+ print(f"Median: {statistics['last_value_median']:.2f}")
431
+ print(f"Min: {statistics['last_value_min']:.2f}")
432
+ print(f"Max: {statistics['last_value_max']:.2f}")
433
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
434
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
435
+
436
+ print("\n" + "-" * 80)
437
+ print("THRESHOLD ANALYSIS")
438
+ print("-" * 80)
439
+ for threshold in [80, 85, 90, 95, 100]:
440
+ count = statistics[f"count_above_{threshold}"]
441
+ percent = statistics[f"percent_above_{threshold}"]
442
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
443
+
444
+ print("\n" + "-" * 80)
445
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
446
+ print("-" * 80)
447
+
448
+ # Save results
449
+ save_results(evaluation_results, statistics, output_dir)
450
+
451
+ # Create plots
452
+ if evaluation_results["results"]:
453
+ plot_value_distribution(evaluation_results, output_dir)
454
+ else:
455
+ print("\nNo successful evaluations to plot.")
456
+
457
+ print("\n" + "=" * 80)
458
+ print("EVALUATION COMPLETE")
459
+ print("=" * 80)
460
+
461
+ return 0
462
+
463
+
464
+ if __name__ == "__main__":
465
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008152522.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ # Evaluate all LIBERO-10 tasks
12
+ python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
13
+
14
+ # Evaluate a single task
15
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
16
+
17
+ Examples:
18
+ # Evaluate all LIBERO-10 tasks
19
+ python evaluate_test_demo_values.py \
20
+ --process-all-tasks \
21
+ --manifests-root toy_test_demos_LIBERO_10 \
22
+ --output-dir evaluation_results_all_tasks \
23
+ --base-url http://localhost:8111
24
+
25
+ # Evaluate a single task
26
+ python evaluate_test_demo_values.py \
27
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
28
+ --output-dir evaluation_results \
29
+ --base-url http://localhost:8111
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import argparse
35
+ import base64
36
+ import json
37
+ import os
38
+ import glob
39
+ import sys
40
+ import time
41
+ from io import BytesIO
42
+ from pathlib import Path
43
+ from typing import Dict, List, Optional
44
+
45
+ import matplotlib.pyplot as plt
46
+ import numpy as np
47
+ import requests
48
+ from PIL import Image
49
+ from tqdm import tqdm
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # Helpers
53
+ # ---------------------------------------------------------------------------
54
+
55
+ def sample_fixed_interval_frames(image_list, num_frames):
56
+ # sample num_frames frames from image_list
57
+ # sample with equal interval while also ensuring the first and the last frames are included
58
+ if len(image_list) == 0:
59
+ raise ValueError("image_list is empty")
60
+ elif len(image_list) == 1:
61
+ return [image_list[0]] * num_frames
62
+ elif num_frames == 2:
63
+ return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
64
+ elif num_frames == 3:
65
+ return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
66
+ else:
67
+ total_frames = len(image_list)
68
+ indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
69
+ sampled_frames = [image_list[i] for i in indices]
70
+ return sampled_frames
71
+
72
+
73
+ num_frames_for_reference = 8
74
+ ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
75
+ libero_10_task_list = [
76
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
77
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
78
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
79
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
80
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
81
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
82
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
83
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
84
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
85
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
86
+ ]
87
+ reference_frames_dict = {}
88
+ for task_name in libero_10_task_list:
89
+ ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
90
+ ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
91
+ ref_frm_file_list.sort()
92
+ reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
93
+ reference_frames_dict[task_name] = reference_frames_temp
94
+
95
+
96
+ def read_manifest(manifest_path: Path) -> Dict:
97
+ """Read the test demo manifest JSON file."""
98
+ if not manifest_path.is_file():
99
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
100
+
101
+ with manifest_path.open("r", encoding="utf-8") as f:
102
+ manifest_data = json.load(f)
103
+
104
+ # Convert relative paths to absolute paths
105
+ manifest_dir = manifest_path.parent
106
+ for demo in manifest_data.get("demos", []):
107
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
108
+
109
+ return manifest_data
110
+
111
+
112
+ def image_to_base64(path: Path) -> str:
113
+ """Convert an image file to base64 encoded JPEG."""
114
+ with Image.open(path) as img:
115
+ img = img.convert("RGB")
116
+ buffer = BytesIO()
117
+ img.save(buffer, format="JPEG", quality=95)
118
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
119
+
120
+
121
+ def encode_images(paths: List[str]) -> List[str]:
122
+ """Encode a list of image paths to base64."""
123
+ return [image_to_base64(Path(p)) for p in paths]
124
+
125
+
126
+ def call_trajectory_critic(
127
+ session: requests.Session,
128
+ base_url: str,
129
+ task: str,
130
+ frames_b64: List[str],
131
+ reference_b64: Optional[List[str]],
132
+ timeout: float,
133
+ ) -> Dict:
134
+ """Call the VLAC trajectory-critic endpoint."""
135
+ payload = {
136
+ "task": task,
137
+ "frames": frames_b64,
138
+ "reference": reference_b64,
139
+ "ref_num": len(reference_b64 or []),
140
+ "skip": 1,
141
+ "batch_size": min(len(frames_b64), 8),
142
+ "think": False,
143
+ "return_video": False,
144
+ }
145
+ start = time.time()
146
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
147
+ resp.raise_for_status()
148
+ result = resp.json()
149
+ result["latency_sec"] = time.time() - start
150
+ return result
151
+
152
+
153
+ # ---------------------------------------------------------------------------
154
+ # Evaluation
155
+ # ---------------------------------------------------------------------------
156
+
157
+
158
+ def evaluate_demos(
159
+ manifest_data: Dict,
160
+ base_url: str,
161
+ timeout: float,
162
+ use_reference: bool = False,
163
+ ) -> Dict[str, any]:
164
+ """Evaluate all demos and collect value statistics."""
165
+ session = requests.Session()
166
+ task_name = manifest_data.get("task_name", "")
167
+ demos = manifest_data.get("demos", [])
168
+
169
+ results = []
170
+ failed_demos = []
171
+
172
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
173
+ print(f"Task: {task_name}")
174
+ print(f"Use reference: {use_reference}\n")
175
+
176
+ for demo in tqdm(demos, desc="Processing demos"):
177
+ demo_name = demo["demo_name"]
178
+ frame_paths = demo["frame_paths"]
179
+
180
+ # try:
181
+ # Encode frames
182
+ frames_b64 = encode_images(frame_paths)
183
+
184
+ # For now, no reference trajectory (can be added later)
185
+ print(f"Using reference frames for task {task_name}")
186
+ reference_b64 = encode_images(reference_frames_dict[task_name])
187
+
188
+ # Call VLAC service
189
+ result = call_trajectory_critic(
190
+ session=session,
191
+ base_url=base_url,
192
+ task=task_name,
193
+ frames_b64=frames_b64,
194
+ reference_b64=reference_b64,
195
+ timeout=timeout,
196
+ )
197
+
198
+ # Extract values
199
+ value_list = result.get("value_list", [])
200
+ if not value_list:
201
+ print(f"\n[warn] No values returned for demo {demo_name}")
202
+ failed_demos.append(demo_name)
203
+ continue
204
+
205
+ # Record results
206
+ demo_result = {
207
+ "demo_name": demo_name,
208
+ "total_frames": demo["total_frames"],
209
+ "success_index": demo["success_index"],
210
+ "num_sampled_frames": len(frame_paths),
211
+ "value_list": value_list,
212
+ "last_value": value_list[-1], # The critical value for success frame
213
+ "mean_value": float(np.mean(value_list)),
214
+ "std_value": float(np.std(value_list)),
215
+ "latency_sec": result.get("latency_sec", 0.0),
216
+ }
217
+ results.append(demo_result)
218
+
219
+ # except requests.RequestException as exc:
220
+ # print(f"\n[error] Request failed for demo {demo_name}: {exc}")
221
+ # failed_demos.append(demo_name)
222
+ # except Exception as exc:
223
+ # print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
224
+ # failed_demos.append(demo_name)
225
+
226
+ return {
227
+ "task_name": task_name,
228
+ "total_demos": len(demos),
229
+ "successful_evals": len(results),
230
+ "failed_demos": failed_demos,
231
+ "results": results,
232
+ }
233
+
234
+
235
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
236
+ """Compute summary statistics from evaluation results."""
237
+ results = evaluation_results["results"]
238
+ if not results:
239
+ return {}
240
+
241
+ last_values = [r["last_value"] for r in results]
242
+ mean_values = [r["mean_value"] for r in results]
243
+ latencies = [r["latency_sec"] for r in results]
244
+
245
+ stats = {
246
+ "last_value_mean": float(np.mean(last_values)),
247
+ "last_value_std": float(np.std(last_values)),
248
+ "last_value_min": float(np.min(last_values)),
249
+ "last_value_max": float(np.max(last_values)),
250
+ "last_value_median": float(np.median(last_values)),
251
+ "last_value_q25": float(np.percentile(last_values, 25)),
252
+ "last_value_q75": float(np.percentile(last_values, 75)),
253
+ "mean_latency": float(np.mean(latencies)),
254
+ "total_evaluated": len(results),
255
+ }
256
+
257
+ # Count how many demos have last_value >= various thresholds
258
+ for threshold in [80, 85, 90, 95, 100]:
259
+ count = sum(1 for v in last_values if v >= threshold)
260
+ stats[f"count_above_{threshold}"] = count
261
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
262
+
263
+ return stats
264
+
265
+
266
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
267
+ """Create visualization plots for value distribution."""
268
+ results = evaluation_results["results"]
269
+ if not results:
270
+ print("No results to plot")
271
+ return
272
+
273
+ task_name = evaluation_results["task_name"]
274
+ last_values = [r["last_value"] for r in results]
275
+
276
+ # Create figure with multiple subplots
277
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
278
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
279
+
280
+ # 1. Histogram of last values
281
+ ax1 = axes[0, 0]
282
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
283
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
284
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
285
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
286
+ ax1.set_ylabel('Frequency', fontsize=12)
287
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
288
+ ax1.legend()
289
+ ax1.grid(True, alpha=0.3)
290
+
291
+ # 2. Box plot of last values
292
+ ax2 = axes[0, 1]
293
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
294
+ for patch in box_data['boxes']:
295
+ patch.set_facecolor('lightblue')
296
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
297
+ ax2.set_ylabel('Value', fontsize=12)
298
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
299
+ ax2.legend()
300
+ ax2.grid(True, alpha=0.3, axis='y')
301
+
302
+ # 3. Value progression across demos
303
+ ax3 = axes[1, 0]
304
+ demo_indices = range(len(results))
305
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
306
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
307
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
308
+ ax3.set_xlabel('Demo Index', fontsize=12)
309
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
310
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
311
+ ax3.legend()
312
+ ax3.grid(True, alpha=0.3)
313
+
314
+ # 4. Cumulative distribution
315
+ ax4 = axes[1, 1]
316
+ sorted_values = np.sort(last_values)
317
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
318
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
319
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
320
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
321
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
322
+ ax4.set_title('Cumulative Distribution', fontsize=14)
323
+ ax4.legend()
324
+ ax4.grid(True, alpha=0.3)
325
+
326
+ plt.tight_layout()
327
+
328
+ # Save the plot
329
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
330
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
331
+ print(f"\nPlot saved to: {plot_path}")
332
+
333
+ # Also save a PDF version
334
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
335
+ plt.savefig(pdf_path, bbox_inches='tight')
336
+ print(f"PDF saved to: {pdf_path}")
337
+
338
+ plt.close()
339
+
340
+
341
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
342
+ """Save evaluation results and statistics to JSON files."""
343
+ task_name = evaluation_results["task_name"]
344
+
345
+ # Save detailed results
346
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
347
+ with results_path.open("w", encoding="utf-8") as f:
348
+ json.dump(evaluation_results, f, indent=2)
349
+ print(f"\nDetailed results saved to: {results_path}")
350
+
351
+ # Save summary statistics
352
+ stats_path = output_dir / f"{task_name}_statistics.json"
353
+ with stats_path.open("w", encoding="utf-8") as f:
354
+ json.dump(statistics, f, indent=2)
355
+ print(f"Statistics saved to: {stats_path}")
356
+
357
+
358
+ # ---------------------------------------------------------------------------
359
+ # CLI
360
+ # ---------------------------------------------------------------------------
361
+
362
+
363
+ def parse_args() -> argparse.Namespace:
364
+ parser = argparse.ArgumentParser(
365
+ description="Evaluate value estimation for test demonstrations"
366
+ )
367
+ parser.add_argument(
368
+ "--manifest-path",
369
+ type=Path,
370
+ default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
371
+ help="Path to the test manifest JSON file",
372
+ )
373
+ parser.add_argument(
374
+ "--output-dir",
375
+ type=Path,
376
+ default="evaluation_results",
377
+ help="Directory to save evaluation results and plots",
378
+ )
379
+ parser.add_argument(
380
+ "--base-url",
381
+ default="http://localhost:8111",
382
+ help="VLAC service base URL (default: http://localhost:8111)",
383
+ )
384
+ parser.add_argument(
385
+ "--timeout",
386
+ type=float,
387
+ default=30.0,
388
+ help="HTTP request timeout in seconds (default: 30.0)",
389
+ )
390
+ parser.add_argument(
391
+ "--use-reference",
392
+ action="store_true",
393
+ help="Use reference trajectory (if available)",
394
+ )
395
+ return parser.parse_args()
396
+
397
+
398
+ def main() -> int:
399
+ args = parse_args()
400
+
401
+ # Read manifest
402
+ try:
403
+ manifest_data = read_manifest(args.manifest_path)
404
+ except FileNotFoundError as exc:
405
+ print(f"Error: {exc}")
406
+ return 1
407
+
408
+ # Create output directory
409
+ output_dir = args.output_dir.expanduser()
410
+ output_dir.mkdir(parents=True, exist_ok=True)
411
+
412
+ # Run evaluation
413
+ print("=" * 80)
414
+ print("VLAC Value Estimation Evaluation")
415
+ print("=" * 80)
416
+
417
+ evaluation_results = evaluate_demos(
418
+ manifest_data=manifest_data,
419
+ base_url=args.base_url,
420
+ timeout=args.timeout,
421
+ use_reference=args.use_reference,
422
+ )
423
+
424
+ # Compute statistics
425
+ statistics = compute_statistics(evaluation_results)
426
+
427
+ # Print summary
428
+ print("\n" + "=" * 80)
429
+ print("EVALUATION SUMMARY")
430
+ print("=" * 80)
431
+ print(f"Task: {evaluation_results['task_name']}")
432
+ print(f"Total demos: {evaluation_results['total_demos']}")
433
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
434
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
435
+
436
+ if statistics:
437
+ print("\n" + "-" * 80)
438
+ print("SUCCESS FRAME VALUE STATISTICS")
439
+ print("-" * 80)
440
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
441
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
442
+ print(f"Median: {statistics['last_value_median']:.2f}")
443
+ print(f"Min: {statistics['last_value_min']:.2f}")
444
+ print(f"Max: {statistics['last_value_max']:.2f}")
445
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
446
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
447
+
448
+ print("\n" + "-" * 80)
449
+ print("THRESHOLD ANALYSIS")
450
+ print("-" * 80)
451
+ for threshold in [80, 85, 90, 95, 100]:
452
+ count = statistics[f"count_above_{threshold}"]
453
+ percent = statistics[f"percent_above_{threshold}"]
454
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
455
+
456
+ print("\n" + "-" * 80)
457
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
458
+ print("-" * 80)
459
+
460
+ # Save results
461
+ save_results(evaluation_results, statistics, output_dir)
462
+
463
+ # Create plots
464
+ if evaluation_results["results"]:
465
+ plot_value_distribution(evaluation_results, output_dir)
466
+ else:
467
+ print("\nNo successful evaluations to plot.")
468
+
469
+ print("\n" + "=" * 80)
470
+ print("EVALUATION COMPLETE")
471
+ print("=" * 80)
472
+
473
+ return 0
474
+
475
+
476
+ if __name__ == "__main__":
477
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008152534.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ # Evaluate all LIBERO-10 tasks
12
+ python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
13
+
14
+ # Evaluate a single task
15
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
16
+
17
+ Examples:
18
+ # Evaluate all LIBERO-10 tasks
19
+ python evaluate_test_demo_values.py \
20
+ --process-all-tasks \
21
+ --manifests-root toy_test_demos_LIBERO_10 \
22
+ --output-dir evaluation_results_all_tasks \
23
+ --base-url http://localhost:8111
24
+
25
+ # Evaluate a single task
26
+ python evaluate_test_demo_values.py \
27
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
28
+ --output-dir evaluation_results \
29
+ --base-url http://localhost:8111
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import argparse
35
+ import base64
36
+ import json
37
+ import os
38
+ import glob
39
+ import sys
40
+ import time
41
+ from io import BytesIO
42
+ from pathlib import Path
43
+ from typing import Dict, List, Optional
44
+
45
+ import matplotlib.pyplot as plt
46
+ import numpy as np
47
+ import requests
48
+ from PIL import Image
49
+ from tqdm import tqdm
50
+
51
+ # LIBERO-10 task list
52
+ LIBERO_10_TASKS = [
53
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
54
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
55
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
56
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
57
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
58
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
59
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
60
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
61
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
62
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
63
+ ]
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Helpers
67
+ # ---------------------------------------------------------------------------
68
+
69
+ def sample_fixed_interval_frames(image_list, num_frames):
70
+ # sample num_frames frames from image_list
71
+ # sample with equal interval while also ensuring the first and the last frames are included
72
+ if len(image_list) == 0:
73
+ raise ValueError("image_list is empty")
74
+ elif len(image_list) == 1:
75
+ return [image_list[0]] * num_frames
76
+ elif num_frames == 2:
77
+ return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
78
+ elif num_frames == 3:
79
+ return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
80
+ else:
81
+ total_frames = len(image_list)
82
+ indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
83
+ sampled_frames = [image_list[i] for i in indices]
84
+ return sampled_frames
85
+
86
+
87
+ num_frames_for_reference = 8
88
+ ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
89
+ libero_10_task_list = [
90
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
91
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
92
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
93
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
94
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
95
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
96
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
97
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
98
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
99
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
100
+ ]
101
+ reference_frames_dict = {}
102
+ for task_name in libero_10_task_list:
103
+ ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
104
+ ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
105
+ ref_frm_file_list.sort()
106
+ reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
107
+ reference_frames_dict[task_name] = reference_frames_temp
108
+
109
+
110
+ def read_manifest(manifest_path: Path) -> Dict:
111
+ """Read the test demo manifest JSON file."""
112
+ if not manifest_path.is_file():
113
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
114
+
115
+ with manifest_path.open("r", encoding="utf-8") as f:
116
+ manifest_data = json.load(f)
117
+
118
+ # Convert relative paths to absolute paths
119
+ manifest_dir = manifest_path.parent
120
+ for demo in manifest_data.get("demos", []):
121
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
122
+
123
+ return manifest_data
124
+
125
+
126
+ def image_to_base64(path: Path) -> str:
127
+ """Convert an image file to base64 encoded JPEG."""
128
+ with Image.open(path) as img:
129
+ img = img.convert("RGB")
130
+ buffer = BytesIO()
131
+ img.save(buffer, format="JPEG", quality=95)
132
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
133
+
134
+
135
+ def encode_images(paths: List[str]) -> List[str]:
136
+ """Encode a list of image paths to base64."""
137
+ return [image_to_base64(Path(p)) for p in paths]
138
+
139
+
140
+ def call_trajectory_critic(
141
+ session: requests.Session,
142
+ base_url: str,
143
+ task: str,
144
+ frames_b64: List[str],
145
+ reference_b64: Optional[List[str]],
146
+ timeout: float,
147
+ ) -> Dict:
148
+ """Call the VLAC trajectory-critic endpoint."""
149
+ payload = {
150
+ "task": task,
151
+ "frames": frames_b64,
152
+ "reference": reference_b64,
153
+ "ref_num": len(reference_b64 or []),
154
+ "skip": 1,
155
+ "batch_size": min(len(frames_b64), 8),
156
+ "think": False,
157
+ "return_video": False,
158
+ }
159
+ start = time.time()
160
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
161
+ resp.raise_for_status()
162
+ result = resp.json()
163
+ result["latency_sec"] = time.time() - start
164
+ return result
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # Evaluation
169
+ # ---------------------------------------------------------------------------
170
+
171
+
172
+ def evaluate_demos(
173
+ manifest_data: Dict,
174
+ base_url: str,
175
+ timeout: float,
176
+ use_reference: bool = False,
177
+ ) -> Dict[str, any]:
178
+ """Evaluate all demos and collect value statistics."""
179
+ session = requests.Session()
180
+ task_name = manifest_data.get("task_name", "")
181
+ demos = manifest_data.get("demos", [])
182
+
183
+ results = []
184
+ failed_demos = []
185
+
186
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
187
+ print(f"Task: {task_name}")
188
+ print(f"Use reference: {use_reference}\n")
189
+
190
+ for demo in tqdm(demos, desc="Processing demos"):
191
+ demo_name = demo["demo_name"]
192
+ frame_paths = demo["frame_paths"]
193
+
194
+ # try:
195
+ # Encode frames
196
+ frames_b64 = encode_images(frame_paths)
197
+
198
+ # For now, no reference trajectory (can be added later)
199
+ print(f"Using reference frames for task {task_name}")
200
+ reference_b64 = encode_images(reference_frames_dict[task_name])
201
+
202
+ # Call VLAC service
203
+ result = call_trajectory_critic(
204
+ session=session,
205
+ base_url=base_url,
206
+ task=task_name,
207
+ frames_b64=frames_b64,
208
+ reference_b64=reference_b64,
209
+ timeout=timeout,
210
+ )
211
+
212
+ # Extract values
213
+ value_list = result.get("value_list", [])
214
+ if not value_list:
215
+ print(f"\n[warn] No values returned for demo {demo_name}")
216
+ failed_demos.append(demo_name)
217
+ continue
218
+
219
+ # Record results
220
+ demo_result = {
221
+ "demo_name": demo_name,
222
+ "total_frames": demo["total_frames"],
223
+ "success_index": demo["success_index"],
224
+ "num_sampled_frames": len(frame_paths),
225
+ "value_list": value_list,
226
+ "last_value": value_list[-1], # The critical value for success frame
227
+ "mean_value": float(np.mean(value_list)),
228
+ "std_value": float(np.std(value_list)),
229
+ "latency_sec": result.get("latency_sec", 0.0),
230
+ }
231
+ results.append(demo_result)
232
+
233
+ # except requests.RequestException as exc:
234
+ # print(f"\n[error] Request failed for demo {demo_name}: {exc}")
235
+ # failed_demos.append(demo_name)
236
+ # except Exception as exc:
237
+ # print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
238
+ # failed_demos.append(demo_name)
239
+
240
+ return {
241
+ "task_name": task_name,
242
+ "total_demos": len(demos),
243
+ "successful_evals": len(results),
244
+ "failed_demos": failed_demos,
245
+ "results": results,
246
+ }
247
+
248
+
249
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
250
+ """Compute summary statistics from evaluation results."""
251
+ results = evaluation_results["results"]
252
+ if not results:
253
+ return {}
254
+
255
+ last_values = [r["last_value"] for r in results]
256
+ mean_values = [r["mean_value"] for r in results]
257
+ latencies = [r["latency_sec"] for r in results]
258
+
259
+ stats = {
260
+ "last_value_mean": float(np.mean(last_values)),
261
+ "last_value_std": float(np.std(last_values)),
262
+ "last_value_min": float(np.min(last_values)),
263
+ "last_value_max": float(np.max(last_values)),
264
+ "last_value_median": float(np.median(last_values)),
265
+ "last_value_q25": float(np.percentile(last_values, 25)),
266
+ "last_value_q75": float(np.percentile(last_values, 75)),
267
+ "mean_latency": float(np.mean(latencies)),
268
+ "total_evaluated": len(results),
269
+ }
270
+
271
+ # Count how many demos have last_value >= various thresholds
272
+ for threshold in [80, 85, 90, 95, 100]:
273
+ count = sum(1 for v in last_values if v >= threshold)
274
+ stats[f"count_above_{threshold}"] = count
275
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
276
+
277
+ return stats
278
+
279
+
280
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
281
+ """Create visualization plots for value distribution."""
282
+ results = evaluation_results["results"]
283
+ if not results:
284
+ print("No results to plot")
285
+ return
286
+
287
+ task_name = evaluation_results["task_name"]
288
+ last_values = [r["last_value"] for r in results]
289
+
290
+ # Create figure with multiple subplots
291
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
292
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
293
+
294
+ # 1. Histogram of last values
295
+ ax1 = axes[0, 0]
296
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
297
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
298
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
299
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
300
+ ax1.set_ylabel('Frequency', fontsize=12)
301
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
302
+ ax1.legend()
303
+ ax1.grid(True, alpha=0.3)
304
+
305
+ # 2. Box plot of last values
306
+ ax2 = axes[0, 1]
307
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
308
+ for patch in box_data['boxes']:
309
+ patch.set_facecolor('lightblue')
310
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
311
+ ax2.set_ylabel('Value', fontsize=12)
312
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
313
+ ax2.legend()
314
+ ax2.grid(True, alpha=0.3, axis='y')
315
+
316
+ # 3. Value progression across demos
317
+ ax3 = axes[1, 0]
318
+ demo_indices = range(len(results))
319
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
320
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
321
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
322
+ ax3.set_xlabel('Demo Index', fontsize=12)
323
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
324
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
325
+ ax3.legend()
326
+ ax3.grid(True, alpha=0.3)
327
+
328
+ # 4. Cumulative distribution
329
+ ax4 = axes[1, 1]
330
+ sorted_values = np.sort(last_values)
331
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
332
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
333
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
334
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
335
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
336
+ ax4.set_title('Cumulative Distribution', fontsize=14)
337
+ ax4.legend()
338
+ ax4.grid(True, alpha=0.3)
339
+
340
+ plt.tight_layout()
341
+
342
+ # Save the plot
343
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
344
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
345
+ print(f"\nPlot saved to: {plot_path}")
346
+
347
+ # Also save a PDF version
348
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
349
+ plt.savefig(pdf_path, bbox_inches='tight')
350
+ print(f"PDF saved to: {pdf_path}")
351
+
352
+ plt.close()
353
+
354
+
355
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
356
+ """Save evaluation results and statistics to JSON files."""
357
+ task_name = evaluation_results["task_name"]
358
+
359
+ # Save detailed results
360
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
361
+ with results_path.open("w", encoding="utf-8") as f:
362
+ json.dump(evaluation_results, f, indent=2)
363
+ print(f"\nDetailed results saved to: {results_path}")
364
+
365
+ # Save summary statistics
366
+ stats_path = output_dir / f"{task_name}_statistics.json"
367
+ with stats_path.open("w", encoding="utf-8") as f:
368
+ json.dump(statistics, f, indent=2)
369
+ print(f"Statistics saved to: {stats_path}")
370
+
371
+
372
+ # ---------------------------------------------------------------------------
373
+ # CLI
374
+ # ---------------------------------------------------------------------------
375
+
376
+
377
+ def parse_args() -> argparse.Namespace:
378
+ parser = argparse.ArgumentParser(
379
+ description="Evaluate value estimation for test demonstrations"
380
+ )
381
+ parser.add_argument(
382
+ "--manifest-path",
383
+ type=Path,
384
+ default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
385
+ help="Path to the test manifest JSON file",
386
+ )
387
+ parser.add_argument(
388
+ "--output-dir",
389
+ type=Path,
390
+ default="evaluation_results",
391
+ help="Directory to save evaluation results and plots",
392
+ )
393
+ parser.add_argument(
394
+ "--base-url",
395
+ default="http://localhost:8111",
396
+ help="VLAC service base URL (default: http://localhost:8111)",
397
+ )
398
+ parser.add_argument(
399
+ "--timeout",
400
+ type=float,
401
+ default=30.0,
402
+ help="HTTP request timeout in seconds (default: 30.0)",
403
+ )
404
+ parser.add_argument(
405
+ "--use-reference",
406
+ action="store_true",
407
+ help="Use reference trajectory (if available)",
408
+ )
409
+ return parser.parse_args()
410
+
411
+
412
+ def main() -> int:
413
+ args = parse_args()
414
+
415
+ # Read manifest
416
+ try:
417
+ manifest_data = read_manifest(args.manifest_path)
418
+ except FileNotFoundError as exc:
419
+ print(f"Error: {exc}")
420
+ return 1
421
+
422
+ # Create output directory
423
+ output_dir = args.output_dir.expanduser()
424
+ output_dir.mkdir(parents=True, exist_ok=True)
425
+
426
+ # Run evaluation
427
+ print("=" * 80)
428
+ print("VLAC Value Estimation Evaluation")
429
+ print("=" * 80)
430
+
431
+ evaluation_results = evaluate_demos(
432
+ manifest_data=manifest_data,
433
+ base_url=args.base_url,
434
+ timeout=args.timeout,
435
+ use_reference=args.use_reference,
436
+ )
437
+
438
+ # Compute statistics
439
+ statistics = compute_statistics(evaluation_results)
440
+
441
+ # Print summary
442
+ print("\n" + "=" * 80)
443
+ print("EVALUATION SUMMARY")
444
+ print("=" * 80)
445
+ print(f"Task: {evaluation_results['task_name']}")
446
+ print(f"Total demos: {evaluation_results['total_demos']}")
447
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
448
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
449
+
450
+ if statistics:
451
+ print("\n" + "-" * 80)
452
+ print("SUCCESS FRAME VALUE STATISTICS")
453
+ print("-" * 80)
454
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
455
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
456
+ print(f"Median: {statistics['last_value_median']:.2f}")
457
+ print(f"Min: {statistics['last_value_min']:.2f}")
458
+ print(f"Max: {statistics['last_value_max']:.2f}")
459
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
460
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
461
+
462
+ print("\n" + "-" * 80)
463
+ print("THRESHOLD ANALYSIS")
464
+ print("-" * 80)
465
+ for threshold in [80, 85, 90, 95, 100]:
466
+ count = statistics[f"count_above_{threshold}"]
467
+ percent = statistics[f"percent_above_{threshold}"]
468
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
469
+
470
+ print("\n" + "-" * 80)
471
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
472
+ print("-" * 80)
473
+
474
+ # Save results
475
+ save_results(evaluation_results, statistics, output_dir)
476
+
477
+ # Create plots
478
+ if evaluation_results["results"]:
479
+ plot_value_distribution(evaluation_results, output_dir)
480
+ else:
481
+ print("\nNo successful evaluations to plot.")
482
+
483
+ print("\n" + "=" * 80)
484
+ print("EVALUATION COMPLETE")
485
+ print("=" * 80)
486
+
487
+ return 0
488
+
489
+
490
+ if __name__ == "__main__":
491
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008152548.py ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ # Evaluate all LIBERO-10 tasks
12
+ python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
13
+
14
+ # Evaluate a single task
15
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
16
+
17
+ Examples:
18
+ # Evaluate all LIBERO-10 tasks
19
+ python evaluate_test_demo_values.py \
20
+ --process-all-tasks \
21
+ --manifests-root toy_test_demos_LIBERO_10 \
22
+ --output-dir evaluation_results_all_tasks \
23
+ --base-url http://localhost:8111
24
+
25
+ # Evaluate a single task
26
+ python evaluate_test_demo_values.py \
27
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
28
+ --output-dir evaluation_results \
29
+ --base-url http://localhost:8111
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import argparse
35
+ import base64
36
+ import json
37
+ import os
38
+ import glob
39
+ import sys
40
+ import time
41
+ from io import BytesIO
42
+ from pathlib import Path
43
+ from typing import Dict, List, Optional
44
+
45
+ import matplotlib.pyplot as plt
46
+ import numpy as np
47
+ import requests
48
+ from PIL import Image
49
+ from tqdm import tqdm
50
+
51
+ # LIBERO-10 task list
52
+ LIBERO_10_TASKS = [
53
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
54
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
55
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
56
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
57
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
58
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
59
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
60
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
61
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
62
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
63
+ ]
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Helpers
67
+ # ---------------------------------------------------------------------------
68
+
69
+ def sample_fixed_interval_frames(image_list, num_frames):
70
+ # sample num_frames frames from image_list
71
+ # sample with equal interval while also ensuring the first and the last frames are included
72
+ if len(image_list) == 0:
73
+ raise ValueError("image_list is empty")
74
+ elif len(image_list) == 1:
75
+ return [image_list[0]] * num_frames
76
+ elif num_frames == 2:
77
+ return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
78
+ elif num_frames == 3:
79
+ return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
80
+ else:
81
+ total_frames = len(image_list)
82
+ indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
83
+ sampled_frames = [image_list[i] for i in indices]
84
+ return sampled_frames
85
+
86
+
87
+ num_frames_for_reference = 8
88
+ ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
89
+ libero_10_task_list = [
90
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
91
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
92
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
93
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
94
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
95
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
96
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
97
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
98
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
99
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
100
+ ]
101
+ reference_frames_dict = {}
102
+ for task_name in libero_10_task_list:
103
+ ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
104
+ ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
105
+ ref_frm_file_list.sort()
106
+ reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
107
+ reference_frames_dict[task_name] = reference_frames_temp
108
+
109
+
110
+ def read_manifest(manifest_path: Path) -> Dict:
111
+ """Read the test demo manifest JSON file."""
112
+ if not manifest_path.is_file():
113
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
114
+
115
+ with manifest_path.open("r", encoding="utf-8") as f:
116
+ manifest_data = json.load(f)
117
+
118
+ # Convert relative paths to absolute paths
119
+ manifest_dir = manifest_path.parent
120
+ for demo in manifest_data.get("demos", []):
121
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
122
+
123
+ return manifest_data
124
+
125
+
126
+ def image_to_base64(path: Path) -> str:
127
+ """Convert an image file to base64 encoded JPEG."""
128
+ with Image.open(path) as img:
129
+ img = img.convert("RGB")
130
+ buffer = BytesIO()
131
+ img.save(buffer, format="JPEG", quality=95)
132
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
133
+
134
+
135
+ def encode_images(paths: List[str]) -> List[str]:
136
+ """Encode a list of image paths to base64."""
137
+ return [image_to_base64(Path(p)) for p in paths]
138
+
139
+
140
+ def call_trajectory_critic(
141
+ session: requests.Session,
142
+ base_url: str,
143
+ task: str,
144
+ frames_b64: List[str],
145
+ reference_b64: Optional[List[str]],
146
+ timeout: float,
147
+ ) -> Dict:
148
+ """Call the VLAC trajectory-critic endpoint."""
149
+ payload = {
150
+ "task": task,
151
+ "frames": frames_b64,
152
+ "reference": reference_b64,
153
+ "ref_num": len(reference_b64 or []),
154
+ "skip": 1,
155
+ "batch_size": min(len(frames_b64), 8),
156
+ "think": False,
157
+ "return_video": False,
158
+ }
159
+ start = time.time()
160
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
161
+ resp.raise_for_status()
162
+ result = resp.json()
163
+ result["latency_sec"] = time.time() - start
164
+ return result
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # Evaluation
169
+ # ---------------------------------------------------------------------------
170
+
171
+
172
+ def evaluate_demos(
173
+ manifest_data: Dict,
174
+ base_url: str,
175
+ timeout: float,
176
+ use_reference: bool = False,
177
+ ) -> Dict[str, any]:
178
+ """Evaluate all demos and collect value statistics."""
179
+ session = requests.Session()
180
+ task_name = manifest_data.get("task_name", "")
181
+ demos = manifest_data.get("demos", [])
182
+
183
+ results = []
184
+ failed_demos = []
185
+
186
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
187
+ print(f"Task: {task_name}")
188
+ print(f"Use reference: {use_reference}\n")
189
+
190
+ for demo in tqdm(demos, desc="Processing demos"):
191
+ demo_name = demo["demo_name"]
192
+ frame_paths = demo["frame_paths"]
193
+
194
+ # try:
195
+ # Encode frames
196
+ frames_b64 = encode_images(frame_paths)
197
+
198
+ # For now, no reference trajectory (can be added later)
199
+ print(f"Using reference frames for task {task_name}")
200
+ reference_b64 = encode_images(reference_frames_dict[task_name])
201
+
202
+ # Call VLAC service
203
+ result = call_trajectory_critic(
204
+ session=session,
205
+ base_url=base_url,
206
+ task=task_name,
207
+ frames_b64=frames_b64,
208
+ reference_b64=reference_b64,
209
+ timeout=timeout,
210
+ )
211
+
212
+ # Extract values
213
+ value_list = result.get("value_list", [])
214
+ if not value_list:
215
+ print(f"\n[warn] No values returned for demo {demo_name}")
216
+ failed_demos.append(demo_name)
217
+ continue
218
+
219
+ # Record results
220
+ demo_result = {
221
+ "demo_name": demo_name,
222
+ "total_frames": demo["total_frames"],
223
+ "success_index": demo["success_index"],
224
+ "num_sampled_frames": len(frame_paths),
225
+ "value_list": value_list,
226
+ "last_value": value_list[-1], # The critical value for success frame
227
+ "mean_value": float(np.mean(value_list)),
228
+ "std_value": float(np.std(value_list)),
229
+ "latency_sec": result.get("latency_sec", 0.0),
230
+ }
231
+ results.append(demo_result)
232
+
233
+ # except requests.RequestException as exc:
234
+ # print(f"\n[error] Request failed for demo {demo_name}: {exc}")
235
+ # failed_demos.append(demo_name)
236
+ # except Exception as exc:
237
+ # print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
238
+ # failed_demos.append(demo_name)
239
+
240
+ return {
241
+ "task_name": task_name,
242
+ "total_demos": len(demos),
243
+ "successful_evals": len(results),
244
+ "failed_demos": failed_demos,
245
+ "results": results,
246
+ }
247
+
248
+
249
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
250
+ """Compute summary statistics from evaluation results."""
251
+ results = evaluation_results["results"]
252
+ if not results:
253
+ return {}
254
+
255
+ last_values = [r["last_value"] for r in results]
256
+ mean_values = [r["mean_value"] for r in results]
257
+ latencies = [r["latency_sec"] for r in results]
258
+
259
+ stats = {
260
+ "last_value_mean": float(np.mean(last_values)),
261
+ "last_value_std": float(np.std(last_values)),
262
+ "last_value_min": float(np.min(last_values)),
263
+ "last_value_max": float(np.max(last_values)),
264
+ "last_value_median": float(np.median(last_values)),
265
+ "last_value_q25": float(np.percentile(last_values, 25)),
266
+ "last_value_q75": float(np.percentile(last_values, 75)),
267
+ "mean_latency": float(np.mean(latencies)),
268
+ "total_evaluated": len(results),
269
+ }
270
+
271
+ # Count how many demos have last_value >= various thresholds
272
+ for threshold in [80, 85, 90, 95, 100]:
273
+ count = sum(1 for v in last_values if v >= threshold)
274
+ stats[f"count_above_{threshold}"] = count
275
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
276
+
277
+ return stats
278
+
279
+
280
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
281
+ """Create visualization plots for value distribution."""
282
+ results = evaluation_results["results"]
283
+ if not results:
284
+ print("No results to plot")
285
+ return
286
+
287
+ task_name = evaluation_results["task_name"]
288
+ last_values = [r["last_value"] for r in results]
289
+
290
+ # Create figure with multiple subplots
291
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
292
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
293
+
294
+ # 1. Histogram of last values
295
+ ax1 = axes[0, 0]
296
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
297
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
298
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
299
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
300
+ ax1.set_ylabel('Frequency', fontsize=12)
301
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
302
+ ax1.legend()
303
+ ax1.grid(True, alpha=0.3)
304
+
305
+ # 2. Box plot of last values
306
+ ax2 = axes[0, 1]
307
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
308
+ for patch in box_data['boxes']:
309
+ patch.set_facecolor('lightblue')
310
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
311
+ ax2.set_ylabel('Value', fontsize=12)
312
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
313
+ ax2.legend()
314
+ ax2.grid(True, alpha=0.3, axis='y')
315
+
316
+ # 3. Value progression across demos
317
+ ax3 = axes[1, 0]
318
+ demo_indices = range(len(results))
319
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
320
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
321
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
322
+ ax3.set_xlabel('Demo Index', fontsize=12)
323
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
324
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
325
+ ax3.legend()
326
+ ax3.grid(True, alpha=0.3)
327
+
328
+ # 4. Cumulative distribution
329
+ ax4 = axes[1, 1]
330
+ sorted_values = np.sort(last_values)
331
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
332
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
333
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
334
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
335
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
336
+ ax4.set_title('Cumulative Distribution', fontsize=14)
337
+ ax4.legend()
338
+ ax4.grid(True, alpha=0.3)
339
+
340
+ plt.tight_layout()
341
+
342
+ # Save the plot
343
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
344
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
345
+ print(f"\nPlot saved to: {plot_path}")
346
+
347
+ # Also save a PDF version
348
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
349
+ plt.savefig(pdf_path, bbox_inches='tight')
350
+ print(f"PDF saved to: {pdf_path}")
351
+
352
+ plt.close()
353
+
354
+
355
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
356
+ """Save evaluation results and statistics to JSON files."""
357
+ task_name = evaluation_results["task_name"]
358
+
359
+ # Save detailed results
360
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
361
+ with results_path.open("w", encoding="utf-8") as f:
362
+ json.dump(evaluation_results, f, indent=2)
363
+ print(f"\nDetailed results saved to: {results_path}")
364
+
365
+ # Save summary statistics
366
+ stats_path = output_dir / f"{task_name}_statistics.json"
367
+ with stats_path.open("w", encoding="utf-8") as f:
368
+ json.dump(statistics, f, indent=2)
369
+ print(f"Statistics saved to: {stats_path}")
370
+
371
+
372
+ # ---------------------------------------------------------------------------
373
+ # CLI
374
+ # ---------------------------------------------------------------------------
375
+
376
+
377
+ def parse_args() -> argparse.Namespace:
378
+ parser = argparse.ArgumentParser(
379
+ description="Evaluate value estimation for test demonstrations"
380
+ )
381
+
382
+ # Mode selection
383
+ parser.add_argument(
384
+ "--process-all-tasks",
385
+ action="store_true",
386
+ help="Process all LIBERO-10 tasks"
387
+ )
388
+
389
+ # Arguments for processing all tasks
390
+ parser.add_argument(
391
+ "--manifests-root",
392
+ type=Path,
393
+ help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
394
+ )
395
+
396
+ # Arguments for processing a single task
397
+ parser.add_argument(
398
+ "--manifest-path",
399
+ type=Path,
400
+ help="Path to the test manifest JSON file (for single task mode)",
401
+ )
402
+
403
+ # Common arguments
404
+ parser.add_argument(
405
+ "--output-dir",
406
+ type=Path,
407
+ default="evaluation_results",
408
+ help="Directory to save evaluation results and plots",
409
+ )
410
+ parser.add_argument(
411
+ "--base-url",
412
+ default="http://localhost:8111",
413
+ help="VLAC service base URL (default: http://localhost:8111)",
414
+ )
415
+ parser.add_argument(
416
+ "--timeout",
417
+ type=float,
418
+ default=30.0,
419
+ help="HTTP request timeout in seconds (default: 30.0)",
420
+ )
421
+ parser.add_argument(
422
+ "--use-reference",
423
+ action="store_true",
424
+ help="Use reference trajectory (if available)",
425
+ )
426
+
427
+ args = parser.parse_args()
428
+
429
+ # Validate arguments
430
+ if args.process_all_tasks:
431
+ if not args.manifests_root:
432
+ parser.error("--manifests-root is required when using --process-all-tasks")
433
+ else:
434
+ if not args.manifest_path:
435
+ parser.error("--manifest-path is required for single task mode")
436
+
437
+ return args
438
+
439
+
440
+ def main() -> int:
441
+ args = parse_args()
442
+
443
+ # Read manifest
444
+ try:
445
+ manifest_data = read_manifest(args.manifest_path)
446
+ except FileNotFoundError as exc:
447
+ print(f"Error: {exc}")
448
+ return 1
449
+
450
+ # Create output directory
451
+ output_dir = args.output_dir.expanduser()
452
+ output_dir.mkdir(parents=True, exist_ok=True)
453
+
454
+ # Run evaluation
455
+ print("=" * 80)
456
+ print("VLAC Value Estimation Evaluation")
457
+ print("=" * 80)
458
+
459
+ evaluation_results = evaluate_demos(
460
+ manifest_data=manifest_data,
461
+ base_url=args.base_url,
462
+ timeout=args.timeout,
463
+ use_reference=args.use_reference,
464
+ )
465
+
466
+ # Compute statistics
467
+ statistics = compute_statistics(evaluation_results)
468
+
469
+ # Print summary
470
+ print("\n" + "=" * 80)
471
+ print("EVALUATION SUMMARY")
472
+ print("=" * 80)
473
+ print(f"Task: {evaluation_results['task_name']}")
474
+ print(f"Total demos: {evaluation_results['total_demos']}")
475
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
476
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
477
+
478
+ if statistics:
479
+ print("\n" + "-" * 80)
480
+ print("SUCCESS FRAME VALUE STATISTICS")
481
+ print("-" * 80)
482
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
483
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
484
+ print(f"Median: {statistics['last_value_median']:.2f}")
485
+ print(f"Min: {statistics['last_value_min']:.2f}")
486
+ print(f"Max: {statistics['last_value_max']:.2f}")
487
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
488
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
489
+
490
+ print("\n" + "-" * 80)
491
+ print("THRESHOLD ANALYSIS")
492
+ print("-" * 80)
493
+ for threshold in [80, 85, 90, 95, 100]:
494
+ count = statistics[f"count_above_{threshold}"]
495
+ percent = statistics[f"percent_above_{threshold}"]
496
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
497
+
498
+ print("\n" + "-" * 80)
499
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
500
+ print("-" * 80)
501
+
502
+ # Save results
503
+ save_results(evaluation_results, statistics, output_dir)
504
+
505
+ # Create plots
506
+ if evaluation_results["results"]:
507
+ plot_value_distribution(evaluation_results, output_dir)
508
+ else:
509
+ print("\nNo successful evaluations to plot.")
510
+
511
+ print("\n" + "=" * 80)
512
+ print("EVALUATION COMPLETE")
513
+ print("=" * 80)
514
+
515
+ return 0
516
+
517
+
518
+ if __name__ == "__main__":
519
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008152620.py ADDED
@@ -0,0 +1,683 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ # Evaluate all LIBERO-10 tasks
12
+ python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
13
+
14
+ # Evaluate a single task
15
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
16
+
17
+ Examples:
18
+ # Evaluate all LIBERO-10 tasks
19
+ python evaluate_test_demo_values.py \
20
+ --process-all-tasks \
21
+ --manifests-root toy_test_demos_LIBERO_10 \
22
+ --output-dir evaluation_results_all_tasks \
23
+ --base-url http://localhost:8111
24
+
25
+ # Evaluate a single task
26
+ python evaluate_test_demo_values.py \
27
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
28
+ --output-dir evaluation_results \
29
+ --base-url http://localhost:8111
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import argparse
35
+ import base64
36
+ import json
37
+ import os
38
+ import glob
39
+ import sys
40
+ import time
41
+ from io import BytesIO
42
+ from pathlib import Path
43
+ from typing import Dict, List, Optional
44
+
45
+ import matplotlib.pyplot as plt
46
+ import numpy as np
47
+ import requests
48
+ from PIL import Image
49
+ from tqdm import tqdm
50
+
51
+ # LIBERO-10 task list
52
+ LIBERO_10_TASKS = [
53
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
54
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
55
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
56
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
57
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
58
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
59
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
60
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
61
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
62
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
63
+ ]
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Helpers
67
+ # ---------------------------------------------------------------------------
68
+
69
+ def sample_fixed_interval_frames(image_list, num_frames):
70
+ # sample num_frames frames from image_list
71
+ # sample with equal interval while also ensuring the first and the last frames are included
72
+ if len(image_list) == 0:
73
+ raise ValueError("image_list is empty")
74
+ elif len(image_list) == 1:
75
+ return [image_list[0]] * num_frames
76
+ elif num_frames == 2:
77
+ return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
78
+ elif num_frames == 3:
79
+ return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
80
+ else:
81
+ total_frames = len(image_list)
82
+ indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
83
+ sampled_frames = [image_list[i] for i in indices]
84
+ return sampled_frames
85
+
86
+
87
+ num_frames_for_reference = 8
88
+ ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
89
+ libero_10_task_list = [
90
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
91
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
92
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
93
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
94
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
95
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
96
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
97
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
98
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
99
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
100
+ ]
101
+ reference_frames_dict = {}
102
+ for task_name in libero_10_task_list:
103
+ ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
104
+ ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
105
+ ref_frm_file_list.sort()
106
+ reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
107
+ reference_frames_dict[task_name] = reference_frames_temp
108
+
109
+
110
+ def read_manifest(manifest_path: Path) -> Dict:
111
+ """Read the test demo manifest JSON file."""
112
+ if not manifest_path.is_file():
113
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
114
+
115
+ with manifest_path.open("r", encoding="utf-8") as f:
116
+ manifest_data = json.load(f)
117
+
118
+ # Convert relative paths to absolute paths
119
+ manifest_dir = manifest_path.parent
120
+ for demo in manifest_data.get("demos", []):
121
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
122
+
123
+ return manifest_data
124
+
125
+
126
+ def image_to_base64(path: Path) -> str:
127
+ """Convert an image file to base64 encoded JPEG."""
128
+ with Image.open(path) as img:
129
+ img = img.convert("RGB")
130
+ buffer = BytesIO()
131
+ img.save(buffer, format="JPEG", quality=95)
132
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
133
+
134
+
135
+ def encode_images(paths: List[str]) -> List[str]:
136
+ """Encode a list of image paths to base64."""
137
+ return [image_to_base64(Path(p)) for p in paths]
138
+
139
+
140
+ def call_trajectory_critic(
141
+ session: requests.Session,
142
+ base_url: str,
143
+ task: str,
144
+ frames_b64: List[str],
145
+ reference_b64: Optional[List[str]],
146
+ timeout: float,
147
+ ) -> Dict:
148
+ """Call the VLAC trajectory-critic endpoint."""
149
+ payload = {
150
+ "task": task,
151
+ "frames": frames_b64,
152
+ "reference": reference_b64,
153
+ "ref_num": len(reference_b64 or []),
154
+ "skip": 1,
155
+ "batch_size": min(len(frames_b64), 8),
156
+ "think": False,
157
+ "return_video": False,
158
+ }
159
+ start = time.time()
160
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
161
+ resp.raise_for_status()
162
+ result = resp.json()
163
+ result["latency_sec"] = time.time() - start
164
+ return result
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # Evaluation
169
+ # ---------------------------------------------------------------------------
170
+
171
+
172
+ def evaluate_demos(
173
+ manifest_data: Dict,
174
+ base_url: str,
175
+ timeout: float,
176
+ use_reference: bool = False,
177
+ ) -> Dict[str, any]:
178
+ """Evaluate all demos and collect value statistics."""
179
+ session = requests.Session()
180
+ task_name = manifest_data.get("task_name", "")
181
+ demos = manifest_data.get("demos", [])
182
+
183
+ results = []
184
+ failed_demos = []
185
+
186
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
187
+ print(f"Task: {task_name}")
188
+ print(f"Use reference: {use_reference}\n")
189
+
190
+ for demo in tqdm(demos, desc="Processing demos"):
191
+ demo_name = demo["demo_name"]
192
+ frame_paths = demo["frame_paths"]
193
+
194
+ # try:
195
+ # Encode frames
196
+ frames_b64 = encode_images(frame_paths)
197
+
198
+ # For now, no reference trajectory (can be added later)
199
+ print(f"Using reference frames for task {task_name}")
200
+ reference_b64 = encode_images(reference_frames_dict[task_name])
201
+
202
+ # Call VLAC service
203
+ result = call_trajectory_critic(
204
+ session=session,
205
+ base_url=base_url,
206
+ task=task_name,
207
+ frames_b64=frames_b64,
208
+ reference_b64=reference_b64,
209
+ timeout=timeout,
210
+ )
211
+
212
+ # Extract values
213
+ value_list = result.get("value_list", [])
214
+ if not value_list:
215
+ print(f"\n[warn] No values returned for demo {demo_name}")
216
+ failed_demos.append(demo_name)
217
+ continue
218
+
219
+ # Record results
220
+ demo_result = {
221
+ "demo_name": demo_name,
222
+ "total_frames": demo["total_frames"],
223
+ "success_index": demo["success_index"],
224
+ "num_sampled_frames": len(frame_paths),
225
+ "value_list": value_list,
226
+ "last_value": value_list[-1], # The critical value for success frame
227
+ "mean_value": float(np.mean(value_list)),
228
+ "std_value": float(np.std(value_list)),
229
+ "latency_sec": result.get("latency_sec", 0.0),
230
+ }
231
+ results.append(demo_result)
232
+
233
+ # except requests.RequestException as exc:
234
+ # print(f"\n[error] Request failed for demo {demo_name}: {exc}")
235
+ # failed_demos.append(demo_name)
236
+ # except Exception as exc:
237
+ # print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
238
+ # failed_demos.append(demo_name)
239
+
240
+ return {
241
+ "task_name": task_name,
242
+ "total_demos": len(demos),
243
+ "successful_evals": len(results),
244
+ "failed_demos": failed_demos,
245
+ "results": results,
246
+ }
247
+
248
+
249
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
250
+ """Compute summary statistics from evaluation results."""
251
+ results = evaluation_results["results"]
252
+ if not results:
253
+ return {}
254
+
255
+ last_values = [r["last_value"] for r in results]
256
+ mean_values = [r["mean_value"] for r in results]
257
+ latencies = [r["latency_sec"] for r in results]
258
+
259
+ stats = {
260
+ "last_value_mean": float(np.mean(last_values)),
261
+ "last_value_std": float(np.std(last_values)),
262
+ "last_value_min": float(np.min(last_values)),
263
+ "last_value_max": float(np.max(last_values)),
264
+ "last_value_median": float(np.median(last_values)),
265
+ "last_value_q25": float(np.percentile(last_values, 25)),
266
+ "last_value_q75": float(np.percentile(last_values, 75)),
267
+ "mean_latency": float(np.mean(latencies)),
268
+ "total_evaluated": len(results),
269
+ }
270
+
271
+ # Count how many demos have last_value >= various thresholds
272
+ for threshold in [80, 85, 90, 95, 100]:
273
+ count = sum(1 for v in last_values if v >= threshold)
274
+ stats[f"count_above_{threshold}"] = count
275
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
276
+
277
+ return stats
278
+
279
+
280
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
281
+ """Create visualization plots for value distribution."""
282
+ results = evaluation_results["results"]
283
+ if not results:
284
+ print("No results to plot")
285
+ return
286
+
287
+ task_name = evaluation_results["task_name"]
288
+ last_values = [r["last_value"] for r in results]
289
+
290
+ # Create figure with multiple subplots
291
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
292
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
293
+
294
+ # 1. Histogram of last values
295
+ ax1 = axes[0, 0]
296
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
297
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
298
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
299
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
300
+ ax1.set_ylabel('Frequency', fontsize=12)
301
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
302
+ ax1.legend()
303
+ ax1.grid(True, alpha=0.3)
304
+
305
+ # 2. Box plot of last values
306
+ ax2 = axes[0, 1]
307
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
308
+ for patch in box_data['boxes']:
309
+ patch.set_facecolor('lightblue')
310
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
311
+ ax2.set_ylabel('Value', fontsize=12)
312
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
313
+ ax2.legend()
314
+ ax2.grid(True, alpha=0.3, axis='y')
315
+
316
+ # 3. Value progression across demos
317
+ ax3 = axes[1, 0]
318
+ demo_indices = range(len(results))
319
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
320
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
321
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
322
+ ax3.set_xlabel('Demo Index', fontsize=12)
323
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
324
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
325
+ ax3.legend()
326
+ ax3.grid(True, alpha=0.3)
327
+
328
+ # 4. Cumulative distribution
329
+ ax4 = axes[1, 1]
330
+ sorted_values = np.sort(last_values)
331
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
332
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
333
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
334
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
335
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
336
+ ax4.set_title('Cumulative Distribution', fontsize=14)
337
+ ax4.legend()
338
+ ax4.grid(True, alpha=0.3)
339
+
340
+ plt.tight_layout()
341
+
342
+ # Save the plot
343
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
344
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
345
+ print(f"\nPlot saved to: {plot_path}")
346
+
347
+ # Also save a PDF version
348
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
349
+ plt.savefig(pdf_path, bbox_inches='tight')
350
+ print(f"PDF saved to: {pdf_path}")
351
+
352
+ plt.close()
353
+
354
+
355
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
356
+ """Save evaluation results and statistics to JSON files."""
357
+ task_name = evaluation_results["task_name"]
358
+
359
+ # Save detailed results
360
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
361
+ with results_path.open("w", encoding="utf-8") as f:
362
+ json.dump(evaluation_results, f, indent=2)
363
+ print(f"\nDetailed results saved to: {results_path}")
364
+
365
+ # Save summary statistics
366
+ stats_path = output_dir / f"{task_name}_statistics.json"
367
+ with stats_path.open("w", encoding="utf-8") as f:
368
+ json.dump(statistics, f, indent=2)
369
+ print(f"Statistics saved to: {stats_path}")
370
+
371
+
372
+ def find_manifest_file(manifests_root: Path, task_name: str) -> Optional[Path]:
373
+ """Find the manifest file for a given task name.
374
+
375
+ Tries different patterns commonly used.
376
+ """
377
+ # Try different patterns
378
+ patterns = [
379
+ manifests_root / task_name / f"{task_name}_test_manifest.json",
380
+ manifests_root / task_name / "test_manifest.json",
381
+ manifests_root / f"{task_name}_test_manifest.json",
382
+ ]
383
+
384
+ for candidate in patterns:
385
+ if candidate.exists():
386
+ return candidate
387
+
388
+ return None
389
+
390
+
391
+ def evaluate_single_task(
392
+ manifest_path: Path,
393
+ output_dir: Path,
394
+ base_url: str,
395
+ timeout: float,
396
+ use_reference: bool,
397
+ ) -> Optional[Dict]:
398
+ """Evaluate a single task and return the statistics.
399
+
400
+ Returns:
401
+ Dictionary with evaluation results and statistics, or None if failed
402
+ """
403
+ try:
404
+ manifest_data = read_manifest(manifest_path)
405
+ except FileNotFoundError as exc:
406
+ print(f"Error reading manifest: {exc}")
407
+ return None
408
+
409
+ task_name = manifest_data.get("task_name", "unknown")
410
+
411
+ print(f"\n{'='*80}")
412
+ print(f"Evaluating task: {task_name}")
413
+ print(f"Manifest: {manifest_path}")
414
+ print(f"{'='*80}")
415
+
416
+ # Run evaluation
417
+ evaluation_results = evaluate_demos(
418
+ manifest_data=manifest_data,
419
+ base_url=base_url,
420
+ timeout=timeout,
421
+ use_reference=use_reference,
422
+ )
423
+
424
+ # Compute statistics
425
+ statistics = compute_statistics(evaluation_results)
426
+
427
+ # Print summary
428
+ print("\n" + "-" * 80)
429
+ print("TASK EVALUATION SUMMARY")
430
+ print("-" * 80)
431
+ print(f"Task: {evaluation_results['task_name']}")
432
+ print(f"Total demos: {evaluation_results['total_demos']}")
433
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
434
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
435
+
436
+ if statistics:
437
+ print(f"\nMean success value: {statistics['last_value_mean']:.2f}")
438
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
439
+ print(f"Median: {statistics['last_value_median']:.2f}")
440
+ print(f"Values >= 90: {statistics.get('count_above_90', 0)} ({statistics.get('percent_above_90', 0):.1f}%)")
441
+
442
+ # Save results
443
+ task_output_dir = output_dir / task_name
444
+ task_output_dir.mkdir(parents=True, exist_ok=True)
445
+ save_results(evaluation_results, statistics, task_output_dir)
446
+
447
+ # Create plots
448
+ if evaluation_results["results"]:
449
+ plot_value_distribution(evaluation_results, task_output_dir)
450
+
451
+ return {
452
+ "task_name": task_name,
453
+ "evaluation_results": evaluation_results,
454
+ "statistics": statistics,
455
+ }
456
+
457
+
458
+ def plot_aggregate_statistics(all_task_results: List[Dict], output_dir: Path) -> None:
459
+ """Create aggregate plots across all tasks."""
460
+ if not all_task_results:
461
+ return
462
+
463
+ # Extract data
464
+ task_names = [r["task_name"] for r in all_task_results]
465
+ mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
466
+ median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
467
+ std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
468
+
469
+ # Create figure with subplots
470
+ fig, axes = plt.subplots(2, 2, figsize=(16, 12))
471
+ fig.suptitle("VLAC Value Estimation - Aggregate Statistics Across All Tasks", fontsize=16, fontweight='bold')
472
+
473
+ # 1. Mean values per task
474
+ ax1 = axes[0, 0]
475
+ bars = ax1.bar(range(len(task_names)), mean_values, color='steelblue', alpha=0.7)
476
+ ax1.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
477
+ ax1.axhline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Overall Mean ({np.mean(mean_values):.1f})')
478
+ ax1.set_xlabel('Task', fontsize=12)
479
+ ax1.set_ylabel('Mean Success Value', fontsize=12)
480
+ ax1.set_title('Mean Success Frame Values by Task', fontsize=14)
481
+ ax1.set_xticks(range(len(task_names)))
482
+ ax1.set_xticklabels(range(1, len(task_names) + 1))
483
+ ax1.legend()
484
+ ax1.grid(True, alpha=0.3, axis='y')
485
+
486
+ # 2. Distribution of mean values
487
+ ax2 = axes[0, 1]
488
+ ax2.hist(mean_values, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
489
+ ax2.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
490
+ ax2.axvline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(mean_values):.1f})')
491
+ ax2.set_xlabel('Mean Success Value', fontsize=12)
492
+ ax2.set_ylabel('Frequency (Tasks)', fontsize=12)
493
+ ax2.set_title('Distribution of Task-Level Mean Values', fontsize=14)
494
+ ax2.legend()
495
+ ax2.grid(True, alpha=0.3)
496
+
497
+ # 3. Median values per task
498
+ ax3 = axes[1, 0]
499
+ bars = ax3.bar(range(len(task_names)), median_values, color='coral', alpha=0.7)
500
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
501
+ ax3.axhline(np.median(median_values), color='green', linestyle='-', linewidth=2, label=f'Overall Median ({np.median(median_values):.1f})')
502
+ ax3.set_xlabel('Task', fontsize=12)
503
+ ax3.set_ylabel('Median Success Value', fontsize=12)
504
+ ax3.set_title('Median Success Frame Values by Task', fontsize=14)
505
+ ax3.set_xticks(range(len(task_names)))
506
+ ax3.set_xticklabels(range(1, len(task_names) + 1))
507
+ ax3.legend()
508
+ ax3.grid(True, alpha=0.3, axis='y')
509
+
510
+ # 4. Std deviation per task
511
+ ax4 = axes[1, 1]
512
+ bars = ax4.bar(range(len(task_names)), std_values, color='orange', alpha=0.7)
513
+ ax4.axhline(np.mean(std_values), color='green', linestyle='-', linewidth=2, label=f'Mean Std ({np.mean(std_values):.1f})')
514
+ ax4.set_xlabel('Task', fontsize=12)
515
+ ax4.set_ylabel('Standard Deviation', fontsize=12)
516
+ ax4.set_title('Variability in Success Values by Task', fontsize=14)
517
+ ax4.set_xticks(range(len(task_names)))
518
+ ax4.set_xticklabels(range(1, len(task_names) + 1))
519
+ ax4.legend()
520
+ ax4.grid(True, alpha=0.3, axis='y')
521
+
522
+ plt.tight_layout()
523
+
524
+ # Save plots
525
+ plot_path = output_dir / "aggregate_statistics.png"
526
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
527
+ print(f"\nAggregate plot saved to: {plot_path}")
528
+
529
+ pdf_path = output_dir / "aggregate_statistics.pdf"
530
+ plt.savefig(pdf_path, bbox_inches='tight')
531
+ print(f"Aggregate PDF saved to: {pdf_path}")
532
+
533
+ plt.close()
534
+
535
+
536
+ # ---------------------------------------------------------------------------
537
+ # CLI
538
+ # ---------------------------------------------------------------------------
539
+
540
+
541
+ def parse_args() -> argparse.Namespace:
542
+ parser = argparse.ArgumentParser(
543
+ description="Evaluate value estimation for test demonstrations"
544
+ )
545
+
546
+ # Mode selection
547
+ parser.add_argument(
548
+ "--process-all-tasks",
549
+ action="store_true",
550
+ help="Process all LIBERO-10 tasks"
551
+ )
552
+
553
+ # Arguments for processing all tasks
554
+ parser.add_argument(
555
+ "--manifests-root",
556
+ type=Path,
557
+ help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
558
+ )
559
+
560
+ # Arguments for processing a single task
561
+ parser.add_argument(
562
+ "--manifest-path",
563
+ type=Path,
564
+ help="Path to the test manifest JSON file (for single task mode)",
565
+ )
566
+
567
+ # Common arguments
568
+ parser.add_argument(
569
+ "--output-dir",
570
+ type=Path,
571
+ default="evaluation_results",
572
+ help="Directory to save evaluation results and plots",
573
+ )
574
+ parser.add_argument(
575
+ "--base-url",
576
+ default="http://localhost:8111",
577
+ help="VLAC service base URL (default: http://localhost:8111)",
578
+ )
579
+ parser.add_argument(
580
+ "--timeout",
581
+ type=float,
582
+ default=30.0,
583
+ help="HTTP request timeout in seconds (default: 30.0)",
584
+ )
585
+ parser.add_argument(
586
+ "--use-reference",
587
+ action="store_true",
588
+ help="Use reference trajectory (if available)",
589
+ )
590
+
591
+ args = parser.parse_args()
592
+
593
+ # Validate arguments
594
+ if args.process_all_tasks:
595
+ if not args.manifests_root:
596
+ parser.error("--manifests-root is required when using --process-all-tasks")
597
+ else:
598
+ if not args.manifest_path:
599
+ parser.error("--manifest-path is required for single task mode")
600
+
601
+ return args
602
+
603
+
604
+ def main() -> int:
605
+ args = parse_args()
606
+
607
+ # Read manifest
608
+ try:
609
+ manifest_data = read_manifest(args.manifest_path)
610
+ except FileNotFoundError as exc:
611
+ print(f"Error: {exc}")
612
+ return 1
613
+
614
+ # Create output directory
615
+ output_dir = args.output_dir.expanduser()
616
+ output_dir.mkdir(parents=True, exist_ok=True)
617
+
618
+ # Run evaluation
619
+ print("=" * 80)
620
+ print("VLAC Value Estimation Evaluation")
621
+ print("=" * 80)
622
+
623
+ evaluation_results = evaluate_demos(
624
+ manifest_data=manifest_data,
625
+ base_url=args.base_url,
626
+ timeout=args.timeout,
627
+ use_reference=args.use_reference,
628
+ )
629
+
630
+ # Compute statistics
631
+ statistics = compute_statistics(evaluation_results)
632
+
633
+ # Print summary
634
+ print("\n" + "=" * 80)
635
+ print("EVALUATION SUMMARY")
636
+ print("=" * 80)
637
+ print(f"Task: {evaluation_results['task_name']}")
638
+ print(f"Total demos: {evaluation_results['total_demos']}")
639
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
640
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
641
+
642
+ if statistics:
643
+ print("\n" + "-" * 80)
644
+ print("SUCCESS FRAME VALUE STATISTICS")
645
+ print("-" * 80)
646
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
647
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
648
+ print(f"Median: {statistics['last_value_median']:.2f}")
649
+ print(f"Min: {statistics['last_value_min']:.2f}")
650
+ print(f"Max: {statistics['last_value_max']:.2f}")
651
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
652
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
653
+
654
+ print("\n" + "-" * 80)
655
+ print("THRESHOLD ANALYSIS")
656
+ print("-" * 80)
657
+ for threshold in [80, 85, 90, 95, 100]:
658
+ count = statistics[f"count_above_{threshold}"]
659
+ percent = statistics[f"percent_above_{threshold}"]
660
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
661
+
662
+ print("\n" + "-" * 80)
663
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
664
+ print("-" * 80)
665
+
666
+ # Save results
667
+ save_results(evaluation_results, statistics, output_dir)
668
+
669
+ # Create plots
670
+ if evaluation_results["results"]:
671
+ plot_value_distribution(evaluation_results, output_dir)
672
+ else:
673
+ print("\nNo successful evaluations to plot.")
674
+
675
+ print("\n" + "=" * 80)
676
+ print("EVALUATION COMPLETE")
677
+ print("=" * 80)
678
+
679
+ return 0
680
+
681
+
682
+ if __name__ == "__main__":
683
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008152700.py ADDED
@@ -0,0 +1,784 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ # Evaluate all LIBERO-10 tasks
12
+ python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
13
+
14
+ # Evaluate a single task
15
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
16
+
17
+ Examples:
18
+ # Evaluate all LIBERO-10 tasks
19
+ python evaluate_test_demo_values.py \
20
+ --process-all-tasks \
21
+ --manifests-root toy_test_demos_LIBERO_10 \
22
+ --output-dir evaluation_results_all_tasks \
23
+ --base-url http://localhost:8111
24
+
25
+ # Evaluate a single task
26
+ python evaluate_test_demo_values.py \
27
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
28
+ --output-dir evaluation_results \
29
+ --base-url http://localhost:8111
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import argparse
35
+ import base64
36
+ import json
37
+ import os
38
+ import glob
39
+ import sys
40
+ import time
41
+ from io import BytesIO
42
+ from pathlib import Path
43
+ from typing import Dict, List, Optional
44
+
45
+ import matplotlib.pyplot as plt
46
+ import numpy as np
47
+ import requests
48
+ from PIL import Image
49
+ from tqdm import tqdm
50
+
51
+ # LIBERO-10 task list
52
+ LIBERO_10_TASKS = [
53
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
54
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
55
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
56
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
57
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
58
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
59
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
60
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
61
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
62
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
63
+ ]
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Helpers
67
+ # ---------------------------------------------------------------------------
68
+
69
+ def sample_fixed_interval_frames(image_list, num_frames):
70
+ # sample num_frames frames from image_list
71
+ # sample with equal interval while also ensuring the first and the last frames are included
72
+ if len(image_list) == 0:
73
+ raise ValueError("image_list is empty")
74
+ elif len(image_list) == 1:
75
+ return [image_list[0]] * num_frames
76
+ elif num_frames == 2:
77
+ return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
78
+ elif num_frames == 3:
79
+ return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
80
+ else:
81
+ total_frames = len(image_list)
82
+ indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
83
+ sampled_frames = [image_list[i] for i in indices]
84
+ return sampled_frames
85
+
86
+
87
+ num_frames_for_reference = 8
88
+ ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
89
+ libero_10_task_list = [
90
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
91
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
92
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
93
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
94
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
95
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
96
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
97
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
98
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
99
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
100
+ ]
101
+ reference_frames_dict = {}
102
+ for task_name in libero_10_task_list:
103
+ ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
104
+ ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
105
+ ref_frm_file_list.sort()
106
+ reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
107
+ reference_frames_dict[task_name] = reference_frames_temp
108
+
109
+
110
+ def read_manifest(manifest_path: Path) -> Dict:
111
+ """Read the test demo manifest JSON file."""
112
+ if not manifest_path.is_file():
113
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
114
+
115
+ with manifest_path.open("r", encoding="utf-8") as f:
116
+ manifest_data = json.load(f)
117
+
118
+ # Convert relative paths to absolute paths
119
+ manifest_dir = manifest_path.parent
120
+ for demo in manifest_data.get("demos", []):
121
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
122
+
123
+ return manifest_data
124
+
125
+
126
+ def image_to_base64(path: Path) -> str:
127
+ """Convert an image file to base64 encoded JPEG."""
128
+ with Image.open(path) as img:
129
+ img = img.convert("RGB")
130
+ buffer = BytesIO()
131
+ img.save(buffer, format="JPEG", quality=95)
132
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
133
+
134
+
135
+ def encode_images(paths: List[str]) -> List[str]:
136
+ """Encode a list of image paths to base64."""
137
+ return [image_to_base64(Path(p)) for p in paths]
138
+
139
+
140
+ def call_trajectory_critic(
141
+ session: requests.Session,
142
+ base_url: str,
143
+ task: str,
144
+ frames_b64: List[str],
145
+ reference_b64: Optional[List[str]],
146
+ timeout: float,
147
+ ) -> Dict:
148
+ """Call the VLAC trajectory-critic endpoint."""
149
+ payload = {
150
+ "task": task,
151
+ "frames": frames_b64,
152
+ "reference": reference_b64,
153
+ "ref_num": len(reference_b64 or []),
154
+ "skip": 1,
155
+ "batch_size": min(len(frames_b64), 8),
156
+ "think": False,
157
+ "return_video": False,
158
+ }
159
+ start = time.time()
160
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
161
+ resp.raise_for_status()
162
+ result = resp.json()
163
+ result["latency_sec"] = time.time() - start
164
+ return result
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # Evaluation
169
+ # ---------------------------------------------------------------------------
170
+
171
+
172
+ def evaluate_demos(
173
+ manifest_data: Dict,
174
+ base_url: str,
175
+ timeout: float,
176
+ use_reference: bool = False,
177
+ ) -> Dict[str, any]:
178
+ """Evaluate all demos and collect value statistics."""
179
+ session = requests.Session()
180
+ task_name = manifest_data.get("task_name", "")
181
+ demos = manifest_data.get("demos", [])
182
+
183
+ results = []
184
+ failed_demos = []
185
+
186
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
187
+ print(f"Task: {task_name}")
188
+ print(f"Use reference: {use_reference}\n")
189
+
190
+ for demo in tqdm(demos, desc="Processing demos"):
191
+ demo_name = demo["demo_name"]
192
+ frame_paths = demo["frame_paths"]
193
+
194
+ # try:
195
+ # Encode frames
196
+ frames_b64 = encode_images(frame_paths)
197
+
198
+ # For now, no reference trajectory (can be added later)
199
+ print(f"Using reference frames for task {task_name}")
200
+ reference_b64 = encode_images(reference_frames_dict[task_name])
201
+
202
+ # Call VLAC service
203
+ result = call_trajectory_critic(
204
+ session=session,
205
+ base_url=base_url,
206
+ task=task_name,
207
+ frames_b64=frames_b64,
208
+ reference_b64=reference_b64,
209
+ timeout=timeout,
210
+ )
211
+
212
+ # Extract values
213
+ value_list = result.get("value_list", [])
214
+ if not value_list:
215
+ print(f"\n[warn] No values returned for demo {demo_name}")
216
+ failed_demos.append(demo_name)
217
+ continue
218
+
219
+ # Record results
220
+ demo_result = {
221
+ "demo_name": demo_name,
222
+ "total_frames": demo["total_frames"],
223
+ "success_index": demo["success_index"],
224
+ "num_sampled_frames": len(frame_paths),
225
+ "value_list": value_list,
226
+ "last_value": value_list[-1], # The critical value for success frame
227
+ "mean_value": float(np.mean(value_list)),
228
+ "std_value": float(np.std(value_list)),
229
+ "latency_sec": result.get("latency_sec", 0.0),
230
+ }
231
+ results.append(demo_result)
232
+
233
+ # except requests.RequestException as exc:
234
+ # print(f"\n[error] Request failed for demo {demo_name}: {exc}")
235
+ # failed_demos.append(demo_name)
236
+ # except Exception as exc:
237
+ # print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
238
+ # failed_demos.append(demo_name)
239
+
240
+ return {
241
+ "task_name": task_name,
242
+ "total_demos": len(demos),
243
+ "successful_evals": len(results),
244
+ "failed_demos": failed_demos,
245
+ "results": results,
246
+ }
247
+
248
+
249
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
250
+ """Compute summary statistics from evaluation results."""
251
+ results = evaluation_results["results"]
252
+ if not results:
253
+ return {}
254
+
255
+ last_values = [r["last_value"] for r in results]
256
+ mean_values = [r["mean_value"] for r in results]
257
+ latencies = [r["latency_sec"] for r in results]
258
+
259
+ stats = {
260
+ "last_value_mean": float(np.mean(last_values)),
261
+ "last_value_std": float(np.std(last_values)),
262
+ "last_value_min": float(np.min(last_values)),
263
+ "last_value_max": float(np.max(last_values)),
264
+ "last_value_median": float(np.median(last_values)),
265
+ "last_value_q25": float(np.percentile(last_values, 25)),
266
+ "last_value_q75": float(np.percentile(last_values, 75)),
267
+ "mean_latency": float(np.mean(latencies)),
268
+ "total_evaluated": len(results),
269
+ }
270
+
271
+ # Count how many demos have last_value >= various thresholds
272
+ for threshold in [80, 85, 90, 95, 100]:
273
+ count = sum(1 for v in last_values if v >= threshold)
274
+ stats[f"count_above_{threshold}"] = count
275
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
276
+
277
+ return stats
278
+
279
+
280
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
281
+ """Create visualization plots for value distribution."""
282
+ results = evaluation_results["results"]
283
+ if not results:
284
+ print("No results to plot")
285
+ return
286
+
287
+ task_name = evaluation_results["task_name"]
288
+ last_values = [r["last_value"] for r in results]
289
+
290
+ # Create figure with multiple subplots
291
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
292
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
293
+
294
+ # 1. Histogram of last values
295
+ ax1 = axes[0, 0]
296
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
297
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
298
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
299
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
300
+ ax1.set_ylabel('Frequency', fontsize=12)
301
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
302
+ ax1.legend()
303
+ ax1.grid(True, alpha=0.3)
304
+
305
+ # 2. Box plot of last values
306
+ ax2 = axes[0, 1]
307
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
308
+ for patch in box_data['boxes']:
309
+ patch.set_facecolor('lightblue')
310
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
311
+ ax2.set_ylabel('Value', fontsize=12)
312
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
313
+ ax2.legend()
314
+ ax2.grid(True, alpha=0.3, axis='y')
315
+
316
+ # 3. Value progression across demos
317
+ ax3 = axes[1, 0]
318
+ demo_indices = range(len(results))
319
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
320
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
321
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
322
+ ax3.set_xlabel('Demo Index', fontsize=12)
323
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
324
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
325
+ ax3.legend()
326
+ ax3.grid(True, alpha=0.3)
327
+
328
+ # 4. Cumulative distribution
329
+ ax4 = axes[1, 1]
330
+ sorted_values = np.sort(last_values)
331
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
332
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
333
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
334
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
335
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
336
+ ax4.set_title('Cumulative Distribution', fontsize=14)
337
+ ax4.legend()
338
+ ax4.grid(True, alpha=0.3)
339
+
340
+ plt.tight_layout()
341
+
342
+ # Save the plot
343
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
344
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
345
+ print(f"\nPlot saved to: {plot_path}")
346
+
347
+ # Also save a PDF version
348
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
349
+ plt.savefig(pdf_path, bbox_inches='tight')
350
+ print(f"PDF saved to: {pdf_path}")
351
+
352
+ plt.close()
353
+
354
+
355
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
356
+ """Save evaluation results and statistics to JSON files."""
357
+ task_name = evaluation_results["task_name"]
358
+
359
+ # Save detailed results
360
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
361
+ with results_path.open("w", encoding="utf-8") as f:
362
+ json.dump(evaluation_results, f, indent=2)
363
+ print(f"\nDetailed results saved to: {results_path}")
364
+
365
+ # Save summary statistics
366
+ stats_path = output_dir / f"{task_name}_statistics.json"
367
+ with stats_path.open("w", encoding="utf-8") as f:
368
+ json.dump(statistics, f, indent=2)
369
+ print(f"Statistics saved to: {stats_path}")
370
+
371
+
372
+ def find_manifest_file(manifests_root: Path, task_name: str) -> Optional[Path]:
373
+ """Find the manifest file for a given task name.
374
+
375
+ Tries different patterns commonly used.
376
+ """
377
+ # Try different patterns
378
+ patterns = [
379
+ manifests_root / task_name / f"{task_name}_test_manifest.json",
380
+ manifests_root / task_name / "test_manifest.json",
381
+ manifests_root / f"{task_name}_test_manifest.json",
382
+ ]
383
+
384
+ for candidate in patterns:
385
+ if candidate.exists():
386
+ return candidate
387
+
388
+ return None
389
+
390
+
391
+ def evaluate_single_task(
392
+ manifest_path: Path,
393
+ output_dir: Path,
394
+ base_url: str,
395
+ timeout: float,
396
+ use_reference: bool,
397
+ ) -> Optional[Dict]:
398
+ """Evaluate a single task and return the statistics.
399
+
400
+ Returns:
401
+ Dictionary with evaluation results and statistics, or None if failed
402
+ """
403
+ try:
404
+ manifest_data = read_manifest(manifest_path)
405
+ except FileNotFoundError as exc:
406
+ print(f"Error reading manifest: {exc}")
407
+ return None
408
+
409
+ task_name = manifest_data.get("task_name", "unknown")
410
+
411
+ print(f"\n{'='*80}")
412
+ print(f"Evaluating task: {task_name}")
413
+ print(f"Manifest: {manifest_path}")
414
+ print(f"{'='*80}")
415
+
416
+ # Run evaluation
417
+ evaluation_results = evaluate_demos(
418
+ manifest_data=manifest_data,
419
+ base_url=base_url,
420
+ timeout=timeout,
421
+ use_reference=use_reference,
422
+ )
423
+
424
+ # Compute statistics
425
+ statistics = compute_statistics(evaluation_results)
426
+
427
+ # Print summary
428
+ print("\n" + "-" * 80)
429
+ print("TASK EVALUATION SUMMARY")
430
+ print("-" * 80)
431
+ print(f"Task: {evaluation_results['task_name']}")
432
+ print(f"Total demos: {evaluation_results['total_demos']}")
433
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
434
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
435
+
436
+ if statistics:
437
+ print(f"\nMean success value: {statistics['last_value_mean']:.2f}")
438
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
439
+ print(f"Median: {statistics['last_value_median']:.2f}")
440
+ print(f"Values >= 90: {statistics.get('count_above_90', 0)} ({statistics.get('percent_above_90', 0):.1f}%)")
441
+
442
+ # Save results
443
+ task_output_dir = output_dir / task_name
444
+ task_output_dir.mkdir(parents=True, exist_ok=True)
445
+ save_results(evaluation_results, statistics, task_output_dir)
446
+
447
+ # Create plots
448
+ if evaluation_results["results"]:
449
+ plot_value_distribution(evaluation_results, task_output_dir)
450
+
451
+ return {
452
+ "task_name": task_name,
453
+ "evaluation_results": evaluation_results,
454
+ "statistics": statistics,
455
+ }
456
+
457
+
458
+ def plot_aggregate_statistics(all_task_results: List[Dict], output_dir: Path) -> None:
459
+ """Create aggregate plots across all tasks."""
460
+ if not all_task_results:
461
+ return
462
+
463
+ # Extract data
464
+ task_names = [r["task_name"] for r in all_task_results]
465
+ mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
466
+ median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
467
+ std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
468
+
469
+ # Create figure with subplots
470
+ fig, axes = plt.subplots(2, 2, figsize=(16, 12))
471
+ fig.suptitle("VLAC Value Estimation - Aggregate Statistics Across All Tasks", fontsize=16, fontweight='bold')
472
+
473
+ # 1. Mean values per task
474
+ ax1 = axes[0, 0]
475
+ bars = ax1.bar(range(len(task_names)), mean_values, color='steelblue', alpha=0.7)
476
+ ax1.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
477
+ ax1.axhline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Overall Mean ({np.mean(mean_values):.1f})')
478
+ ax1.set_xlabel('Task', fontsize=12)
479
+ ax1.set_ylabel('Mean Success Value', fontsize=12)
480
+ ax1.set_title('Mean Success Frame Values by Task', fontsize=14)
481
+ ax1.set_xticks(range(len(task_names)))
482
+ ax1.set_xticklabels(range(1, len(task_names) + 1))
483
+ ax1.legend()
484
+ ax1.grid(True, alpha=0.3, axis='y')
485
+
486
+ # 2. Distribution of mean values
487
+ ax2 = axes[0, 1]
488
+ ax2.hist(mean_values, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
489
+ ax2.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
490
+ ax2.axvline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(mean_values):.1f})')
491
+ ax2.set_xlabel('Mean Success Value', fontsize=12)
492
+ ax2.set_ylabel('Frequency (Tasks)', fontsize=12)
493
+ ax2.set_title('Distribution of Task-Level Mean Values', fontsize=14)
494
+ ax2.legend()
495
+ ax2.grid(True, alpha=0.3)
496
+
497
+ # 3. Median values per task
498
+ ax3 = axes[1, 0]
499
+ bars = ax3.bar(range(len(task_names)), median_values, color='coral', alpha=0.7)
500
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
501
+ ax3.axhline(np.median(median_values), color='green', linestyle='-', linewidth=2, label=f'Overall Median ({np.median(median_values):.1f})')
502
+ ax3.set_xlabel('Task', fontsize=12)
503
+ ax3.set_ylabel('Median Success Value', fontsize=12)
504
+ ax3.set_title('Median Success Frame Values by Task', fontsize=14)
505
+ ax3.set_xticks(range(len(task_names)))
506
+ ax3.set_xticklabels(range(1, len(task_names) + 1))
507
+ ax3.legend()
508
+ ax3.grid(True, alpha=0.3, axis='y')
509
+
510
+ # 4. Std deviation per task
511
+ ax4 = axes[1, 1]
512
+ bars = ax4.bar(range(len(task_names)), std_values, color='orange', alpha=0.7)
513
+ ax4.axhline(np.mean(std_values), color='green', linestyle='-', linewidth=2, label=f'Mean Std ({np.mean(std_values):.1f})')
514
+ ax4.set_xlabel('Task', fontsize=12)
515
+ ax4.set_ylabel('Standard Deviation', fontsize=12)
516
+ ax4.set_title('Variability in Success Values by Task', fontsize=14)
517
+ ax4.set_xticks(range(len(task_names)))
518
+ ax4.set_xticklabels(range(1, len(task_names) + 1))
519
+ ax4.legend()
520
+ ax4.grid(True, alpha=0.3, axis='y')
521
+
522
+ plt.tight_layout()
523
+
524
+ # Save plots
525
+ plot_path = output_dir / "aggregate_statistics.png"
526
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
527
+ print(f"\nAggregate plot saved to: {plot_path}")
528
+
529
+ pdf_path = output_dir / "aggregate_statistics.pdf"
530
+ plt.savefig(pdf_path, bbox_inches='tight')
531
+ print(f"Aggregate PDF saved to: {pdf_path}")
532
+
533
+ plt.close()
534
+
535
+
536
+ # ---------------------------------------------------------------------------
537
+ # CLI
538
+ # ---------------------------------------------------------------------------
539
+
540
+
541
+ def parse_args() -> argparse.Namespace:
542
+ parser = argparse.ArgumentParser(
543
+ description="Evaluate value estimation for test demonstrations"
544
+ )
545
+
546
+ # Mode selection
547
+ parser.add_argument(
548
+ "--process-all-tasks",
549
+ action="store_true",
550
+ help="Process all LIBERO-10 tasks"
551
+ )
552
+
553
+ # Arguments for processing all tasks
554
+ parser.add_argument(
555
+ "--manifests-root",
556
+ type=Path,
557
+ help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
558
+ )
559
+
560
+ # Arguments for processing a single task
561
+ parser.add_argument(
562
+ "--manifest-path",
563
+ type=Path,
564
+ help="Path to the test manifest JSON file (for single task mode)",
565
+ )
566
+
567
+ # Common arguments
568
+ parser.add_argument(
569
+ "--output-dir",
570
+ type=Path,
571
+ default="evaluation_results",
572
+ help="Directory to save evaluation results and plots",
573
+ )
574
+ parser.add_argument(
575
+ "--base-url",
576
+ default="http://localhost:8111",
577
+ help="VLAC service base URL (default: http://localhost:8111)",
578
+ )
579
+ parser.add_argument(
580
+ "--timeout",
581
+ type=float,
582
+ default=30.0,
583
+ help="HTTP request timeout in seconds (default: 30.0)",
584
+ )
585
+ parser.add_argument(
586
+ "--use-reference",
587
+ action="store_true",
588
+ help="Use reference trajectory (if available)",
589
+ )
590
+
591
+ args = parser.parse_args()
592
+
593
+ # Validate arguments
594
+ if args.process_all_tasks:
595
+ if not args.manifests_root:
596
+ parser.error("--manifests-root is required when using --process-all-tasks")
597
+ else:
598
+ if not args.manifest_path:
599
+ parser.error("--manifest-path is required for single task mode")
600
+
601
+ return args
602
+
603
+
604
+ def main() -> int:
605
+ args = parse_args()
606
+
607
+ # Create output directory
608
+ output_dir = args.output_dir.expanduser()
609
+ output_dir.mkdir(parents=True, exist_ok=True)
610
+
611
+ if args.process_all_tasks:
612
+ # Process all LIBERO-10 tasks
613
+ manifests_root = args.manifests_root.expanduser()
614
+
615
+ if not manifests_root.exists():
616
+ print(f"Error: Manifests root directory not found: {manifests_root}")
617
+ return 1
618
+
619
+ print("=" * 80)
620
+ print("EVALUATING ALL LIBERO-10 TASKS")
621
+ print("=" * 80)
622
+ print(f"Manifests root: {manifests_root}")
623
+ print(f"Output directory: {output_dir}")
624
+ print(f"Base URL: {args.base_url}")
625
+ print(f"Total tasks to evaluate: {len(LIBERO_10_TASKS)}")
626
+ print("=" * 80)
627
+
628
+ successful_tasks = []
629
+ failed_tasks = []
630
+ all_task_results = []
631
+
632
+ for idx, task_name in enumerate(LIBERO_10_TASKS, 1):
633
+ print(f"\n[{idx}/{len(LIBERO_10_TASKS)}] Processing: {task_name}")
634
+
635
+ # Find manifest file
636
+ manifest_path = find_manifest_file(manifests_root, task_name)
637
+ if manifest_path is None:
638
+ print(f" [ERROR] Manifest file not found for task: {task_name}")
639
+ failed_tasks.append(task_name)
640
+ continue
641
+
642
+ # Evaluate the task
643
+ result = evaluate_single_task(
644
+ manifest_path=manifest_path,
645
+ output_dir=output_dir,
646
+ base_url=args.base_url,
647
+ timeout=args.timeout,
648
+ use_reference=args.use_reference,
649
+ )
650
+
651
+ if result:
652
+ successful_tasks.append(task_name)
653
+ all_task_results.append(result)
654
+ else:
655
+ failed_tasks.append(task_name)
656
+
657
+ # Print overall summary
658
+ print("\n" + "=" * 80)
659
+ print("EVALUATION COMPLETE - ALL TASKS")
660
+ print("=" * 80)
661
+ print(f"Successfully evaluated: {len(successful_tasks)}/{len(LIBERO_10_TASKS)} tasks")
662
+ print(f"Failed: {len(failed_tasks)}/{len(LIBERO_10_TASKS)} tasks")
663
+
664
+ if failed_tasks:
665
+ print("\nFailed tasks:")
666
+ for task in failed_tasks:
667
+ print(f" - {task}")
668
+
669
+ # Compute and display aggregate statistics
670
+ if all_task_results:
671
+ print("\n" + "=" * 80)
672
+ print("AGGREGATE STATISTICS ACROSS ALL TASKS")
673
+ print("=" * 80)
674
+
675
+ all_mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
676
+ all_median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
677
+ all_std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
678
+
679
+ print(f"\nOverall mean of task means: {np.mean(all_mean_values):.2f} ± {np.std(all_mean_values):.2f}")
680
+ print(f"Overall median of task medians: {np.median(all_median_values):.2f}")
681
+ print(f"Average std deviation: {np.mean(all_std_values):.2f}")
682
+
683
+ print(f"\nBest performing task: {all_task_results[np.argmax(all_mean_values)]['task_name']} ({max(all_mean_values):.2f})")
684
+ print(f"Worst performing task: {all_task_results[np.argmin(all_mean_values)]['task_name']} ({min(all_mean_values):.2f})")
685
+
686
+ # Save aggregate statistics
687
+ aggregate_stats = {
688
+ "total_tasks": len(LIBERO_10_TASKS),
689
+ "successful_tasks": len(successful_tasks),
690
+ "failed_tasks": len(failed_tasks),
691
+ "overall_mean_of_means": float(np.mean(all_mean_values)),
692
+ "overall_std_of_means": float(np.std(all_mean_values)),
693
+ "overall_median_of_medians": float(np.median(all_median_values)),
694
+ "average_std_deviation": float(np.mean(all_std_values)),
695
+ "best_task": all_task_results[np.argmax(all_mean_values)]['task_name'],
696
+ "best_task_mean_value": float(max(all_mean_values)),
697
+ "worst_task": all_task_results[np.argmin(all_mean_values)]['task_name'],
698
+ "worst_task_mean_value": float(min(all_mean_values)),
699
+ "task_results": [
700
+ {
701
+ "task_name": r["task_name"],
702
+ "mean_value": r["statistics"]["last_value_mean"],
703
+ "median_value": r["statistics"]["last_value_median"],
704
+ "std_value": r["statistics"]["last_value_std"],
705
+ }
706
+ for r in all_task_results
707
+ ]
708
+ }
709
+
710
+ aggregate_path = output_dir / "aggregate_statistics.json"
711
+ with aggregate_path.open("w", encoding="utf-8") as f:
712
+ json.dump(aggregate_stats, f, indent=2)
713
+ print(f"\nAggregate statistics saved to: {aggregate_path}")
714
+
715
+ # Create aggregate plots
716
+ plot_aggregate_statistics(all_task_results, output_dir)
717
+
718
+ print("\n" + "=" * 80)
719
+ print(f"All results saved to: {output_dir}")
720
+ print("=" * 80)
721
+
722
+ else:
723
+ # Process a single task
724
+ print("=" * 80)
725
+ print("VLAC Value Estimation Evaluation - Single Task")
726
+ print("=" * 80)
727
+
728
+ result = evaluate_single_task(
729
+ manifest_path=args.manifest_path.expanduser(),
730
+ output_dir=output_dir,
731
+ base_url=args.base_url,
732
+ timeout=args.timeout,
733
+ use_reference=args.use_reference,
734
+ )
735
+
736
+ if not result:
737
+ print("\nEvaluation failed!")
738
+ return 1
739
+
740
+ # Print detailed statistics for single task
741
+ statistics = result["statistics"]
742
+ evaluation_results = result["evaluation_results"]
743
+
744
+ print("\n" + "=" * 80)
745
+ print("DETAILED EVALUATION SUMMARY")
746
+ print("=" * 80)
747
+ print(f"Task: {evaluation_results['task_name']}")
748
+ print(f"Total demos: {evaluation_results['total_demos']}")
749
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
750
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
751
+
752
+ if statistics:
753
+ print("\n" + "-" * 80)
754
+ print("SUCCESS FRAME VALUE STATISTICS")
755
+ print("-" * 80)
756
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
757
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
758
+ print(f"Median: {statistics['last_value_median']:.2f}")
759
+ print(f"Min: {statistics['last_value_min']:.2f}")
760
+ print(f"Max: {statistics['last_value_max']:.2f}")
761
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
762
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
763
+
764
+ print("\n" + "-" * 80)
765
+ print("THRESHOLD ANALYSIS")
766
+ print("-" * 80)
767
+ for threshold in [80, 85, 90, 95, 100]:
768
+ count = statistics[f"count_above_{threshold}"]
769
+ percent = statistics[f"percent_above_{threshold}"]
770
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
771
+
772
+ print("\n" + "-" * 80)
773
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
774
+ print("-" * 80)
775
+
776
+ print("\n" + "=" * 80)
777
+ print("EVALUATION COMPLETE")
778
+ print("=" * 80)
779
+
780
+ return 0
781
+
782
+
783
+ if __name__ == "__main__":
784
+ sys.exit(main())
Dev/.history/testing/evaluate_test_demo_values_20251008152727.py ADDED
@@ -0,0 +1,784 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
3
+
4
+ This script:
5
+ 1. Reads test demo manifests created by prepare_test_demo_single_task.py
6
+ 2. Calls the VLAC trajectory-critic service for each demo
7
+ 3. Records the last value (success frame value) - ideally should be 100
8
+ 4. Plots statistics to visualize the value distribution
9
+
10
+ Usage:
11
+ # Evaluate all LIBERO-10 tasks
12
+ python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
13
+
14
+ # Evaluate a single task
15
+ python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
16
+
17
+ Examples:
18
+ # Evaluate all LIBERO-10 tasks
19
+ python evaluate_test_demo_values.py \
20
+ --process-all-tasks \
21
+ --manifests-root toy_test_demos_LIBERO_10 \
22
+ --output-dir evaluation_results_all_tasks \
23
+ --base-url http://localhost:8111
24
+
25
+ # Evaluate a single task
26
+ python evaluate_test_demo_values.py \
27
+ --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
28
+ --output-dir evaluation_results \
29
+ --base-url http://localhost:8111
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import argparse
35
+ import base64
36
+ import json
37
+ import os
38
+ import glob
39
+ import sys
40
+ import time
41
+ from io import BytesIO
42
+ from pathlib import Path
43
+ from typing import Dict, List, Optional
44
+
45
+ import matplotlib.pyplot as plt
46
+ import numpy as np
47
+ import requests
48
+ from PIL import Image
49
+ from tqdm import tqdm
50
+
51
+ # LIBERO-10 task list
52
+ LIBERO_10_TASKS = [
53
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
54
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
55
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
56
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
57
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
58
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
59
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
60
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
61
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
62
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
63
+ ]
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Helpers
67
+ # ---------------------------------------------------------------------------
68
+
69
+ def sample_fixed_interval_frames(image_list, num_frames):
70
+ # sample num_frames frames from image_list
71
+ # sample with equal interval while also ensuring the first and the last frames are included
72
+ if len(image_list) == 0:
73
+ raise ValueError("image_list is empty")
74
+ elif len(image_list) == 1:
75
+ return [image_list[0]] * num_frames
76
+ elif num_frames == 2:
77
+ return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
78
+ elif num_frames == 3:
79
+ return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
80
+ else:
81
+ total_frames = len(image_list)
82
+ indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
83
+ sampled_frames = [image_list[i] for i in indices]
84
+ return sampled_frames
85
+
86
+
87
+ num_frames_for_reference = 8
88
+ ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
89
+ libero_10_task_list = [
90
+ "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
91
+ "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
92
+ "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
93
+ "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
94
+ "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
95
+ "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
96
+ "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
97
+ "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
98
+ "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
99
+ "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
100
+ ]
101
+ reference_frames_dict = {}
102
+ for task_name in libero_10_task_list:
103
+ ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
104
+ ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
105
+ ref_frm_file_list.sort()
106
+ reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
107
+ reference_frames_dict[task_name] = reference_frames_temp
108
+
109
+
110
+ def read_manifest(manifest_path: Path) -> Dict:
111
+ """Read the test demo manifest JSON file."""
112
+ if not manifest_path.is_file():
113
+ raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
114
+
115
+ with manifest_path.open("r", encoding="utf-8") as f:
116
+ manifest_data = json.load(f)
117
+
118
+ # Convert relative paths to absolute paths
119
+ manifest_dir = manifest_path.parent
120
+ for demo in manifest_data.get("demos", []):
121
+ demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
122
+
123
+ return manifest_data
124
+
125
+
126
+ def image_to_base64(path: Path) -> str:
127
+ """Convert an image file to base64 encoded JPEG."""
128
+ with Image.open(path) as img:
129
+ img = img.convert("RGB")
130
+ buffer = BytesIO()
131
+ img.save(buffer, format="JPEG", quality=95)
132
+ return base64.b64encode(buffer.getvalue()).decode("utf-8")
133
+
134
+
135
+ def encode_images(paths: List[str]) -> List[str]:
136
+ """Encode a list of image paths to base64."""
137
+ return [image_to_base64(Path(p)) for p in paths]
138
+
139
+
140
+ def call_trajectory_critic(
141
+ session: requests.Session,
142
+ base_url: str,
143
+ task: str,
144
+ frames_b64: List[str],
145
+ reference_b64: Optional[List[str]],
146
+ timeout: float,
147
+ ) -> Dict:
148
+ """Call the VLAC trajectory-critic endpoint."""
149
+ payload = {
150
+ "task": task,
151
+ "frames": frames_b64,
152
+ "reference": reference_b64,
153
+ "ref_num": len(reference_b64 or []),
154
+ "skip": 1,
155
+ "batch_size": min(len(frames_b64), 8),
156
+ "think": False,
157
+ "return_video": False,
158
+ }
159
+ start = time.time()
160
+ resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
161
+ resp.raise_for_status()
162
+ result = resp.json()
163
+ result["latency_sec"] = time.time() - start
164
+ return result
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # Evaluation
169
+ # ---------------------------------------------------------------------------
170
+
171
+
172
+ def evaluate_demos(
173
+ manifest_data: Dict,
174
+ base_url: str,
175
+ timeout: float,
176
+ use_reference: bool = False,
177
+ ) -> Dict[str, any]:
178
+ """Evaluate all demos and collect value statistics."""
179
+ session = requests.Session()
180
+ task_name = manifest_data.get("task_name", "")
181
+ demos = manifest_data.get("demos", [])
182
+
183
+ results = []
184
+ failed_demos = []
185
+
186
+ print(f"\nEvaluating {len(demos)} test demonstrations...")
187
+ print(f"Task: {task_name}")
188
+ print(f"Use reference: {use_reference}\n")
189
+
190
+ for demo in tqdm(demos, desc="Processing demos"):
191
+ demo_name = demo["demo_name"]
192
+ frame_paths = demo["frame_paths"]
193
+
194
+ # try:
195
+ # Encode frames
196
+ frames_b64 = encode_images(frame_paths)
197
+
198
+ # For now, no reference trajectory (can be added later)
199
+ print(f"Using reference frames for task {task_name}")
200
+ reference_b64 = encode_images(reference_frames_dict[task_name])
201
+
202
+ # Call VLAC service
203
+ result = call_trajectory_critic(
204
+ session=session,
205
+ base_url=base_url,
206
+ task=task_name,
207
+ frames_b64=frames_b64,
208
+ reference_b64=reference_b64,
209
+ timeout=timeout,
210
+ )
211
+
212
+ # Extract values
213
+ value_list = result.get("value_list", [])
214
+ if not value_list:
215
+ print(f"\n[warn] No values returned for demo {demo_name}")
216
+ failed_demos.append(demo_name)
217
+ continue
218
+
219
+ # Record results
220
+ demo_result = {
221
+ "demo_name": demo_name,
222
+ "total_frames": demo["total_frames"],
223
+ "success_index": demo["success_index"],
224
+ "num_sampled_frames": len(frame_paths),
225
+ "value_list": value_list,
226
+ "last_value": value_list[-1], # The critical value for success frame
227
+ "mean_value": float(np.mean(value_list)),
228
+ "std_value": float(np.std(value_list)),
229
+ "latency_sec": result.get("latency_sec", 0.0),
230
+ }
231
+ results.append(demo_result)
232
+
233
+ # except requests.RequestException as exc:
234
+ # print(f"\n[error] Request failed for demo {demo_name}: {exc}")
235
+ # failed_demos.append(demo_name)
236
+ # except Exception as exc:
237
+ # print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
238
+ # failed_demos.append(demo_name)
239
+
240
+ return {
241
+ "task_name": task_name,
242
+ "total_demos": len(demos),
243
+ "successful_evals": len(results),
244
+ "failed_demos": failed_demos,
245
+ "results": results,
246
+ }
247
+
248
+
249
+ def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
250
+ """Compute summary statistics from evaluation results."""
251
+ results = evaluation_results["results"]
252
+ if not results:
253
+ return {}
254
+
255
+ last_values = [r["last_value"] for r in results]
256
+ mean_values = [r["mean_value"] for r in results]
257
+ latencies = [r["latency_sec"] for r in results]
258
+
259
+ stats = {
260
+ "last_value_mean": float(np.mean(last_values)),
261
+ "last_value_std": float(np.std(last_values)),
262
+ "last_value_min": float(np.min(last_values)),
263
+ "last_value_max": float(np.max(last_values)),
264
+ "last_value_median": float(np.median(last_values)),
265
+ "last_value_q25": float(np.percentile(last_values, 25)),
266
+ "last_value_q75": float(np.percentile(last_values, 75)),
267
+ "mean_latency": float(np.mean(latencies)),
268
+ "total_evaluated": len(results),
269
+ }
270
+
271
+ # Count how many demos have last_value >= various thresholds
272
+ for threshold in [80, 85, 90, 95, 100]:
273
+ count = sum(1 for v in last_values if v >= threshold)
274
+ stats[f"count_above_{threshold}"] = count
275
+ stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
276
+
277
+ return stats
278
+
279
+
280
+ def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
281
+ """Create visualization plots for value distribution."""
282
+ results = evaluation_results["results"]
283
+ if not results:
284
+ print("No results to plot")
285
+ return
286
+
287
+ task_name = evaluation_results["task_name"]
288
+ last_values = [r["last_value"] for r in results]
289
+
290
+ # Create figure with multiple subplots
291
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
292
+ fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
293
+
294
+ # 1. Histogram of last values
295
+ ax1 = axes[0, 0]
296
+ ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
297
+ ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
298
+ ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
299
+ ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
300
+ ax1.set_ylabel('Frequency', fontsize=12)
301
+ ax1.set_title('Distribution of Success Frame Values', fontsize=14)
302
+ ax1.legend()
303
+ ax1.grid(True, alpha=0.3)
304
+
305
+ # 2. Box plot of last values
306
+ ax2 = axes[0, 1]
307
+ box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
308
+ for patch in box_data['boxes']:
309
+ patch.set_facecolor('lightblue')
310
+ ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
311
+ ax2.set_ylabel('Value', fontsize=12)
312
+ ax2.set_title('Success Frame Value Distribution', fontsize=14)
313
+ ax2.legend()
314
+ ax2.grid(True, alpha=0.3, axis='y')
315
+
316
+ # 3. Value progression across demos
317
+ ax3 = axes[1, 0]
318
+ demo_indices = range(len(results))
319
+ ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
320
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
321
+ ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
322
+ ax3.set_xlabel('Demo Index', fontsize=12)
323
+ ax3.set_ylabel('Last Frame Value', fontsize=12)
324
+ ax3.set_title('Success Frame Values Across Demos', fontsize=14)
325
+ ax3.legend()
326
+ ax3.grid(True, alpha=0.3)
327
+
328
+ # 4. Cumulative distribution
329
+ ax4 = axes[1, 1]
330
+ sorted_values = np.sort(last_values)
331
+ cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
332
+ ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
333
+ ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
334
+ ax4.set_xlabel('Success Frame Value', fontsize=12)
335
+ ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
336
+ ax4.set_title('Cumulative Distribution', fontsize=14)
337
+ ax4.legend()
338
+ ax4.grid(True, alpha=0.3)
339
+
340
+ plt.tight_layout()
341
+
342
+ # Save the plot
343
+ plot_path = output_dir / f"{task_name}_value_distribution.png"
344
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
345
+ print(f"\nPlot saved to: {plot_path}")
346
+
347
+ # Also save a PDF version
348
+ pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
349
+ plt.savefig(pdf_path, bbox_inches='tight')
350
+ print(f"PDF saved to: {pdf_path}")
351
+
352
+ plt.close()
353
+
354
+
355
+ def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
356
+ """Save evaluation results and statistics to JSON files."""
357
+ task_name = evaluation_results["task_name"]
358
+
359
+ # Save detailed results
360
+ results_path = output_dir / f"{task_name}_evaluation_results.json"
361
+ with results_path.open("w", encoding="utf-8") as f:
362
+ json.dump(evaluation_results, f, indent=2)
363
+ print(f"\nDetailed results saved to: {results_path}")
364
+
365
+ # Save summary statistics
366
+ stats_path = output_dir / f"{task_name}_statistics.json"
367
+ with stats_path.open("w", encoding="utf-8") as f:
368
+ json.dump(statistics, f, indent=2)
369
+ print(f"Statistics saved to: {stats_path}")
370
+
371
+
372
+ def find_manifest_file(manifests_root: Path, task_name: str) -> Optional[Path]:
373
+ """Find the manifest file for a given task name.
374
+
375
+ Tries different patterns commonly used.
376
+ """
377
+ # Try different patterns
378
+ patterns = [
379
+ manifests_root / task_name / f"{task_name}_test_manifest.json",
380
+ manifests_root / task_name / "test_manifest.json",
381
+ manifests_root / f"{task_name}_test_manifest.json",
382
+ ]
383
+
384
+ for candidate in patterns:
385
+ if candidate.exists():
386
+ return candidate
387
+
388
+ return None
389
+
390
+
391
+ def evaluate_single_task(
392
+ manifest_path: Path,
393
+ output_dir: Path,
394
+ base_url: str,
395
+ timeout: float,
396
+ use_reference: bool,
397
+ ) -> Optional[Dict]:
398
+ """Evaluate a single task and return the statistics.
399
+
400
+ Returns:
401
+ Dictionary with evaluation results and statistics, or None if failed
402
+ """
403
+ try:
404
+ manifest_data = read_manifest(manifest_path)
405
+ except FileNotFoundError as exc:
406
+ print(f"Error reading manifest: {exc}")
407
+ return None
408
+
409
+ task_name = manifest_data.get("task_name", "unknown")
410
+
411
+ print(f"\n{'='*80}")
412
+ print(f"Evaluating task: {task_name}")
413
+ print(f"Manifest: {manifest_path}")
414
+ print(f"{'='*80}")
415
+
416
+ # Run evaluation
417
+ evaluation_results = evaluate_demos(
418
+ manifest_data=manifest_data,
419
+ base_url=base_url,
420
+ timeout=timeout,
421
+ use_reference=use_reference,
422
+ )
423
+
424
+ # Compute statistics
425
+ statistics = compute_statistics(evaluation_results)
426
+
427
+ # Print summary
428
+ print("\n" + "-" * 80)
429
+ print("TASK EVALUATION SUMMARY")
430
+ print("-" * 80)
431
+ print(f"Task: {evaluation_results['task_name']}")
432
+ print(f"Total demos: {evaluation_results['total_demos']}")
433
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
434
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
435
+
436
+ if statistics:
437
+ print(f"\nMean success value: {statistics['last_value_mean']:.2f}")
438
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
439
+ print(f"Median: {statistics['last_value_median']:.2f}")
440
+ print(f"Values >= 90: {statistics.get('count_above_90', 0)} ({statistics.get('percent_above_90', 0):.1f}%)")
441
+
442
+ # Save results
443
+ task_output_dir = output_dir / task_name
444
+ task_output_dir.mkdir(parents=True, exist_ok=True)
445
+ save_results(evaluation_results, statistics, task_output_dir)
446
+
447
+ # Create plots
448
+ if evaluation_results["results"]:
449
+ plot_value_distribution(evaluation_results, task_output_dir)
450
+
451
+ return {
452
+ "task_name": task_name,
453
+ "evaluation_results": evaluation_results,
454
+ "statistics": statistics,
455
+ }
456
+
457
+
458
+ def plot_aggregate_statistics(all_task_results: List[Dict], output_dir: Path) -> None:
459
+ """Create aggregate plots across all tasks."""
460
+ if not all_task_results:
461
+ return
462
+
463
+ # Extract data
464
+ task_names = [r["task_name"] for r in all_task_results]
465
+ mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
466
+ median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
467
+ std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
468
+
469
+ # Create figure with subplots
470
+ fig, axes = plt.subplots(2, 2, figsize=(16, 12))
471
+ fig.suptitle("VLAC Value Estimation - Aggregate Statistics Across All Tasks", fontsize=16, fontweight='bold')
472
+
473
+ # 1. Mean values per task
474
+ ax1 = axes[0, 0]
475
+ bars = ax1.bar(range(len(task_names)), mean_values, color='steelblue', alpha=0.7)
476
+ ax1.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
477
+ ax1.axhline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Overall Mean ({np.mean(mean_values):.1f})')
478
+ ax1.set_xlabel('Task', fontsize=12)
479
+ ax1.set_ylabel('Mean Success Value', fontsize=12)
480
+ ax1.set_title('Mean Success Frame Values by Task', fontsize=14)
481
+ ax1.set_xticks(range(len(task_names)))
482
+ ax1.set_xticklabels(range(1, len(task_names) + 1))
483
+ ax1.legend()
484
+ ax1.grid(True, alpha=0.3, axis='y')
485
+
486
+ # 2. Distribution of mean values
487
+ ax2 = axes[0, 1]
488
+ ax2.hist(mean_values, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
489
+ ax2.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
490
+ ax2.axvline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(mean_values):.1f})')
491
+ ax2.set_xlabel('Mean Success Value', fontsize=12)
492
+ ax2.set_ylabel('Frequency (Tasks)', fontsize=12)
493
+ ax2.set_title('Distribution of Task-Level Mean Values', fontsize=14)
494
+ ax2.legend()
495
+ ax2.grid(True, alpha=0.3)
496
+
497
+ # 3. Median values per task
498
+ ax3 = axes[1, 0]
499
+ bars = ax3.bar(range(len(task_names)), median_values, color='coral', alpha=0.7)
500
+ ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
501
+ ax3.axhline(np.median(median_values), color='green', linestyle='-', linewidth=2, label=f'Overall Median ({np.median(median_values):.1f})')
502
+ ax3.set_xlabel('Task', fontsize=12)
503
+ ax3.set_ylabel('Median Success Value', fontsize=12)
504
+ ax3.set_title('Median Success Frame Values by Task', fontsize=14)
505
+ ax3.set_xticks(range(len(task_names)))
506
+ ax3.set_xticklabels(range(1, len(task_names) + 1))
507
+ ax3.legend()
508
+ ax3.grid(True, alpha=0.3, axis='y')
509
+
510
+ # 4. Std deviation per task
511
+ ax4 = axes[1, 1]
512
+ bars = ax4.bar(range(len(task_names)), std_values, color='orange', alpha=0.7)
513
+ ax4.axhline(np.mean(std_values), color='green', linestyle='-', linewidth=2, label=f'Mean Std ({np.mean(std_values):.1f})')
514
+ ax4.set_xlabel('Task', fontsize=12)
515
+ ax4.set_ylabel('Standard Deviation', fontsize=12)
516
+ ax4.set_title('Variability in Success Values by Task', fontsize=14)
517
+ ax4.set_xticks(range(len(task_names)))
518
+ ax4.set_xticklabels(range(1, len(task_names) + 1))
519
+ ax4.legend()
520
+ ax4.grid(True, alpha=0.3, axis='y')
521
+
522
+ plt.tight_layout()
523
+
524
+ # Save plots
525
+ plot_path = output_dir / "aggregate_statistics.png"
526
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
527
+ print(f"\nAggregate plot saved to: {plot_path}")
528
+
529
+ pdf_path = output_dir / "aggregate_statistics.pdf"
530
+ plt.savefig(pdf_path, bbox_inches='tight')
531
+ print(f"Aggregate PDF saved to: {pdf_path}")
532
+
533
+ plt.close()
534
+
535
+
536
+ # ---------------------------------------------------------------------------
537
+ # CLI
538
+ # ---------------------------------------------------------------------------
539
+
540
+
541
+ def parse_args() -> argparse.Namespace:
542
+ parser = argparse.ArgumentParser(
543
+ description="Evaluate value estimation for test demonstrations"
544
+ )
545
+
546
+ # Mode selection
547
+ parser.add_argument(
548
+ "--process-all-tasks",
549
+ action="store_true",
550
+ help="Process all LIBERO-10 tasks"
551
+ )
552
+
553
+ # Arguments for processing all tasks
554
+ parser.add_argument(
555
+ "--manifests-root",
556
+ type=Path,
557
+ help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
558
+ )
559
+
560
+ # Arguments for processing a single task
561
+ parser.add_argument(
562
+ "--manifest-path",
563
+ type=Path,
564
+ help="Path to the test manifest JSON file (for single task mode)",
565
+ )
566
+
567
+ # Common arguments
568
+ parser.add_argument(
569
+ "--output-dir",
570
+ type=Path,
571
+ default="evaluation_results",
572
+ help="Directory to save evaluation results and plots",
573
+ )
574
+ parser.add_argument(
575
+ "--base-url",
576
+ default="http://localhost:8111",
577
+ help="VLAC service base URL (default: http://localhost:8111)",
578
+ )
579
+ parser.add_argument(
580
+ "--timeout",
581
+ type=float,
582
+ default=30.0,
583
+ help="HTTP request timeout in seconds (default: 30.0)",
584
+ )
585
+ parser.add_argument(
586
+ "--use-reference",
587
+ action="store_true",
588
+ help="Use reference trajectory (if available)",
589
+ )
590
+
591
+ args = parser.parse_args()
592
+
593
+ # Validate arguments
594
+ if args.process_all_tasks:
595
+ if not args.manifests_root:
596
+ parser.error("--manifests-root is required when using --process-all-tasks")
597
+ else:
598
+ if not args.manifest_path:
599
+ parser.error("--manifest-path is required for single task mode")
600
+
601
+ return args
602
+
603
+
604
+ def main() -> int:
605
+ args = parse_args()
606
+
607
+ # Create output directory
608
+ output_dir = args.output_dir.expanduser()
609
+ output_dir.mkdir(parents=True, exist_ok=True)
610
+
611
+ if args.process_all_tasks:
612
+ # Process all LIBERO-10 tasks
613
+ manifests_root = args.manifests_root.expanduser()
614
+
615
+ if not manifests_root.exists():
616
+ print(f"Error: Manifests root directory not found: {manifests_root}")
617
+ return 1
618
+
619
+ print("=" * 80)
620
+ print("EVALUATING ALL LIBERO-10 TASKS")
621
+ print("=" * 80)
622
+ print(f"Manifests root: {manifests_root}")
623
+ print(f"Output directory: {output_dir}")
624
+ print(f"Base URL: {args.base_url}")
625
+ print(f"Total tasks to evaluate: {len(LIBERO_10_TASKS)}")
626
+ print("=" * 80)
627
+
628
+ successful_tasks = []
629
+ failed_tasks = []
630
+ all_task_results = []
631
+
632
+ for idx, task_name in enumerate(LIBERO_10_TASKS, 1):
633
+ print(f"\n[{idx}/{len(LIBERO_10_TASKS)}] Processing: {task_name}")
634
+
635
+ # Find manifest file
636
+ manifest_path = find_manifest_file(manifests_root, task_name)
637
+ if manifest_path is None:
638
+ print(f" [ERROR] Manifest file not found for task: {task_name}")
639
+ failed_tasks.append(task_name)
640
+ continue
641
+
642
+ # Evaluate the task
643
+ result = evaluate_single_task(
644
+ manifest_path=manifest_path,
645
+ output_dir=output_dir,
646
+ base_url=args.base_url,
647
+ timeout=args.timeout,
648
+ use_reference=args.use_reference,
649
+ )
650
+
651
+ if result:
652
+ successful_tasks.append(task_name)
653
+ all_task_results.append(result)
654
+ else:
655
+ failed_tasks.append(task_name)
656
+
657
+ # Print overall summary
658
+ print("\n" + "=" * 80)
659
+ print("EVALUATION COMPLETE - ALL TASKS")
660
+ print("=" * 80)
661
+ print(f"Successfully evaluated: {len(successful_tasks)}/{len(LIBERO_10_TASKS)} tasks")
662
+ print(f"Failed: {len(failed_tasks)}/{len(LIBERO_10_TASKS)} tasks")
663
+
664
+ if failed_tasks:
665
+ print("\nFailed tasks:")
666
+ for task in failed_tasks:
667
+ print(f" - {task}")
668
+
669
+ # Compute and display aggregate statistics
670
+ if all_task_results:
671
+ print("\n" + "=" * 80)
672
+ print("AGGREGATE STATISTICS ACROSS ALL TASKS")
673
+ print("=" * 80)
674
+
675
+ all_mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
676
+ all_median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
677
+ all_std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
678
+
679
+ print(f"\nOverall mean of task means: {np.mean(all_mean_values):.2f} ± {np.std(all_mean_values):.2f}")
680
+ print(f"Overall median of task medians: {np.median(all_median_values):.2f}")
681
+ print(f"Average std deviation: {np.mean(all_std_values):.2f}")
682
+
683
+ print(f"\nBest performing task: {all_task_results[np.argmax(all_mean_values)]['task_name']} ({max(all_mean_values):.2f})")
684
+ print(f"Worst performing task: {all_task_results[np.argmin(all_mean_values)]['task_name']} ({min(all_mean_values):.2f})")
685
+
686
+ # Save aggregate statistics
687
+ aggregate_stats = {
688
+ "total_tasks": len(LIBERO_10_TASKS),
689
+ "successful_tasks": len(successful_tasks),
690
+ "failed_tasks": len(failed_tasks),
691
+ "overall_mean_of_means": float(np.mean(all_mean_values)),
692
+ "overall_std_of_means": float(np.std(all_mean_values)),
693
+ "overall_median_of_medians": float(np.median(all_median_values)),
694
+ "average_std_deviation": float(np.mean(all_std_values)),
695
+ "best_task": all_task_results[np.argmax(all_mean_values)]['task_name'],
696
+ "best_task_mean_value": float(max(all_mean_values)),
697
+ "worst_task": all_task_results[np.argmin(all_mean_values)]['task_name'],
698
+ "worst_task_mean_value": float(min(all_mean_values)),
699
+ "task_results": [
700
+ {
701
+ "task_name": r["task_name"],
702
+ "mean_value": r["statistics"]["last_value_mean"],
703
+ "median_value": r["statistics"]["last_value_median"],
704
+ "std_value": r["statistics"]["last_value_std"],
705
+ }
706
+ for r in all_task_results
707
+ ]
708
+ }
709
+
710
+ aggregate_path = output_dir / "aggregate_statistics.json"
711
+ with aggregate_path.open("w", encoding="utf-8") as f:
712
+ json.dump(aggregate_stats, f, indent=2)
713
+ print(f"\nAggregate statistics saved to: {aggregate_path}")
714
+
715
+ # Create aggregate plots
716
+ plot_aggregate_statistics(all_task_results, output_dir)
717
+
718
+ print("\n" + "=" * 80)
719
+ print(f"All results saved to: {output_dir}")
720
+ print("=" * 80)
721
+
722
+ else:
723
+ # Process a single task
724
+ print("=" * 80)
725
+ print("VLAC Value Estimation Evaluation - Single Task")
726
+ print("=" * 80)
727
+
728
+ result = evaluate_single_task(
729
+ manifest_path=args.manifest_path.expanduser(),
730
+ output_dir=output_dir,
731
+ base_url=args.base_url,
732
+ timeout=args.timeout,
733
+ use_reference=args.use_reference,
734
+ )
735
+
736
+ if not result:
737
+ print("\nEvaluation failed!")
738
+ return 1
739
+
740
+ # Print detailed statistics for single task
741
+ statistics = result["statistics"]
742
+ evaluation_results = result["evaluation_results"]
743
+
744
+ print("\n" + "=" * 80)
745
+ print("DETAILED EVALUATION SUMMARY")
746
+ print("=" * 80)
747
+ print(f"Task: {evaluation_results['task_name']}")
748
+ print(f"Total demos: {evaluation_results['total_demos']}")
749
+ print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
750
+ print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
751
+
752
+ if statistics:
753
+ print("\n" + "-" * 80)
754
+ print("SUCCESS FRAME VALUE STATISTICS")
755
+ print("-" * 80)
756
+ print(f"Mean: {statistics['last_value_mean']:.2f}")
757
+ print(f"Std Dev: {statistics['last_value_std']:.2f}")
758
+ print(f"Median: {statistics['last_value_median']:.2f}")
759
+ print(f"Min: {statistics['last_value_min']:.2f}")
760
+ print(f"Max: {statistics['last_value_max']:.2f}")
761
+ print(f"Q25: {statistics['last_value_q25']:.2f}")
762
+ print(f"Q75: {statistics['last_value_q75']:.2f}")
763
+
764
+ print("\n" + "-" * 80)
765
+ print("THRESHOLD ANALYSIS")
766
+ print("-" * 80)
767
+ for threshold in [80, 85, 90, 95, 100]:
768
+ count = statistics[f"count_above_{threshold}"]
769
+ percent = statistics[f"percent_above_{threshold}"]
770
+ print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
771
+
772
+ print("\n" + "-" * 80)
773
+ print(f"Mean latency: {statistics['mean_latency']:.2f}s")
774
+ print("-" * 80)
775
+
776
+ print("\n" + "=" * 80)
777
+ print("EVALUATION COMPLETE")
778
+ print("=" * 80)
779
+
780
+ return 0
781
+
782
+
783
+ if __name__ == "__main__":
784
+ sys.exit(main())