Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .DS_Store +0 -0
- .gitattributes +71 -0
- Dev/.DS_Store +0 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250926003154.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928021537.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928101936.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928110056.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115107.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115109.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175228.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175432.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175459.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230226.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230315.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230435.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250928234553.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250929122641.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124054.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124057.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_20250929130229.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223735.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223952.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224119.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224233.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224326.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114711.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114715.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114806.sh +109 -0
- Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114816.sh +109 -0
- Dev/.history/launch_vlac_service_20251002114022.py +0 -0
- Dev/.history/launch_vlac_service_20251002114026.py +23 -0
- Dev/.history/setup_verl_20250930114055.sh +0 -0
- Dev/.history/setup_verl_20250930114105.sh +32 -0
- Dev/.history/setup_vlac_20250930114110.sh +0 -0
- Dev/.history/setup_vlac_20250930114358.sh +6 -0
- Dev/.history/setup_vlac_20250930120731.sh +6 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008150855.py +422 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008150925.py +422 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008151015.py +422 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008151156.py +422 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008151427.py +465 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008151542.py +466 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008151723.py +466 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008151816.py +465 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008152522.py +477 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008152534.py +491 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008152548.py +519 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008152620.py +683 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008152700.py +784 -0
- Dev/.history/testing/evaluate_test_demo_values_20251008152727.py +784 -0
.DS_Store
ADDED
|
Binary file (12.3 kB). View file
|
|
|
.gitattributes
CHANGED
|
@@ -33,3 +33,74 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
Dev/evo_vlac/examples/videos/pick-bowl-ref.mov filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
Dev/evo_vlac/examples/videos/pick-bowl-test.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
Dev/testing/evaluation_results_all_tasks/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
Dev/testing/evaluation_results_all_tasks/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
Dev/testing/evaluation_results_all_tasks/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
Dev/testing/evaluation_results_all_tasks/aggregate_statistics.png filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
Dev/testing/evaluation_results_all_tasks_2frms/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
Dev/testing/evaluation_results_all_tasks_2frms/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
Dev/testing/evaluation_results_all_tasks_2frms/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
Dev/testing/evaluation_results_all_tasks_2frms/aggregate_statistics.png filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it/KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it/KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
Dev/testing/evaluation_results_all_tasks_8frms/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove/KITCHEN_SCENE8_put_both_moka_pots_on_the_stove_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket/LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket/LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate/LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
Dev/testing/evaluation_results_all_tasks_8frms/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate/LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
Dev/testing/evaluation_results_all_tasks_8frms/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy/STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy_value_distribution.png filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
Dev/testing/evaluation_results_all_tasks_8frms/aggregate_statistics.png filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
Dev/testing/success_rate_across_trials.png filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
Dev/visual_prompting/task_1_demo_with_traj.png filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
Release/docs/assets/method_overview.png filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
Release/docs/assets/qualitative.png filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
Release/docs/assets/teaser.png filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
Release/reward_model/evo_vlac/examples/videos/pick-bowl-ref.mov filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
Release/reward_model/evo_vlac/examples/videos/pick-bowl-test.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
Reward/Robo-Dopamine/assets/eval.png filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
Reward/Robo-Dopamine/assets/example_backward.png filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
Reward/Robo-Dopamine/assets/example_forward.png filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
Reward/Robo-Dopamine/assets/example_incremental.png filter=lfs diff=lfs merge=lfs -text
|
| 82 |
+
Reward/Robo-Dopamine/assets/method.png filter=lfs diff=lfs merge=lfs -text
|
| 83 |
+
Reward/Robo-Dopamine/assets/teasor.png filter=lfs diff=lfs merge=lfs -text
|
| 84 |
+
Reward/Robo-Dopamine/assets/vsi.png filter=lfs diff=lfs merge=lfs -text
|
| 85 |
+
Reward/Robo-Dopamine/dataset/example_raw_data/episode_001/cam_high.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 86 |
+
Reward/Robo-Dopamine/dataset/example_raw_data/episode_001/cam_left_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 87 |
+
Reward/Robo-Dopamine/dataset/example_raw_data/episode_001/cam_right_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 88 |
+
Reward/Robo-Dopamine/dataset/example_raw_data/episode_002/cam_high.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 89 |
+
Reward/Robo-Dopamine/dataset/example_raw_data/episode_002/cam_left_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 90 |
+
Reward/Robo-Dopamine/dataset/example_raw_data/episode_002/cam_right_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 91 |
+
Reward/Robo-Dopamine/examples/demo_table/cam_high.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 92 |
+
Reward/Robo-Dopamine/examples/demo_table/cam_left_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 93 |
+
Reward/Robo-Dopamine/examples/demo_table/cam_right_wrist.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 94 |
+
Reward/Robo-Dopamine/examples/demo_table/goal_image.png filter=lfs diff=lfs merge=lfs -text
|
| 95 |
+
Reward/VLAC/data/VLAC_EAI.pdf filter=lfs diff=lfs merge=lfs -text
|
| 96 |
+
Reward/VLAC/data/framework.png filter=lfs diff=lfs merge=lfs -text
|
| 97 |
+
Reward/VLAC/data/title_banner-2.gif filter=lfs diff=lfs merge=lfs -text
|
| 98 |
+
Reward/VLAC/evo_vlac/examples/videos/pick-bowl-ref.mov filter=lfs diff=lfs merge=lfs -text
|
| 99 |
+
Reward/VLAC/evo_vlac/examples/videos/pick-bowl-test.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 100 |
+
Reward/robometer/assets/robometer.jpg filter=lfs diff=lfs merge=lfs -text
|
| 101 |
+
Reward/robometer/scripts/example_videos/soar_put_green_stick_in_brown_bowl_rewards_progress_success.png filter=lfs diff=lfs merge=lfs -text
|
| 102 |
+
arxiv/arxiv.pdf filter=lfs diff=lfs merge=lfs -text
|
| 103 |
+
arxiv/fig/fig1.pdf filter=lfs diff=lfs merge=lfs -text
|
| 104 |
+
arxiv/fig/mismatch.pdf filter=lfs diff=lfs merge=lfs -text
|
| 105 |
+
arxiv/fig/qualitative.pdf filter=lfs diff=lfs merge=lfs -text
|
| 106 |
+
arxiv/fig/ttt_vla_main.pdf filter=lfs diff=lfs merge=lfs -text
|
Dev/.DS_Store
ADDED
|
Binary file (10.2 kB). View file
|
|
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250926003154.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=False \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928021537.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=2 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=False \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928101936.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=8 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=False \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928110056.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=8 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=False \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115107.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=8 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=4 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=False \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928115109.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=8 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=4 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=False \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175228.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=8 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=4 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175432.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928175459.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230226.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230315.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928230435.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250928234553.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=25 \
|
| 100 |
+
trainer.test_freq=4 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929122641.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=5 \
|
| 100 |
+
trainer.test_freq=1 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124054.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=1 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929124057.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=1 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_20250929130229.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=1 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223735.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=False \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=1 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930223952.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=True \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=1 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224119.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=True \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=2 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224233.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=True \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=2 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20250930224326.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=True \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=2 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114711.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=True \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=2 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114715.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=True \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=2 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=True \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114806.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=True \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=2 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=False \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/examples/run_openvla_oft_rl_vlac_sparse_20251002114816.sh
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# OpenVLA-OFT RL Training with VLAC Integration
|
| 4 |
+
# Based on run_openvla_oft_rl.sh but with VLAC service integration
|
| 5 |
+
|
| 6 |
+
set -x
|
| 7 |
+
|
| 8 |
+
export MUJOCO_GL="egl" # glfw, glx, osmesa, egl
|
| 9 |
+
export PYOPENGL_PLATFORM="egl"
|
| 10 |
+
|
| 11 |
+
export NCCL_DEBUG=WARN
|
| 12 |
+
export WANDB_API_KEY='e3f637ebbcc4a90452916a3f7b209ba6dcd7ebea'
|
| 13 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 14 |
+
export TOKENIZERS_PARALLELISM=true
|
| 15 |
+
export CUDA_LAUNCH_BLOCKING=1
|
| 16 |
+
export TORCH_USE_CUDA_DSA=1
|
| 17 |
+
|
| 18 |
+
# VLAC Service Configuration
|
| 19 |
+
export VLAC_SERVICE_URL="http://localhost:8111"
|
| 20 |
+
|
| 21 |
+
# Before starting training, make sure VLAC service is running:
|
| 22 |
+
# python vlac_service.py --port 8111 --gpu-ids 0,1,2,3
|
| 23 |
+
|
| 24 |
+
PROJECT_NAME='SimpleVLA-RL-VLAC'
|
| 25 |
+
EXPERIMENT_NAME='vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16'
|
| 26 |
+
|
| 27 |
+
# For openvla-oft Libero-Long traj1 SFT or traj all SFT models can be find in https://huggingface.co/collections/Haozhan72/simplevla-rl-6833311430cd9df52aeb1f86
|
| 28 |
+
SFT_MODEL_PATH="CKPT/Openvla-oft-SFT-libero10-traj1"
|
| 29 |
+
CKPT_PATH="CKPT/vlac-libero10-sfttraj1_node1_trial-sparse-threshold-0.8-offset-16"
|
| 30 |
+
# DATASET_NAME can be libero_10 (libero_Long), libero_90, libero_spatial, libero_object, libero_goal
|
| 31 |
+
DATASET_NAME="libero_10"
|
| 32 |
+
VLA_NAME="openvla-oft"
|
| 33 |
+
NUM_GPUS=8
|
| 34 |
+
# If you want to use 2*8 GPU to RL. Set NUM_NODES=2
|
| 35 |
+
NUM_NODES=1
|
| 36 |
+
ALIGN_PATH="/home/zechen/SimpleVLA-RL/align.json"
|
| 37 |
+
|
| 38 |
+
HYDRA_FULL_ERROR=1 python -m verl.trainer.main_ppo \
|
| 39 |
+
data.task_suite_name=$DATASET_NAME \
|
| 40 |
+
data.num_trials_per_task=50 \
|
| 41 |
+
data.n_samples=8 \
|
| 42 |
+
data.filter_accuracy=True \
|
| 43 |
+
data.accuracy_lower_bound=0.1 \
|
| 44 |
+
data.accuracy_upper_bound=0.9 \
|
| 45 |
+
data.oversample_factor=1 \
|
| 46 |
+
data.train_batch_size=64 \
|
| 47 |
+
data.val_batch_size=496 \
|
| 48 |
+
data.max_prompt_length=256 \
|
| 49 |
+
data.max_response_length=128 \
|
| 50 |
+
actor_rollout_ref.model.path=$SFT_MODEL_PATH \
|
| 51 |
+
actor_rollout_ref.model.vla=$VLA_NAME \
|
| 52 |
+
actor_rollout_ref.model.action_token_len=7 \
|
| 53 |
+
actor_rollout_ref.model.action_chunks_len=8 \
|
| 54 |
+
actor_rollout_ref.actor.optim.lr=5e-6 \
|
| 55 |
+
actor_rollout_ref.actor.optim.warmup_style=constant \
|
| 56 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 57 |
+
actor_rollout_ref.actor.ppo_micro_batch_size=$NUM_GPUS \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=False \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 60 |
+
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
|
| 61 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 62 |
+
actor_rollout_ref.actor.grad_clip=1 \
|
| 63 |
+
actor_rollout_ref.actor.clip_ratio_high=0.28 \
|
| 64 |
+
actor_rollout_ref.actor.clip_ratio_low=0.2 \
|
| 65 |
+
actor_rollout_ref.actor.num_images_in_input=1 \
|
| 66 |
+
actor_rollout_ref.actor.traj_mini_batch_size=16 \
|
| 67 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=False \
|
| 68 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0. \
|
| 70 |
+
actor_rollout_ref.rollout.num_images_in_input=1 \
|
| 71 |
+
actor_rollout_ref.rollout.val_micro_batch_size=8 \
|
| 72 |
+
actor_rollout_ref.rollout.temperature=1.6 \
|
| 73 |
+
actor_rollout_ref.rollout.experiment_name=$EXPERIMENT_NAME \
|
| 74 |
+
actor_rollout_ref.rollout.micro_batch_size=1 \
|
| 75 |
+
actor_rollout_ref.rollout.unnorm_key=$DATASET_NAME \
|
| 76 |
+
actor_rollout_ref.rollout.model_family=openvla \
|
| 77 |
+
actor_rollout_ref.rollout.task_suite_name=$DATASET_NAME \
|
| 78 |
+
actor_rollout_ref.rollout.num_steps_wait=10 \
|
| 79 |
+
actor_rollout_ref.rollout.pretrained_checkpoint=$SFT_MODEL_PATH \
|
| 80 |
+
actor_rollout_ref.rollout.center_crop=True \
|
| 81 |
+
actor_rollout_ref.rollout.max_prompt_length=512 \
|
| 82 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
|
| 83 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 84 |
+
actor_rollout_ref.rollout.name=hf \
|
| 85 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 86 |
+
\
|
| 87 |
+
+actor_rollout_ref.rollout.use_vlac=true \
|
| 88 |
+
+actor_rollout_ref.rollout.vlac_service_url=$VLAC_SERVICE_URL \
|
| 89 |
+
\
|
| 90 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
|
| 91 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=0.00 \
|
| 93 |
+
trainer.logger=['console','wandb'] \
|
| 94 |
+
trainer.project_name=$PROJECT_NAME \
|
| 95 |
+
trainer.experiment_name=$EXPERIMENT_NAME \
|
| 96 |
+
trainer.default_local_dir=$CKPT_PATH/$PROJECT_NAME/$EXPERIMENT_NAME \
|
| 97 |
+
trainer.n_gpus_per_node=$NUM_GPUS \
|
| 98 |
+
trainer.nnodes=$NUM_NODES \
|
| 99 |
+
trainer.save_freq=10 \
|
| 100 |
+
trainer.test_freq=2 \
|
| 101 |
+
trainer.total_epochs=100 \
|
| 102 |
+
trainer.val_only=False \
|
| 103 |
+
trainer.val_before_train=False \
|
| 104 |
+
trainer.val_use_vlac=False \
|
| 105 |
+
algorithm.adv_estimator=grpo \
|
| 106 |
+
algorithm.adv_params.verifier_gamma=1.0 \
|
| 107 |
+
algorithm.adv_params.reward_model_gamma=1.0 \
|
| 108 |
+
trainer.runtime_env=$ALIGN_PATH \
|
| 109 |
+
trainer.wandb_mode=online
|
Dev/.history/launch_vlac_service_20251002114022.py
ADDED
|
File without changes
|
Dev/.history/launch_vlac_service_20251002114026.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
def launch_servers(base_port=8111):
|
| 6 |
+
num_gpus = torch.cuda.device_count()
|
| 7 |
+
processes = []
|
| 8 |
+
for gpu_id in range(num_gpus):
|
| 9 |
+
port = base_port + gpu_id
|
| 10 |
+
env = os.environ.copy()
|
| 11 |
+
env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
| 12 |
+
cmd = [
|
| 13 |
+
"python", "vlac_service.py", # 你写的 FastAPI 代码文件
|
| 14 |
+
"--port", str(port)
|
| 15 |
+
]
|
| 16 |
+
print(f"Launching GPU {gpu_id} on port {port}")
|
| 17 |
+
p = subprocess.Popen(cmd, env=env)
|
| 18 |
+
processes.append(p)
|
| 19 |
+
for p in processes:
|
| 20 |
+
p.wait()
|
| 21 |
+
|
| 22 |
+
if __name__ == "__main__":
|
| 23 |
+
launch_servers()
|
Dev/.history/setup_verl_20250930114055.sh
ADDED
|
File without changes
|
Dev/.history/setup_verl_20250930114105.sh
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
conda create -n verl python==3.10
|
| 2 |
+
conda activate verl
|
| 3 |
+
|
| 4 |
+
cd /mnt/bn/vgfm2/test_dit/zechen/RL_Playground/verl
|
| 5 |
+
pip install --no-deps -e .
|
| 6 |
+
|
| 7 |
+
cd ../../openvla-oft/
|
| 8 |
+
pip install -e .
|
| 9 |
+
|
| 10 |
+
cd LIBERO
|
| 11 |
+
pip install -e .
|
| 12 |
+
|
| 13 |
+
cd ..
|
| 14 |
+
pip install -r experiments/robot/libero/libero_requirements.txt
|
| 15 |
+
|
| 16 |
+
pip install packaging ninja
|
| 17 |
+
ninja --version; echo $?
|
| 18 |
+
|
| 19 |
+
pip install git+https://github.com/NICTA/pyairports.git
|
| 20 |
+
|
| 21 |
+
cd ../SimpleVLA-RL
|
| 22 |
+
pip install -r req.txt
|
| 23 |
+
pip uninstall torch torchvision torchaudio
|
| 24 |
+
|
| 25 |
+
pip3 install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121
|
| 26 |
+
pip install transformers@git+https://github.com/moojink/transformers-openvla-oft.git
|
| 27 |
+
|
| 28 |
+
pip uninstall flash_attn
|
| 29 |
+
pip install "flash-attn==2.5.5" --no-build-isolation --no-cache-dir
|
| 30 |
+
|
| 31 |
+
conda install -c conda-forge libegl-devel
|
| 32 |
+
sudo apt install libosmesa6 libosmesa6-dev
|
Dev/.history/setup_vlac_20250930114110.sh
ADDED
|
File without changes
|
Dev/.history/setup_vlac_20250930114358.sh
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
conda create -n vlac python==3.10
|
| 2 |
+
conda activate vlac
|
| 3 |
+
|
| 4 |
+
pip install ms-swift==3.3 transformers==4.51.0 peft==0.15.2 opencv-python loguru
|
| 5 |
+
pip3 install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121
|
| 6 |
+
pip install "flash-attn==2.5.5" --no-build-isolation --no-cache-dir
|
Dev/.history/setup_vlac_20250930120731.sh
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
conda create -n vlac python==3.10
|
| 2 |
+
conda activate vlac
|
| 3 |
+
|
| 4 |
+
pip install ms-swift==3.3 transformers==4.51.0 peft==0.15.2 opencv-python loguru timm
|
| 5 |
+
pip3 install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu121
|
| 6 |
+
pip install "flash-attn==2.5.5" --no-build-isolation --no-cache-dir
|
Dev/.history/testing/evaluate_test_demo_values_20251008150855.py
ADDED
|
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 12 |
+
|
| 13 |
+
Example:
|
| 14 |
+
python evaluate_test_demo_values.py \
|
| 15 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 16 |
+
--output-dir evaluation_results \
|
| 17 |
+
--base-url http://localhost:8111
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import base64
|
| 24 |
+
import json
|
| 25 |
+
import sys
|
| 26 |
+
import time
|
| 27 |
+
from io import BytesIO
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
from typing import Dict, List, Optional
|
| 30 |
+
|
| 31 |
+
import matplotlib.pyplot as plt
|
| 32 |
+
import numpy as np
|
| 33 |
+
import requests
|
| 34 |
+
from PIL import Image
|
| 35 |
+
from tqdm import tqdm
|
| 36 |
+
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
# Helpers
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 43 |
+
"""Read the test demo manifest JSON file."""
|
| 44 |
+
if not manifest_path.is_file():
|
| 45 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 46 |
+
|
| 47 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 48 |
+
manifest_data = json.load(f)
|
| 49 |
+
|
| 50 |
+
# Convert relative paths to absolute paths
|
| 51 |
+
manifest_dir = manifest_path.parent
|
| 52 |
+
for demo in manifest_data.get("demos", []):
|
| 53 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 54 |
+
|
| 55 |
+
return manifest_data
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def image_to_base64(path: Path) -> str:
|
| 59 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 60 |
+
with Image.open(path) as img:
|
| 61 |
+
img = img.convert("RGB")
|
| 62 |
+
buffer = BytesIO()
|
| 63 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 64 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 68 |
+
"""Encode a list of image paths to base64."""
|
| 69 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def call_trajectory_critic(
|
| 73 |
+
session: requests.Session,
|
| 74 |
+
base_url: str,
|
| 75 |
+
task: str,
|
| 76 |
+
frames_b64: List[str],
|
| 77 |
+
reference_b64: Optional[List[str]],
|
| 78 |
+
timeout: float,
|
| 79 |
+
) -> Dict:
|
| 80 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 81 |
+
payload = {
|
| 82 |
+
"task": task,
|
| 83 |
+
"frames": frames_b64,
|
| 84 |
+
"reference": reference_b64,
|
| 85 |
+
"ref_num": len(reference_b64 or []),
|
| 86 |
+
"skip": 1,
|
| 87 |
+
"batch_size": min(len(frames_b64), 8),
|
| 88 |
+
"think": False,
|
| 89 |
+
"return_video": False,
|
| 90 |
+
}
|
| 91 |
+
start = time.time()
|
| 92 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 93 |
+
resp.raise_for_status()
|
| 94 |
+
result = resp.json()
|
| 95 |
+
result["latency_sec"] = time.time() - start
|
| 96 |
+
return result
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ---------------------------------------------------------------------------
|
| 100 |
+
# Evaluation
|
| 101 |
+
# ---------------------------------------------------------------------------
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def evaluate_demos(
|
| 105 |
+
manifest_data: Dict,
|
| 106 |
+
base_url: str,
|
| 107 |
+
timeout: float,
|
| 108 |
+
use_reference: bool = False,
|
| 109 |
+
) -> Dict[str, any]:
|
| 110 |
+
"""Evaluate all demos and collect value statistics."""
|
| 111 |
+
session = requests.Session()
|
| 112 |
+
task_name = manifest_data.get("task_name", "")
|
| 113 |
+
demos = manifest_data.get("demos", [])
|
| 114 |
+
|
| 115 |
+
results = []
|
| 116 |
+
failed_demos = []
|
| 117 |
+
|
| 118 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 119 |
+
print(f"Task: {task_name}")
|
| 120 |
+
print(f"Use reference: {use_reference}\n")
|
| 121 |
+
|
| 122 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 123 |
+
demo_name = demo["demo_name"]
|
| 124 |
+
frame_paths = demo["frame_paths"]
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
# Encode frames
|
| 128 |
+
frames_b64 = encode_images(frame_paths)
|
| 129 |
+
|
| 130 |
+
# For now, no reference trajectory (can be added later)
|
| 131 |
+
reference_b64 = None
|
| 132 |
+
|
| 133 |
+
# Call VLAC service
|
| 134 |
+
result = call_trajectory_critic(
|
| 135 |
+
session=session,
|
| 136 |
+
base_url=base_url,
|
| 137 |
+
task=task_name,
|
| 138 |
+
frames_b64=frames_b64,
|
| 139 |
+
reference_b64=reference_b64,
|
| 140 |
+
timeout=timeout,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Extract values
|
| 144 |
+
value_list = result.get("value_list", [])
|
| 145 |
+
if not value_list:
|
| 146 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 147 |
+
failed_demos.append(demo_name)
|
| 148 |
+
continue
|
| 149 |
+
|
| 150 |
+
# Record results
|
| 151 |
+
demo_result = {
|
| 152 |
+
"demo_name": demo_name,
|
| 153 |
+
"total_frames": demo["total_frames"],
|
| 154 |
+
"success_index": demo["success_index"],
|
| 155 |
+
"num_sampled_frames": len(frame_paths),
|
| 156 |
+
"value_list": value_list,
|
| 157 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 158 |
+
"mean_value": float(np.mean(value_list)),
|
| 159 |
+
"std_value": float(np.std(value_list)),
|
| 160 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 161 |
+
}
|
| 162 |
+
results.append(demo_result)
|
| 163 |
+
|
| 164 |
+
except requests.RequestException as exc:
|
| 165 |
+
print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 166 |
+
failed_demos.append(demo_name)
|
| 167 |
+
except Exception as exc:
|
| 168 |
+
print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 169 |
+
failed_demos.append(demo_name)
|
| 170 |
+
|
| 171 |
+
return {
|
| 172 |
+
"task_name": task_name,
|
| 173 |
+
"total_demos": len(demos),
|
| 174 |
+
"successful_evals": len(results),
|
| 175 |
+
"failed_demos": failed_demos,
|
| 176 |
+
"results": results,
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 181 |
+
"""Compute summary statistics from evaluation results."""
|
| 182 |
+
results = evaluation_results["results"]
|
| 183 |
+
if not results:
|
| 184 |
+
return {}
|
| 185 |
+
|
| 186 |
+
last_values = [r["last_value"] for r in results]
|
| 187 |
+
mean_values = [r["mean_value"] for r in results]
|
| 188 |
+
latencies = [r["latency_sec"] for r in results]
|
| 189 |
+
|
| 190 |
+
stats = {
|
| 191 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 192 |
+
"last_value_std": float(np.std(last_values)),
|
| 193 |
+
"last_value_min": float(np.min(last_values)),
|
| 194 |
+
"last_value_max": float(np.max(last_values)),
|
| 195 |
+
"last_value_median": float(np.median(last_values)),
|
| 196 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 197 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 198 |
+
"mean_latency": float(np.mean(latencies)),
|
| 199 |
+
"total_evaluated": len(results),
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
# Count how many demos have last_value >= various thresholds
|
| 203 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 204 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 205 |
+
stats[f"count_above_{threshold}"] = count
|
| 206 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 207 |
+
|
| 208 |
+
return stats
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 212 |
+
"""Create visualization plots for value distribution."""
|
| 213 |
+
results = evaluation_results["results"]
|
| 214 |
+
if not results:
|
| 215 |
+
print("No results to plot")
|
| 216 |
+
return
|
| 217 |
+
|
| 218 |
+
task_name = evaluation_results["task_name"]
|
| 219 |
+
last_values = [r["last_value"] for r in results]
|
| 220 |
+
|
| 221 |
+
# Create figure with multiple subplots
|
| 222 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 223 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 224 |
+
|
| 225 |
+
# 1. Histogram of last values
|
| 226 |
+
ax1 = axes[0, 0]
|
| 227 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 228 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 229 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 230 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 231 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 232 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 233 |
+
ax1.legend()
|
| 234 |
+
ax1.grid(True, alpha=0.3)
|
| 235 |
+
|
| 236 |
+
# 2. Box plot of last values
|
| 237 |
+
ax2 = axes[0, 1]
|
| 238 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 239 |
+
for patch in box_data['boxes']:
|
| 240 |
+
patch.set_facecolor('lightblue')
|
| 241 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 242 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 243 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 244 |
+
ax2.legend()
|
| 245 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 246 |
+
|
| 247 |
+
# 3. Value progression across demos
|
| 248 |
+
ax3 = axes[1, 0]
|
| 249 |
+
demo_indices = range(len(results))
|
| 250 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 251 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 252 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 253 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 254 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 255 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 256 |
+
ax3.legend()
|
| 257 |
+
ax3.grid(True, alpha=0.3)
|
| 258 |
+
|
| 259 |
+
# 4. Cumulative distribution
|
| 260 |
+
ax4 = axes[1, 1]
|
| 261 |
+
sorted_values = np.sort(last_values)
|
| 262 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 263 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 264 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 265 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 266 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 267 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 268 |
+
ax4.legend()
|
| 269 |
+
ax4.grid(True, alpha=0.3)
|
| 270 |
+
|
| 271 |
+
plt.tight_layout()
|
| 272 |
+
|
| 273 |
+
# Save the plot
|
| 274 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 275 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 276 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 277 |
+
|
| 278 |
+
# Also save a PDF version
|
| 279 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 280 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 281 |
+
print(f"PDF saved to: {pdf_path}")
|
| 282 |
+
|
| 283 |
+
plt.close()
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 287 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 288 |
+
task_name = evaluation_results["task_name"]
|
| 289 |
+
|
| 290 |
+
# Save detailed results
|
| 291 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 292 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 293 |
+
json.dump(evaluation_results, f, indent=2)
|
| 294 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 295 |
+
|
| 296 |
+
# Save summary statistics
|
| 297 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 298 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 299 |
+
json.dump(statistics, f, indent=2)
|
| 300 |
+
print(f"Statistics saved to: {stats_path}")
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
# ---------------------------------------------------------------------------
|
| 304 |
+
# CLI
|
| 305 |
+
# ---------------------------------------------------------------------------
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def parse_args() -> argparse.Namespace:
|
| 309 |
+
parser = argparse.ArgumentParser(
|
| 310 |
+
description="Evaluate value estimation for test demonstrations"
|
| 311 |
+
)
|
| 312 |
+
parser.add_argument(
|
| 313 |
+
"--manifest-path",
|
| 314 |
+
type=Path,
|
| 315 |
+
default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
|
| 316 |
+
help="Path to the test manifest JSON file",
|
| 317 |
+
)
|
| 318 |
+
parser.add_argument(
|
| 319 |
+
"--output-dir",
|
| 320 |
+
type=Path,
|
| 321 |
+
default="evaluation_results",
|
| 322 |
+
help="Directory to save evaluation results and plots",
|
| 323 |
+
)
|
| 324 |
+
parser.add_argument(
|
| 325 |
+
"--base-url",
|
| 326 |
+
default="http://localhost:8111",
|
| 327 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 328 |
+
)
|
| 329 |
+
parser.add_argument(
|
| 330 |
+
"--timeout",
|
| 331 |
+
type=float,
|
| 332 |
+
default=30.0,
|
| 333 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 334 |
+
)
|
| 335 |
+
parser.add_argument(
|
| 336 |
+
"--use-reference",
|
| 337 |
+
action="store_true",
|
| 338 |
+
help="Use reference trajectory (if available)",
|
| 339 |
+
)
|
| 340 |
+
return parser.parse_args()
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def main() -> int:
|
| 344 |
+
args = parse_args()
|
| 345 |
+
|
| 346 |
+
# Read manifest
|
| 347 |
+
try:
|
| 348 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 349 |
+
except FileNotFoundError as exc:
|
| 350 |
+
print(f"Error: {exc}")
|
| 351 |
+
return 1
|
| 352 |
+
|
| 353 |
+
# Create output directory
|
| 354 |
+
output_dir = args.output_dir.expanduser()
|
| 355 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 356 |
+
|
| 357 |
+
# Run evaluation
|
| 358 |
+
print("=" * 80)
|
| 359 |
+
print("VLAC Value Estimation Evaluation")
|
| 360 |
+
print("=" * 80)
|
| 361 |
+
|
| 362 |
+
evaluation_results = evaluate_demos(
|
| 363 |
+
manifest_data=manifest_data,
|
| 364 |
+
base_url=args.base_url,
|
| 365 |
+
timeout=args.timeout,
|
| 366 |
+
use_reference=args.use_reference,
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
# Compute statistics
|
| 370 |
+
statistics = compute_statistics(evaluation_results)
|
| 371 |
+
|
| 372 |
+
# Print summary
|
| 373 |
+
print("\n" + "=" * 80)
|
| 374 |
+
print("EVALUATION SUMMARY")
|
| 375 |
+
print("=" * 80)
|
| 376 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 377 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 378 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 379 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 380 |
+
|
| 381 |
+
if statistics:
|
| 382 |
+
print("\n" + "-" * 80)
|
| 383 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 384 |
+
print("-" * 80)
|
| 385 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 386 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 387 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 388 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 389 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 390 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 391 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 392 |
+
|
| 393 |
+
print("\n" + "-" * 80)
|
| 394 |
+
print("THRESHOLD ANALYSIS")
|
| 395 |
+
print("-" * 80)
|
| 396 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 397 |
+
count = statistics[f"count_above_{threshold}"]
|
| 398 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 399 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 400 |
+
|
| 401 |
+
print("\n" + "-" * 80)
|
| 402 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 403 |
+
print("-" * 80)
|
| 404 |
+
|
| 405 |
+
# Save results
|
| 406 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 407 |
+
|
| 408 |
+
# Create plots
|
| 409 |
+
if evaluation_results["results"]:
|
| 410 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 411 |
+
else:
|
| 412 |
+
print("\nNo successful evaluations to plot.")
|
| 413 |
+
|
| 414 |
+
print("\n" + "=" * 80)
|
| 415 |
+
print("EVALUATION COMPLETE")
|
| 416 |
+
print("=" * 80)
|
| 417 |
+
|
| 418 |
+
return 0
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
if __name__ == "__main__":
|
| 422 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008150925.py
ADDED
|
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 12 |
+
|
| 13 |
+
Example:
|
| 14 |
+
python evaluate_test_demo_values.py \
|
| 15 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 16 |
+
--output-dir evaluation_results \
|
| 17 |
+
--base-url http://localhost:8111
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import base64
|
| 24 |
+
import json
|
| 25 |
+
import sys
|
| 26 |
+
import time
|
| 27 |
+
from io import BytesIO
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
from typing import Dict, List, Optional
|
| 30 |
+
|
| 31 |
+
import matplotlib.pyplot as plt
|
| 32 |
+
import numpy as np
|
| 33 |
+
import requests
|
| 34 |
+
from PIL import Image
|
| 35 |
+
from tqdm import tqdm
|
| 36 |
+
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
# Helpers
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 43 |
+
"""Read the test demo manifest JSON file."""
|
| 44 |
+
if not manifest_path.is_file():
|
| 45 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 46 |
+
|
| 47 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 48 |
+
manifest_data = json.load(f)
|
| 49 |
+
|
| 50 |
+
# Convert relative paths to absolute paths
|
| 51 |
+
manifest_dir = manifest_path.parent
|
| 52 |
+
for demo in manifest_data.get("demos", []):
|
| 53 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 54 |
+
|
| 55 |
+
return manifest_data
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def image_to_base64(path: Path) -> str:
|
| 59 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 60 |
+
with Image.open(path) as img:
|
| 61 |
+
img = img.convert("RGB")
|
| 62 |
+
buffer = BytesIO()
|
| 63 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 64 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 68 |
+
"""Encode a list of image paths to base64."""
|
| 69 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def call_trajectory_critic(
|
| 73 |
+
session: requests.Session,
|
| 74 |
+
base_url: str,
|
| 75 |
+
task: str,
|
| 76 |
+
frames_b64: List[str],
|
| 77 |
+
reference_b64: Optional[List[str]],
|
| 78 |
+
timeout: float,
|
| 79 |
+
) -> Dict:
|
| 80 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 81 |
+
payload = {
|
| 82 |
+
"task": task,
|
| 83 |
+
"frames": frames_b64,
|
| 84 |
+
"reference": reference_b64,
|
| 85 |
+
"ref_num": len(reference_b64 or []),
|
| 86 |
+
"skip": 1,
|
| 87 |
+
"batch_size": min(len(frames_b64), 8),
|
| 88 |
+
"think": False,
|
| 89 |
+
"return_video": False,
|
| 90 |
+
}
|
| 91 |
+
start = time.time()
|
| 92 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 93 |
+
resp.raise_for_status()
|
| 94 |
+
result = resp.json()
|
| 95 |
+
result["latency_sec"] = time.time() - start
|
| 96 |
+
return result
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ---------------------------------------------------------------------------
|
| 100 |
+
# Evaluation
|
| 101 |
+
# ---------------------------------------------------------------------------
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def evaluate_demos(
|
| 105 |
+
manifest_data: Dict,
|
| 106 |
+
base_url: str,
|
| 107 |
+
timeout: float,
|
| 108 |
+
use_reference: bool = False,
|
| 109 |
+
) -> Dict[str, any]:
|
| 110 |
+
"""Evaluate all demos and collect value statistics."""
|
| 111 |
+
session = requests.Session()
|
| 112 |
+
task_name = manifest_data.get("task_name", "")
|
| 113 |
+
demos = manifest_data.get("demos", [])
|
| 114 |
+
|
| 115 |
+
results = []
|
| 116 |
+
failed_demos = []
|
| 117 |
+
|
| 118 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 119 |
+
print(f"Task: {task_name}")
|
| 120 |
+
print(f"Use reference: {use_reference}\n")
|
| 121 |
+
|
| 122 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 123 |
+
demo_name = demo["demo_name"]
|
| 124 |
+
frame_paths = demo["frame_paths"]
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
# Encode frames
|
| 128 |
+
frames_b64 = encode_images(frame_paths)
|
| 129 |
+
|
| 130 |
+
# For now, no reference trajectory (can be added later)
|
| 131 |
+
reference_b64 = None
|
| 132 |
+
|
| 133 |
+
# Call VLAC service
|
| 134 |
+
result = call_trajectory_critic(
|
| 135 |
+
session=session,
|
| 136 |
+
base_url=base_url,
|
| 137 |
+
task=task_name,
|
| 138 |
+
frames_b64=frames_b64,
|
| 139 |
+
reference_b64=reference_b64,
|
| 140 |
+
timeout=timeout,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Extract values
|
| 144 |
+
value_list = result.get("value_list", [])
|
| 145 |
+
if not value_list:
|
| 146 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 147 |
+
failed_demos.append(demo_name)
|
| 148 |
+
continue
|
| 149 |
+
|
| 150 |
+
# Record results
|
| 151 |
+
demo_result = {
|
| 152 |
+
"demo_name": demo_name,
|
| 153 |
+
"total_frames": demo["total_frames"],
|
| 154 |
+
"success_index": demo["success_index"],
|
| 155 |
+
"num_sampled_frames": len(frame_paths),
|
| 156 |
+
"value_list": value_list,
|
| 157 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 158 |
+
"mean_value": float(np.mean(value_list)),
|
| 159 |
+
"std_value": float(np.std(value_list)),
|
| 160 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 161 |
+
}
|
| 162 |
+
results.append(demo_result)
|
| 163 |
+
|
| 164 |
+
except requests.RequestException as exc:
|
| 165 |
+
print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 166 |
+
failed_demos.append(demo_name)
|
| 167 |
+
except Exception as exc:
|
| 168 |
+
print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 169 |
+
failed_demos.append(demo_name)
|
| 170 |
+
|
| 171 |
+
return {
|
| 172 |
+
"task_name": task_name,
|
| 173 |
+
"total_demos": len(demos),
|
| 174 |
+
"successful_evals": len(results),
|
| 175 |
+
"failed_demos": failed_demos,
|
| 176 |
+
"results": results,
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 181 |
+
"""Compute summary statistics from evaluation results."""
|
| 182 |
+
results = evaluation_results["results"]
|
| 183 |
+
if not results:
|
| 184 |
+
return {}
|
| 185 |
+
|
| 186 |
+
last_values = [r["last_value"] for r in results]
|
| 187 |
+
mean_values = [r["mean_value"] for r in results]
|
| 188 |
+
latencies = [r["latency_sec"] for r in results]
|
| 189 |
+
|
| 190 |
+
stats = {
|
| 191 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 192 |
+
"last_value_std": float(np.std(last_values)),
|
| 193 |
+
"last_value_min": float(np.min(last_values)),
|
| 194 |
+
"last_value_max": float(np.max(last_values)),
|
| 195 |
+
"last_value_median": float(np.median(last_values)),
|
| 196 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 197 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 198 |
+
"mean_latency": float(np.mean(latencies)),
|
| 199 |
+
"total_evaluated": len(results),
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
# Count how many demos have last_value >= various thresholds
|
| 203 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 204 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 205 |
+
stats[f"count_above_{threshold}"] = count
|
| 206 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 207 |
+
|
| 208 |
+
return stats
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 212 |
+
"""Create visualization plots for value distribution."""
|
| 213 |
+
results = evaluation_results["results"]
|
| 214 |
+
if not results:
|
| 215 |
+
print("No results to plot")
|
| 216 |
+
return
|
| 217 |
+
|
| 218 |
+
task_name = evaluation_results["task_name"]
|
| 219 |
+
last_values = [r["last_value"] for r in results]
|
| 220 |
+
|
| 221 |
+
# Create figure with multiple subplots
|
| 222 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 223 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 224 |
+
|
| 225 |
+
# 1. Histogram of last values
|
| 226 |
+
ax1 = axes[0, 0]
|
| 227 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 228 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 229 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 230 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 231 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 232 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 233 |
+
ax1.legend()
|
| 234 |
+
ax1.grid(True, alpha=0.3)
|
| 235 |
+
|
| 236 |
+
# 2. Box plot of last values
|
| 237 |
+
ax2 = axes[0, 1]
|
| 238 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 239 |
+
for patch in box_data['boxes']:
|
| 240 |
+
patch.set_facecolor('lightblue')
|
| 241 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 242 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 243 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 244 |
+
ax2.legend()
|
| 245 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 246 |
+
|
| 247 |
+
# 3. Value progression across demos
|
| 248 |
+
ax3 = axes[1, 0]
|
| 249 |
+
demo_indices = range(len(results))
|
| 250 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 251 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 252 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 253 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 254 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 255 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 256 |
+
ax3.legend()
|
| 257 |
+
ax3.grid(True, alpha=0.3)
|
| 258 |
+
|
| 259 |
+
# 4. Cumulative distribution
|
| 260 |
+
ax4 = axes[1, 1]
|
| 261 |
+
sorted_values = np.sort(last_values)
|
| 262 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 263 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 264 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 265 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 266 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 267 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 268 |
+
ax4.legend()
|
| 269 |
+
ax4.grid(True, alpha=0.3)
|
| 270 |
+
|
| 271 |
+
plt.tight_layout()
|
| 272 |
+
|
| 273 |
+
# Save the plot
|
| 274 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 275 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 276 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 277 |
+
|
| 278 |
+
# Also save a PDF version
|
| 279 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 280 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 281 |
+
print(f"PDF saved to: {pdf_path}")
|
| 282 |
+
|
| 283 |
+
plt.close()
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 287 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 288 |
+
task_name = evaluation_results["task_name"]
|
| 289 |
+
|
| 290 |
+
# Save detailed results
|
| 291 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 292 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 293 |
+
json.dump(evaluation_results, f, indent=2)
|
| 294 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 295 |
+
|
| 296 |
+
# Save summary statistics
|
| 297 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 298 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 299 |
+
json.dump(statistics, f, indent=2)
|
| 300 |
+
print(f"Statistics saved to: {stats_path}")
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
# ---------------------------------------------------------------------------
|
| 304 |
+
# CLI
|
| 305 |
+
# ---------------------------------------------------------------------------
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def parse_args() -> argparse.Namespace:
|
| 309 |
+
parser = argparse.ArgumentParser(
|
| 310 |
+
description="Evaluate value estimation for test demonstrations"
|
| 311 |
+
)
|
| 312 |
+
parser.add_argument(
|
| 313 |
+
"--manifest-path",
|
| 314 |
+
type=Path,
|
| 315 |
+
default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
|
| 316 |
+
help="Path to the test manifest JSON file",
|
| 317 |
+
)
|
| 318 |
+
parser.add_argument(
|
| 319 |
+
"--output-dir",
|
| 320 |
+
type=Path,
|
| 321 |
+
default="evaluation_results",
|
| 322 |
+
help="Directory to save evaluation results and plots",
|
| 323 |
+
)
|
| 324 |
+
parser.add_argument(
|
| 325 |
+
"--base-url",
|
| 326 |
+
default="http://localhost:8111",
|
| 327 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 328 |
+
)
|
| 329 |
+
parser.add_argument(
|
| 330 |
+
"--timeout",
|
| 331 |
+
type=float,
|
| 332 |
+
default=30.0,
|
| 333 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 334 |
+
)
|
| 335 |
+
parser.add_argument(
|
| 336 |
+
"--use-reference",
|
| 337 |
+
action="store_true",
|
| 338 |
+
help="Use reference trajectory (if available)",
|
| 339 |
+
)
|
| 340 |
+
return parser.parse_args()
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def main() -> int:
|
| 344 |
+
args = parse_args()
|
| 345 |
+
|
| 346 |
+
# Read manifest
|
| 347 |
+
try:
|
| 348 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 349 |
+
except FileNotFoundError as exc:
|
| 350 |
+
print(f"Error: {exc}")
|
| 351 |
+
return 1
|
| 352 |
+
|
| 353 |
+
# Create output directory
|
| 354 |
+
output_dir = args.output_dir.expanduser()
|
| 355 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 356 |
+
|
| 357 |
+
# Run evaluation
|
| 358 |
+
print("=" * 80)
|
| 359 |
+
print("VLAC Value Estimation Evaluation")
|
| 360 |
+
print("=" * 80)
|
| 361 |
+
|
| 362 |
+
evaluation_results = evaluate_demos(
|
| 363 |
+
manifest_data=manifest_data,
|
| 364 |
+
base_url=args.base_url,
|
| 365 |
+
timeout=args.timeout,
|
| 366 |
+
use_reference=args.use_reference,
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
# Compute statistics
|
| 370 |
+
statistics = compute_statistics(evaluation_results)
|
| 371 |
+
|
| 372 |
+
# Print summary
|
| 373 |
+
print("\n" + "=" * 80)
|
| 374 |
+
print("EVALUATION SUMMARY")
|
| 375 |
+
print("=" * 80)
|
| 376 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 377 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 378 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 379 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 380 |
+
|
| 381 |
+
if statistics:
|
| 382 |
+
print("\n" + "-" * 80)
|
| 383 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 384 |
+
print("-" * 80)
|
| 385 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 386 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 387 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 388 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 389 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 390 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 391 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 392 |
+
|
| 393 |
+
print("\n" + "-" * 80)
|
| 394 |
+
print("THRESHOLD ANALYSIS")
|
| 395 |
+
print("-" * 80)
|
| 396 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 397 |
+
count = statistics[f"count_above_{threshold}"]
|
| 398 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 399 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 400 |
+
|
| 401 |
+
print("\n" + "-" * 80)
|
| 402 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 403 |
+
print("-" * 80)
|
| 404 |
+
|
| 405 |
+
# Save results
|
| 406 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 407 |
+
|
| 408 |
+
# Create plots
|
| 409 |
+
if evaluation_results["results"]:
|
| 410 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 411 |
+
else:
|
| 412 |
+
print("\nNo successful evaluations to plot.")
|
| 413 |
+
|
| 414 |
+
print("\n" + "=" * 80)
|
| 415 |
+
print("EVALUATION COMPLETE")
|
| 416 |
+
print("=" * 80)
|
| 417 |
+
|
| 418 |
+
return 0
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
if __name__ == "__main__":
|
| 422 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008151015.py
ADDED
|
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 12 |
+
|
| 13 |
+
Example:
|
| 14 |
+
python evaluate_test_demo_values.py \
|
| 15 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 16 |
+
--output-dir evaluation_results \
|
| 17 |
+
--base-url http://localhost:8111
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import base64
|
| 24 |
+
import json
|
| 25 |
+
import sys
|
| 26 |
+
import time
|
| 27 |
+
from io import BytesIO
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
from typing import Dict, List, Optional
|
| 30 |
+
|
| 31 |
+
import matplotlib.pyplot as plt
|
| 32 |
+
import numpy as np
|
| 33 |
+
import requests
|
| 34 |
+
from PIL import Image
|
| 35 |
+
from tqdm import tqdm
|
| 36 |
+
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
# Helpers
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 43 |
+
"""Read the test demo manifest JSON file."""
|
| 44 |
+
if not manifest_path.is_file():
|
| 45 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 46 |
+
|
| 47 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 48 |
+
manifest_data = json.load(f)
|
| 49 |
+
|
| 50 |
+
# Convert relative paths to absolute paths
|
| 51 |
+
manifest_dir = manifest_path.parent
|
| 52 |
+
for demo in manifest_data.get("demos", []):
|
| 53 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 54 |
+
|
| 55 |
+
return manifest_data
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def image_to_base64(path: Path) -> str:
|
| 59 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 60 |
+
with Image.open(path) as img:
|
| 61 |
+
img = img.convert("RGB")
|
| 62 |
+
buffer = BytesIO()
|
| 63 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 64 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 68 |
+
"""Encode a list of image paths to base64."""
|
| 69 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def call_trajectory_critic(
|
| 73 |
+
session: requests.Session,
|
| 74 |
+
base_url: str,
|
| 75 |
+
task: str,
|
| 76 |
+
frames_b64: List[str],
|
| 77 |
+
reference_b64: Optional[List[str]],
|
| 78 |
+
timeout: float,
|
| 79 |
+
) -> Dict:
|
| 80 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 81 |
+
payload = {
|
| 82 |
+
"task": task,
|
| 83 |
+
"frames": frames_b64,
|
| 84 |
+
"reference": reference_b64,
|
| 85 |
+
"ref_num": len(reference_b64 or []),
|
| 86 |
+
"skip": 1,
|
| 87 |
+
"batch_size": min(len(frames_b64), 8),
|
| 88 |
+
"think": False,
|
| 89 |
+
"return_video": False,
|
| 90 |
+
}
|
| 91 |
+
start = time.time()
|
| 92 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 93 |
+
resp.raise_for_status()
|
| 94 |
+
result = resp.json()
|
| 95 |
+
result["latency_sec"] = time.time() - start
|
| 96 |
+
return result
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ---------------------------------------------------------------------------
|
| 100 |
+
# Evaluation
|
| 101 |
+
# ---------------------------------------------------------------------------
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def evaluate_demos(
|
| 105 |
+
manifest_data: Dict,
|
| 106 |
+
base_url: str,
|
| 107 |
+
timeout: float,
|
| 108 |
+
use_reference: bool = False,
|
| 109 |
+
) -> Dict[str, any]:
|
| 110 |
+
"""Evaluate all demos and collect value statistics."""
|
| 111 |
+
session = requests.Session()
|
| 112 |
+
task_name = manifest_data.get("task_name", "")
|
| 113 |
+
demos = manifest_data.get("demos", [])
|
| 114 |
+
|
| 115 |
+
results = []
|
| 116 |
+
failed_demos = []
|
| 117 |
+
|
| 118 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 119 |
+
print(f"Task: {task_name}")
|
| 120 |
+
print(f"Use reference: {use_reference}\n")
|
| 121 |
+
|
| 122 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 123 |
+
demo_name = demo["demo_name"]
|
| 124 |
+
frame_paths = demo["frame_paths"]
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
# Encode frames
|
| 128 |
+
frames_b64 = encode_images(frame_paths)
|
| 129 |
+
|
| 130 |
+
# For now, no reference trajectory (can be added later)
|
| 131 |
+
reference_b64 = None
|
| 132 |
+
|
| 133 |
+
# Call VLAC service
|
| 134 |
+
result = call_trajectory_critic(
|
| 135 |
+
session=session,
|
| 136 |
+
base_url=base_url,
|
| 137 |
+
task=task_name,
|
| 138 |
+
frames_b64=frames_b64,
|
| 139 |
+
reference_b64=reference_b64,
|
| 140 |
+
timeout=timeout,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Extract values
|
| 144 |
+
value_list = result.get("value_list", [])
|
| 145 |
+
if not value_list:
|
| 146 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 147 |
+
failed_demos.append(demo_name)
|
| 148 |
+
continue
|
| 149 |
+
|
| 150 |
+
# Record results
|
| 151 |
+
demo_result = {
|
| 152 |
+
"demo_name": demo_name,
|
| 153 |
+
"total_frames": demo["total_frames"],
|
| 154 |
+
"success_index": demo["success_index"],
|
| 155 |
+
"num_sampled_frames": len(frame_paths),
|
| 156 |
+
"value_list": value_list,
|
| 157 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 158 |
+
"mean_value": float(np.mean(value_list)),
|
| 159 |
+
"std_value": float(np.std(value_list)),
|
| 160 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 161 |
+
}
|
| 162 |
+
results.append(demo_result)
|
| 163 |
+
|
| 164 |
+
except requests.RequestException as exc:
|
| 165 |
+
print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 166 |
+
failed_demos.append(demo_name)
|
| 167 |
+
except Exception as exc:
|
| 168 |
+
print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 169 |
+
failed_demos.append(demo_name)
|
| 170 |
+
|
| 171 |
+
return {
|
| 172 |
+
"task_name": task_name,
|
| 173 |
+
"total_demos": len(demos),
|
| 174 |
+
"successful_evals": len(results),
|
| 175 |
+
"failed_demos": failed_demos,
|
| 176 |
+
"results": results,
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 181 |
+
"""Compute summary statistics from evaluation results."""
|
| 182 |
+
results = evaluation_results["results"]
|
| 183 |
+
if not results:
|
| 184 |
+
return {}
|
| 185 |
+
|
| 186 |
+
last_values = [r["last_value"] for r in results]
|
| 187 |
+
mean_values = [r["mean_value"] for r in results]
|
| 188 |
+
latencies = [r["latency_sec"] for r in results]
|
| 189 |
+
|
| 190 |
+
stats = {
|
| 191 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 192 |
+
"last_value_std": float(np.std(last_values)),
|
| 193 |
+
"last_value_min": float(np.min(last_values)),
|
| 194 |
+
"last_value_max": float(np.max(last_values)),
|
| 195 |
+
"last_value_median": float(np.median(last_values)),
|
| 196 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 197 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 198 |
+
"mean_latency": float(np.mean(latencies)),
|
| 199 |
+
"total_evaluated": len(results),
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
# Count how many demos have last_value >= various thresholds
|
| 203 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 204 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 205 |
+
stats[f"count_above_{threshold}"] = count
|
| 206 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 207 |
+
|
| 208 |
+
return stats
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 212 |
+
"""Create visualization plots for value distribution."""
|
| 213 |
+
results = evaluation_results["results"]
|
| 214 |
+
if not results:
|
| 215 |
+
print("No results to plot")
|
| 216 |
+
return
|
| 217 |
+
|
| 218 |
+
task_name = evaluation_results["task_name"]
|
| 219 |
+
last_values = [r["last_value"] for r in results]
|
| 220 |
+
|
| 221 |
+
# Create figure with multiple subplots
|
| 222 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 223 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 224 |
+
|
| 225 |
+
# 1. Histogram of last values
|
| 226 |
+
ax1 = axes[0, 0]
|
| 227 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 228 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 229 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 230 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 231 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 232 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 233 |
+
ax1.legend()
|
| 234 |
+
ax1.grid(True, alpha=0.3)
|
| 235 |
+
|
| 236 |
+
# 2. Box plot of last values
|
| 237 |
+
ax2 = axes[0, 1]
|
| 238 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 239 |
+
for patch in box_data['boxes']:
|
| 240 |
+
patch.set_facecolor('lightblue')
|
| 241 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 242 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 243 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 244 |
+
ax2.legend()
|
| 245 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 246 |
+
|
| 247 |
+
# 3. Value progression across demos
|
| 248 |
+
ax3 = axes[1, 0]
|
| 249 |
+
demo_indices = range(len(results))
|
| 250 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 251 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 252 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 253 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 254 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 255 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 256 |
+
ax3.legend()
|
| 257 |
+
ax3.grid(True, alpha=0.3)
|
| 258 |
+
|
| 259 |
+
# 4. Cumulative distribution
|
| 260 |
+
ax4 = axes[1, 1]
|
| 261 |
+
sorted_values = np.sort(last_values)
|
| 262 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 263 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 264 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 265 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 266 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 267 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 268 |
+
ax4.legend()
|
| 269 |
+
ax4.grid(True, alpha=0.3)
|
| 270 |
+
|
| 271 |
+
plt.tight_layout()
|
| 272 |
+
|
| 273 |
+
# Save the plot
|
| 274 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 275 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 276 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 277 |
+
|
| 278 |
+
# Also save a PDF version
|
| 279 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 280 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 281 |
+
print(f"PDF saved to: {pdf_path}")
|
| 282 |
+
|
| 283 |
+
plt.close()
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 287 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 288 |
+
task_name = evaluation_results["task_name"]
|
| 289 |
+
|
| 290 |
+
# Save detailed results
|
| 291 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 292 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 293 |
+
json.dump(evaluation_results, f, indent=2)
|
| 294 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 295 |
+
|
| 296 |
+
# Save summary statistics
|
| 297 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 298 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 299 |
+
json.dump(statistics, f, indent=2)
|
| 300 |
+
print(f"Statistics saved to: {stats_path}")
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
# ---------------------------------------------------------------------------
|
| 304 |
+
# CLI
|
| 305 |
+
# ---------------------------------------------------------------------------
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def parse_args() -> argparse.Namespace:
|
| 309 |
+
parser = argparse.ArgumentParser(
|
| 310 |
+
description="Evaluate value estimation for test demonstrations"
|
| 311 |
+
)
|
| 312 |
+
parser.add_argument(
|
| 313 |
+
"--manifest-path",
|
| 314 |
+
type=Path,
|
| 315 |
+
default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
|
| 316 |
+
help="Path to the test manifest JSON file",
|
| 317 |
+
)
|
| 318 |
+
parser.add_argument(
|
| 319 |
+
"--output-dir",
|
| 320 |
+
type=Path,
|
| 321 |
+
default="evaluation_results",
|
| 322 |
+
help="Directory to save evaluation results and plots",
|
| 323 |
+
)
|
| 324 |
+
parser.add_argument(
|
| 325 |
+
"--base-url",
|
| 326 |
+
default="http://localhost:8111",
|
| 327 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 328 |
+
)
|
| 329 |
+
parser.add_argument(
|
| 330 |
+
"--timeout",
|
| 331 |
+
type=float,
|
| 332 |
+
default=30.0,
|
| 333 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 334 |
+
)
|
| 335 |
+
parser.add_argument(
|
| 336 |
+
"--use-reference",
|
| 337 |
+
action="store_true",
|
| 338 |
+
help="Use reference trajectory (if available)",
|
| 339 |
+
)
|
| 340 |
+
return parser.parse_args()
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def main() -> int:
|
| 344 |
+
args = parse_args()
|
| 345 |
+
|
| 346 |
+
# Read manifest
|
| 347 |
+
try:
|
| 348 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 349 |
+
except FileNotFoundError as exc:
|
| 350 |
+
print(f"Error: {exc}")
|
| 351 |
+
return 1
|
| 352 |
+
|
| 353 |
+
# Create output directory
|
| 354 |
+
output_dir = args.output_dir.expanduser()
|
| 355 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 356 |
+
|
| 357 |
+
# Run evaluation
|
| 358 |
+
print("=" * 80)
|
| 359 |
+
print("VLAC Value Estimation Evaluation")
|
| 360 |
+
print("=" * 80)
|
| 361 |
+
|
| 362 |
+
evaluation_results = evaluate_demos(
|
| 363 |
+
manifest_data=manifest_data,
|
| 364 |
+
base_url=args.base_url,
|
| 365 |
+
timeout=args.timeout,
|
| 366 |
+
use_reference=args.use_reference,
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
# Compute statistics
|
| 370 |
+
statistics = compute_statistics(evaluation_results)
|
| 371 |
+
|
| 372 |
+
# Print summary
|
| 373 |
+
print("\n" + "=" * 80)
|
| 374 |
+
print("EVALUATION SUMMARY")
|
| 375 |
+
print("=" * 80)
|
| 376 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 377 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 378 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 379 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 380 |
+
|
| 381 |
+
if statistics:
|
| 382 |
+
print("\n" + "-" * 80)
|
| 383 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 384 |
+
print("-" * 80)
|
| 385 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 386 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 387 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 388 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 389 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 390 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 391 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 392 |
+
|
| 393 |
+
print("\n" + "-" * 80)
|
| 394 |
+
print("THRESHOLD ANALYSIS")
|
| 395 |
+
print("-" * 80)
|
| 396 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 397 |
+
count = statistics[f"count_above_{threshold}"]
|
| 398 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 399 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 400 |
+
|
| 401 |
+
print("\n" + "-" * 80)
|
| 402 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 403 |
+
print("-" * 80)
|
| 404 |
+
|
| 405 |
+
# Save results
|
| 406 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 407 |
+
|
| 408 |
+
# Create plots
|
| 409 |
+
if evaluation_results["results"]:
|
| 410 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 411 |
+
else:
|
| 412 |
+
print("\nNo successful evaluations to plot.")
|
| 413 |
+
|
| 414 |
+
print("\n" + "=" * 80)
|
| 415 |
+
print("EVALUATION COMPLETE")
|
| 416 |
+
print("=" * 80)
|
| 417 |
+
|
| 418 |
+
return 0
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
if __name__ == "__main__":
|
| 422 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008151156.py
ADDED
|
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 12 |
+
|
| 13 |
+
Example:
|
| 14 |
+
python evaluate_test_demo_values.py \
|
| 15 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 16 |
+
--output-dir evaluation_results \
|
| 17 |
+
--base-url http://localhost:8111
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import base64
|
| 24 |
+
import json
|
| 25 |
+
import sys
|
| 26 |
+
import time
|
| 27 |
+
from io import BytesIO
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
from typing import Dict, List, Optional
|
| 30 |
+
|
| 31 |
+
import matplotlib.pyplot as plt
|
| 32 |
+
import numpy as np
|
| 33 |
+
import requests
|
| 34 |
+
from PIL import Image
|
| 35 |
+
from tqdm import tqdm
|
| 36 |
+
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
# Helpers
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 43 |
+
"""Read the test demo manifest JSON file."""
|
| 44 |
+
if not manifest_path.is_file():
|
| 45 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 46 |
+
|
| 47 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 48 |
+
manifest_data = json.load(f)
|
| 49 |
+
|
| 50 |
+
# Convert relative paths to absolute paths
|
| 51 |
+
manifest_dir = manifest_path.parent
|
| 52 |
+
for demo in manifest_data.get("demos", []):
|
| 53 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 54 |
+
|
| 55 |
+
return manifest_data
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def image_to_base64(path: Path) -> str:
|
| 59 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 60 |
+
with Image.open(path) as img:
|
| 61 |
+
img = img.convert("RGB")
|
| 62 |
+
buffer = BytesIO()
|
| 63 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 64 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 68 |
+
"""Encode a list of image paths to base64."""
|
| 69 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def call_trajectory_critic(
|
| 73 |
+
session: requests.Session,
|
| 74 |
+
base_url: str,
|
| 75 |
+
task: str,
|
| 76 |
+
frames_b64: List[str],
|
| 77 |
+
reference_b64: Optional[List[str]],
|
| 78 |
+
timeout: float,
|
| 79 |
+
) -> Dict:
|
| 80 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 81 |
+
payload = {
|
| 82 |
+
"task": task,
|
| 83 |
+
"frames": frames_b64,
|
| 84 |
+
"reference": reference_b64,
|
| 85 |
+
"ref_num": len(reference_b64 or []),
|
| 86 |
+
"skip": 1,
|
| 87 |
+
"batch_size": min(len(frames_b64), 8),
|
| 88 |
+
"think": False,
|
| 89 |
+
"return_video": False,
|
| 90 |
+
}
|
| 91 |
+
start = time.time()
|
| 92 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 93 |
+
resp.raise_for_status()
|
| 94 |
+
result = resp.json()
|
| 95 |
+
result["latency_sec"] = time.time() - start
|
| 96 |
+
return result
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ---------------------------------------------------------------------------
|
| 100 |
+
# Evaluation
|
| 101 |
+
# ---------------------------------------------------------------------------
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def evaluate_demos(
|
| 105 |
+
manifest_data: Dict,
|
| 106 |
+
base_url: str,
|
| 107 |
+
timeout: float,
|
| 108 |
+
use_reference: bool = False,
|
| 109 |
+
) -> Dict[str, any]:
|
| 110 |
+
"""Evaluate all demos and collect value statistics."""
|
| 111 |
+
session = requests.Session()
|
| 112 |
+
task_name = manifest_data.get("task_name", "")
|
| 113 |
+
demos = manifest_data.get("demos", [])
|
| 114 |
+
|
| 115 |
+
results = []
|
| 116 |
+
failed_demos = []
|
| 117 |
+
|
| 118 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 119 |
+
print(f"Task: {task_name}")
|
| 120 |
+
print(f"Use reference: {use_reference}\n")
|
| 121 |
+
|
| 122 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 123 |
+
demo_name = demo["demo_name"]
|
| 124 |
+
frame_paths = demo["frame_paths"]
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
# Encode frames
|
| 128 |
+
frames_b64 = encode_images(frame_paths)
|
| 129 |
+
|
| 130 |
+
# For now, no reference trajectory (can be added later)
|
| 131 |
+
reference_b64 = None
|
| 132 |
+
|
| 133 |
+
# Call VLAC service
|
| 134 |
+
result = call_trajectory_critic(
|
| 135 |
+
session=session,
|
| 136 |
+
base_url=base_url,
|
| 137 |
+
task=task_name,
|
| 138 |
+
frames_b64=frames_b64,
|
| 139 |
+
reference_b64=reference_b64,
|
| 140 |
+
timeout=timeout,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Extract values
|
| 144 |
+
value_list = result.get("value_list", [])
|
| 145 |
+
if not value_list:
|
| 146 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 147 |
+
failed_demos.append(demo_name)
|
| 148 |
+
continue
|
| 149 |
+
|
| 150 |
+
# Record results
|
| 151 |
+
demo_result = {
|
| 152 |
+
"demo_name": demo_name,
|
| 153 |
+
"total_frames": demo["total_frames"],
|
| 154 |
+
"success_index": demo["success_index"],
|
| 155 |
+
"num_sampled_frames": len(frame_paths),
|
| 156 |
+
"value_list": value_list,
|
| 157 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 158 |
+
"mean_value": float(np.mean(value_list)),
|
| 159 |
+
"std_value": float(np.std(value_list)),
|
| 160 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 161 |
+
}
|
| 162 |
+
results.append(demo_result)
|
| 163 |
+
|
| 164 |
+
except requests.RequestException as exc:
|
| 165 |
+
print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 166 |
+
failed_demos.append(demo_name)
|
| 167 |
+
except Exception as exc:
|
| 168 |
+
print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 169 |
+
failed_demos.append(demo_name)
|
| 170 |
+
|
| 171 |
+
return {
|
| 172 |
+
"task_name": task_name,
|
| 173 |
+
"total_demos": len(demos),
|
| 174 |
+
"successful_evals": len(results),
|
| 175 |
+
"failed_demos": failed_demos,
|
| 176 |
+
"results": results,
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 181 |
+
"""Compute summary statistics from evaluation results."""
|
| 182 |
+
results = evaluation_results["results"]
|
| 183 |
+
if not results:
|
| 184 |
+
return {}
|
| 185 |
+
|
| 186 |
+
last_values = [r["last_value"] for r in results]
|
| 187 |
+
mean_values = [r["mean_value"] for r in results]
|
| 188 |
+
latencies = [r["latency_sec"] for r in results]
|
| 189 |
+
|
| 190 |
+
stats = {
|
| 191 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 192 |
+
"last_value_std": float(np.std(last_values)),
|
| 193 |
+
"last_value_min": float(np.min(last_values)),
|
| 194 |
+
"last_value_max": float(np.max(last_values)),
|
| 195 |
+
"last_value_median": float(np.median(last_values)),
|
| 196 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 197 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 198 |
+
"mean_latency": float(np.mean(latencies)),
|
| 199 |
+
"total_evaluated": len(results),
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
# Count how many demos have last_value >= various thresholds
|
| 203 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 204 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 205 |
+
stats[f"count_above_{threshold}"] = count
|
| 206 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 207 |
+
|
| 208 |
+
return stats
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 212 |
+
"""Create visualization plots for value distribution."""
|
| 213 |
+
results = evaluation_results["results"]
|
| 214 |
+
if not results:
|
| 215 |
+
print("No results to plot")
|
| 216 |
+
return
|
| 217 |
+
|
| 218 |
+
task_name = evaluation_results["task_name"]
|
| 219 |
+
last_values = [r["last_value"] for r in results]
|
| 220 |
+
|
| 221 |
+
# Create figure with multiple subplots
|
| 222 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 223 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 224 |
+
|
| 225 |
+
# 1. Histogram of last values
|
| 226 |
+
ax1 = axes[0, 0]
|
| 227 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 228 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 229 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 230 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 231 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 232 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 233 |
+
ax1.legend()
|
| 234 |
+
ax1.grid(True, alpha=0.3)
|
| 235 |
+
|
| 236 |
+
# 2. Box plot of last values
|
| 237 |
+
ax2 = axes[0, 1]
|
| 238 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 239 |
+
for patch in box_data['boxes']:
|
| 240 |
+
patch.set_facecolor('lightblue')
|
| 241 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 242 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 243 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 244 |
+
ax2.legend()
|
| 245 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 246 |
+
|
| 247 |
+
# 3. Value progression across demos
|
| 248 |
+
ax3 = axes[1, 0]
|
| 249 |
+
demo_indices = range(len(results))
|
| 250 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 251 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 252 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 253 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 254 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 255 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 256 |
+
ax3.legend()
|
| 257 |
+
ax3.grid(True, alpha=0.3)
|
| 258 |
+
|
| 259 |
+
# 4. Cumulative distribution
|
| 260 |
+
ax4 = axes[1, 1]
|
| 261 |
+
sorted_values = np.sort(last_values)
|
| 262 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 263 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 264 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 265 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 266 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 267 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 268 |
+
ax4.legend()
|
| 269 |
+
ax4.grid(True, alpha=0.3)
|
| 270 |
+
|
| 271 |
+
plt.tight_layout()
|
| 272 |
+
|
| 273 |
+
# Save the plot
|
| 274 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 275 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 276 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 277 |
+
|
| 278 |
+
# Also save a PDF version
|
| 279 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 280 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 281 |
+
print(f"PDF saved to: {pdf_path}")
|
| 282 |
+
|
| 283 |
+
plt.close()
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 287 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 288 |
+
task_name = evaluation_results["task_name"]
|
| 289 |
+
|
| 290 |
+
# Save detailed results
|
| 291 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 292 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 293 |
+
json.dump(evaluation_results, f, indent=2)
|
| 294 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 295 |
+
|
| 296 |
+
# Save summary statistics
|
| 297 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 298 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 299 |
+
json.dump(statistics, f, indent=2)
|
| 300 |
+
print(f"Statistics saved to: {stats_path}")
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
# ---------------------------------------------------------------------------
|
| 304 |
+
# CLI
|
| 305 |
+
# ---------------------------------------------------------------------------
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def parse_args() -> argparse.Namespace:
|
| 309 |
+
parser = argparse.ArgumentParser(
|
| 310 |
+
description="Evaluate value estimation for test demonstrations"
|
| 311 |
+
)
|
| 312 |
+
parser.add_argument(
|
| 313 |
+
"--manifest-path",
|
| 314 |
+
type=Path,
|
| 315 |
+
default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
|
| 316 |
+
help="Path to the test manifest JSON file",
|
| 317 |
+
)
|
| 318 |
+
parser.add_argument(
|
| 319 |
+
"--output-dir",
|
| 320 |
+
type=Path,
|
| 321 |
+
default="evaluation_results",
|
| 322 |
+
help="Directory to save evaluation results and plots",
|
| 323 |
+
)
|
| 324 |
+
parser.add_argument(
|
| 325 |
+
"--base-url",
|
| 326 |
+
default="http://localhost:8111",
|
| 327 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 328 |
+
)
|
| 329 |
+
parser.add_argument(
|
| 330 |
+
"--timeout",
|
| 331 |
+
type=float,
|
| 332 |
+
default=30.0,
|
| 333 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 334 |
+
)
|
| 335 |
+
parser.add_argument(
|
| 336 |
+
"--use-reference",
|
| 337 |
+
action="store_true",
|
| 338 |
+
help="Use reference trajectory (if available)",
|
| 339 |
+
)
|
| 340 |
+
return parser.parse_args()
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def main() -> int:
|
| 344 |
+
args = parse_args()
|
| 345 |
+
|
| 346 |
+
# Read manifest
|
| 347 |
+
try:
|
| 348 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 349 |
+
except FileNotFoundError as exc:
|
| 350 |
+
print(f"Error: {exc}")
|
| 351 |
+
return 1
|
| 352 |
+
|
| 353 |
+
# Create output directory
|
| 354 |
+
output_dir = args.output_dir.expanduser()
|
| 355 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 356 |
+
|
| 357 |
+
# Run evaluation
|
| 358 |
+
print("=" * 80)
|
| 359 |
+
print("VLAC Value Estimation Evaluation")
|
| 360 |
+
print("=" * 80)
|
| 361 |
+
|
| 362 |
+
evaluation_results = evaluate_demos(
|
| 363 |
+
manifest_data=manifest_data,
|
| 364 |
+
base_url=args.base_url,
|
| 365 |
+
timeout=args.timeout,
|
| 366 |
+
use_reference=args.use_reference,
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
# Compute statistics
|
| 370 |
+
statistics = compute_statistics(evaluation_results)
|
| 371 |
+
|
| 372 |
+
# Print summary
|
| 373 |
+
print("\n" + "=" * 80)
|
| 374 |
+
print("EVALUATION SUMMARY")
|
| 375 |
+
print("=" * 80)
|
| 376 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 377 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 378 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 379 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 380 |
+
|
| 381 |
+
if statistics:
|
| 382 |
+
print("\n" + "-" * 80)
|
| 383 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 384 |
+
print("-" * 80)
|
| 385 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 386 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 387 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 388 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 389 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 390 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 391 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 392 |
+
|
| 393 |
+
print("\n" + "-" * 80)
|
| 394 |
+
print("THRESHOLD ANALYSIS")
|
| 395 |
+
print("-" * 80)
|
| 396 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 397 |
+
count = statistics[f"count_above_{threshold}"]
|
| 398 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 399 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 400 |
+
|
| 401 |
+
print("\n" + "-" * 80)
|
| 402 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 403 |
+
print("-" * 80)
|
| 404 |
+
|
| 405 |
+
# Save results
|
| 406 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 407 |
+
|
| 408 |
+
# Create plots
|
| 409 |
+
if evaluation_results["results"]:
|
| 410 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 411 |
+
else:
|
| 412 |
+
print("\nNo successful evaluations to plot.")
|
| 413 |
+
|
| 414 |
+
print("\n" + "=" * 80)
|
| 415 |
+
print("EVALUATION COMPLETE")
|
| 416 |
+
print("=" * 80)
|
| 417 |
+
|
| 418 |
+
return 0
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
if __name__ == "__main__":
|
| 422 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008151427.py
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 12 |
+
|
| 13 |
+
Example:
|
| 14 |
+
python evaluate_test_demo_values.py \
|
| 15 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 16 |
+
--output-dir evaluation_results \
|
| 17 |
+
--base-url http://localhost:8111
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import base64
|
| 24 |
+
import json
|
| 25 |
+
import os
|
| 26 |
+
import glob
|
| 27 |
+
import sys
|
| 28 |
+
import time
|
| 29 |
+
from io import BytesIO
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
from typing import Dict, List, Optional
|
| 32 |
+
|
| 33 |
+
import matplotlib.pyplot as plt
|
| 34 |
+
import numpy as np
|
| 35 |
+
import requests
|
| 36 |
+
from PIL import Image
|
| 37 |
+
from tqdm import tqdm
|
| 38 |
+
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
# Helpers
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
|
| 43 |
+
def sample_fixed_interval_frames(image_list, num_frames):
|
| 44 |
+
# sample num_frames frames from image_list
|
| 45 |
+
# sample with equal interval while also ensuring the first and the last frames are included
|
| 46 |
+
if len(image_list) == 0:
|
| 47 |
+
raise ValueError("image_list is empty")
|
| 48 |
+
elif len(image_list) == 1:
|
| 49 |
+
return [image_list[0]] * num_frames
|
| 50 |
+
elif num_frames == 2:
|
| 51 |
+
return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
|
| 52 |
+
elif num_frames == 3:
|
| 53 |
+
return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
|
| 54 |
+
else:
|
| 55 |
+
total_frames = len(image_list)
|
| 56 |
+
indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
|
| 57 |
+
sampled_frames = [image_list[i] for i in indices]
|
| 58 |
+
return sampled_frames
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
num_frames_for_reference = 8
|
| 62 |
+
ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
|
| 63 |
+
libero_10_task_list = [
|
| 64 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 65 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 66 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 67 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 68 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 69 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 70 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 71 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 72 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 73 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
|
| 74 |
+
]
|
| 75 |
+
reference_frames_dict = {}
|
| 76 |
+
for task_name in libero_10_task_list:
|
| 77 |
+
ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
|
| 78 |
+
ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
|
| 79 |
+
ref_frm_file_list.sort()
|
| 80 |
+
reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
|
| 81 |
+
reference_frames_temp = [np.array(Image.open(frame)) for frame in reference_frames_temp]
|
| 82 |
+
reference_frames_dict[task_name] = reference_frames_temp
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 86 |
+
"""Read the test demo manifest JSON file."""
|
| 87 |
+
if not manifest_path.is_file():
|
| 88 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 89 |
+
|
| 90 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 91 |
+
manifest_data = json.load(f)
|
| 92 |
+
|
| 93 |
+
# Convert relative paths to absolute paths
|
| 94 |
+
manifest_dir = manifest_path.parent
|
| 95 |
+
for demo in manifest_data.get("demos", []):
|
| 96 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 97 |
+
|
| 98 |
+
return manifest_data
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def image_to_base64(path: Path) -> str:
|
| 102 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 103 |
+
with Image.open(path) as img:
|
| 104 |
+
img = img.convert("RGB")
|
| 105 |
+
buffer = BytesIO()
|
| 106 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 107 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 111 |
+
"""Encode a list of image paths to base64."""
|
| 112 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def call_trajectory_critic(
|
| 116 |
+
session: requests.Session,
|
| 117 |
+
base_url: str,
|
| 118 |
+
task: str,
|
| 119 |
+
frames_b64: List[str],
|
| 120 |
+
reference_b64: Optional[List[str]],
|
| 121 |
+
timeout: float,
|
| 122 |
+
) -> Dict:
|
| 123 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 124 |
+
payload = {
|
| 125 |
+
"task": task,
|
| 126 |
+
"frames": frames_b64,
|
| 127 |
+
"reference": reference_b64,
|
| 128 |
+
"ref_num": len(reference_b64 or []),
|
| 129 |
+
"skip": 1,
|
| 130 |
+
"batch_size": min(len(frames_b64), 8),
|
| 131 |
+
"think": False,
|
| 132 |
+
"return_video": False,
|
| 133 |
+
}
|
| 134 |
+
start = time.time()
|
| 135 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 136 |
+
resp.raise_for_status()
|
| 137 |
+
result = resp.json()
|
| 138 |
+
result["latency_sec"] = time.time() - start
|
| 139 |
+
return result
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# ---------------------------------------------------------------------------
|
| 143 |
+
# Evaluation
|
| 144 |
+
# ---------------------------------------------------------------------------
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def evaluate_demos(
|
| 148 |
+
manifest_data: Dict,
|
| 149 |
+
base_url: str,
|
| 150 |
+
timeout: float,
|
| 151 |
+
use_reference: bool = False,
|
| 152 |
+
) -> Dict[str, any]:
|
| 153 |
+
"""Evaluate all demos and collect value statistics."""
|
| 154 |
+
session = requests.Session()
|
| 155 |
+
task_name = manifest_data.get("task_name", "")
|
| 156 |
+
demos = manifest_data.get("demos", [])
|
| 157 |
+
|
| 158 |
+
results = []
|
| 159 |
+
failed_demos = []
|
| 160 |
+
|
| 161 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 162 |
+
print(f"Task: {task_name}")
|
| 163 |
+
print(f"Use reference: {use_reference}\n")
|
| 164 |
+
|
| 165 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 166 |
+
demo_name = demo["demo_name"]
|
| 167 |
+
frame_paths = demo["frame_paths"]
|
| 168 |
+
|
| 169 |
+
try:
|
| 170 |
+
# Encode frames
|
| 171 |
+
frames_b64 = encode_images(frame_paths)
|
| 172 |
+
|
| 173 |
+
# For now, no reference trajectory (can be added later)
|
| 174 |
+
reference_b64 = reference_frames_dict[task_name]
|
| 175 |
+
|
| 176 |
+
# Call VLAC service
|
| 177 |
+
result = call_trajectory_critic(
|
| 178 |
+
session=session,
|
| 179 |
+
base_url=base_url,
|
| 180 |
+
task=task_name,
|
| 181 |
+
frames_b64=frames_b64,
|
| 182 |
+
reference_b64=reference_b64,
|
| 183 |
+
timeout=timeout,
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
# Extract values
|
| 187 |
+
value_list = result.get("value_list", [])
|
| 188 |
+
if not value_list:
|
| 189 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 190 |
+
failed_demos.append(demo_name)
|
| 191 |
+
continue
|
| 192 |
+
|
| 193 |
+
# Record results
|
| 194 |
+
demo_result = {
|
| 195 |
+
"demo_name": demo_name,
|
| 196 |
+
"total_frames": demo["total_frames"],
|
| 197 |
+
"success_index": demo["success_index"],
|
| 198 |
+
"num_sampled_frames": len(frame_paths),
|
| 199 |
+
"value_list": value_list,
|
| 200 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 201 |
+
"mean_value": float(np.mean(value_list)),
|
| 202 |
+
"std_value": float(np.std(value_list)),
|
| 203 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 204 |
+
}
|
| 205 |
+
results.append(demo_result)
|
| 206 |
+
|
| 207 |
+
except requests.RequestException as exc:
|
| 208 |
+
print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 209 |
+
failed_demos.append(demo_name)
|
| 210 |
+
except Exception as exc:
|
| 211 |
+
print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 212 |
+
failed_demos.append(demo_name)
|
| 213 |
+
|
| 214 |
+
return {
|
| 215 |
+
"task_name": task_name,
|
| 216 |
+
"total_demos": len(demos),
|
| 217 |
+
"successful_evals": len(results),
|
| 218 |
+
"failed_demos": failed_demos,
|
| 219 |
+
"results": results,
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 224 |
+
"""Compute summary statistics from evaluation results."""
|
| 225 |
+
results = evaluation_results["results"]
|
| 226 |
+
if not results:
|
| 227 |
+
return {}
|
| 228 |
+
|
| 229 |
+
last_values = [r["last_value"] for r in results]
|
| 230 |
+
mean_values = [r["mean_value"] for r in results]
|
| 231 |
+
latencies = [r["latency_sec"] for r in results]
|
| 232 |
+
|
| 233 |
+
stats = {
|
| 234 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 235 |
+
"last_value_std": float(np.std(last_values)),
|
| 236 |
+
"last_value_min": float(np.min(last_values)),
|
| 237 |
+
"last_value_max": float(np.max(last_values)),
|
| 238 |
+
"last_value_median": float(np.median(last_values)),
|
| 239 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 240 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 241 |
+
"mean_latency": float(np.mean(latencies)),
|
| 242 |
+
"total_evaluated": len(results),
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
# Count how many demos have last_value >= various thresholds
|
| 246 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 247 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 248 |
+
stats[f"count_above_{threshold}"] = count
|
| 249 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 250 |
+
|
| 251 |
+
return stats
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 255 |
+
"""Create visualization plots for value distribution."""
|
| 256 |
+
results = evaluation_results["results"]
|
| 257 |
+
if not results:
|
| 258 |
+
print("No results to plot")
|
| 259 |
+
return
|
| 260 |
+
|
| 261 |
+
task_name = evaluation_results["task_name"]
|
| 262 |
+
last_values = [r["last_value"] for r in results]
|
| 263 |
+
|
| 264 |
+
# Create figure with multiple subplots
|
| 265 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 266 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 267 |
+
|
| 268 |
+
# 1. Histogram of last values
|
| 269 |
+
ax1 = axes[0, 0]
|
| 270 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 271 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 272 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 273 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 274 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 275 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 276 |
+
ax1.legend()
|
| 277 |
+
ax1.grid(True, alpha=0.3)
|
| 278 |
+
|
| 279 |
+
# 2. Box plot of last values
|
| 280 |
+
ax2 = axes[0, 1]
|
| 281 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 282 |
+
for patch in box_data['boxes']:
|
| 283 |
+
patch.set_facecolor('lightblue')
|
| 284 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 285 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 286 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 287 |
+
ax2.legend()
|
| 288 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 289 |
+
|
| 290 |
+
# 3. Value progression across demos
|
| 291 |
+
ax3 = axes[1, 0]
|
| 292 |
+
demo_indices = range(len(results))
|
| 293 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 294 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 295 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 296 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 297 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 298 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 299 |
+
ax3.legend()
|
| 300 |
+
ax3.grid(True, alpha=0.3)
|
| 301 |
+
|
| 302 |
+
# 4. Cumulative distribution
|
| 303 |
+
ax4 = axes[1, 1]
|
| 304 |
+
sorted_values = np.sort(last_values)
|
| 305 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 306 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 307 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 308 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 309 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 310 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 311 |
+
ax4.legend()
|
| 312 |
+
ax4.grid(True, alpha=0.3)
|
| 313 |
+
|
| 314 |
+
plt.tight_layout()
|
| 315 |
+
|
| 316 |
+
# Save the plot
|
| 317 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 318 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 319 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 320 |
+
|
| 321 |
+
# Also save a PDF version
|
| 322 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 323 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 324 |
+
print(f"PDF saved to: {pdf_path}")
|
| 325 |
+
|
| 326 |
+
plt.close()
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 330 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 331 |
+
task_name = evaluation_results["task_name"]
|
| 332 |
+
|
| 333 |
+
# Save detailed results
|
| 334 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 335 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 336 |
+
json.dump(evaluation_results, f, indent=2)
|
| 337 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 338 |
+
|
| 339 |
+
# Save summary statistics
|
| 340 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 341 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 342 |
+
json.dump(statistics, f, indent=2)
|
| 343 |
+
print(f"Statistics saved to: {stats_path}")
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
# ---------------------------------------------------------------------------
|
| 347 |
+
# CLI
|
| 348 |
+
# ---------------------------------------------------------------------------
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def parse_args() -> argparse.Namespace:
|
| 352 |
+
parser = argparse.ArgumentParser(
|
| 353 |
+
description="Evaluate value estimation for test demonstrations"
|
| 354 |
+
)
|
| 355 |
+
parser.add_argument(
|
| 356 |
+
"--manifest-path",
|
| 357 |
+
type=Path,
|
| 358 |
+
default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
|
| 359 |
+
help="Path to the test manifest JSON file",
|
| 360 |
+
)
|
| 361 |
+
parser.add_argument(
|
| 362 |
+
"--output-dir",
|
| 363 |
+
type=Path,
|
| 364 |
+
default="evaluation_results",
|
| 365 |
+
help="Directory to save evaluation results and plots",
|
| 366 |
+
)
|
| 367 |
+
parser.add_argument(
|
| 368 |
+
"--base-url",
|
| 369 |
+
default="http://localhost:8111",
|
| 370 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 371 |
+
)
|
| 372 |
+
parser.add_argument(
|
| 373 |
+
"--timeout",
|
| 374 |
+
type=float,
|
| 375 |
+
default=30.0,
|
| 376 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 377 |
+
)
|
| 378 |
+
parser.add_argument(
|
| 379 |
+
"--use-reference",
|
| 380 |
+
action="store_true",
|
| 381 |
+
help="Use reference trajectory (if available)",
|
| 382 |
+
)
|
| 383 |
+
return parser.parse_args()
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
def main() -> int:
|
| 387 |
+
args = parse_args()
|
| 388 |
+
|
| 389 |
+
# Read manifest
|
| 390 |
+
try:
|
| 391 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 392 |
+
except FileNotFoundError as exc:
|
| 393 |
+
print(f"Error: {exc}")
|
| 394 |
+
return 1
|
| 395 |
+
|
| 396 |
+
# Create output directory
|
| 397 |
+
output_dir = args.output_dir.expanduser()
|
| 398 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 399 |
+
|
| 400 |
+
# Run evaluation
|
| 401 |
+
print("=" * 80)
|
| 402 |
+
print("VLAC Value Estimation Evaluation")
|
| 403 |
+
print("=" * 80)
|
| 404 |
+
|
| 405 |
+
evaluation_results = evaluate_demos(
|
| 406 |
+
manifest_data=manifest_data,
|
| 407 |
+
base_url=args.base_url,
|
| 408 |
+
timeout=args.timeout,
|
| 409 |
+
use_reference=args.use_reference,
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
# Compute statistics
|
| 413 |
+
statistics = compute_statistics(evaluation_results)
|
| 414 |
+
|
| 415 |
+
# Print summary
|
| 416 |
+
print("\n" + "=" * 80)
|
| 417 |
+
print("EVALUATION SUMMARY")
|
| 418 |
+
print("=" * 80)
|
| 419 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 420 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 421 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 422 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 423 |
+
|
| 424 |
+
if statistics:
|
| 425 |
+
print("\n" + "-" * 80)
|
| 426 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 427 |
+
print("-" * 80)
|
| 428 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 429 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 430 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 431 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 432 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 433 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 434 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 435 |
+
|
| 436 |
+
print("\n" + "-" * 80)
|
| 437 |
+
print("THRESHOLD ANALYSIS")
|
| 438 |
+
print("-" * 80)
|
| 439 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 440 |
+
count = statistics[f"count_above_{threshold}"]
|
| 441 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 442 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 443 |
+
|
| 444 |
+
print("\n" + "-" * 80)
|
| 445 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 446 |
+
print("-" * 80)
|
| 447 |
+
|
| 448 |
+
# Save results
|
| 449 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 450 |
+
|
| 451 |
+
# Create plots
|
| 452 |
+
if evaluation_results["results"]:
|
| 453 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 454 |
+
else:
|
| 455 |
+
print("\nNo successful evaluations to plot.")
|
| 456 |
+
|
| 457 |
+
print("\n" + "=" * 80)
|
| 458 |
+
print("EVALUATION COMPLETE")
|
| 459 |
+
print("=" * 80)
|
| 460 |
+
|
| 461 |
+
return 0
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
if __name__ == "__main__":
|
| 465 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008151542.py
ADDED
|
@@ -0,0 +1,466 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 12 |
+
|
| 13 |
+
Example:
|
| 14 |
+
python evaluate_test_demo_values.py \
|
| 15 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 16 |
+
--output-dir evaluation_results \
|
| 17 |
+
--base-url http://localhost:8111
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import base64
|
| 24 |
+
import json
|
| 25 |
+
import os
|
| 26 |
+
import glob
|
| 27 |
+
import sys
|
| 28 |
+
import time
|
| 29 |
+
from io import BytesIO
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
from typing import Dict, List, Optional
|
| 32 |
+
|
| 33 |
+
import matplotlib.pyplot as plt
|
| 34 |
+
import numpy as np
|
| 35 |
+
import requests
|
| 36 |
+
from PIL import Image
|
| 37 |
+
from tqdm import tqdm
|
| 38 |
+
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
# Helpers
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
|
| 43 |
+
def sample_fixed_interval_frames(image_list, num_frames):
|
| 44 |
+
# sample num_frames frames from image_list
|
| 45 |
+
# sample with equal interval while also ensuring the first and the last frames are included
|
| 46 |
+
if len(image_list) == 0:
|
| 47 |
+
raise ValueError("image_list is empty")
|
| 48 |
+
elif len(image_list) == 1:
|
| 49 |
+
return [image_list[0]] * num_frames
|
| 50 |
+
elif num_frames == 2:
|
| 51 |
+
return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
|
| 52 |
+
elif num_frames == 3:
|
| 53 |
+
return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
|
| 54 |
+
else:
|
| 55 |
+
total_frames = len(image_list)
|
| 56 |
+
indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
|
| 57 |
+
sampled_frames = [image_list[i] for i in indices]
|
| 58 |
+
return sampled_frames
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
num_frames_for_reference = 8
|
| 62 |
+
ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
|
| 63 |
+
libero_10_task_list = [
|
| 64 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 65 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 66 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 67 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 68 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 69 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 70 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 71 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 72 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 73 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
|
| 74 |
+
]
|
| 75 |
+
reference_frames_dict = {}
|
| 76 |
+
for task_name in libero_10_task_list:
|
| 77 |
+
ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
|
| 78 |
+
ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
|
| 79 |
+
ref_frm_file_list.sort()
|
| 80 |
+
reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
|
| 81 |
+
reference_frames_temp = [np.array(Image.open(frame)) for frame in reference_frames_temp]
|
| 82 |
+
reference_frames_dict[task_name] = reference_frames_temp
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 86 |
+
"""Read the test demo manifest JSON file."""
|
| 87 |
+
if not manifest_path.is_file():
|
| 88 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 89 |
+
|
| 90 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 91 |
+
manifest_data = json.load(f)
|
| 92 |
+
|
| 93 |
+
# Convert relative paths to absolute paths
|
| 94 |
+
manifest_dir = manifest_path.parent
|
| 95 |
+
for demo in manifest_data.get("demos", []):
|
| 96 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 97 |
+
|
| 98 |
+
return manifest_data
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def image_to_base64(path: Path) -> str:
|
| 102 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 103 |
+
with Image.open(path) as img:
|
| 104 |
+
img = img.convert("RGB")
|
| 105 |
+
buffer = BytesIO()
|
| 106 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 107 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 111 |
+
"""Encode a list of image paths to base64."""
|
| 112 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def call_trajectory_critic(
|
| 116 |
+
session: requests.Session,
|
| 117 |
+
base_url: str,
|
| 118 |
+
task: str,
|
| 119 |
+
frames_b64: List[str],
|
| 120 |
+
reference_b64: Optional[List[str]],
|
| 121 |
+
timeout: float,
|
| 122 |
+
) -> Dict:
|
| 123 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 124 |
+
payload = {
|
| 125 |
+
"task": task,
|
| 126 |
+
"frames": frames_b64,
|
| 127 |
+
"reference": reference_b64,
|
| 128 |
+
"ref_num": len(reference_b64 or []),
|
| 129 |
+
"skip": 1,
|
| 130 |
+
"batch_size": min(len(frames_b64), 8),
|
| 131 |
+
"think": False,
|
| 132 |
+
"return_video": False,
|
| 133 |
+
}
|
| 134 |
+
start = time.time()
|
| 135 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 136 |
+
resp.raise_for_status()
|
| 137 |
+
result = resp.json()
|
| 138 |
+
result["latency_sec"] = time.time() - start
|
| 139 |
+
return result
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# ---------------------------------------------------------------------------
|
| 143 |
+
# Evaluation
|
| 144 |
+
# ---------------------------------------------------------------------------
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def evaluate_demos(
|
| 148 |
+
manifest_data: Dict,
|
| 149 |
+
base_url: str,
|
| 150 |
+
timeout: float,
|
| 151 |
+
use_reference: bool = False,
|
| 152 |
+
) -> Dict[str, any]:
|
| 153 |
+
"""Evaluate all demos and collect value statistics."""
|
| 154 |
+
session = requests.Session()
|
| 155 |
+
task_name = manifest_data.get("task_name", "")
|
| 156 |
+
demos = manifest_data.get("demos", [])
|
| 157 |
+
|
| 158 |
+
results = []
|
| 159 |
+
failed_demos = []
|
| 160 |
+
|
| 161 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 162 |
+
print(f"Task: {task_name}")
|
| 163 |
+
print(f"Use reference: {use_reference}\n")
|
| 164 |
+
|
| 165 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 166 |
+
demo_name = demo["demo_name"]
|
| 167 |
+
frame_paths = demo["frame_paths"]
|
| 168 |
+
|
| 169 |
+
try:
|
| 170 |
+
# Encode frames
|
| 171 |
+
frames_b64 = encode_images(frame_paths)
|
| 172 |
+
|
| 173 |
+
# For now, no reference trajectory (can be added later)
|
| 174 |
+
print(f"Using reference frames for task {task_name}")
|
| 175 |
+
reference_b64 = reference_frames_dict[task_name]
|
| 176 |
+
|
| 177 |
+
# Call VLAC service
|
| 178 |
+
result = call_trajectory_critic(
|
| 179 |
+
session=session,
|
| 180 |
+
base_url=base_url,
|
| 181 |
+
task=task_name,
|
| 182 |
+
frames_b64=frames_b64,
|
| 183 |
+
reference_b64=reference_b64,
|
| 184 |
+
timeout=timeout,
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
# Extract values
|
| 188 |
+
value_list = result.get("value_list", [])
|
| 189 |
+
if not value_list:
|
| 190 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 191 |
+
failed_demos.append(demo_name)
|
| 192 |
+
continue
|
| 193 |
+
|
| 194 |
+
# Record results
|
| 195 |
+
demo_result = {
|
| 196 |
+
"demo_name": demo_name,
|
| 197 |
+
"total_frames": demo["total_frames"],
|
| 198 |
+
"success_index": demo["success_index"],
|
| 199 |
+
"num_sampled_frames": len(frame_paths),
|
| 200 |
+
"value_list": value_list,
|
| 201 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 202 |
+
"mean_value": float(np.mean(value_list)),
|
| 203 |
+
"std_value": float(np.std(value_list)),
|
| 204 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 205 |
+
}
|
| 206 |
+
results.append(demo_result)
|
| 207 |
+
|
| 208 |
+
except requests.RequestException as exc:
|
| 209 |
+
print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 210 |
+
failed_demos.append(demo_name)
|
| 211 |
+
except Exception as exc:
|
| 212 |
+
print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 213 |
+
failed_demos.append(demo_name)
|
| 214 |
+
|
| 215 |
+
return {
|
| 216 |
+
"task_name": task_name,
|
| 217 |
+
"total_demos": len(demos),
|
| 218 |
+
"successful_evals": len(results),
|
| 219 |
+
"failed_demos": failed_demos,
|
| 220 |
+
"results": results,
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 225 |
+
"""Compute summary statistics from evaluation results."""
|
| 226 |
+
results = evaluation_results["results"]
|
| 227 |
+
if not results:
|
| 228 |
+
return {}
|
| 229 |
+
|
| 230 |
+
last_values = [r["last_value"] for r in results]
|
| 231 |
+
mean_values = [r["mean_value"] for r in results]
|
| 232 |
+
latencies = [r["latency_sec"] for r in results]
|
| 233 |
+
|
| 234 |
+
stats = {
|
| 235 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 236 |
+
"last_value_std": float(np.std(last_values)),
|
| 237 |
+
"last_value_min": float(np.min(last_values)),
|
| 238 |
+
"last_value_max": float(np.max(last_values)),
|
| 239 |
+
"last_value_median": float(np.median(last_values)),
|
| 240 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 241 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 242 |
+
"mean_latency": float(np.mean(latencies)),
|
| 243 |
+
"total_evaluated": len(results),
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
# Count how many demos have last_value >= various thresholds
|
| 247 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 248 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 249 |
+
stats[f"count_above_{threshold}"] = count
|
| 250 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 251 |
+
|
| 252 |
+
return stats
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 256 |
+
"""Create visualization plots for value distribution."""
|
| 257 |
+
results = evaluation_results["results"]
|
| 258 |
+
if not results:
|
| 259 |
+
print("No results to plot")
|
| 260 |
+
return
|
| 261 |
+
|
| 262 |
+
task_name = evaluation_results["task_name"]
|
| 263 |
+
last_values = [r["last_value"] for r in results]
|
| 264 |
+
|
| 265 |
+
# Create figure with multiple subplots
|
| 266 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 267 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 268 |
+
|
| 269 |
+
# 1. Histogram of last values
|
| 270 |
+
ax1 = axes[0, 0]
|
| 271 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 272 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 273 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 274 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 275 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 276 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 277 |
+
ax1.legend()
|
| 278 |
+
ax1.grid(True, alpha=0.3)
|
| 279 |
+
|
| 280 |
+
# 2. Box plot of last values
|
| 281 |
+
ax2 = axes[0, 1]
|
| 282 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 283 |
+
for patch in box_data['boxes']:
|
| 284 |
+
patch.set_facecolor('lightblue')
|
| 285 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 286 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 287 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 288 |
+
ax2.legend()
|
| 289 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 290 |
+
|
| 291 |
+
# 3. Value progression across demos
|
| 292 |
+
ax3 = axes[1, 0]
|
| 293 |
+
demo_indices = range(len(results))
|
| 294 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 295 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 296 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 297 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 298 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 299 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 300 |
+
ax3.legend()
|
| 301 |
+
ax3.grid(True, alpha=0.3)
|
| 302 |
+
|
| 303 |
+
# 4. Cumulative distribution
|
| 304 |
+
ax4 = axes[1, 1]
|
| 305 |
+
sorted_values = np.sort(last_values)
|
| 306 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 307 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 308 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 309 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 310 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 311 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 312 |
+
ax4.legend()
|
| 313 |
+
ax4.grid(True, alpha=0.3)
|
| 314 |
+
|
| 315 |
+
plt.tight_layout()
|
| 316 |
+
|
| 317 |
+
# Save the plot
|
| 318 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 319 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 320 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 321 |
+
|
| 322 |
+
# Also save a PDF version
|
| 323 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 324 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 325 |
+
print(f"PDF saved to: {pdf_path}")
|
| 326 |
+
|
| 327 |
+
plt.close()
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 331 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 332 |
+
task_name = evaluation_results["task_name"]
|
| 333 |
+
|
| 334 |
+
# Save detailed results
|
| 335 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 336 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 337 |
+
json.dump(evaluation_results, f, indent=2)
|
| 338 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 339 |
+
|
| 340 |
+
# Save summary statistics
|
| 341 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 342 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 343 |
+
json.dump(statistics, f, indent=2)
|
| 344 |
+
print(f"Statistics saved to: {stats_path}")
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
# ---------------------------------------------------------------------------
|
| 348 |
+
# CLI
|
| 349 |
+
# ---------------------------------------------------------------------------
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def parse_args() -> argparse.Namespace:
|
| 353 |
+
parser = argparse.ArgumentParser(
|
| 354 |
+
description="Evaluate value estimation for test demonstrations"
|
| 355 |
+
)
|
| 356 |
+
parser.add_argument(
|
| 357 |
+
"--manifest-path",
|
| 358 |
+
type=Path,
|
| 359 |
+
default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
|
| 360 |
+
help="Path to the test manifest JSON file",
|
| 361 |
+
)
|
| 362 |
+
parser.add_argument(
|
| 363 |
+
"--output-dir",
|
| 364 |
+
type=Path,
|
| 365 |
+
default="evaluation_results",
|
| 366 |
+
help="Directory to save evaluation results and plots",
|
| 367 |
+
)
|
| 368 |
+
parser.add_argument(
|
| 369 |
+
"--base-url",
|
| 370 |
+
default="http://localhost:8111",
|
| 371 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 372 |
+
)
|
| 373 |
+
parser.add_argument(
|
| 374 |
+
"--timeout",
|
| 375 |
+
type=float,
|
| 376 |
+
default=30.0,
|
| 377 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 378 |
+
)
|
| 379 |
+
parser.add_argument(
|
| 380 |
+
"--use-reference",
|
| 381 |
+
action="store_true",
|
| 382 |
+
help="Use reference trajectory (if available)",
|
| 383 |
+
)
|
| 384 |
+
return parser.parse_args()
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
def main() -> int:
|
| 388 |
+
args = parse_args()
|
| 389 |
+
|
| 390 |
+
# Read manifest
|
| 391 |
+
try:
|
| 392 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 393 |
+
except FileNotFoundError as exc:
|
| 394 |
+
print(f"Error: {exc}")
|
| 395 |
+
return 1
|
| 396 |
+
|
| 397 |
+
# Create output directory
|
| 398 |
+
output_dir = args.output_dir.expanduser()
|
| 399 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 400 |
+
|
| 401 |
+
# Run evaluation
|
| 402 |
+
print("=" * 80)
|
| 403 |
+
print("VLAC Value Estimation Evaluation")
|
| 404 |
+
print("=" * 80)
|
| 405 |
+
|
| 406 |
+
evaluation_results = evaluate_demos(
|
| 407 |
+
manifest_data=manifest_data,
|
| 408 |
+
base_url=args.base_url,
|
| 409 |
+
timeout=args.timeout,
|
| 410 |
+
use_reference=args.use_reference,
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
# Compute statistics
|
| 414 |
+
statistics = compute_statistics(evaluation_results)
|
| 415 |
+
|
| 416 |
+
# Print summary
|
| 417 |
+
print("\n" + "=" * 80)
|
| 418 |
+
print("EVALUATION SUMMARY")
|
| 419 |
+
print("=" * 80)
|
| 420 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 421 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 422 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 423 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 424 |
+
|
| 425 |
+
if statistics:
|
| 426 |
+
print("\n" + "-" * 80)
|
| 427 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 428 |
+
print("-" * 80)
|
| 429 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 430 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 431 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 432 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 433 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 434 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 435 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 436 |
+
|
| 437 |
+
print("\n" + "-" * 80)
|
| 438 |
+
print("THRESHOLD ANALYSIS")
|
| 439 |
+
print("-" * 80)
|
| 440 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 441 |
+
count = statistics[f"count_above_{threshold}"]
|
| 442 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 443 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 444 |
+
|
| 445 |
+
print("\n" + "-" * 80)
|
| 446 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 447 |
+
print("-" * 80)
|
| 448 |
+
|
| 449 |
+
# Save results
|
| 450 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 451 |
+
|
| 452 |
+
# Create plots
|
| 453 |
+
if evaluation_results["results"]:
|
| 454 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 455 |
+
else:
|
| 456 |
+
print("\nNo successful evaluations to plot.")
|
| 457 |
+
|
| 458 |
+
print("\n" + "=" * 80)
|
| 459 |
+
print("EVALUATION COMPLETE")
|
| 460 |
+
print("=" * 80)
|
| 461 |
+
|
| 462 |
+
return 0
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
if __name__ == "__main__":
|
| 466 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008151723.py
ADDED
|
@@ -0,0 +1,466 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 12 |
+
|
| 13 |
+
Example:
|
| 14 |
+
python evaluate_test_demo_values.py \
|
| 15 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 16 |
+
--output-dir evaluation_results \
|
| 17 |
+
--base-url http://localhost:8111
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import base64
|
| 24 |
+
import json
|
| 25 |
+
import os
|
| 26 |
+
import glob
|
| 27 |
+
import sys
|
| 28 |
+
import time
|
| 29 |
+
from io import BytesIO
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
from typing import Dict, List, Optional
|
| 32 |
+
|
| 33 |
+
import matplotlib.pyplot as plt
|
| 34 |
+
import numpy as np
|
| 35 |
+
import requests
|
| 36 |
+
from PIL import Image
|
| 37 |
+
from tqdm import tqdm
|
| 38 |
+
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
# Helpers
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
|
| 43 |
+
def sample_fixed_interval_frames(image_list, num_frames):
|
| 44 |
+
# sample num_frames frames from image_list
|
| 45 |
+
# sample with equal interval while also ensuring the first and the last frames are included
|
| 46 |
+
if len(image_list) == 0:
|
| 47 |
+
raise ValueError("image_list is empty")
|
| 48 |
+
elif len(image_list) == 1:
|
| 49 |
+
return [image_list[0]] * num_frames
|
| 50 |
+
elif num_frames == 2:
|
| 51 |
+
return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
|
| 52 |
+
elif num_frames == 3:
|
| 53 |
+
return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
|
| 54 |
+
else:
|
| 55 |
+
total_frames = len(image_list)
|
| 56 |
+
indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
|
| 57 |
+
sampled_frames = [image_list[i] for i in indices]
|
| 58 |
+
return sampled_frames
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
num_frames_for_reference = 8
|
| 62 |
+
ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
|
| 63 |
+
libero_10_task_list = [
|
| 64 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 65 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 66 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 67 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 68 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 69 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 70 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 71 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 72 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 73 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
|
| 74 |
+
]
|
| 75 |
+
reference_frames_dict = {}
|
| 76 |
+
for task_name in libero_10_task_list:
|
| 77 |
+
ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
|
| 78 |
+
ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
|
| 79 |
+
ref_frm_file_list.sort()
|
| 80 |
+
reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
|
| 81 |
+
reference_frames_temp = [np.array(Image.open(frame)) for frame in reference_frames_temp]
|
| 82 |
+
reference_frames_dict[task_name] = reference_frames_temp
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 86 |
+
"""Read the test demo manifest JSON file."""
|
| 87 |
+
if not manifest_path.is_file():
|
| 88 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 89 |
+
|
| 90 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 91 |
+
manifest_data = json.load(f)
|
| 92 |
+
|
| 93 |
+
# Convert relative paths to absolute paths
|
| 94 |
+
manifest_dir = manifest_path.parent
|
| 95 |
+
for demo in manifest_data.get("demos", []):
|
| 96 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 97 |
+
|
| 98 |
+
return manifest_data
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def image_to_base64(path: Path) -> str:
|
| 102 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 103 |
+
with Image.open(path) as img:
|
| 104 |
+
img = img.convert("RGB")
|
| 105 |
+
buffer = BytesIO()
|
| 106 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 107 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 111 |
+
"""Encode a list of image paths to base64."""
|
| 112 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def call_trajectory_critic(
|
| 116 |
+
session: requests.Session,
|
| 117 |
+
base_url: str,
|
| 118 |
+
task: str,
|
| 119 |
+
frames_b64: List[str],
|
| 120 |
+
reference_b64: Optional[List[str]],
|
| 121 |
+
timeout: float,
|
| 122 |
+
) -> Dict:
|
| 123 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 124 |
+
payload = {
|
| 125 |
+
"task": task,
|
| 126 |
+
"frames": frames_b64,
|
| 127 |
+
"reference": reference_b64,
|
| 128 |
+
"ref_num": len(reference_b64 or []),
|
| 129 |
+
"skip": 1,
|
| 130 |
+
"batch_size": min(len(frames_b64), 8),
|
| 131 |
+
"think": False,
|
| 132 |
+
"return_video": False,
|
| 133 |
+
}
|
| 134 |
+
start = time.time()
|
| 135 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 136 |
+
resp.raise_for_status()
|
| 137 |
+
result = resp.json()
|
| 138 |
+
result["latency_sec"] = time.time() - start
|
| 139 |
+
return result
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# ---------------------------------------------------------------------------
|
| 143 |
+
# Evaluation
|
| 144 |
+
# ---------------------------------------------------------------------------
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def evaluate_demos(
|
| 148 |
+
manifest_data: Dict,
|
| 149 |
+
base_url: str,
|
| 150 |
+
timeout: float,
|
| 151 |
+
use_reference: bool = False,
|
| 152 |
+
) -> Dict[str, any]:
|
| 153 |
+
"""Evaluate all demos and collect value statistics."""
|
| 154 |
+
session = requests.Session()
|
| 155 |
+
task_name = manifest_data.get("task_name", "")
|
| 156 |
+
demos = manifest_data.get("demos", [])
|
| 157 |
+
|
| 158 |
+
results = []
|
| 159 |
+
failed_demos = []
|
| 160 |
+
|
| 161 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 162 |
+
print(f"Task: {task_name}")
|
| 163 |
+
print(f"Use reference: {use_reference}\n")
|
| 164 |
+
|
| 165 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 166 |
+
demo_name = demo["demo_name"]
|
| 167 |
+
frame_paths = demo["frame_paths"]
|
| 168 |
+
|
| 169 |
+
# try:
|
| 170 |
+
# Encode frames
|
| 171 |
+
frames_b64 = encode_images(frame_paths)
|
| 172 |
+
|
| 173 |
+
# For now, no reference trajectory (can be added later)
|
| 174 |
+
print(f"Using reference frames for task {task_name}")
|
| 175 |
+
reference_b64 = reference_frames_dict[task_name]
|
| 176 |
+
|
| 177 |
+
# Call VLAC service
|
| 178 |
+
result = call_trajectory_critic(
|
| 179 |
+
session=session,
|
| 180 |
+
base_url=base_url,
|
| 181 |
+
task=task_name,
|
| 182 |
+
frames_b64=frames_b64,
|
| 183 |
+
reference_b64=reference_b64,
|
| 184 |
+
timeout=timeout,
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
# Extract values
|
| 188 |
+
value_list = result.get("value_list", [])
|
| 189 |
+
if not value_list:
|
| 190 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 191 |
+
failed_demos.append(demo_name)
|
| 192 |
+
continue
|
| 193 |
+
|
| 194 |
+
# Record results
|
| 195 |
+
demo_result = {
|
| 196 |
+
"demo_name": demo_name,
|
| 197 |
+
"total_frames": demo["total_frames"],
|
| 198 |
+
"success_index": demo["success_index"],
|
| 199 |
+
"num_sampled_frames": len(frame_paths),
|
| 200 |
+
"value_list": value_list,
|
| 201 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 202 |
+
"mean_value": float(np.mean(value_list)),
|
| 203 |
+
"std_value": float(np.std(value_list)),
|
| 204 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 205 |
+
}
|
| 206 |
+
results.append(demo_result)
|
| 207 |
+
|
| 208 |
+
# except requests.RequestException as exc:
|
| 209 |
+
# print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 210 |
+
# failed_demos.append(demo_name)
|
| 211 |
+
# except Exception as exc:
|
| 212 |
+
# print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 213 |
+
# failed_demos.append(demo_name)
|
| 214 |
+
|
| 215 |
+
return {
|
| 216 |
+
"task_name": task_name,
|
| 217 |
+
"total_demos": len(demos),
|
| 218 |
+
"successful_evals": len(results),
|
| 219 |
+
"failed_demos": failed_demos,
|
| 220 |
+
"results": results,
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 225 |
+
"""Compute summary statistics from evaluation results."""
|
| 226 |
+
results = evaluation_results["results"]
|
| 227 |
+
if not results:
|
| 228 |
+
return {}
|
| 229 |
+
|
| 230 |
+
last_values = [r["last_value"] for r in results]
|
| 231 |
+
mean_values = [r["mean_value"] for r in results]
|
| 232 |
+
latencies = [r["latency_sec"] for r in results]
|
| 233 |
+
|
| 234 |
+
stats = {
|
| 235 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 236 |
+
"last_value_std": float(np.std(last_values)),
|
| 237 |
+
"last_value_min": float(np.min(last_values)),
|
| 238 |
+
"last_value_max": float(np.max(last_values)),
|
| 239 |
+
"last_value_median": float(np.median(last_values)),
|
| 240 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 241 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 242 |
+
"mean_latency": float(np.mean(latencies)),
|
| 243 |
+
"total_evaluated": len(results),
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
# Count how many demos have last_value >= various thresholds
|
| 247 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 248 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 249 |
+
stats[f"count_above_{threshold}"] = count
|
| 250 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 251 |
+
|
| 252 |
+
return stats
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 256 |
+
"""Create visualization plots for value distribution."""
|
| 257 |
+
results = evaluation_results["results"]
|
| 258 |
+
if not results:
|
| 259 |
+
print("No results to plot")
|
| 260 |
+
return
|
| 261 |
+
|
| 262 |
+
task_name = evaluation_results["task_name"]
|
| 263 |
+
last_values = [r["last_value"] for r in results]
|
| 264 |
+
|
| 265 |
+
# Create figure with multiple subplots
|
| 266 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 267 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 268 |
+
|
| 269 |
+
# 1. Histogram of last values
|
| 270 |
+
ax1 = axes[0, 0]
|
| 271 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 272 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 273 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 274 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 275 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 276 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 277 |
+
ax1.legend()
|
| 278 |
+
ax1.grid(True, alpha=0.3)
|
| 279 |
+
|
| 280 |
+
# 2. Box plot of last values
|
| 281 |
+
ax2 = axes[0, 1]
|
| 282 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 283 |
+
for patch in box_data['boxes']:
|
| 284 |
+
patch.set_facecolor('lightblue')
|
| 285 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 286 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 287 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 288 |
+
ax2.legend()
|
| 289 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 290 |
+
|
| 291 |
+
# 3. Value progression across demos
|
| 292 |
+
ax3 = axes[1, 0]
|
| 293 |
+
demo_indices = range(len(results))
|
| 294 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 295 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 296 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 297 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 298 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 299 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 300 |
+
ax3.legend()
|
| 301 |
+
ax3.grid(True, alpha=0.3)
|
| 302 |
+
|
| 303 |
+
# 4. Cumulative distribution
|
| 304 |
+
ax4 = axes[1, 1]
|
| 305 |
+
sorted_values = np.sort(last_values)
|
| 306 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 307 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 308 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 309 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 310 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 311 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 312 |
+
ax4.legend()
|
| 313 |
+
ax4.grid(True, alpha=0.3)
|
| 314 |
+
|
| 315 |
+
plt.tight_layout()
|
| 316 |
+
|
| 317 |
+
# Save the plot
|
| 318 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 319 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 320 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 321 |
+
|
| 322 |
+
# Also save a PDF version
|
| 323 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 324 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 325 |
+
print(f"PDF saved to: {pdf_path}")
|
| 326 |
+
|
| 327 |
+
plt.close()
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 331 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 332 |
+
task_name = evaluation_results["task_name"]
|
| 333 |
+
|
| 334 |
+
# Save detailed results
|
| 335 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 336 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 337 |
+
json.dump(evaluation_results, f, indent=2)
|
| 338 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 339 |
+
|
| 340 |
+
# Save summary statistics
|
| 341 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 342 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 343 |
+
json.dump(statistics, f, indent=2)
|
| 344 |
+
print(f"Statistics saved to: {stats_path}")
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
# ---------------------------------------------------------------------------
|
| 348 |
+
# CLI
|
| 349 |
+
# ---------------------------------------------------------------------------
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def parse_args() -> argparse.Namespace:
|
| 353 |
+
parser = argparse.ArgumentParser(
|
| 354 |
+
description="Evaluate value estimation for test demonstrations"
|
| 355 |
+
)
|
| 356 |
+
parser.add_argument(
|
| 357 |
+
"--manifest-path",
|
| 358 |
+
type=Path,
|
| 359 |
+
default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
|
| 360 |
+
help="Path to the test manifest JSON file",
|
| 361 |
+
)
|
| 362 |
+
parser.add_argument(
|
| 363 |
+
"--output-dir",
|
| 364 |
+
type=Path,
|
| 365 |
+
default="evaluation_results",
|
| 366 |
+
help="Directory to save evaluation results and plots",
|
| 367 |
+
)
|
| 368 |
+
parser.add_argument(
|
| 369 |
+
"--base-url",
|
| 370 |
+
default="http://localhost:8111",
|
| 371 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 372 |
+
)
|
| 373 |
+
parser.add_argument(
|
| 374 |
+
"--timeout",
|
| 375 |
+
type=float,
|
| 376 |
+
default=30.0,
|
| 377 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 378 |
+
)
|
| 379 |
+
parser.add_argument(
|
| 380 |
+
"--use-reference",
|
| 381 |
+
action="store_true",
|
| 382 |
+
help="Use reference trajectory (if available)",
|
| 383 |
+
)
|
| 384 |
+
return parser.parse_args()
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
def main() -> int:
|
| 388 |
+
args = parse_args()
|
| 389 |
+
|
| 390 |
+
# Read manifest
|
| 391 |
+
try:
|
| 392 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 393 |
+
except FileNotFoundError as exc:
|
| 394 |
+
print(f"Error: {exc}")
|
| 395 |
+
return 1
|
| 396 |
+
|
| 397 |
+
# Create output directory
|
| 398 |
+
output_dir = args.output_dir.expanduser()
|
| 399 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 400 |
+
|
| 401 |
+
# Run evaluation
|
| 402 |
+
print("=" * 80)
|
| 403 |
+
print("VLAC Value Estimation Evaluation")
|
| 404 |
+
print("=" * 80)
|
| 405 |
+
|
| 406 |
+
evaluation_results = evaluate_demos(
|
| 407 |
+
manifest_data=manifest_data,
|
| 408 |
+
base_url=args.base_url,
|
| 409 |
+
timeout=args.timeout,
|
| 410 |
+
use_reference=args.use_reference,
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
# Compute statistics
|
| 414 |
+
statistics = compute_statistics(evaluation_results)
|
| 415 |
+
|
| 416 |
+
# Print summary
|
| 417 |
+
print("\n" + "=" * 80)
|
| 418 |
+
print("EVALUATION SUMMARY")
|
| 419 |
+
print("=" * 80)
|
| 420 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 421 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 422 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 423 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 424 |
+
|
| 425 |
+
if statistics:
|
| 426 |
+
print("\n" + "-" * 80)
|
| 427 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 428 |
+
print("-" * 80)
|
| 429 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 430 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 431 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 432 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 433 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 434 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 435 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 436 |
+
|
| 437 |
+
print("\n" + "-" * 80)
|
| 438 |
+
print("THRESHOLD ANALYSIS")
|
| 439 |
+
print("-" * 80)
|
| 440 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 441 |
+
count = statistics[f"count_above_{threshold}"]
|
| 442 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 443 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 444 |
+
|
| 445 |
+
print("\n" + "-" * 80)
|
| 446 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 447 |
+
print("-" * 80)
|
| 448 |
+
|
| 449 |
+
# Save results
|
| 450 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 451 |
+
|
| 452 |
+
# Create plots
|
| 453 |
+
if evaluation_results["results"]:
|
| 454 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 455 |
+
else:
|
| 456 |
+
print("\nNo successful evaluations to plot.")
|
| 457 |
+
|
| 458 |
+
print("\n" + "=" * 80)
|
| 459 |
+
print("EVALUATION COMPLETE")
|
| 460 |
+
print("=" * 80)
|
| 461 |
+
|
| 462 |
+
return 0
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
if __name__ == "__main__":
|
| 466 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008151816.py
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 12 |
+
|
| 13 |
+
Example:
|
| 14 |
+
python evaluate_test_demo_values.py \
|
| 15 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 16 |
+
--output-dir evaluation_results \
|
| 17 |
+
--base-url http://localhost:8111
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import base64
|
| 24 |
+
import json
|
| 25 |
+
import os
|
| 26 |
+
import glob
|
| 27 |
+
import sys
|
| 28 |
+
import time
|
| 29 |
+
from io import BytesIO
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
from typing import Dict, List, Optional
|
| 32 |
+
|
| 33 |
+
import matplotlib.pyplot as plt
|
| 34 |
+
import numpy as np
|
| 35 |
+
import requests
|
| 36 |
+
from PIL import Image
|
| 37 |
+
from tqdm import tqdm
|
| 38 |
+
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
# Helpers
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
|
| 43 |
+
def sample_fixed_interval_frames(image_list, num_frames):
|
| 44 |
+
# sample num_frames frames from image_list
|
| 45 |
+
# sample with equal interval while also ensuring the first and the last frames are included
|
| 46 |
+
if len(image_list) == 0:
|
| 47 |
+
raise ValueError("image_list is empty")
|
| 48 |
+
elif len(image_list) == 1:
|
| 49 |
+
return [image_list[0]] * num_frames
|
| 50 |
+
elif num_frames == 2:
|
| 51 |
+
return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
|
| 52 |
+
elif num_frames == 3:
|
| 53 |
+
return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
|
| 54 |
+
else:
|
| 55 |
+
total_frames = len(image_list)
|
| 56 |
+
indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
|
| 57 |
+
sampled_frames = [image_list[i] for i in indices]
|
| 58 |
+
return sampled_frames
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
num_frames_for_reference = 8
|
| 62 |
+
ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
|
| 63 |
+
libero_10_task_list = [
|
| 64 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 65 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 66 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 67 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 68 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 69 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 70 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 71 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 72 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 73 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
|
| 74 |
+
]
|
| 75 |
+
reference_frames_dict = {}
|
| 76 |
+
for task_name in libero_10_task_list:
|
| 77 |
+
ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
|
| 78 |
+
ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
|
| 79 |
+
ref_frm_file_list.sort()
|
| 80 |
+
reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
|
| 81 |
+
reference_frames_dict[task_name] = reference_frames_temp
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 85 |
+
"""Read the test demo manifest JSON file."""
|
| 86 |
+
if not manifest_path.is_file():
|
| 87 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 88 |
+
|
| 89 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 90 |
+
manifest_data = json.load(f)
|
| 91 |
+
|
| 92 |
+
# Convert relative paths to absolute paths
|
| 93 |
+
manifest_dir = manifest_path.parent
|
| 94 |
+
for demo in manifest_data.get("demos", []):
|
| 95 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 96 |
+
|
| 97 |
+
return manifest_data
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def image_to_base64(path: Path) -> str:
|
| 101 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 102 |
+
with Image.open(path) as img:
|
| 103 |
+
img = img.convert("RGB")
|
| 104 |
+
buffer = BytesIO()
|
| 105 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 106 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 110 |
+
"""Encode a list of image paths to base64."""
|
| 111 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def call_trajectory_critic(
|
| 115 |
+
session: requests.Session,
|
| 116 |
+
base_url: str,
|
| 117 |
+
task: str,
|
| 118 |
+
frames_b64: List[str],
|
| 119 |
+
reference_b64: Optional[List[str]],
|
| 120 |
+
timeout: float,
|
| 121 |
+
) -> Dict:
|
| 122 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 123 |
+
payload = {
|
| 124 |
+
"task": task,
|
| 125 |
+
"frames": frames_b64,
|
| 126 |
+
"reference": reference_b64,
|
| 127 |
+
"ref_num": len(reference_b64 or []),
|
| 128 |
+
"skip": 1,
|
| 129 |
+
"batch_size": min(len(frames_b64), 8),
|
| 130 |
+
"think": False,
|
| 131 |
+
"return_video": False,
|
| 132 |
+
}
|
| 133 |
+
start = time.time()
|
| 134 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 135 |
+
resp.raise_for_status()
|
| 136 |
+
result = resp.json()
|
| 137 |
+
result["latency_sec"] = time.time() - start
|
| 138 |
+
return result
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# ---------------------------------------------------------------------------
|
| 142 |
+
# Evaluation
|
| 143 |
+
# ---------------------------------------------------------------------------
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def evaluate_demos(
|
| 147 |
+
manifest_data: Dict,
|
| 148 |
+
base_url: str,
|
| 149 |
+
timeout: float,
|
| 150 |
+
use_reference: bool = False,
|
| 151 |
+
) -> Dict[str, any]:
|
| 152 |
+
"""Evaluate all demos and collect value statistics."""
|
| 153 |
+
session = requests.Session()
|
| 154 |
+
task_name = manifest_data.get("task_name", "")
|
| 155 |
+
demos = manifest_data.get("demos", [])
|
| 156 |
+
|
| 157 |
+
results = []
|
| 158 |
+
failed_demos = []
|
| 159 |
+
|
| 160 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 161 |
+
print(f"Task: {task_name}")
|
| 162 |
+
print(f"Use reference: {use_reference}\n")
|
| 163 |
+
|
| 164 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 165 |
+
demo_name = demo["demo_name"]
|
| 166 |
+
frame_paths = demo["frame_paths"]
|
| 167 |
+
|
| 168 |
+
# try:
|
| 169 |
+
# Encode frames
|
| 170 |
+
frames_b64 = encode_images(frame_paths)
|
| 171 |
+
|
| 172 |
+
# For now, no reference trajectory (can be added later)
|
| 173 |
+
print(f"Using reference frames for task {task_name}")
|
| 174 |
+
reference_b64 = encode_images(reference_frames_dict[task_name])
|
| 175 |
+
|
| 176 |
+
# Call VLAC service
|
| 177 |
+
result = call_trajectory_critic(
|
| 178 |
+
session=session,
|
| 179 |
+
base_url=base_url,
|
| 180 |
+
task=task_name,
|
| 181 |
+
frames_b64=frames_b64,
|
| 182 |
+
reference_b64=reference_b64,
|
| 183 |
+
timeout=timeout,
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
# Extract values
|
| 187 |
+
value_list = result.get("value_list", [])
|
| 188 |
+
if not value_list:
|
| 189 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 190 |
+
failed_demos.append(demo_name)
|
| 191 |
+
continue
|
| 192 |
+
|
| 193 |
+
# Record results
|
| 194 |
+
demo_result = {
|
| 195 |
+
"demo_name": demo_name,
|
| 196 |
+
"total_frames": demo["total_frames"],
|
| 197 |
+
"success_index": demo["success_index"],
|
| 198 |
+
"num_sampled_frames": len(frame_paths),
|
| 199 |
+
"value_list": value_list,
|
| 200 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 201 |
+
"mean_value": float(np.mean(value_list)),
|
| 202 |
+
"std_value": float(np.std(value_list)),
|
| 203 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 204 |
+
}
|
| 205 |
+
results.append(demo_result)
|
| 206 |
+
|
| 207 |
+
# except requests.RequestException as exc:
|
| 208 |
+
# print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 209 |
+
# failed_demos.append(demo_name)
|
| 210 |
+
# except Exception as exc:
|
| 211 |
+
# print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 212 |
+
# failed_demos.append(demo_name)
|
| 213 |
+
|
| 214 |
+
return {
|
| 215 |
+
"task_name": task_name,
|
| 216 |
+
"total_demos": len(demos),
|
| 217 |
+
"successful_evals": len(results),
|
| 218 |
+
"failed_demos": failed_demos,
|
| 219 |
+
"results": results,
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 224 |
+
"""Compute summary statistics from evaluation results."""
|
| 225 |
+
results = evaluation_results["results"]
|
| 226 |
+
if not results:
|
| 227 |
+
return {}
|
| 228 |
+
|
| 229 |
+
last_values = [r["last_value"] for r in results]
|
| 230 |
+
mean_values = [r["mean_value"] for r in results]
|
| 231 |
+
latencies = [r["latency_sec"] for r in results]
|
| 232 |
+
|
| 233 |
+
stats = {
|
| 234 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 235 |
+
"last_value_std": float(np.std(last_values)),
|
| 236 |
+
"last_value_min": float(np.min(last_values)),
|
| 237 |
+
"last_value_max": float(np.max(last_values)),
|
| 238 |
+
"last_value_median": float(np.median(last_values)),
|
| 239 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 240 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 241 |
+
"mean_latency": float(np.mean(latencies)),
|
| 242 |
+
"total_evaluated": len(results),
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
# Count how many demos have last_value >= various thresholds
|
| 246 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 247 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 248 |
+
stats[f"count_above_{threshold}"] = count
|
| 249 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 250 |
+
|
| 251 |
+
return stats
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 255 |
+
"""Create visualization plots for value distribution."""
|
| 256 |
+
results = evaluation_results["results"]
|
| 257 |
+
if not results:
|
| 258 |
+
print("No results to plot")
|
| 259 |
+
return
|
| 260 |
+
|
| 261 |
+
task_name = evaluation_results["task_name"]
|
| 262 |
+
last_values = [r["last_value"] for r in results]
|
| 263 |
+
|
| 264 |
+
# Create figure with multiple subplots
|
| 265 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 266 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 267 |
+
|
| 268 |
+
# 1. Histogram of last values
|
| 269 |
+
ax1 = axes[0, 0]
|
| 270 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 271 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 272 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 273 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 274 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 275 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 276 |
+
ax1.legend()
|
| 277 |
+
ax1.grid(True, alpha=0.3)
|
| 278 |
+
|
| 279 |
+
# 2. Box plot of last values
|
| 280 |
+
ax2 = axes[0, 1]
|
| 281 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 282 |
+
for patch in box_data['boxes']:
|
| 283 |
+
patch.set_facecolor('lightblue')
|
| 284 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 285 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 286 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 287 |
+
ax2.legend()
|
| 288 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 289 |
+
|
| 290 |
+
# 3. Value progression across demos
|
| 291 |
+
ax3 = axes[1, 0]
|
| 292 |
+
demo_indices = range(len(results))
|
| 293 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 294 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 295 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 296 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 297 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 298 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 299 |
+
ax3.legend()
|
| 300 |
+
ax3.grid(True, alpha=0.3)
|
| 301 |
+
|
| 302 |
+
# 4. Cumulative distribution
|
| 303 |
+
ax4 = axes[1, 1]
|
| 304 |
+
sorted_values = np.sort(last_values)
|
| 305 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 306 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 307 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 308 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 309 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 310 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 311 |
+
ax4.legend()
|
| 312 |
+
ax4.grid(True, alpha=0.3)
|
| 313 |
+
|
| 314 |
+
plt.tight_layout()
|
| 315 |
+
|
| 316 |
+
# Save the plot
|
| 317 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 318 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 319 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 320 |
+
|
| 321 |
+
# Also save a PDF version
|
| 322 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 323 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 324 |
+
print(f"PDF saved to: {pdf_path}")
|
| 325 |
+
|
| 326 |
+
plt.close()
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 330 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 331 |
+
task_name = evaluation_results["task_name"]
|
| 332 |
+
|
| 333 |
+
# Save detailed results
|
| 334 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 335 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 336 |
+
json.dump(evaluation_results, f, indent=2)
|
| 337 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 338 |
+
|
| 339 |
+
# Save summary statistics
|
| 340 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 341 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 342 |
+
json.dump(statistics, f, indent=2)
|
| 343 |
+
print(f"Statistics saved to: {stats_path}")
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
# ---------------------------------------------------------------------------
|
| 347 |
+
# CLI
|
| 348 |
+
# ---------------------------------------------------------------------------
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def parse_args() -> argparse.Namespace:
|
| 352 |
+
parser = argparse.ArgumentParser(
|
| 353 |
+
description="Evaluate value estimation for test demonstrations"
|
| 354 |
+
)
|
| 355 |
+
parser.add_argument(
|
| 356 |
+
"--manifest-path",
|
| 357 |
+
type=Path,
|
| 358 |
+
default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
|
| 359 |
+
help="Path to the test manifest JSON file",
|
| 360 |
+
)
|
| 361 |
+
parser.add_argument(
|
| 362 |
+
"--output-dir",
|
| 363 |
+
type=Path,
|
| 364 |
+
default="evaluation_results",
|
| 365 |
+
help="Directory to save evaluation results and plots",
|
| 366 |
+
)
|
| 367 |
+
parser.add_argument(
|
| 368 |
+
"--base-url",
|
| 369 |
+
default="http://localhost:8111",
|
| 370 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 371 |
+
)
|
| 372 |
+
parser.add_argument(
|
| 373 |
+
"--timeout",
|
| 374 |
+
type=float,
|
| 375 |
+
default=30.0,
|
| 376 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 377 |
+
)
|
| 378 |
+
parser.add_argument(
|
| 379 |
+
"--use-reference",
|
| 380 |
+
action="store_true",
|
| 381 |
+
help="Use reference trajectory (if available)",
|
| 382 |
+
)
|
| 383 |
+
return parser.parse_args()
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
def main() -> int:
|
| 387 |
+
args = parse_args()
|
| 388 |
+
|
| 389 |
+
# Read manifest
|
| 390 |
+
try:
|
| 391 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 392 |
+
except FileNotFoundError as exc:
|
| 393 |
+
print(f"Error: {exc}")
|
| 394 |
+
return 1
|
| 395 |
+
|
| 396 |
+
# Create output directory
|
| 397 |
+
output_dir = args.output_dir.expanduser()
|
| 398 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 399 |
+
|
| 400 |
+
# Run evaluation
|
| 401 |
+
print("=" * 80)
|
| 402 |
+
print("VLAC Value Estimation Evaluation")
|
| 403 |
+
print("=" * 80)
|
| 404 |
+
|
| 405 |
+
evaluation_results = evaluate_demos(
|
| 406 |
+
manifest_data=manifest_data,
|
| 407 |
+
base_url=args.base_url,
|
| 408 |
+
timeout=args.timeout,
|
| 409 |
+
use_reference=args.use_reference,
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
# Compute statistics
|
| 413 |
+
statistics = compute_statistics(evaluation_results)
|
| 414 |
+
|
| 415 |
+
# Print summary
|
| 416 |
+
print("\n" + "=" * 80)
|
| 417 |
+
print("EVALUATION SUMMARY")
|
| 418 |
+
print("=" * 80)
|
| 419 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 420 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 421 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 422 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 423 |
+
|
| 424 |
+
if statistics:
|
| 425 |
+
print("\n" + "-" * 80)
|
| 426 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 427 |
+
print("-" * 80)
|
| 428 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 429 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 430 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 431 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 432 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 433 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 434 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 435 |
+
|
| 436 |
+
print("\n" + "-" * 80)
|
| 437 |
+
print("THRESHOLD ANALYSIS")
|
| 438 |
+
print("-" * 80)
|
| 439 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 440 |
+
count = statistics[f"count_above_{threshold}"]
|
| 441 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 442 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 443 |
+
|
| 444 |
+
print("\n" + "-" * 80)
|
| 445 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 446 |
+
print("-" * 80)
|
| 447 |
+
|
| 448 |
+
# Save results
|
| 449 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 450 |
+
|
| 451 |
+
# Create plots
|
| 452 |
+
if evaluation_results["results"]:
|
| 453 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 454 |
+
else:
|
| 455 |
+
print("\nNo successful evaluations to plot.")
|
| 456 |
+
|
| 457 |
+
print("\n" + "=" * 80)
|
| 458 |
+
print("EVALUATION COMPLETE")
|
| 459 |
+
print("=" * 80)
|
| 460 |
+
|
| 461 |
+
return 0
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
if __name__ == "__main__":
|
| 465 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008152522.py
ADDED
|
@@ -0,0 +1,477 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
# Evaluate all LIBERO-10 tasks
|
| 12 |
+
python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
|
| 13 |
+
|
| 14 |
+
# Evaluate a single task
|
| 15 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 16 |
+
|
| 17 |
+
Examples:
|
| 18 |
+
# Evaluate all LIBERO-10 tasks
|
| 19 |
+
python evaluate_test_demo_values.py \
|
| 20 |
+
--process-all-tasks \
|
| 21 |
+
--manifests-root toy_test_demos_LIBERO_10 \
|
| 22 |
+
--output-dir evaluation_results_all_tasks \
|
| 23 |
+
--base-url http://localhost:8111
|
| 24 |
+
|
| 25 |
+
# Evaluate a single task
|
| 26 |
+
python evaluate_test_demo_values.py \
|
| 27 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 28 |
+
--output-dir evaluation_results \
|
| 29 |
+
--base-url http://localhost:8111
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import argparse
|
| 35 |
+
import base64
|
| 36 |
+
import json
|
| 37 |
+
import os
|
| 38 |
+
import glob
|
| 39 |
+
import sys
|
| 40 |
+
import time
|
| 41 |
+
from io import BytesIO
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
from typing import Dict, List, Optional
|
| 44 |
+
|
| 45 |
+
import matplotlib.pyplot as plt
|
| 46 |
+
import numpy as np
|
| 47 |
+
import requests
|
| 48 |
+
from PIL import Image
|
| 49 |
+
from tqdm import tqdm
|
| 50 |
+
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
# Helpers
|
| 53 |
+
# ---------------------------------------------------------------------------
|
| 54 |
+
|
| 55 |
+
def sample_fixed_interval_frames(image_list, num_frames):
|
| 56 |
+
# sample num_frames frames from image_list
|
| 57 |
+
# sample with equal interval while also ensuring the first and the last frames are included
|
| 58 |
+
if len(image_list) == 0:
|
| 59 |
+
raise ValueError("image_list is empty")
|
| 60 |
+
elif len(image_list) == 1:
|
| 61 |
+
return [image_list[0]] * num_frames
|
| 62 |
+
elif num_frames == 2:
|
| 63 |
+
return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
|
| 64 |
+
elif num_frames == 3:
|
| 65 |
+
return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
|
| 66 |
+
else:
|
| 67 |
+
total_frames = len(image_list)
|
| 68 |
+
indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
|
| 69 |
+
sampled_frames = [image_list[i] for i in indices]
|
| 70 |
+
return sampled_frames
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
num_frames_for_reference = 8
|
| 74 |
+
ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
|
| 75 |
+
libero_10_task_list = [
|
| 76 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 77 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 78 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 79 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 80 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 81 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 82 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 83 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 84 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 85 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
|
| 86 |
+
]
|
| 87 |
+
reference_frames_dict = {}
|
| 88 |
+
for task_name in libero_10_task_list:
|
| 89 |
+
ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
|
| 90 |
+
ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
|
| 91 |
+
ref_frm_file_list.sort()
|
| 92 |
+
reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
|
| 93 |
+
reference_frames_dict[task_name] = reference_frames_temp
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 97 |
+
"""Read the test demo manifest JSON file."""
|
| 98 |
+
if not manifest_path.is_file():
|
| 99 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 100 |
+
|
| 101 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 102 |
+
manifest_data = json.load(f)
|
| 103 |
+
|
| 104 |
+
# Convert relative paths to absolute paths
|
| 105 |
+
manifest_dir = manifest_path.parent
|
| 106 |
+
for demo in manifest_data.get("demos", []):
|
| 107 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 108 |
+
|
| 109 |
+
return manifest_data
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def image_to_base64(path: Path) -> str:
|
| 113 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 114 |
+
with Image.open(path) as img:
|
| 115 |
+
img = img.convert("RGB")
|
| 116 |
+
buffer = BytesIO()
|
| 117 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 118 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 122 |
+
"""Encode a list of image paths to base64."""
|
| 123 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def call_trajectory_critic(
|
| 127 |
+
session: requests.Session,
|
| 128 |
+
base_url: str,
|
| 129 |
+
task: str,
|
| 130 |
+
frames_b64: List[str],
|
| 131 |
+
reference_b64: Optional[List[str]],
|
| 132 |
+
timeout: float,
|
| 133 |
+
) -> Dict:
|
| 134 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 135 |
+
payload = {
|
| 136 |
+
"task": task,
|
| 137 |
+
"frames": frames_b64,
|
| 138 |
+
"reference": reference_b64,
|
| 139 |
+
"ref_num": len(reference_b64 or []),
|
| 140 |
+
"skip": 1,
|
| 141 |
+
"batch_size": min(len(frames_b64), 8),
|
| 142 |
+
"think": False,
|
| 143 |
+
"return_video": False,
|
| 144 |
+
}
|
| 145 |
+
start = time.time()
|
| 146 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 147 |
+
resp.raise_for_status()
|
| 148 |
+
result = resp.json()
|
| 149 |
+
result["latency_sec"] = time.time() - start
|
| 150 |
+
return result
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
# ---------------------------------------------------------------------------
|
| 154 |
+
# Evaluation
|
| 155 |
+
# ---------------------------------------------------------------------------
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def evaluate_demos(
|
| 159 |
+
manifest_data: Dict,
|
| 160 |
+
base_url: str,
|
| 161 |
+
timeout: float,
|
| 162 |
+
use_reference: bool = False,
|
| 163 |
+
) -> Dict[str, any]:
|
| 164 |
+
"""Evaluate all demos and collect value statistics."""
|
| 165 |
+
session = requests.Session()
|
| 166 |
+
task_name = manifest_data.get("task_name", "")
|
| 167 |
+
demos = manifest_data.get("demos", [])
|
| 168 |
+
|
| 169 |
+
results = []
|
| 170 |
+
failed_demos = []
|
| 171 |
+
|
| 172 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 173 |
+
print(f"Task: {task_name}")
|
| 174 |
+
print(f"Use reference: {use_reference}\n")
|
| 175 |
+
|
| 176 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 177 |
+
demo_name = demo["demo_name"]
|
| 178 |
+
frame_paths = demo["frame_paths"]
|
| 179 |
+
|
| 180 |
+
# try:
|
| 181 |
+
# Encode frames
|
| 182 |
+
frames_b64 = encode_images(frame_paths)
|
| 183 |
+
|
| 184 |
+
# For now, no reference trajectory (can be added later)
|
| 185 |
+
print(f"Using reference frames for task {task_name}")
|
| 186 |
+
reference_b64 = encode_images(reference_frames_dict[task_name])
|
| 187 |
+
|
| 188 |
+
# Call VLAC service
|
| 189 |
+
result = call_trajectory_critic(
|
| 190 |
+
session=session,
|
| 191 |
+
base_url=base_url,
|
| 192 |
+
task=task_name,
|
| 193 |
+
frames_b64=frames_b64,
|
| 194 |
+
reference_b64=reference_b64,
|
| 195 |
+
timeout=timeout,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
# Extract values
|
| 199 |
+
value_list = result.get("value_list", [])
|
| 200 |
+
if not value_list:
|
| 201 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 202 |
+
failed_demos.append(demo_name)
|
| 203 |
+
continue
|
| 204 |
+
|
| 205 |
+
# Record results
|
| 206 |
+
demo_result = {
|
| 207 |
+
"demo_name": demo_name,
|
| 208 |
+
"total_frames": demo["total_frames"],
|
| 209 |
+
"success_index": demo["success_index"],
|
| 210 |
+
"num_sampled_frames": len(frame_paths),
|
| 211 |
+
"value_list": value_list,
|
| 212 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 213 |
+
"mean_value": float(np.mean(value_list)),
|
| 214 |
+
"std_value": float(np.std(value_list)),
|
| 215 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 216 |
+
}
|
| 217 |
+
results.append(demo_result)
|
| 218 |
+
|
| 219 |
+
# except requests.RequestException as exc:
|
| 220 |
+
# print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 221 |
+
# failed_demos.append(demo_name)
|
| 222 |
+
# except Exception as exc:
|
| 223 |
+
# print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 224 |
+
# failed_demos.append(demo_name)
|
| 225 |
+
|
| 226 |
+
return {
|
| 227 |
+
"task_name": task_name,
|
| 228 |
+
"total_demos": len(demos),
|
| 229 |
+
"successful_evals": len(results),
|
| 230 |
+
"failed_demos": failed_demos,
|
| 231 |
+
"results": results,
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 236 |
+
"""Compute summary statistics from evaluation results."""
|
| 237 |
+
results = evaluation_results["results"]
|
| 238 |
+
if not results:
|
| 239 |
+
return {}
|
| 240 |
+
|
| 241 |
+
last_values = [r["last_value"] for r in results]
|
| 242 |
+
mean_values = [r["mean_value"] for r in results]
|
| 243 |
+
latencies = [r["latency_sec"] for r in results]
|
| 244 |
+
|
| 245 |
+
stats = {
|
| 246 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 247 |
+
"last_value_std": float(np.std(last_values)),
|
| 248 |
+
"last_value_min": float(np.min(last_values)),
|
| 249 |
+
"last_value_max": float(np.max(last_values)),
|
| 250 |
+
"last_value_median": float(np.median(last_values)),
|
| 251 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 252 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 253 |
+
"mean_latency": float(np.mean(latencies)),
|
| 254 |
+
"total_evaluated": len(results),
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
# Count how many demos have last_value >= various thresholds
|
| 258 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 259 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 260 |
+
stats[f"count_above_{threshold}"] = count
|
| 261 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 262 |
+
|
| 263 |
+
return stats
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 267 |
+
"""Create visualization plots for value distribution."""
|
| 268 |
+
results = evaluation_results["results"]
|
| 269 |
+
if not results:
|
| 270 |
+
print("No results to plot")
|
| 271 |
+
return
|
| 272 |
+
|
| 273 |
+
task_name = evaluation_results["task_name"]
|
| 274 |
+
last_values = [r["last_value"] for r in results]
|
| 275 |
+
|
| 276 |
+
# Create figure with multiple subplots
|
| 277 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 278 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 279 |
+
|
| 280 |
+
# 1. Histogram of last values
|
| 281 |
+
ax1 = axes[0, 0]
|
| 282 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 283 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 284 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 285 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 286 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 287 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 288 |
+
ax1.legend()
|
| 289 |
+
ax1.grid(True, alpha=0.3)
|
| 290 |
+
|
| 291 |
+
# 2. Box plot of last values
|
| 292 |
+
ax2 = axes[0, 1]
|
| 293 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 294 |
+
for patch in box_data['boxes']:
|
| 295 |
+
patch.set_facecolor('lightblue')
|
| 296 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 297 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 298 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 299 |
+
ax2.legend()
|
| 300 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 301 |
+
|
| 302 |
+
# 3. Value progression across demos
|
| 303 |
+
ax3 = axes[1, 0]
|
| 304 |
+
demo_indices = range(len(results))
|
| 305 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 306 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 307 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 308 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 309 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 310 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 311 |
+
ax3.legend()
|
| 312 |
+
ax3.grid(True, alpha=0.3)
|
| 313 |
+
|
| 314 |
+
# 4. Cumulative distribution
|
| 315 |
+
ax4 = axes[1, 1]
|
| 316 |
+
sorted_values = np.sort(last_values)
|
| 317 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 318 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 319 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 320 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 321 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 322 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 323 |
+
ax4.legend()
|
| 324 |
+
ax4.grid(True, alpha=0.3)
|
| 325 |
+
|
| 326 |
+
plt.tight_layout()
|
| 327 |
+
|
| 328 |
+
# Save the plot
|
| 329 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 330 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 331 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 332 |
+
|
| 333 |
+
# Also save a PDF version
|
| 334 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 335 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 336 |
+
print(f"PDF saved to: {pdf_path}")
|
| 337 |
+
|
| 338 |
+
plt.close()
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 342 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 343 |
+
task_name = evaluation_results["task_name"]
|
| 344 |
+
|
| 345 |
+
# Save detailed results
|
| 346 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 347 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 348 |
+
json.dump(evaluation_results, f, indent=2)
|
| 349 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 350 |
+
|
| 351 |
+
# Save summary statistics
|
| 352 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 353 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 354 |
+
json.dump(statistics, f, indent=2)
|
| 355 |
+
print(f"Statistics saved to: {stats_path}")
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
# ---------------------------------------------------------------------------
|
| 359 |
+
# CLI
|
| 360 |
+
# ---------------------------------------------------------------------------
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
def parse_args() -> argparse.Namespace:
|
| 364 |
+
parser = argparse.ArgumentParser(
|
| 365 |
+
description="Evaluate value estimation for test demonstrations"
|
| 366 |
+
)
|
| 367 |
+
parser.add_argument(
|
| 368 |
+
"--manifest-path",
|
| 369 |
+
type=Path,
|
| 370 |
+
default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
|
| 371 |
+
help="Path to the test manifest JSON file",
|
| 372 |
+
)
|
| 373 |
+
parser.add_argument(
|
| 374 |
+
"--output-dir",
|
| 375 |
+
type=Path,
|
| 376 |
+
default="evaluation_results",
|
| 377 |
+
help="Directory to save evaluation results and plots",
|
| 378 |
+
)
|
| 379 |
+
parser.add_argument(
|
| 380 |
+
"--base-url",
|
| 381 |
+
default="http://localhost:8111",
|
| 382 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 383 |
+
)
|
| 384 |
+
parser.add_argument(
|
| 385 |
+
"--timeout",
|
| 386 |
+
type=float,
|
| 387 |
+
default=30.0,
|
| 388 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 389 |
+
)
|
| 390 |
+
parser.add_argument(
|
| 391 |
+
"--use-reference",
|
| 392 |
+
action="store_true",
|
| 393 |
+
help="Use reference trajectory (if available)",
|
| 394 |
+
)
|
| 395 |
+
return parser.parse_args()
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
def main() -> int:
|
| 399 |
+
args = parse_args()
|
| 400 |
+
|
| 401 |
+
# Read manifest
|
| 402 |
+
try:
|
| 403 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 404 |
+
except FileNotFoundError as exc:
|
| 405 |
+
print(f"Error: {exc}")
|
| 406 |
+
return 1
|
| 407 |
+
|
| 408 |
+
# Create output directory
|
| 409 |
+
output_dir = args.output_dir.expanduser()
|
| 410 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 411 |
+
|
| 412 |
+
# Run evaluation
|
| 413 |
+
print("=" * 80)
|
| 414 |
+
print("VLAC Value Estimation Evaluation")
|
| 415 |
+
print("=" * 80)
|
| 416 |
+
|
| 417 |
+
evaluation_results = evaluate_demos(
|
| 418 |
+
manifest_data=manifest_data,
|
| 419 |
+
base_url=args.base_url,
|
| 420 |
+
timeout=args.timeout,
|
| 421 |
+
use_reference=args.use_reference,
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
# Compute statistics
|
| 425 |
+
statistics = compute_statistics(evaluation_results)
|
| 426 |
+
|
| 427 |
+
# Print summary
|
| 428 |
+
print("\n" + "=" * 80)
|
| 429 |
+
print("EVALUATION SUMMARY")
|
| 430 |
+
print("=" * 80)
|
| 431 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 432 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 433 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 434 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 435 |
+
|
| 436 |
+
if statistics:
|
| 437 |
+
print("\n" + "-" * 80)
|
| 438 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 439 |
+
print("-" * 80)
|
| 440 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 441 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 442 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 443 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 444 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 445 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 446 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 447 |
+
|
| 448 |
+
print("\n" + "-" * 80)
|
| 449 |
+
print("THRESHOLD ANALYSIS")
|
| 450 |
+
print("-" * 80)
|
| 451 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 452 |
+
count = statistics[f"count_above_{threshold}"]
|
| 453 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 454 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 455 |
+
|
| 456 |
+
print("\n" + "-" * 80)
|
| 457 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 458 |
+
print("-" * 80)
|
| 459 |
+
|
| 460 |
+
# Save results
|
| 461 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 462 |
+
|
| 463 |
+
# Create plots
|
| 464 |
+
if evaluation_results["results"]:
|
| 465 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 466 |
+
else:
|
| 467 |
+
print("\nNo successful evaluations to plot.")
|
| 468 |
+
|
| 469 |
+
print("\n" + "=" * 80)
|
| 470 |
+
print("EVALUATION COMPLETE")
|
| 471 |
+
print("=" * 80)
|
| 472 |
+
|
| 473 |
+
return 0
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
if __name__ == "__main__":
|
| 477 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008152534.py
ADDED
|
@@ -0,0 +1,491 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
# Evaluate all LIBERO-10 tasks
|
| 12 |
+
python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
|
| 13 |
+
|
| 14 |
+
# Evaluate a single task
|
| 15 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 16 |
+
|
| 17 |
+
Examples:
|
| 18 |
+
# Evaluate all LIBERO-10 tasks
|
| 19 |
+
python evaluate_test_demo_values.py \
|
| 20 |
+
--process-all-tasks \
|
| 21 |
+
--manifests-root toy_test_demos_LIBERO_10 \
|
| 22 |
+
--output-dir evaluation_results_all_tasks \
|
| 23 |
+
--base-url http://localhost:8111
|
| 24 |
+
|
| 25 |
+
# Evaluate a single task
|
| 26 |
+
python evaluate_test_demo_values.py \
|
| 27 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 28 |
+
--output-dir evaluation_results \
|
| 29 |
+
--base-url http://localhost:8111
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import argparse
|
| 35 |
+
import base64
|
| 36 |
+
import json
|
| 37 |
+
import os
|
| 38 |
+
import glob
|
| 39 |
+
import sys
|
| 40 |
+
import time
|
| 41 |
+
from io import BytesIO
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
from typing import Dict, List, Optional
|
| 44 |
+
|
| 45 |
+
import matplotlib.pyplot as plt
|
| 46 |
+
import numpy as np
|
| 47 |
+
import requests
|
| 48 |
+
from PIL import Image
|
| 49 |
+
from tqdm import tqdm
|
| 50 |
+
|
| 51 |
+
# LIBERO-10 task list
|
| 52 |
+
LIBERO_10_TASKS = [
|
| 53 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 54 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 55 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 56 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 57 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 58 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 59 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 60 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 61 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 62 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# Helpers
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
|
| 69 |
+
def sample_fixed_interval_frames(image_list, num_frames):
|
| 70 |
+
# sample num_frames frames from image_list
|
| 71 |
+
# sample with equal interval while also ensuring the first and the last frames are included
|
| 72 |
+
if len(image_list) == 0:
|
| 73 |
+
raise ValueError("image_list is empty")
|
| 74 |
+
elif len(image_list) == 1:
|
| 75 |
+
return [image_list[0]] * num_frames
|
| 76 |
+
elif num_frames == 2:
|
| 77 |
+
return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
|
| 78 |
+
elif num_frames == 3:
|
| 79 |
+
return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
|
| 80 |
+
else:
|
| 81 |
+
total_frames = len(image_list)
|
| 82 |
+
indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
|
| 83 |
+
sampled_frames = [image_list[i] for i in indices]
|
| 84 |
+
return sampled_frames
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
num_frames_for_reference = 8
|
| 88 |
+
ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
|
| 89 |
+
libero_10_task_list = [
|
| 90 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 91 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 92 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 93 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 94 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 95 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 96 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 97 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 98 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 99 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
|
| 100 |
+
]
|
| 101 |
+
reference_frames_dict = {}
|
| 102 |
+
for task_name in libero_10_task_list:
|
| 103 |
+
ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
|
| 104 |
+
ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
|
| 105 |
+
ref_frm_file_list.sort()
|
| 106 |
+
reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
|
| 107 |
+
reference_frames_dict[task_name] = reference_frames_temp
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 111 |
+
"""Read the test demo manifest JSON file."""
|
| 112 |
+
if not manifest_path.is_file():
|
| 113 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 114 |
+
|
| 115 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 116 |
+
manifest_data = json.load(f)
|
| 117 |
+
|
| 118 |
+
# Convert relative paths to absolute paths
|
| 119 |
+
manifest_dir = manifest_path.parent
|
| 120 |
+
for demo in manifest_data.get("demos", []):
|
| 121 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 122 |
+
|
| 123 |
+
return manifest_data
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def image_to_base64(path: Path) -> str:
|
| 127 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 128 |
+
with Image.open(path) as img:
|
| 129 |
+
img = img.convert("RGB")
|
| 130 |
+
buffer = BytesIO()
|
| 131 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 132 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 136 |
+
"""Encode a list of image paths to base64."""
|
| 137 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def call_trajectory_critic(
|
| 141 |
+
session: requests.Session,
|
| 142 |
+
base_url: str,
|
| 143 |
+
task: str,
|
| 144 |
+
frames_b64: List[str],
|
| 145 |
+
reference_b64: Optional[List[str]],
|
| 146 |
+
timeout: float,
|
| 147 |
+
) -> Dict:
|
| 148 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 149 |
+
payload = {
|
| 150 |
+
"task": task,
|
| 151 |
+
"frames": frames_b64,
|
| 152 |
+
"reference": reference_b64,
|
| 153 |
+
"ref_num": len(reference_b64 or []),
|
| 154 |
+
"skip": 1,
|
| 155 |
+
"batch_size": min(len(frames_b64), 8),
|
| 156 |
+
"think": False,
|
| 157 |
+
"return_video": False,
|
| 158 |
+
}
|
| 159 |
+
start = time.time()
|
| 160 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 161 |
+
resp.raise_for_status()
|
| 162 |
+
result = resp.json()
|
| 163 |
+
result["latency_sec"] = time.time() - start
|
| 164 |
+
return result
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ---------------------------------------------------------------------------
|
| 168 |
+
# Evaluation
|
| 169 |
+
# ---------------------------------------------------------------------------
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def evaluate_demos(
|
| 173 |
+
manifest_data: Dict,
|
| 174 |
+
base_url: str,
|
| 175 |
+
timeout: float,
|
| 176 |
+
use_reference: bool = False,
|
| 177 |
+
) -> Dict[str, any]:
|
| 178 |
+
"""Evaluate all demos and collect value statistics."""
|
| 179 |
+
session = requests.Session()
|
| 180 |
+
task_name = manifest_data.get("task_name", "")
|
| 181 |
+
demos = manifest_data.get("demos", [])
|
| 182 |
+
|
| 183 |
+
results = []
|
| 184 |
+
failed_demos = []
|
| 185 |
+
|
| 186 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 187 |
+
print(f"Task: {task_name}")
|
| 188 |
+
print(f"Use reference: {use_reference}\n")
|
| 189 |
+
|
| 190 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 191 |
+
demo_name = demo["demo_name"]
|
| 192 |
+
frame_paths = demo["frame_paths"]
|
| 193 |
+
|
| 194 |
+
# try:
|
| 195 |
+
# Encode frames
|
| 196 |
+
frames_b64 = encode_images(frame_paths)
|
| 197 |
+
|
| 198 |
+
# For now, no reference trajectory (can be added later)
|
| 199 |
+
print(f"Using reference frames for task {task_name}")
|
| 200 |
+
reference_b64 = encode_images(reference_frames_dict[task_name])
|
| 201 |
+
|
| 202 |
+
# Call VLAC service
|
| 203 |
+
result = call_trajectory_critic(
|
| 204 |
+
session=session,
|
| 205 |
+
base_url=base_url,
|
| 206 |
+
task=task_name,
|
| 207 |
+
frames_b64=frames_b64,
|
| 208 |
+
reference_b64=reference_b64,
|
| 209 |
+
timeout=timeout,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Extract values
|
| 213 |
+
value_list = result.get("value_list", [])
|
| 214 |
+
if not value_list:
|
| 215 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 216 |
+
failed_demos.append(demo_name)
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
# Record results
|
| 220 |
+
demo_result = {
|
| 221 |
+
"demo_name": demo_name,
|
| 222 |
+
"total_frames": demo["total_frames"],
|
| 223 |
+
"success_index": demo["success_index"],
|
| 224 |
+
"num_sampled_frames": len(frame_paths),
|
| 225 |
+
"value_list": value_list,
|
| 226 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 227 |
+
"mean_value": float(np.mean(value_list)),
|
| 228 |
+
"std_value": float(np.std(value_list)),
|
| 229 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 230 |
+
}
|
| 231 |
+
results.append(demo_result)
|
| 232 |
+
|
| 233 |
+
# except requests.RequestException as exc:
|
| 234 |
+
# print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 235 |
+
# failed_demos.append(demo_name)
|
| 236 |
+
# except Exception as exc:
|
| 237 |
+
# print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 238 |
+
# failed_demos.append(demo_name)
|
| 239 |
+
|
| 240 |
+
return {
|
| 241 |
+
"task_name": task_name,
|
| 242 |
+
"total_demos": len(demos),
|
| 243 |
+
"successful_evals": len(results),
|
| 244 |
+
"failed_demos": failed_demos,
|
| 245 |
+
"results": results,
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 250 |
+
"""Compute summary statistics from evaluation results."""
|
| 251 |
+
results = evaluation_results["results"]
|
| 252 |
+
if not results:
|
| 253 |
+
return {}
|
| 254 |
+
|
| 255 |
+
last_values = [r["last_value"] for r in results]
|
| 256 |
+
mean_values = [r["mean_value"] for r in results]
|
| 257 |
+
latencies = [r["latency_sec"] for r in results]
|
| 258 |
+
|
| 259 |
+
stats = {
|
| 260 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 261 |
+
"last_value_std": float(np.std(last_values)),
|
| 262 |
+
"last_value_min": float(np.min(last_values)),
|
| 263 |
+
"last_value_max": float(np.max(last_values)),
|
| 264 |
+
"last_value_median": float(np.median(last_values)),
|
| 265 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 266 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 267 |
+
"mean_latency": float(np.mean(latencies)),
|
| 268 |
+
"total_evaluated": len(results),
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
# Count how many demos have last_value >= various thresholds
|
| 272 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 273 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 274 |
+
stats[f"count_above_{threshold}"] = count
|
| 275 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 276 |
+
|
| 277 |
+
return stats
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 281 |
+
"""Create visualization plots for value distribution."""
|
| 282 |
+
results = evaluation_results["results"]
|
| 283 |
+
if not results:
|
| 284 |
+
print("No results to plot")
|
| 285 |
+
return
|
| 286 |
+
|
| 287 |
+
task_name = evaluation_results["task_name"]
|
| 288 |
+
last_values = [r["last_value"] for r in results]
|
| 289 |
+
|
| 290 |
+
# Create figure with multiple subplots
|
| 291 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 292 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 293 |
+
|
| 294 |
+
# 1. Histogram of last values
|
| 295 |
+
ax1 = axes[0, 0]
|
| 296 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 297 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 298 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 299 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 300 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 301 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 302 |
+
ax1.legend()
|
| 303 |
+
ax1.grid(True, alpha=0.3)
|
| 304 |
+
|
| 305 |
+
# 2. Box plot of last values
|
| 306 |
+
ax2 = axes[0, 1]
|
| 307 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 308 |
+
for patch in box_data['boxes']:
|
| 309 |
+
patch.set_facecolor('lightblue')
|
| 310 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 311 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 312 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 313 |
+
ax2.legend()
|
| 314 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 315 |
+
|
| 316 |
+
# 3. Value progression across demos
|
| 317 |
+
ax3 = axes[1, 0]
|
| 318 |
+
demo_indices = range(len(results))
|
| 319 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 320 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 321 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 322 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 323 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 324 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 325 |
+
ax3.legend()
|
| 326 |
+
ax3.grid(True, alpha=0.3)
|
| 327 |
+
|
| 328 |
+
# 4. Cumulative distribution
|
| 329 |
+
ax4 = axes[1, 1]
|
| 330 |
+
sorted_values = np.sort(last_values)
|
| 331 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 332 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 333 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 334 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 335 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 336 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 337 |
+
ax4.legend()
|
| 338 |
+
ax4.grid(True, alpha=0.3)
|
| 339 |
+
|
| 340 |
+
plt.tight_layout()
|
| 341 |
+
|
| 342 |
+
# Save the plot
|
| 343 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 344 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 345 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 346 |
+
|
| 347 |
+
# Also save a PDF version
|
| 348 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 349 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 350 |
+
print(f"PDF saved to: {pdf_path}")
|
| 351 |
+
|
| 352 |
+
plt.close()
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 356 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 357 |
+
task_name = evaluation_results["task_name"]
|
| 358 |
+
|
| 359 |
+
# Save detailed results
|
| 360 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 361 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 362 |
+
json.dump(evaluation_results, f, indent=2)
|
| 363 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 364 |
+
|
| 365 |
+
# Save summary statistics
|
| 366 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 367 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 368 |
+
json.dump(statistics, f, indent=2)
|
| 369 |
+
print(f"Statistics saved to: {stats_path}")
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
# ---------------------------------------------------------------------------
|
| 373 |
+
# CLI
|
| 374 |
+
# ---------------------------------------------------------------------------
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
def parse_args() -> argparse.Namespace:
|
| 378 |
+
parser = argparse.ArgumentParser(
|
| 379 |
+
description="Evaluate value estimation for test demonstrations"
|
| 380 |
+
)
|
| 381 |
+
parser.add_argument(
|
| 382 |
+
"--manifest-path",
|
| 383 |
+
type=Path,
|
| 384 |
+
default="toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json",
|
| 385 |
+
help="Path to the test manifest JSON file",
|
| 386 |
+
)
|
| 387 |
+
parser.add_argument(
|
| 388 |
+
"--output-dir",
|
| 389 |
+
type=Path,
|
| 390 |
+
default="evaluation_results",
|
| 391 |
+
help="Directory to save evaluation results and plots",
|
| 392 |
+
)
|
| 393 |
+
parser.add_argument(
|
| 394 |
+
"--base-url",
|
| 395 |
+
default="http://localhost:8111",
|
| 396 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 397 |
+
)
|
| 398 |
+
parser.add_argument(
|
| 399 |
+
"--timeout",
|
| 400 |
+
type=float,
|
| 401 |
+
default=30.0,
|
| 402 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 403 |
+
)
|
| 404 |
+
parser.add_argument(
|
| 405 |
+
"--use-reference",
|
| 406 |
+
action="store_true",
|
| 407 |
+
help="Use reference trajectory (if available)",
|
| 408 |
+
)
|
| 409 |
+
return parser.parse_args()
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
def main() -> int:
|
| 413 |
+
args = parse_args()
|
| 414 |
+
|
| 415 |
+
# Read manifest
|
| 416 |
+
try:
|
| 417 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 418 |
+
except FileNotFoundError as exc:
|
| 419 |
+
print(f"Error: {exc}")
|
| 420 |
+
return 1
|
| 421 |
+
|
| 422 |
+
# Create output directory
|
| 423 |
+
output_dir = args.output_dir.expanduser()
|
| 424 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 425 |
+
|
| 426 |
+
# Run evaluation
|
| 427 |
+
print("=" * 80)
|
| 428 |
+
print("VLAC Value Estimation Evaluation")
|
| 429 |
+
print("=" * 80)
|
| 430 |
+
|
| 431 |
+
evaluation_results = evaluate_demos(
|
| 432 |
+
manifest_data=manifest_data,
|
| 433 |
+
base_url=args.base_url,
|
| 434 |
+
timeout=args.timeout,
|
| 435 |
+
use_reference=args.use_reference,
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
# Compute statistics
|
| 439 |
+
statistics = compute_statistics(evaluation_results)
|
| 440 |
+
|
| 441 |
+
# Print summary
|
| 442 |
+
print("\n" + "=" * 80)
|
| 443 |
+
print("EVALUATION SUMMARY")
|
| 444 |
+
print("=" * 80)
|
| 445 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 446 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 447 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 448 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 449 |
+
|
| 450 |
+
if statistics:
|
| 451 |
+
print("\n" + "-" * 80)
|
| 452 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 453 |
+
print("-" * 80)
|
| 454 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 455 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 456 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 457 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 458 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 459 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 460 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 461 |
+
|
| 462 |
+
print("\n" + "-" * 80)
|
| 463 |
+
print("THRESHOLD ANALYSIS")
|
| 464 |
+
print("-" * 80)
|
| 465 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 466 |
+
count = statistics[f"count_above_{threshold}"]
|
| 467 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 468 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 469 |
+
|
| 470 |
+
print("\n" + "-" * 80)
|
| 471 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 472 |
+
print("-" * 80)
|
| 473 |
+
|
| 474 |
+
# Save results
|
| 475 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 476 |
+
|
| 477 |
+
# Create plots
|
| 478 |
+
if evaluation_results["results"]:
|
| 479 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 480 |
+
else:
|
| 481 |
+
print("\nNo successful evaluations to plot.")
|
| 482 |
+
|
| 483 |
+
print("\n" + "=" * 80)
|
| 484 |
+
print("EVALUATION COMPLETE")
|
| 485 |
+
print("=" * 80)
|
| 486 |
+
|
| 487 |
+
return 0
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
if __name__ == "__main__":
|
| 491 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008152548.py
ADDED
|
@@ -0,0 +1,519 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
# Evaluate all LIBERO-10 tasks
|
| 12 |
+
python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
|
| 13 |
+
|
| 14 |
+
# Evaluate a single task
|
| 15 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 16 |
+
|
| 17 |
+
Examples:
|
| 18 |
+
# Evaluate all LIBERO-10 tasks
|
| 19 |
+
python evaluate_test_demo_values.py \
|
| 20 |
+
--process-all-tasks \
|
| 21 |
+
--manifests-root toy_test_demos_LIBERO_10 \
|
| 22 |
+
--output-dir evaluation_results_all_tasks \
|
| 23 |
+
--base-url http://localhost:8111
|
| 24 |
+
|
| 25 |
+
# Evaluate a single task
|
| 26 |
+
python evaluate_test_demo_values.py \
|
| 27 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 28 |
+
--output-dir evaluation_results \
|
| 29 |
+
--base-url http://localhost:8111
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import argparse
|
| 35 |
+
import base64
|
| 36 |
+
import json
|
| 37 |
+
import os
|
| 38 |
+
import glob
|
| 39 |
+
import sys
|
| 40 |
+
import time
|
| 41 |
+
from io import BytesIO
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
from typing import Dict, List, Optional
|
| 44 |
+
|
| 45 |
+
import matplotlib.pyplot as plt
|
| 46 |
+
import numpy as np
|
| 47 |
+
import requests
|
| 48 |
+
from PIL import Image
|
| 49 |
+
from tqdm import tqdm
|
| 50 |
+
|
| 51 |
+
# LIBERO-10 task list
|
| 52 |
+
LIBERO_10_TASKS = [
|
| 53 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 54 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 55 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 56 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 57 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 58 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 59 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 60 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 61 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 62 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# Helpers
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
|
| 69 |
+
def sample_fixed_interval_frames(image_list, num_frames):
|
| 70 |
+
# sample num_frames frames from image_list
|
| 71 |
+
# sample with equal interval while also ensuring the first and the last frames are included
|
| 72 |
+
if len(image_list) == 0:
|
| 73 |
+
raise ValueError("image_list is empty")
|
| 74 |
+
elif len(image_list) == 1:
|
| 75 |
+
return [image_list[0]] * num_frames
|
| 76 |
+
elif num_frames == 2:
|
| 77 |
+
return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
|
| 78 |
+
elif num_frames == 3:
|
| 79 |
+
return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
|
| 80 |
+
else:
|
| 81 |
+
total_frames = len(image_list)
|
| 82 |
+
indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
|
| 83 |
+
sampled_frames = [image_list[i] for i in indices]
|
| 84 |
+
return sampled_frames
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
num_frames_for_reference = 8
|
| 88 |
+
ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
|
| 89 |
+
libero_10_task_list = [
|
| 90 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 91 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 92 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 93 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 94 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 95 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 96 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 97 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 98 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 99 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
|
| 100 |
+
]
|
| 101 |
+
reference_frames_dict = {}
|
| 102 |
+
for task_name in libero_10_task_list:
|
| 103 |
+
ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
|
| 104 |
+
ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
|
| 105 |
+
ref_frm_file_list.sort()
|
| 106 |
+
reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
|
| 107 |
+
reference_frames_dict[task_name] = reference_frames_temp
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 111 |
+
"""Read the test demo manifest JSON file."""
|
| 112 |
+
if not manifest_path.is_file():
|
| 113 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 114 |
+
|
| 115 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 116 |
+
manifest_data = json.load(f)
|
| 117 |
+
|
| 118 |
+
# Convert relative paths to absolute paths
|
| 119 |
+
manifest_dir = manifest_path.parent
|
| 120 |
+
for demo in manifest_data.get("demos", []):
|
| 121 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 122 |
+
|
| 123 |
+
return manifest_data
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def image_to_base64(path: Path) -> str:
|
| 127 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 128 |
+
with Image.open(path) as img:
|
| 129 |
+
img = img.convert("RGB")
|
| 130 |
+
buffer = BytesIO()
|
| 131 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 132 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 136 |
+
"""Encode a list of image paths to base64."""
|
| 137 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def call_trajectory_critic(
|
| 141 |
+
session: requests.Session,
|
| 142 |
+
base_url: str,
|
| 143 |
+
task: str,
|
| 144 |
+
frames_b64: List[str],
|
| 145 |
+
reference_b64: Optional[List[str]],
|
| 146 |
+
timeout: float,
|
| 147 |
+
) -> Dict:
|
| 148 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 149 |
+
payload = {
|
| 150 |
+
"task": task,
|
| 151 |
+
"frames": frames_b64,
|
| 152 |
+
"reference": reference_b64,
|
| 153 |
+
"ref_num": len(reference_b64 or []),
|
| 154 |
+
"skip": 1,
|
| 155 |
+
"batch_size": min(len(frames_b64), 8),
|
| 156 |
+
"think": False,
|
| 157 |
+
"return_video": False,
|
| 158 |
+
}
|
| 159 |
+
start = time.time()
|
| 160 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 161 |
+
resp.raise_for_status()
|
| 162 |
+
result = resp.json()
|
| 163 |
+
result["latency_sec"] = time.time() - start
|
| 164 |
+
return result
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ---------------------------------------------------------------------------
|
| 168 |
+
# Evaluation
|
| 169 |
+
# ---------------------------------------------------------------------------
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def evaluate_demos(
|
| 173 |
+
manifest_data: Dict,
|
| 174 |
+
base_url: str,
|
| 175 |
+
timeout: float,
|
| 176 |
+
use_reference: bool = False,
|
| 177 |
+
) -> Dict[str, any]:
|
| 178 |
+
"""Evaluate all demos and collect value statistics."""
|
| 179 |
+
session = requests.Session()
|
| 180 |
+
task_name = manifest_data.get("task_name", "")
|
| 181 |
+
demos = manifest_data.get("demos", [])
|
| 182 |
+
|
| 183 |
+
results = []
|
| 184 |
+
failed_demos = []
|
| 185 |
+
|
| 186 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 187 |
+
print(f"Task: {task_name}")
|
| 188 |
+
print(f"Use reference: {use_reference}\n")
|
| 189 |
+
|
| 190 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 191 |
+
demo_name = demo["demo_name"]
|
| 192 |
+
frame_paths = demo["frame_paths"]
|
| 193 |
+
|
| 194 |
+
# try:
|
| 195 |
+
# Encode frames
|
| 196 |
+
frames_b64 = encode_images(frame_paths)
|
| 197 |
+
|
| 198 |
+
# For now, no reference trajectory (can be added later)
|
| 199 |
+
print(f"Using reference frames for task {task_name}")
|
| 200 |
+
reference_b64 = encode_images(reference_frames_dict[task_name])
|
| 201 |
+
|
| 202 |
+
# Call VLAC service
|
| 203 |
+
result = call_trajectory_critic(
|
| 204 |
+
session=session,
|
| 205 |
+
base_url=base_url,
|
| 206 |
+
task=task_name,
|
| 207 |
+
frames_b64=frames_b64,
|
| 208 |
+
reference_b64=reference_b64,
|
| 209 |
+
timeout=timeout,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Extract values
|
| 213 |
+
value_list = result.get("value_list", [])
|
| 214 |
+
if not value_list:
|
| 215 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 216 |
+
failed_demos.append(demo_name)
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
# Record results
|
| 220 |
+
demo_result = {
|
| 221 |
+
"demo_name": demo_name,
|
| 222 |
+
"total_frames": demo["total_frames"],
|
| 223 |
+
"success_index": demo["success_index"],
|
| 224 |
+
"num_sampled_frames": len(frame_paths),
|
| 225 |
+
"value_list": value_list,
|
| 226 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 227 |
+
"mean_value": float(np.mean(value_list)),
|
| 228 |
+
"std_value": float(np.std(value_list)),
|
| 229 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 230 |
+
}
|
| 231 |
+
results.append(demo_result)
|
| 232 |
+
|
| 233 |
+
# except requests.RequestException as exc:
|
| 234 |
+
# print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 235 |
+
# failed_demos.append(demo_name)
|
| 236 |
+
# except Exception as exc:
|
| 237 |
+
# print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 238 |
+
# failed_demos.append(demo_name)
|
| 239 |
+
|
| 240 |
+
return {
|
| 241 |
+
"task_name": task_name,
|
| 242 |
+
"total_demos": len(demos),
|
| 243 |
+
"successful_evals": len(results),
|
| 244 |
+
"failed_demos": failed_demos,
|
| 245 |
+
"results": results,
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 250 |
+
"""Compute summary statistics from evaluation results."""
|
| 251 |
+
results = evaluation_results["results"]
|
| 252 |
+
if not results:
|
| 253 |
+
return {}
|
| 254 |
+
|
| 255 |
+
last_values = [r["last_value"] for r in results]
|
| 256 |
+
mean_values = [r["mean_value"] for r in results]
|
| 257 |
+
latencies = [r["latency_sec"] for r in results]
|
| 258 |
+
|
| 259 |
+
stats = {
|
| 260 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 261 |
+
"last_value_std": float(np.std(last_values)),
|
| 262 |
+
"last_value_min": float(np.min(last_values)),
|
| 263 |
+
"last_value_max": float(np.max(last_values)),
|
| 264 |
+
"last_value_median": float(np.median(last_values)),
|
| 265 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 266 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 267 |
+
"mean_latency": float(np.mean(latencies)),
|
| 268 |
+
"total_evaluated": len(results),
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
# Count how many demos have last_value >= various thresholds
|
| 272 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 273 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 274 |
+
stats[f"count_above_{threshold}"] = count
|
| 275 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 276 |
+
|
| 277 |
+
return stats
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 281 |
+
"""Create visualization plots for value distribution."""
|
| 282 |
+
results = evaluation_results["results"]
|
| 283 |
+
if not results:
|
| 284 |
+
print("No results to plot")
|
| 285 |
+
return
|
| 286 |
+
|
| 287 |
+
task_name = evaluation_results["task_name"]
|
| 288 |
+
last_values = [r["last_value"] for r in results]
|
| 289 |
+
|
| 290 |
+
# Create figure with multiple subplots
|
| 291 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 292 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 293 |
+
|
| 294 |
+
# 1. Histogram of last values
|
| 295 |
+
ax1 = axes[0, 0]
|
| 296 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 297 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 298 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 299 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 300 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 301 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 302 |
+
ax1.legend()
|
| 303 |
+
ax1.grid(True, alpha=0.3)
|
| 304 |
+
|
| 305 |
+
# 2. Box plot of last values
|
| 306 |
+
ax2 = axes[0, 1]
|
| 307 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 308 |
+
for patch in box_data['boxes']:
|
| 309 |
+
patch.set_facecolor('lightblue')
|
| 310 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 311 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 312 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 313 |
+
ax2.legend()
|
| 314 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 315 |
+
|
| 316 |
+
# 3. Value progression across demos
|
| 317 |
+
ax3 = axes[1, 0]
|
| 318 |
+
demo_indices = range(len(results))
|
| 319 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 320 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 321 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 322 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 323 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 324 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 325 |
+
ax3.legend()
|
| 326 |
+
ax3.grid(True, alpha=0.3)
|
| 327 |
+
|
| 328 |
+
# 4. Cumulative distribution
|
| 329 |
+
ax4 = axes[1, 1]
|
| 330 |
+
sorted_values = np.sort(last_values)
|
| 331 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 332 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 333 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 334 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 335 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 336 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 337 |
+
ax4.legend()
|
| 338 |
+
ax4.grid(True, alpha=0.3)
|
| 339 |
+
|
| 340 |
+
plt.tight_layout()
|
| 341 |
+
|
| 342 |
+
# Save the plot
|
| 343 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 344 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 345 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 346 |
+
|
| 347 |
+
# Also save a PDF version
|
| 348 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 349 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 350 |
+
print(f"PDF saved to: {pdf_path}")
|
| 351 |
+
|
| 352 |
+
plt.close()
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 356 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 357 |
+
task_name = evaluation_results["task_name"]
|
| 358 |
+
|
| 359 |
+
# Save detailed results
|
| 360 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 361 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 362 |
+
json.dump(evaluation_results, f, indent=2)
|
| 363 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 364 |
+
|
| 365 |
+
# Save summary statistics
|
| 366 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 367 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 368 |
+
json.dump(statistics, f, indent=2)
|
| 369 |
+
print(f"Statistics saved to: {stats_path}")
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
# ---------------------------------------------------------------------------
|
| 373 |
+
# CLI
|
| 374 |
+
# ---------------------------------------------------------------------------
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
def parse_args() -> argparse.Namespace:
|
| 378 |
+
parser = argparse.ArgumentParser(
|
| 379 |
+
description="Evaluate value estimation for test demonstrations"
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
# Mode selection
|
| 383 |
+
parser.add_argument(
|
| 384 |
+
"--process-all-tasks",
|
| 385 |
+
action="store_true",
|
| 386 |
+
help="Process all LIBERO-10 tasks"
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
# Arguments for processing all tasks
|
| 390 |
+
parser.add_argument(
|
| 391 |
+
"--manifests-root",
|
| 392 |
+
type=Path,
|
| 393 |
+
help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
# Arguments for processing a single task
|
| 397 |
+
parser.add_argument(
|
| 398 |
+
"--manifest-path",
|
| 399 |
+
type=Path,
|
| 400 |
+
help="Path to the test manifest JSON file (for single task mode)",
|
| 401 |
+
)
|
| 402 |
+
|
| 403 |
+
# Common arguments
|
| 404 |
+
parser.add_argument(
|
| 405 |
+
"--output-dir",
|
| 406 |
+
type=Path,
|
| 407 |
+
default="evaluation_results",
|
| 408 |
+
help="Directory to save evaluation results and plots",
|
| 409 |
+
)
|
| 410 |
+
parser.add_argument(
|
| 411 |
+
"--base-url",
|
| 412 |
+
default="http://localhost:8111",
|
| 413 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 414 |
+
)
|
| 415 |
+
parser.add_argument(
|
| 416 |
+
"--timeout",
|
| 417 |
+
type=float,
|
| 418 |
+
default=30.0,
|
| 419 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 420 |
+
)
|
| 421 |
+
parser.add_argument(
|
| 422 |
+
"--use-reference",
|
| 423 |
+
action="store_true",
|
| 424 |
+
help="Use reference trajectory (if available)",
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
args = parser.parse_args()
|
| 428 |
+
|
| 429 |
+
# Validate arguments
|
| 430 |
+
if args.process_all_tasks:
|
| 431 |
+
if not args.manifests_root:
|
| 432 |
+
parser.error("--manifests-root is required when using --process-all-tasks")
|
| 433 |
+
else:
|
| 434 |
+
if not args.manifest_path:
|
| 435 |
+
parser.error("--manifest-path is required for single task mode")
|
| 436 |
+
|
| 437 |
+
return args
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
def main() -> int:
|
| 441 |
+
args = parse_args()
|
| 442 |
+
|
| 443 |
+
# Read manifest
|
| 444 |
+
try:
|
| 445 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 446 |
+
except FileNotFoundError as exc:
|
| 447 |
+
print(f"Error: {exc}")
|
| 448 |
+
return 1
|
| 449 |
+
|
| 450 |
+
# Create output directory
|
| 451 |
+
output_dir = args.output_dir.expanduser()
|
| 452 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 453 |
+
|
| 454 |
+
# Run evaluation
|
| 455 |
+
print("=" * 80)
|
| 456 |
+
print("VLAC Value Estimation Evaluation")
|
| 457 |
+
print("=" * 80)
|
| 458 |
+
|
| 459 |
+
evaluation_results = evaluate_demos(
|
| 460 |
+
manifest_data=manifest_data,
|
| 461 |
+
base_url=args.base_url,
|
| 462 |
+
timeout=args.timeout,
|
| 463 |
+
use_reference=args.use_reference,
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
# Compute statistics
|
| 467 |
+
statistics = compute_statistics(evaluation_results)
|
| 468 |
+
|
| 469 |
+
# Print summary
|
| 470 |
+
print("\n" + "=" * 80)
|
| 471 |
+
print("EVALUATION SUMMARY")
|
| 472 |
+
print("=" * 80)
|
| 473 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 474 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 475 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 476 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 477 |
+
|
| 478 |
+
if statistics:
|
| 479 |
+
print("\n" + "-" * 80)
|
| 480 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 481 |
+
print("-" * 80)
|
| 482 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 483 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 484 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 485 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 486 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 487 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 488 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 489 |
+
|
| 490 |
+
print("\n" + "-" * 80)
|
| 491 |
+
print("THRESHOLD ANALYSIS")
|
| 492 |
+
print("-" * 80)
|
| 493 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 494 |
+
count = statistics[f"count_above_{threshold}"]
|
| 495 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 496 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 497 |
+
|
| 498 |
+
print("\n" + "-" * 80)
|
| 499 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 500 |
+
print("-" * 80)
|
| 501 |
+
|
| 502 |
+
# Save results
|
| 503 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 504 |
+
|
| 505 |
+
# Create plots
|
| 506 |
+
if evaluation_results["results"]:
|
| 507 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 508 |
+
else:
|
| 509 |
+
print("\nNo successful evaluations to plot.")
|
| 510 |
+
|
| 511 |
+
print("\n" + "=" * 80)
|
| 512 |
+
print("EVALUATION COMPLETE")
|
| 513 |
+
print("=" * 80)
|
| 514 |
+
|
| 515 |
+
return 0
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
if __name__ == "__main__":
|
| 519 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008152620.py
ADDED
|
@@ -0,0 +1,683 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
# Evaluate all LIBERO-10 tasks
|
| 12 |
+
python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
|
| 13 |
+
|
| 14 |
+
# Evaluate a single task
|
| 15 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 16 |
+
|
| 17 |
+
Examples:
|
| 18 |
+
# Evaluate all LIBERO-10 tasks
|
| 19 |
+
python evaluate_test_demo_values.py \
|
| 20 |
+
--process-all-tasks \
|
| 21 |
+
--manifests-root toy_test_demos_LIBERO_10 \
|
| 22 |
+
--output-dir evaluation_results_all_tasks \
|
| 23 |
+
--base-url http://localhost:8111
|
| 24 |
+
|
| 25 |
+
# Evaluate a single task
|
| 26 |
+
python evaluate_test_demo_values.py \
|
| 27 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 28 |
+
--output-dir evaluation_results \
|
| 29 |
+
--base-url http://localhost:8111
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import argparse
|
| 35 |
+
import base64
|
| 36 |
+
import json
|
| 37 |
+
import os
|
| 38 |
+
import glob
|
| 39 |
+
import sys
|
| 40 |
+
import time
|
| 41 |
+
from io import BytesIO
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
from typing import Dict, List, Optional
|
| 44 |
+
|
| 45 |
+
import matplotlib.pyplot as plt
|
| 46 |
+
import numpy as np
|
| 47 |
+
import requests
|
| 48 |
+
from PIL import Image
|
| 49 |
+
from tqdm import tqdm
|
| 50 |
+
|
| 51 |
+
# LIBERO-10 task list
|
| 52 |
+
LIBERO_10_TASKS = [
|
| 53 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 54 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 55 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 56 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 57 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 58 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 59 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 60 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 61 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 62 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# Helpers
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
|
| 69 |
+
def sample_fixed_interval_frames(image_list, num_frames):
|
| 70 |
+
# sample num_frames frames from image_list
|
| 71 |
+
# sample with equal interval while also ensuring the first and the last frames are included
|
| 72 |
+
if len(image_list) == 0:
|
| 73 |
+
raise ValueError("image_list is empty")
|
| 74 |
+
elif len(image_list) == 1:
|
| 75 |
+
return [image_list[0]] * num_frames
|
| 76 |
+
elif num_frames == 2:
|
| 77 |
+
return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
|
| 78 |
+
elif num_frames == 3:
|
| 79 |
+
return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
|
| 80 |
+
else:
|
| 81 |
+
total_frames = len(image_list)
|
| 82 |
+
indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
|
| 83 |
+
sampled_frames = [image_list[i] for i in indices]
|
| 84 |
+
return sampled_frames
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
num_frames_for_reference = 8
|
| 88 |
+
ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
|
| 89 |
+
libero_10_task_list = [
|
| 90 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 91 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 92 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 93 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 94 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 95 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 96 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 97 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 98 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 99 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
|
| 100 |
+
]
|
| 101 |
+
reference_frames_dict = {}
|
| 102 |
+
for task_name in libero_10_task_list:
|
| 103 |
+
ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
|
| 104 |
+
ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
|
| 105 |
+
ref_frm_file_list.sort()
|
| 106 |
+
reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
|
| 107 |
+
reference_frames_dict[task_name] = reference_frames_temp
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 111 |
+
"""Read the test demo manifest JSON file."""
|
| 112 |
+
if not manifest_path.is_file():
|
| 113 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 114 |
+
|
| 115 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 116 |
+
manifest_data = json.load(f)
|
| 117 |
+
|
| 118 |
+
# Convert relative paths to absolute paths
|
| 119 |
+
manifest_dir = manifest_path.parent
|
| 120 |
+
for demo in manifest_data.get("demos", []):
|
| 121 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 122 |
+
|
| 123 |
+
return manifest_data
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def image_to_base64(path: Path) -> str:
|
| 127 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 128 |
+
with Image.open(path) as img:
|
| 129 |
+
img = img.convert("RGB")
|
| 130 |
+
buffer = BytesIO()
|
| 131 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 132 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 136 |
+
"""Encode a list of image paths to base64."""
|
| 137 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def call_trajectory_critic(
|
| 141 |
+
session: requests.Session,
|
| 142 |
+
base_url: str,
|
| 143 |
+
task: str,
|
| 144 |
+
frames_b64: List[str],
|
| 145 |
+
reference_b64: Optional[List[str]],
|
| 146 |
+
timeout: float,
|
| 147 |
+
) -> Dict:
|
| 148 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 149 |
+
payload = {
|
| 150 |
+
"task": task,
|
| 151 |
+
"frames": frames_b64,
|
| 152 |
+
"reference": reference_b64,
|
| 153 |
+
"ref_num": len(reference_b64 or []),
|
| 154 |
+
"skip": 1,
|
| 155 |
+
"batch_size": min(len(frames_b64), 8),
|
| 156 |
+
"think": False,
|
| 157 |
+
"return_video": False,
|
| 158 |
+
}
|
| 159 |
+
start = time.time()
|
| 160 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 161 |
+
resp.raise_for_status()
|
| 162 |
+
result = resp.json()
|
| 163 |
+
result["latency_sec"] = time.time() - start
|
| 164 |
+
return result
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ---------------------------------------------------------------------------
|
| 168 |
+
# Evaluation
|
| 169 |
+
# ---------------------------------------------------------------------------
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def evaluate_demos(
|
| 173 |
+
manifest_data: Dict,
|
| 174 |
+
base_url: str,
|
| 175 |
+
timeout: float,
|
| 176 |
+
use_reference: bool = False,
|
| 177 |
+
) -> Dict[str, any]:
|
| 178 |
+
"""Evaluate all demos and collect value statistics."""
|
| 179 |
+
session = requests.Session()
|
| 180 |
+
task_name = manifest_data.get("task_name", "")
|
| 181 |
+
demos = manifest_data.get("demos", [])
|
| 182 |
+
|
| 183 |
+
results = []
|
| 184 |
+
failed_demos = []
|
| 185 |
+
|
| 186 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 187 |
+
print(f"Task: {task_name}")
|
| 188 |
+
print(f"Use reference: {use_reference}\n")
|
| 189 |
+
|
| 190 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 191 |
+
demo_name = demo["demo_name"]
|
| 192 |
+
frame_paths = demo["frame_paths"]
|
| 193 |
+
|
| 194 |
+
# try:
|
| 195 |
+
# Encode frames
|
| 196 |
+
frames_b64 = encode_images(frame_paths)
|
| 197 |
+
|
| 198 |
+
# For now, no reference trajectory (can be added later)
|
| 199 |
+
print(f"Using reference frames for task {task_name}")
|
| 200 |
+
reference_b64 = encode_images(reference_frames_dict[task_name])
|
| 201 |
+
|
| 202 |
+
# Call VLAC service
|
| 203 |
+
result = call_trajectory_critic(
|
| 204 |
+
session=session,
|
| 205 |
+
base_url=base_url,
|
| 206 |
+
task=task_name,
|
| 207 |
+
frames_b64=frames_b64,
|
| 208 |
+
reference_b64=reference_b64,
|
| 209 |
+
timeout=timeout,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Extract values
|
| 213 |
+
value_list = result.get("value_list", [])
|
| 214 |
+
if not value_list:
|
| 215 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 216 |
+
failed_demos.append(demo_name)
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
# Record results
|
| 220 |
+
demo_result = {
|
| 221 |
+
"demo_name": demo_name,
|
| 222 |
+
"total_frames": demo["total_frames"],
|
| 223 |
+
"success_index": demo["success_index"],
|
| 224 |
+
"num_sampled_frames": len(frame_paths),
|
| 225 |
+
"value_list": value_list,
|
| 226 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 227 |
+
"mean_value": float(np.mean(value_list)),
|
| 228 |
+
"std_value": float(np.std(value_list)),
|
| 229 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 230 |
+
}
|
| 231 |
+
results.append(demo_result)
|
| 232 |
+
|
| 233 |
+
# except requests.RequestException as exc:
|
| 234 |
+
# print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 235 |
+
# failed_demos.append(demo_name)
|
| 236 |
+
# except Exception as exc:
|
| 237 |
+
# print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 238 |
+
# failed_demos.append(demo_name)
|
| 239 |
+
|
| 240 |
+
return {
|
| 241 |
+
"task_name": task_name,
|
| 242 |
+
"total_demos": len(demos),
|
| 243 |
+
"successful_evals": len(results),
|
| 244 |
+
"failed_demos": failed_demos,
|
| 245 |
+
"results": results,
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 250 |
+
"""Compute summary statistics from evaluation results."""
|
| 251 |
+
results = evaluation_results["results"]
|
| 252 |
+
if not results:
|
| 253 |
+
return {}
|
| 254 |
+
|
| 255 |
+
last_values = [r["last_value"] for r in results]
|
| 256 |
+
mean_values = [r["mean_value"] for r in results]
|
| 257 |
+
latencies = [r["latency_sec"] for r in results]
|
| 258 |
+
|
| 259 |
+
stats = {
|
| 260 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 261 |
+
"last_value_std": float(np.std(last_values)),
|
| 262 |
+
"last_value_min": float(np.min(last_values)),
|
| 263 |
+
"last_value_max": float(np.max(last_values)),
|
| 264 |
+
"last_value_median": float(np.median(last_values)),
|
| 265 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 266 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 267 |
+
"mean_latency": float(np.mean(latencies)),
|
| 268 |
+
"total_evaluated": len(results),
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
# Count how many demos have last_value >= various thresholds
|
| 272 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 273 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 274 |
+
stats[f"count_above_{threshold}"] = count
|
| 275 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 276 |
+
|
| 277 |
+
return stats
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 281 |
+
"""Create visualization plots for value distribution."""
|
| 282 |
+
results = evaluation_results["results"]
|
| 283 |
+
if not results:
|
| 284 |
+
print("No results to plot")
|
| 285 |
+
return
|
| 286 |
+
|
| 287 |
+
task_name = evaluation_results["task_name"]
|
| 288 |
+
last_values = [r["last_value"] for r in results]
|
| 289 |
+
|
| 290 |
+
# Create figure with multiple subplots
|
| 291 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 292 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 293 |
+
|
| 294 |
+
# 1. Histogram of last values
|
| 295 |
+
ax1 = axes[0, 0]
|
| 296 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 297 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 298 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 299 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 300 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 301 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 302 |
+
ax1.legend()
|
| 303 |
+
ax1.grid(True, alpha=0.3)
|
| 304 |
+
|
| 305 |
+
# 2. Box plot of last values
|
| 306 |
+
ax2 = axes[0, 1]
|
| 307 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 308 |
+
for patch in box_data['boxes']:
|
| 309 |
+
patch.set_facecolor('lightblue')
|
| 310 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 311 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 312 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 313 |
+
ax2.legend()
|
| 314 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 315 |
+
|
| 316 |
+
# 3. Value progression across demos
|
| 317 |
+
ax3 = axes[1, 0]
|
| 318 |
+
demo_indices = range(len(results))
|
| 319 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 320 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 321 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 322 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 323 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 324 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 325 |
+
ax3.legend()
|
| 326 |
+
ax3.grid(True, alpha=0.3)
|
| 327 |
+
|
| 328 |
+
# 4. Cumulative distribution
|
| 329 |
+
ax4 = axes[1, 1]
|
| 330 |
+
sorted_values = np.sort(last_values)
|
| 331 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 332 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 333 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 334 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 335 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 336 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 337 |
+
ax4.legend()
|
| 338 |
+
ax4.grid(True, alpha=0.3)
|
| 339 |
+
|
| 340 |
+
plt.tight_layout()
|
| 341 |
+
|
| 342 |
+
# Save the plot
|
| 343 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 344 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 345 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 346 |
+
|
| 347 |
+
# Also save a PDF version
|
| 348 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 349 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 350 |
+
print(f"PDF saved to: {pdf_path}")
|
| 351 |
+
|
| 352 |
+
plt.close()
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 356 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 357 |
+
task_name = evaluation_results["task_name"]
|
| 358 |
+
|
| 359 |
+
# Save detailed results
|
| 360 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 361 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 362 |
+
json.dump(evaluation_results, f, indent=2)
|
| 363 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 364 |
+
|
| 365 |
+
# Save summary statistics
|
| 366 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 367 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 368 |
+
json.dump(statistics, f, indent=2)
|
| 369 |
+
print(f"Statistics saved to: {stats_path}")
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def find_manifest_file(manifests_root: Path, task_name: str) -> Optional[Path]:
|
| 373 |
+
"""Find the manifest file for a given task name.
|
| 374 |
+
|
| 375 |
+
Tries different patterns commonly used.
|
| 376 |
+
"""
|
| 377 |
+
# Try different patterns
|
| 378 |
+
patterns = [
|
| 379 |
+
manifests_root / task_name / f"{task_name}_test_manifest.json",
|
| 380 |
+
manifests_root / task_name / "test_manifest.json",
|
| 381 |
+
manifests_root / f"{task_name}_test_manifest.json",
|
| 382 |
+
]
|
| 383 |
+
|
| 384 |
+
for candidate in patterns:
|
| 385 |
+
if candidate.exists():
|
| 386 |
+
return candidate
|
| 387 |
+
|
| 388 |
+
return None
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
def evaluate_single_task(
|
| 392 |
+
manifest_path: Path,
|
| 393 |
+
output_dir: Path,
|
| 394 |
+
base_url: str,
|
| 395 |
+
timeout: float,
|
| 396 |
+
use_reference: bool,
|
| 397 |
+
) -> Optional[Dict]:
|
| 398 |
+
"""Evaluate a single task and return the statistics.
|
| 399 |
+
|
| 400 |
+
Returns:
|
| 401 |
+
Dictionary with evaluation results and statistics, or None if failed
|
| 402 |
+
"""
|
| 403 |
+
try:
|
| 404 |
+
manifest_data = read_manifest(manifest_path)
|
| 405 |
+
except FileNotFoundError as exc:
|
| 406 |
+
print(f"Error reading manifest: {exc}")
|
| 407 |
+
return None
|
| 408 |
+
|
| 409 |
+
task_name = manifest_data.get("task_name", "unknown")
|
| 410 |
+
|
| 411 |
+
print(f"\n{'='*80}")
|
| 412 |
+
print(f"Evaluating task: {task_name}")
|
| 413 |
+
print(f"Manifest: {manifest_path}")
|
| 414 |
+
print(f"{'='*80}")
|
| 415 |
+
|
| 416 |
+
# Run evaluation
|
| 417 |
+
evaluation_results = evaluate_demos(
|
| 418 |
+
manifest_data=manifest_data,
|
| 419 |
+
base_url=base_url,
|
| 420 |
+
timeout=timeout,
|
| 421 |
+
use_reference=use_reference,
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
# Compute statistics
|
| 425 |
+
statistics = compute_statistics(evaluation_results)
|
| 426 |
+
|
| 427 |
+
# Print summary
|
| 428 |
+
print("\n" + "-" * 80)
|
| 429 |
+
print("TASK EVALUATION SUMMARY")
|
| 430 |
+
print("-" * 80)
|
| 431 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 432 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 433 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 434 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 435 |
+
|
| 436 |
+
if statistics:
|
| 437 |
+
print(f"\nMean success value: {statistics['last_value_mean']:.2f}")
|
| 438 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 439 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 440 |
+
print(f"Values >= 90: {statistics.get('count_above_90', 0)} ({statistics.get('percent_above_90', 0):.1f}%)")
|
| 441 |
+
|
| 442 |
+
# Save results
|
| 443 |
+
task_output_dir = output_dir / task_name
|
| 444 |
+
task_output_dir.mkdir(parents=True, exist_ok=True)
|
| 445 |
+
save_results(evaluation_results, statistics, task_output_dir)
|
| 446 |
+
|
| 447 |
+
# Create plots
|
| 448 |
+
if evaluation_results["results"]:
|
| 449 |
+
plot_value_distribution(evaluation_results, task_output_dir)
|
| 450 |
+
|
| 451 |
+
return {
|
| 452 |
+
"task_name": task_name,
|
| 453 |
+
"evaluation_results": evaluation_results,
|
| 454 |
+
"statistics": statistics,
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
def plot_aggregate_statistics(all_task_results: List[Dict], output_dir: Path) -> None:
|
| 459 |
+
"""Create aggregate plots across all tasks."""
|
| 460 |
+
if not all_task_results:
|
| 461 |
+
return
|
| 462 |
+
|
| 463 |
+
# Extract data
|
| 464 |
+
task_names = [r["task_name"] for r in all_task_results]
|
| 465 |
+
mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
|
| 466 |
+
median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
|
| 467 |
+
std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
|
| 468 |
+
|
| 469 |
+
# Create figure with subplots
|
| 470 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
| 471 |
+
fig.suptitle("VLAC Value Estimation - Aggregate Statistics Across All Tasks", fontsize=16, fontweight='bold')
|
| 472 |
+
|
| 473 |
+
# 1. Mean values per task
|
| 474 |
+
ax1 = axes[0, 0]
|
| 475 |
+
bars = ax1.bar(range(len(task_names)), mean_values, color='steelblue', alpha=0.7)
|
| 476 |
+
ax1.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 477 |
+
ax1.axhline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Overall Mean ({np.mean(mean_values):.1f})')
|
| 478 |
+
ax1.set_xlabel('Task', fontsize=12)
|
| 479 |
+
ax1.set_ylabel('Mean Success Value', fontsize=12)
|
| 480 |
+
ax1.set_title('Mean Success Frame Values by Task', fontsize=14)
|
| 481 |
+
ax1.set_xticks(range(len(task_names)))
|
| 482 |
+
ax1.set_xticklabels(range(1, len(task_names) + 1))
|
| 483 |
+
ax1.legend()
|
| 484 |
+
ax1.grid(True, alpha=0.3, axis='y')
|
| 485 |
+
|
| 486 |
+
# 2. Distribution of mean values
|
| 487 |
+
ax2 = axes[0, 1]
|
| 488 |
+
ax2.hist(mean_values, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
|
| 489 |
+
ax2.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 490 |
+
ax2.axvline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(mean_values):.1f})')
|
| 491 |
+
ax2.set_xlabel('Mean Success Value', fontsize=12)
|
| 492 |
+
ax2.set_ylabel('Frequency (Tasks)', fontsize=12)
|
| 493 |
+
ax2.set_title('Distribution of Task-Level Mean Values', fontsize=14)
|
| 494 |
+
ax2.legend()
|
| 495 |
+
ax2.grid(True, alpha=0.3)
|
| 496 |
+
|
| 497 |
+
# 3. Median values per task
|
| 498 |
+
ax3 = axes[1, 0]
|
| 499 |
+
bars = ax3.bar(range(len(task_names)), median_values, color='coral', alpha=0.7)
|
| 500 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 501 |
+
ax3.axhline(np.median(median_values), color='green', linestyle='-', linewidth=2, label=f'Overall Median ({np.median(median_values):.1f})')
|
| 502 |
+
ax3.set_xlabel('Task', fontsize=12)
|
| 503 |
+
ax3.set_ylabel('Median Success Value', fontsize=12)
|
| 504 |
+
ax3.set_title('Median Success Frame Values by Task', fontsize=14)
|
| 505 |
+
ax3.set_xticks(range(len(task_names)))
|
| 506 |
+
ax3.set_xticklabels(range(1, len(task_names) + 1))
|
| 507 |
+
ax3.legend()
|
| 508 |
+
ax3.grid(True, alpha=0.3, axis='y')
|
| 509 |
+
|
| 510 |
+
# 4. Std deviation per task
|
| 511 |
+
ax4 = axes[1, 1]
|
| 512 |
+
bars = ax4.bar(range(len(task_names)), std_values, color='orange', alpha=0.7)
|
| 513 |
+
ax4.axhline(np.mean(std_values), color='green', linestyle='-', linewidth=2, label=f'Mean Std ({np.mean(std_values):.1f})')
|
| 514 |
+
ax4.set_xlabel('Task', fontsize=12)
|
| 515 |
+
ax4.set_ylabel('Standard Deviation', fontsize=12)
|
| 516 |
+
ax4.set_title('Variability in Success Values by Task', fontsize=14)
|
| 517 |
+
ax4.set_xticks(range(len(task_names)))
|
| 518 |
+
ax4.set_xticklabels(range(1, len(task_names) + 1))
|
| 519 |
+
ax4.legend()
|
| 520 |
+
ax4.grid(True, alpha=0.3, axis='y')
|
| 521 |
+
|
| 522 |
+
plt.tight_layout()
|
| 523 |
+
|
| 524 |
+
# Save plots
|
| 525 |
+
plot_path = output_dir / "aggregate_statistics.png"
|
| 526 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 527 |
+
print(f"\nAggregate plot saved to: {plot_path}")
|
| 528 |
+
|
| 529 |
+
pdf_path = output_dir / "aggregate_statistics.pdf"
|
| 530 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 531 |
+
print(f"Aggregate PDF saved to: {pdf_path}")
|
| 532 |
+
|
| 533 |
+
plt.close()
|
| 534 |
+
|
| 535 |
+
|
| 536 |
+
# ---------------------------------------------------------------------------
|
| 537 |
+
# CLI
|
| 538 |
+
# ---------------------------------------------------------------------------
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
def parse_args() -> argparse.Namespace:
|
| 542 |
+
parser = argparse.ArgumentParser(
|
| 543 |
+
description="Evaluate value estimation for test demonstrations"
|
| 544 |
+
)
|
| 545 |
+
|
| 546 |
+
# Mode selection
|
| 547 |
+
parser.add_argument(
|
| 548 |
+
"--process-all-tasks",
|
| 549 |
+
action="store_true",
|
| 550 |
+
help="Process all LIBERO-10 tasks"
|
| 551 |
+
)
|
| 552 |
+
|
| 553 |
+
# Arguments for processing all tasks
|
| 554 |
+
parser.add_argument(
|
| 555 |
+
"--manifests-root",
|
| 556 |
+
type=Path,
|
| 557 |
+
help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
|
| 558 |
+
)
|
| 559 |
+
|
| 560 |
+
# Arguments for processing a single task
|
| 561 |
+
parser.add_argument(
|
| 562 |
+
"--manifest-path",
|
| 563 |
+
type=Path,
|
| 564 |
+
help="Path to the test manifest JSON file (for single task mode)",
|
| 565 |
+
)
|
| 566 |
+
|
| 567 |
+
# Common arguments
|
| 568 |
+
parser.add_argument(
|
| 569 |
+
"--output-dir",
|
| 570 |
+
type=Path,
|
| 571 |
+
default="evaluation_results",
|
| 572 |
+
help="Directory to save evaluation results and plots",
|
| 573 |
+
)
|
| 574 |
+
parser.add_argument(
|
| 575 |
+
"--base-url",
|
| 576 |
+
default="http://localhost:8111",
|
| 577 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 578 |
+
)
|
| 579 |
+
parser.add_argument(
|
| 580 |
+
"--timeout",
|
| 581 |
+
type=float,
|
| 582 |
+
default=30.0,
|
| 583 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 584 |
+
)
|
| 585 |
+
parser.add_argument(
|
| 586 |
+
"--use-reference",
|
| 587 |
+
action="store_true",
|
| 588 |
+
help="Use reference trajectory (if available)",
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
args = parser.parse_args()
|
| 592 |
+
|
| 593 |
+
# Validate arguments
|
| 594 |
+
if args.process_all_tasks:
|
| 595 |
+
if not args.manifests_root:
|
| 596 |
+
parser.error("--manifests-root is required when using --process-all-tasks")
|
| 597 |
+
else:
|
| 598 |
+
if not args.manifest_path:
|
| 599 |
+
parser.error("--manifest-path is required for single task mode")
|
| 600 |
+
|
| 601 |
+
return args
|
| 602 |
+
|
| 603 |
+
|
| 604 |
+
def main() -> int:
|
| 605 |
+
args = parse_args()
|
| 606 |
+
|
| 607 |
+
# Read manifest
|
| 608 |
+
try:
|
| 609 |
+
manifest_data = read_manifest(args.manifest_path)
|
| 610 |
+
except FileNotFoundError as exc:
|
| 611 |
+
print(f"Error: {exc}")
|
| 612 |
+
return 1
|
| 613 |
+
|
| 614 |
+
# Create output directory
|
| 615 |
+
output_dir = args.output_dir.expanduser()
|
| 616 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 617 |
+
|
| 618 |
+
# Run evaluation
|
| 619 |
+
print("=" * 80)
|
| 620 |
+
print("VLAC Value Estimation Evaluation")
|
| 621 |
+
print("=" * 80)
|
| 622 |
+
|
| 623 |
+
evaluation_results = evaluate_demos(
|
| 624 |
+
manifest_data=manifest_data,
|
| 625 |
+
base_url=args.base_url,
|
| 626 |
+
timeout=args.timeout,
|
| 627 |
+
use_reference=args.use_reference,
|
| 628 |
+
)
|
| 629 |
+
|
| 630 |
+
# Compute statistics
|
| 631 |
+
statistics = compute_statistics(evaluation_results)
|
| 632 |
+
|
| 633 |
+
# Print summary
|
| 634 |
+
print("\n" + "=" * 80)
|
| 635 |
+
print("EVALUATION SUMMARY")
|
| 636 |
+
print("=" * 80)
|
| 637 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 638 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 639 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 640 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 641 |
+
|
| 642 |
+
if statistics:
|
| 643 |
+
print("\n" + "-" * 80)
|
| 644 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 645 |
+
print("-" * 80)
|
| 646 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 647 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 648 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 649 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 650 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 651 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 652 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 653 |
+
|
| 654 |
+
print("\n" + "-" * 80)
|
| 655 |
+
print("THRESHOLD ANALYSIS")
|
| 656 |
+
print("-" * 80)
|
| 657 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 658 |
+
count = statistics[f"count_above_{threshold}"]
|
| 659 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 660 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 661 |
+
|
| 662 |
+
print("\n" + "-" * 80)
|
| 663 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 664 |
+
print("-" * 80)
|
| 665 |
+
|
| 666 |
+
# Save results
|
| 667 |
+
save_results(evaluation_results, statistics, output_dir)
|
| 668 |
+
|
| 669 |
+
# Create plots
|
| 670 |
+
if evaluation_results["results"]:
|
| 671 |
+
plot_value_distribution(evaluation_results, output_dir)
|
| 672 |
+
else:
|
| 673 |
+
print("\nNo successful evaluations to plot.")
|
| 674 |
+
|
| 675 |
+
print("\n" + "=" * 80)
|
| 676 |
+
print("EVALUATION COMPLETE")
|
| 677 |
+
print("=" * 80)
|
| 678 |
+
|
| 679 |
+
return 0
|
| 680 |
+
|
| 681 |
+
|
| 682 |
+
if __name__ == "__main__":
|
| 683 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008152700.py
ADDED
|
@@ -0,0 +1,784 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
# Evaluate all LIBERO-10 tasks
|
| 12 |
+
python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
|
| 13 |
+
|
| 14 |
+
# Evaluate a single task
|
| 15 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 16 |
+
|
| 17 |
+
Examples:
|
| 18 |
+
# Evaluate all LIBERO-10 tasks
|
| 19 |
+
python evaluate_test_demo_values.py \
|
| 20 |
+
--process-all-tasks \
|
| 21 |
+
--manifests-root toy_test_demos_LIBERO_10 \
|
| 22 |
+
--output-dir evaluation_results_all_tasks \
|
| 23 |
+
--base-url http://localhost:8111
|
| 24 |
+
|
| 25 |
+
# Evaluate a single task
|
| 26 |
+
python evaluate_test_demo_values.py \
|
| 27 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 28 |
+
--output-dir evaluation_results \
|
| 29 |
+
--base-url http://localhost:8111
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import argparse
|
| 35 |
+
import base64
|
| 36 |
+
import json
|
| 37 |
+
import os
|
| 38 |
+
import glob
|
| 39 |
+
import sys
|
| 40 |
+
import time
|
| 41 |
+
from io import BytesIO
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
from typing import Dict, List, Optional
|
| 44 |
+
|
| 45 |
+
import matplotlib.pyplot as plt
|
| 46 |
+
import numpy as np
|
| 47 |
+
import requests
|
| 48 |
+
from PIL import Image
|
| 49 |
+
from tqdm import tqdm
|
| 50 |
+
|
| 51 |
+
# LIBERO-10 task list
|
| 52 |
+
LIBERO_10_TASKS = [
|
| 53 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 54 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 55 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 56 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 57 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 58 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 59 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 60 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 61 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 62 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# Helpers
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
|
| 69 |
+
def sample_fixed_interval_frames(image_list, num_frames):
|
| 70 |
+
# sample num_frames frames from image_list
|
| 71 |
+
# sample with equal interval while also ensuring the first and the last frames are included
|
| 72 |
+
if len(image_list) == 0:
|
| 73 |
+
raise ValueError("image_list is empty")
|
| 74 |
+
elif len(image_list) == 1:
|
| 75 |
+
return [image_list[0]] * num_frames
|
| 76 |
+
elif num_frames == 2:
|
| 77 |
+
return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
|
| 78 |
+
elif num_frames == 3:
|
| 79 |
+
return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
|
| 80 |
+
else:
|
| 81 |
+
total_frames = len(image_list)
|
| 82 |
+
indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
|
| 83 |
+
sampled_frames = [image_list[i] for i in indices]
|
| 84 |
+
return sampled_frames
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
num_frames_for_reference = 8
|
| 88 |
+
ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
|
| 89 |
+
libero_10_task_list = [
|
| 90 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 91 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 92 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 93 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 94 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 95 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 96 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 97 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 98 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 99 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
|
| 100 |
+
]
|
| 101 |
+
reference_frames_dict = {}
|
| 102 |
+
for task_name in libero_10_task_list:
|
| 103 |
+
ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
|
| 104 |
+
ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
|
| 105 |
+
ref_frm_file_list.sort()
|
| 106 |
+
reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
|
| 107 |
+
reference_frames_dict[task_name] = reference_frames_temp
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 111 |
+
"""Read the test demo manifest JSON file."""
|
| 112 |
+
if not manifest_path.is_file():
|
| 113 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 114 |
+
|
| 115 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 116 |
+
manifest_data = json.load(f)
|
| 117 |
+
|
| 118 |
+
# Convert relative paths to absolute paths
|
| 119 |
+
manifest_dir = manifest_path.parent
|
| 120 |
+
for demo in manifest_data.get("demos", []):
|
| 121 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 122 |
+
|
| 123 |
+
return manifest_data
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def image_to_base64(path: Path) -> str:
|
| 127 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 128 |
+
with Image.open(path) as img:
|
| 129 |
+
img = img.convert("RGB")
|
| 130 |
+
buffer = BytesIO()
|
| 131 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 132 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 136 |
+
"""Encode a list of image paths to base64."""
|
| 137 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def call_trajectory_critic(
|
| 141 |
+
session: requests.Session,
|
| 142 |
+
base_url: str,
|
| 143 |
+
task: str,
|
| 144 |
+
frames_b64: List[str],
|
| 145 |
+
reference_b64: Optional[List[str]],
|
| 146 |
+
timeout: float,
|
| 147 |
+
) -> Dict:
|
| 148 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 149 |
+
payload = {
|
| 150 |
+
"task": task,
|
| 151 |
+
"frames": frames_b64,
|
| 152 |
+
"reference": reference_b64,
|
| 153 |
+
"ref_num": len(reference_b64 or []),
|
| 154 |
+
"skip": 1,
|
| 155 |
+
"batch_size": min(len(frames_b64), 8),
|
| 156 |
+
"think": False,
|
| 157 |
+
"return_video": False,
|
| 158 |
+
}
|
| 159 |
+
start = time.time()
|
| 160 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 161 |
+
resp.raise_for_status()
|
| 162 |
+
result = resp.json()
|
| 163 |
+
result["latency_sec"] = time.time() - start
|
| 164 |
+
return result
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ---------------------------------------------------------------------------
|
| 168 |
+
# Evaluation
|
| 169 |
+
# ---------------------------------------------------------------------------
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def evaluate_demos(
|
| 173 |
+
manifest_data: Dict,
|
| 174 |
+
base_url: str,
|
| 175 |
+
timeout: float,
|
| 176 |
+
use_reference: bool = False,
|
| 177 |
+
) -> Dict[str, any]:
|
| 178 |
+
"""Evaluate all demos and collect value statistics."""
|
| 179 |
+
session = requests.Session()
|
| 180 |
+
task_name = manifest_data.get("task_name", "")
|
| 181 |
+
demos = manifest_data.get("demos", [])
|
| 182 |
+
|
| 183 |
+
results = []
|
| 184 |
+
failed_demos = []
|
| 185 |
+
|
| 186 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 187 |
+
print(f"Task: {task_name}")
|
| 188 |
+
print(f"Use reference: {use_reference}\n")
|
| 189 |
+
|
| 190 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 191 |
+
demo_name = demo["demo_name"]
|
| 192 |
+
frame_paths = demo["frame_paths"]
|
| 193 |
+
|
| 194 |
+
# try:
|
| 195 |
+
# Encode frames
|
| 196 |
+
frames_b64 = encode_images(frame_paths)
|
| 197 |
+
|
| 198 |
+
# For now, no reference trajectory (can be added later)
|
| 199 |
+
print(f"Using reference frames for task {task_name}")
|
| 200 |
+
reference_b64 = encode_images(reference_frames_dict[task_name])
|
| 201 |
+
|
| 202 |
+
# Call VLAC service
|
| 203 |
+
result = call_trajectory_critic(
|
| 204 |
+
session=session,
|
| 205 |
+
base_url=base_url,
|
| 206 |
+
task=task_name,
|
| 207 |
+
frames_b64=frames_b64,
|
| 208 |
+
reference_b64=reference_b64,
|
| 209 |
+
timeout=timeout,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Extract values
|
| 213 |
+
value_list = result.get("value_list", [])
|
| 214 |
+
if not value_list:
|
| 215 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 216 |
+
failed_demos.append(demo_name)
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
# Record results
|
| 220 |
+
demo_result = {
|
| 221 |
+
"demo_name": demo_name,
|
| 222 |
+
"total_frames": demo["total_frames"],
|
| 223 |
+
"success_index": demo["success_index"],
|
| 224 |
+
"num_sampled_frames": len(frame_paths),
|
| 225 |
+
"value_list": value_list,
|
| 226 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 227 |
+
"mean_value": float(np.mean(value_list)),
|
| 228 |
+
"std_value": float(np.std(value_list)),
|
| 229 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 230 |
+
}
|
| 231 |
+
results.append(demo_result)
|
| 232 |
+
|
| 233 |
+
# except requests.RequestException as exc:
|
| 234 |
+
# print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 235 |
+
# failed_demos.append(demo_name)
|
| 236 |
+
# except Exception as exc:
|
| 237 |
+
# print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 238 |
+
# failed_demos.append(demo_name)
|
| 239 |
+
|
| 240 |
+
return {
|
| 241 |
+
"task_name": task_name,
|
| 242 |
+
"total_demos": len(demos),
|
| 243 |
+
"successful_evals": len(results),
|
| 244 |
+
"failed_demos": failed_demos,
|
| 245 |
+
"results": results,
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 250 |
+
"""Compute summary statistics from evaluation results."""
|
| 251 |
+
results = evaluation_results["results"]
|
| 252 |
+
if not results:
|
| 253 |
+
return {}
|
| 254 |
+
|
| 255 |
+
last_values = [r["last_value"] for r in results]
|
| 256 |
+
mean_values = [r["mean_value"] for r in results]
|
| 257 |
+
latencies = [r["latency_sec"] for r in results]
|
| 258 |
+
|
| 259 |
+
stats = {
|
| 260 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 261 |
+
"last_value_std": float(np.std(last_values)),
|
| 262 |
+
"last_value_min": float(np.min(last_values)),
|
| 263 |
+
"last_value_max": float(np.max(last_values)),
|
| 264 |
+
"last_value_median": float(np.median(last_values)),
|
| 265 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 266 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 267 |
+
"mean_latency": float(np.mean(latencies)),
|
| 268 |
+
"total_evaluated": len(results),
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
# Count how many demos have last_value >= various thresholds
|
| 272 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 273 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 274 |
+
stats[f"count_above_{threshold}"] = count
|
| 275 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 276 |
+
|
| 277 |
+
return stats
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 281 |
+
"""Create visualization plots for value distribution."""
|
| 282 |
+
results = evaluation_results["results"]
|
| 283 |
+
if not results:
|
| 284 |
+
print("No results to plot")
|
| 285 |
+
return
|
| 286 |
+
|
| 287 |
+
task_name = evaluation_results["task_name"]
|
| 288 |
+
last_values = [r["last_value"] for r in results]
|
| 289 |
+
|
| 290 |
+
# Create figure with multiple subplots
|
| 291 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 292 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 293 |
+
|
| 294 |
+
# 1. Histogram of last values
|
| 295 |
+
ax1 = axes[0, 0]
|
| 296 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 297 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 298 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 299 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 300 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 301 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 302 |
+
ax1.legend()
|
| 303 |
+
ax1.grid(True, alpha=0.3)
|
| 304 |
+
|
| 305 |
+
# 2. Box plot of last values
|
| 306 |
+
ax2 = axes[0, 1]
|
| 307 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 308 |
+
for patch in box_data['boxes']:
|
| 309 |
+
patch.set_facecolor('lightblue')
|
| 310 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 311 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 312 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 313 |
+
ax2.legend()
|
| 314 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 315 |
+
|
| 316 |
+
# 3. Value progression across demos
|
| 317 |
+
ax3 = axes[1, 0]
|
| 318 |
+
demo_indices = range(len(results))
|
| 319 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 320 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 321 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 322 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 323 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 324 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 325 |
+
ax3.legend()
|
| 326 |
+
ax3.grid(True, alpha=0.3)
|
| 327 |
+
|
| 328 |
+
# 4. Cumulative distribution
|
| 329 |
+
ax4 = axes[1, 1]
|
| 330 |
+
sorted_values = np.sort(last_values)
|
| 331 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 332 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 333 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 334 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 335 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 336 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 337 |
+
ax4.legend()
|
| 338 |
+
ax4.grid(True, alpha=0.3)
|
| 339 |
+
|
| 340 |
+
plt.tight_layout()
|
| 341 |
+
|
| 342 |
+
# Save the plot
|
| 343 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 344 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 345 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 346 |
+
|
| 347 |
+
# Also save a PDF version
|
| 348 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 349 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 350 |
+
print(f"PDF saved to: {pdf_path}")
|
| 351 |
+
|
| 352 |
+
plt.close()
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 356 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 357 |
+
task_name = evaluation_results["task_name"]
|
| 358 |
+
|
| 359 |
+
# Save detailed results
|
| 360 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 361 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 362 |
+
json.dump(evaluation_results, f, indent=2)
|
| 363 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 364 |
+
|
| 365 |
+
# Save summary statistics
|
| 366 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 367 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 368 |
+
json.dump(statistics, f, indent=2)
|
| 369 |
+
print(f"Statistics saved to: {stats_path}")
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def find_manifest_file(manifests_root: Path, task_name: str) -> Optional[Path]:
|
| 373 |
+
"""Find the manifest file for a given task name.
|
| 374 |
+
|
| 375 |
+
Tries different patterns commonly used.
|
| 376 |
+
"""
|
| 377 |
+
# Try different patterns
|
| 378 |
+
patterns = [
|
| 379 |
+
manifests_root / task_name / f"{task_name}_test_manifest.json",
|
| 380 |
+
manifests_root / task_name / "test_manifest.json",
|
| 381 |
+
manifests_root / f"{task_name}_test_manifest.json",
|
| 382 |
+
]
|
| 383 |
+
|
| 384 |
+
for candidate in patterns:
|
| 385 |
+
if candidate.exists():
|
| 386 |
+
return candidate
|
| 387 |
+
|
| 388 |
+
return None
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
def evaluate_single_task(
|
| 392 |
+
manifest_path: Path,
|
| 393 |
+
output_dir: Path,
|
| 394 |
+
base_url: str,
|
| 395 |
+
timeout: float,
|
| 396 |
+
use_reference: bool,
|
| 397 |
+
) -> Optional[Dict]:
|
| 398 |
+
"""Evaluate a single task and return the statistics.
|
| 399 |
+
|
| 400 |
+
Returns:
|
| 401 |
+
Dictionary with evaluation results and statistics, or None if failed
|
| 402 |
+
"""
|
| 403 |
+
try:
|
| 404 |
+
manifest_data = read_manifest(manifest_path)
|
| 405 |
+
except FileNotFoundError as exc:
|
| 406 |
+
print(f"Error reading manifest: {exc}")
|
| 407 |
+
return None
|
| 408 |
+
|
| 409 |
+
task_name = manifest_data.get("task_name", "unknown")
|
| 410 |
+
|
| 411 |
+
print(f"\n{'='*80}")
|
| 412 |
+
print(f"Evaluating task: {task_name}")
|
| 413 |
+
print(f"Manifest: {manifest_path}")
|
| 414 |
+
print(f"{'='*80}")
|
| 415 |
+
|
| 416 |
+
# Run evaluation
|
| 417 |
+
evaluation_results = evaluate_demos(
|
| 418 |
+
manifest_data=manifest_data,
|
| 419 |
+
base_url=base_url,
|
| 420 |
+
timeout=timeout,
|
| 421 |
+
use_reference=use_reference,
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
# Compute statistics
|
| 425 |
+
statistics = compute_statistics(evaluation_results)
|
| 426 |
+
|
| 427 |
+
# Print summary
|
| 428 |
+
print("\n" + "-" * 80)
|
| 429 |
+
print("TASK EVALUATION SUMMARY")
|
| 430 |
+
print("-" * 80)
|
| 431 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 432 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 433 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 434 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 435 |
+
|
| 436 |
+
if statistics:
|
| 437 |
+
print(f"\nMean success value: {statistics['last_value_mean']:.2f}")
|
| 438 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 439 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 440 |
+
print(f"Values >= 90: {statistics.get('count_above_90', 0)} ({statistics.get('percent_above_90', 0):.1f}%)")
|
| 441 |
+
|
| 442 |
+
# Save results
|
| 443 |
+
task_output_dir = output_dir / task_name
|
| 444 |
+
task_output_dir.mkdir(parents=True, exist_ok=True)
|
| 445 |
+
save_results(evaluation_results, statistics, task_output_dir)
|
| 446 |
+
|
| 447 |
+
# Create plots
|
| 448 |
+
if evaluation_results["results"]:
|
| 449 |
+
plot_value_distribution(evaluation_results, task_output_dir)
|
| 450 |
+
|
| 451 |
+
return {
|
| 452 |
+
"task_name": task_name,
|
| 453 |
+
"evaluation_results": evaluation_results,
|
| 454 |
+
"statistics": statistics,
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
def plot_aggregate_statistics(all_task_results: List[Dict], output_dir: Path) -> None:
|
| 459 |
+
"""Create aggregate plots across all tasks."""
|
| 460 |
+
if not all_task_results:
|
| 461 |
+
return
|
| 462 |
+
|
| 463 |
+
# Extract data
|
| 464 |
+
task_names = [r["task_name"] for r in all_task_results]
|
| 465 |
+
mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
|
| 466 |
+
median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
|
| 467 |
+
std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
|
| 468 |
+
|
| 469 |
+
# Create figure with subplots
|
| 470 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
| 471 |
+
fig.suptitle("VLAC Value Estimation - Aggregate Statistics Across All Tasks", fontsize=16, fontweight='bold')
|
| 472 |
+
|
| 473 |
+
# 1. Mean values per task
|
| 474 |
+
ax1 = axes[0, 0]
|
| 475 |
+
bars = ax1.bar(range(len(task_names)), mean_values, color='steelblue', alpha=0.7)
|
| 476 |
+
ax1.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 477 |
+
ax1.axhline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Overall Mean ({np.mean(mean_values):.1f})')
|
| 478 |
+
ax1.set_xlabel('Task', fontsize=12)
|
| 479 |
+
ax1.set_ylabel('Mean Success Value', fontsize=12)
|
| 480 |
+
ax1.set_title('Mean Success Frame Values by Task', fontsize=14)
|
| 481 |
+
ax1.set_xticks(range(len(task_names)))
|
| 482 |
+
ax1.set_xticklabels(range(1, len(task_names) + 1))
|
| 483 |
+
ax1.legend()
|
| 484 |
+
ax1.grid(True, alpha=0.3, axis='y')
|
| 485 |
+
|
| 486 |
+
# 2. Distribution of mean values
|
| 487 |
+
ax2 = axes[0, 1]
|
| 488 |
+
ax2.hist(mean_values, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
|
| 489 |
+
ax2.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 490 |
+
ax2.axvline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(mean_values):.1f})')
|
| 491 |
+
ax2.set_xlabel('Mean Success Value', fontsize=12)
|
| 492 |
+
ax2.set_ylabel('Frequency (Tasks)', fontsize=12)
|
| 493 |
+
ax2.set_title('Distribution of Task-Level Mean Values', fontsize=14)
|
| 494 |
+
ax2.legend()
|
| 495 |
+
ax2.grid(True, alpha=0.3)
|
| 496 |
+
|
| 497 |
+
# 3. Median values per task
|
| 498 |
+
ax3 = axes[1, 0]
|
| 499 |
+
bars = ax3.bar(range(len(task_names)), median_values, color='coral', alpha=0.7)
|
| 500 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 501 |
+
ax3.axhline(np.median(median_values), color='green', linestyle='-', linewidth=2, label=f'Overall Median ({np.median(median_values):.1f})')
|
| 502 |
+
ax3.set_xlabel('Task', fontsize=12)
|
| 503 |
+
ax3.set_ylabel('Median Success Value', fontsize=12)
|
| 504 |
+
ax3.set_title('Median Success Frame Values by Task', fontsize=14)
|
| 505 |
+
ax3.set_xticks(range(len(task_names)))
|
| 506 |
+
ax3.set_xticklabels(range(1, len(task_names) + 1))
|
| 507 |
+
ax3.legend()
|
| 508 |
+
ax3.grid(True, alpha=0.3, axis='y')
|
| 509 |
+
|
| 510 |
+
# 4. Std deviation per task
|
| 511 |
+
ax4 = axes[1, 1]
|
| 512 |
+
bars = ax4.bar(range(len(task_names)), std_values, color='orange', alpha=0.7)
|
| 513 |
+
ax4.axhline(np.mean(std_values), color='green', linestyle='-', linewidth=2, label=f'Mean Std ({np.mean(std_values):.1f})')
|
| 514 |
+
ax4.set_xlabel('Task', fontsize=12)
|
| 515 |
+
ax4.set_ylabel('Standard Deviation', fontsize=12)
|
| 516 |
+
ax4.set_title('Variability in Success Values by Task', fontsize=14)
|
| 517 |
+
ax4.set_xticks(range(len(task_names)))
|
| 518 |
+
ax4.set_xticklabels(range(1, len(task_names) + 1))
|
| 519 |
+
ax4.legend()
|
| 520 |
+
ax4.grid(True, alpha=0.3, axis='y')
|
| 521 |
+
|
| 522 |
+
plt.tight_layout()
|
| 523 |
+
|
| 524 |
+
# Save plots
|
| 525 |
+
plot_path = output_dir / "aggregate_statistics.png"
|
| 526 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 527 |
+
print(f"\nAggregate plot saved to: {plot_path}")
|
| 528 |
+
|
| 529 |
+
pdf_path = output_dir / "aggregate_statistics.pdf"
|
| 530 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 531 |
+
print(f"Aggregate PDF saved to: {pdf_path}")
|
| 532 |
+
|
| 533 |
+
plt.close()
|
| 534 |
+
|
| 535 |
+
|
| 536 |
+
# ---------------------------------------------------------------------------
|
| 537 |
+
# CLI
|
| 538 |
+
# ---------------------------------------------------------------------------
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
def parse_args() -> argparse.Namespace:
|
| 542 |
+
parser = argparse.ArgumentParser(
|
| 543 |
+
description="Evaluate value estimation for test demonstrations"
|
| 544 |
+
)
|
| 545 |
+
|
| 546 |
+
# Mode selection
|
| 547 |
+
parser.add_argument(
|
| 548 |
+
"--process-all-tasks",
|
| 549 |
+
action="store_true",
|
| 550 |
+
help="Process all LIBERO-10 tasks"
|
| 551 |
+
)
|
| 552 |
+
|
| 553 |
+
# Arguments for processing all tasks
|
| 554 |
+
parser.add_argument(
|
| 555 |
+
"--manifests-root",
|
| 556 |
+
type=Path,
|
| 557 |
+
help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
|
| 558 |
+
)
|
| 559 |
+
|
| 560 |
+
# Arguments for processing a single task
|
| 561 |
+
parser.add_argument(
|
| 562 |
+
"--manifest-path",
|
| 563 |
+
type=Path,
|
| 564 |
+
help="Path to the test manifest JSON file (for single task mode)",
|
| 565 |
+
)
|
| 566 |
+
|
| 567 |
+
# Common arguments
|
| 568 |
+
parser.add_argument(
|
| 569 |
+
"--output-dir",
|
| 570 |
+
type=Path,
|
| 571 |
+
default="evaluation_results",
|
| 572 |
+
help="Directory to save evaluation results and plots",
|
| 573 |
+
)
|
| 574 |
+
parser.add_argument(
|
| 575 |
+
"--base-url",
|
| 576 |
+
default="http://localhost:8111",
|
| 577 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 578 |
+
)
|
| 579 |
+
parser.add_argument(
|
| 580 |
+
"--timeout",
|
| 581 |
+
type=float,
|
| 582 |
+
default=30.0,
|
| 583 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 584 |
+
)
|
| 585 |
+
parser.add_argument(
|
| 586 |
+
"--use-reference",
|
| 587 |
+
action="store_true",
|
| 588 |
+
help="Use reference trajectory (if available)",
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
args = parser.parse_args()
|
| 592 |
+
|
| 593 |
+
# Validate arguments
|
| 594 |
+
if args.process_all_tasks:
|
| 595 |
+
if not args.manifests_root:
|
| 596 |
+
parser.error("--manifests-root is required when using --process-all-tasks")
|
| 597 |
+
else:
|
| 598 |
+
if not args.manifest_path:
|
| 599 |
+
parser.error("--manifest-path is required for single task mode")
|
| 600 |
+
|
| 601 |
+
return args
|
| 602 |
+
|
| 603 |
+
|
| 604 |
+
def main() -> int:
|
| 605 |
+
args = parse_args()
|
| 606 |
+
|
| 607 |
+
# Create output directory
|
| 608 |
+
output_dir = args.output_dir.expanduser()
|
| 609 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 610 |
+
|
| 611 |
+
if args.process_all_tasks:
|
| 612 |
+
# Process all LIBERO-10 tasks
|
| 613 |
+
manifests_root = args.manifests_root.expanduser()
|
| 614 |
+
|
| 615 |
+
if not manifests_root.exists():
|
| 616 |
+
print(f"Error: Manifests root directory not found: {manifests_root}")
|
| 617 |
+
return 1
|
| 618 |
+
|
| 619 |
+
print("=" * 80)
|
| 620 |
+
print("EVALUATING ALL LIBERO-10 TASKS")
|
| 621 |
+
print("=" * 80)
|
| 622 |
+
print(f"Manifests root: {manifests_root}")
|
| 623 |
+
print(f"Output directory: {output_dir}")
|
| 624 |
+
print(f"Base URL: {args.base_url}")
|
| 625 |
+
print(f"Total tasks to evaluate: {len(LIBERO_10_TASKS)}")
|
| 626 |
+
print("=" * 80)
|
| 627 |
+
|
| 628 |
+
successful_tasks = []
|
| 629 |
+
failed_tasks = []
|
| 630 |
+
all_task_results = []
|
| 631 |
+
|
| 632 |
+
for idx, task_name in enumerate(LIBERO_10_TASKS, 1):
|
| 633 |
+
print(f"\n[{idx}/{len(LIBERO_10_TASKS)}] Processing: {task_name}")
|
| 634 |
+
|
| 635 |
+
# Find manifest file
|
| 636 |
+
manifest_path = find_manifest_file(manifests_root, task_name)
|
| 637 |
+
if manifest_path is None:
|
| 638 |
+
print(f" [ERROR] Manifest file not found for task: {task_name}")
|
| 639 |
+
failed_tasks.append(task_name)
|
| 640 |
+
continue
|
| 641 |
+
|
| 642 |
+
# Evaluate the task
|
| 643 |
+
result = evaluate_single_task(
|
| 644 |
+
manifest_path=manifest_path,
|
| 645 |
+
output_dir=output_dir,
|
| 646 |
+
base_url=args.base_url,
|
| 647 |
+
timeout=args.timeout,
|
| 648 |
+
use_reference=args.use_reference,
|
| 649 |
+
)
|
| 650 |
+
|
| 651 |
+
if result:
|
| 652 |
+
successful_tasks.append(task_name)
|
| 653 |
+
all_task_results.append(result)
|
| 654 |
+
else:
|
| 655 |
+
failed_tasks.append(task_name)
|
| 656 |
+
|
| 657 |
+
# Print overall summary
|
| 658 |
+
print("\n" + "=" * 80)
|
| 659 |
+
print("EVALUATION COMPLETE - ALL TASKS")
|
| 660 |
+
print("=" * 80)
|
| 661 |
+
print(f"Successfully evaluated: {len(successful_tasks)}/{len(LIBERO_10_TASKS)} tasks")
|
| 662 |
+
print(f"Failed: {len(failed_tasks)}/{len(LIBERO_10_TASKS)} tasks")
|
| 663 |
+
|
| 664 |
+
if failed_tasks:
|
| 665 |
+
print("\nFailed tasks:")
|
| 666 |
+
for task in failed_tasks:
|
| 667 |
+
print(f" - {task}")
|
| 668 |
+
|
| 669 |
+
# Compute and display aggregate statistics
|
| 670 |
+
if all_task_results:
|
| 671 |
+
print("\n" + "=" * 80)
|
| 672 |
+
print("AGGREGATE STATISTICS ACROSS ALL TASKS")
|
| 673 |
+
print("=" * 80)
|
| 674 |
+
|
| 675 |
+
all_mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
|
| 676 |
+
all_median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
|
| 677 |
+
all_std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
|
| 678 |
+
|
| 679 |
+
print(f"\nOverall mean of task means: {np.mean(all_mean_values):.2f} ± {np.std(all_mean_values):.2f}")
|
| 680 |
+
print(f"Overall median of task medians: {np.median(all_median_values):.2f}")
|
| 681 |
+
print(f"Average std deviation: {np.mean(all_std_values):.2f}")
|
| 682 |
+
|
| 683 |
+
print(f"\nBest performing task: {all_task_results[np.argmax(all_mean_values)]['task_name']} ({max(all_mean_values):.2f})")
|
| 684 |
+
print(f"Worst performing task: {all_task_results[np.argmin(all_mean_values)]['task_name']} ({min(all_mean_values):.2f})")
|
| 685 |
+
|
| 686 |
+
# Save aggregate statistics
|
| 687 |
+
aggregate_stats = {
|
| 688 |
+
"total_tasks": len(LIBERO_10_TASKS),
|
| 689 |
+
"successful_tasks": len(successful_tasks),
|
| 690 |
+
"failed_tasks": len(failed_tasks),
|
| 691 |
+
"overall_mean_of_means": float(np.mean(all_mean_values)),
|
| 692 |
+
"overall_std_of_means": float(np.std(all_mean_values)),
|
| 693 |
+
"overall_median_of_medians": float(np.median(all_median_values)),
|
| 694 |
+
"average_std_deviation": float(np.mean(all_std_values)),
|
| 695 |
+
"best_task": all_task_results[np.argmax(all_mean_values)]['task_name'],
|
| 696 |
+
"best_task_mean_value": float(max(all_mean_values)),
|
| 697 |
+
"worst_task": all_task_results[np.argmin(all_mean_values)]['task_name'],
|
| 698 |
+
"worst_task_mean_value": float(min(all_mean_values)),
|
| 699 |
+
"task_results": [
|
| 700 |
+
{
|
| 701 |
+
"task_name": r["task_name"],
|
| 702 |
+
"mean_value": r["statistics"]["last_value_mean"],
|
| 703 |
+
"median_value": r["statistics"]["last_value_median"],
|
| 704 |
+
"std_value": r["statistics"]["last_value_std"],
|
| 705 |
+
}
|
| 706 |
+
for r in all_task_results
|
| 707 |
+
]
|
| 708 |
+
}
|
| 709 |
+
|
| 710 |
+
aggregate_path = output_dir / "aggregate_statistics.json"
|
| 711 |
+
with aggregate_path.open("w", encoding="utf-8") as f:
|
| 712 |
+
json.dump(aggregate_stats, f, indent=2)
|
| 713 |
+
print(f"\nAggregate statistics saved to: {aggregate_path}")
|
| 714 |
+
|
| 715 |
+
# Create aggregate plots
|
| 716 |
+
plot_aggregate_statistics(all_task_results, output_dir)
|
| 717 |
+
|
| 718 |
+
print("\n" + "=" * 80)
|
| 719 |
+
print(f"All results saved to: {output_dir}")
|
| 720 |
+
print("=" * 80)
|
| 721 |
+
|
| 722 |
+
else:
|
| 723 |
+
# Process a single task
|
| 724 |
+
print("=" * 80)
|
| 725 |
+
print("VLAC Value Estimation Evaluation - Single Task")
|
| 726 |
+
print("=" * 80)
|
| 727 |
+
|
| 728 |
+
result = evaluate_single_task(
|
| 729 |
+
manifest_path=args.manifest_path.expanduser(),
|
| 730 |
+
output_dir=output_dir,
|
| 731 |
+
base_url=args.base_url,
|
| 732 |
+
timeout=args.timeout,
|
| 733 |
+
use_reference=args.use_reference,
|
| 734 |
+
)
|
| 735 |
+
|
| 736 |
+
if not result:
|
| 737 |
+
print("\nEvaluation failed!")
|
| 738 |
+
return 1
|
| 739 |
+
|
| 740 |
+
# Print detailed statistics for single task
|
| 741 |
+
statistics = result["statistics"]
|
| 742 |
+
evaluation_results = result["evaluation_results"]
|
| 743 |
+
|
| 744 |
+
print("\n" + "=" * 80)
|
| 745 |
+
print("DETAILED EVALUATION SUMMARY")
|
| 746 |
+
print("=" * 80)
|
| 747 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 748 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 749 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 750 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 751 |
+
|
| 752 |
+
if statistics:
|
| 753 |
+
print("\n" + "-" * 80)
|
| 754 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 755 |
+
print("-" * 80)
|
| 756 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 757 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 758 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 759 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 760 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 761 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 762 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 763 |
+
|
| 764 |
+
print("\n" + "-" * 80)
|
| 765 |
+
print("THRESHOLD ANALYSIS")
|
| 766 |
+
print("-" * 80)
|
| 767 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 768 |
+
count = statistics[f"count_above_{threshold}"]
|
| 769 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 770 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 771 |
+
|
| 772 |
+
print("\n" + "-" * 80)
|
| 773 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 774 |
+
print("-" * 80)
|
| 775 |
+
|
| 776 |
+
print("\n" + "=" * 80)
|
| 777 |
+
print("EVALUATION COMPLETE")
|
| 778 |
+
print("=" * 80)
|
| 779 |
+
|
| 780 |
+
return 0
|
| 781 |
+
|
| 782 |
+
|
| 783 |
+
if __name__ == "__main__":
|
| 784 |
+
sys.exit(main())
|
Dev/.history/testing/evaluate_test_demo_values_20251008152727.py
ADDED
|
@@ -0,0 +1,784 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Reads test demo manifests created by prepare_test_demo_single_task.py
|
| 6 |
+
2. Calls the VLAC trajectory-critic service for each demo
|
| 7 |
+
3. Records the last value (success frame value) - ideally should be 100
|
| 8 |
+
4. Plots statistics to visualize the value distribution
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
# Evaluate all LIBERO-10 tasks
|
| 12 |
+
python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir>
|
| 13 |
+
|
| 14 |
+
# Evaluate a single task
|
| 15 |
+
python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir>
|
| 16 |
+
|
| 17 |
+
Examples:
|
| 18 |
+
# Evaluate all LIBERO-10 tasks
|
| 19 |
+
python evaluate_test_demo_values.py \
|
| 20 |
+
--process-all-tasks \
|
| 21 |
+
--manifests-root toy_test_demos_LIBERO_10 \
|
| 22 |
+
--output-dir evaluation_results_all_tasks \
|
| 23 |
+
--base-url http://localhost:8111
|
| 24 |
+
|
| 25 |
+
# Evaluate a single task
|
| 26 |
+
python evaluate_test_demo_values.py \
|
| 27 |
+
--manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \
|
| 28 |
+
--output-dir evaluation_results \
|
| 29 |
+
--base-url http://localhost:8111
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import argparse
|
| 35 |
+
import base64
|
| 36 |
+
import json
|
| 37 |
+
import os
|
| 38 |
+
import glob
|
| 39 |
+
import sys
|
| 40 |
+
import time
|
| 41 |
+
from io import BytesIO
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
from typing import Dict, List, Optional
|
| 44 |
+
|
| 45 |
+
import matplotlib.pyplot as plt
|
| 46 |
+
import numpy as np
|
| 47 |
+
import requests
|
| 48 |
+
from PIL import Image
|
| 49 |
+
from tqdm import tqdm
|
| 50 |
+
|
| 51 |
+
# LIBERO-10 task list
|
| 52 |
+
LIBERO_10_TASKS = [
|
| 53 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 54 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 55 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 56 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 57 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 58 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 59 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 60 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 61 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 62 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy",
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# Helpers
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
|
| 69 |
+
def sample_fixed_interval_frames(image_list, num_frames):
|
| 70 |
+
# sample num_frames frames from image_list
|
| 71 |
+
# sample with equal interval while also ensuring the first and the last frames are included
|
| 72 |
+
if len(image_list) == 0:
|
| 73 |
+
raise ValueError("image_list is empty")
|
| 74 |
+
elif len(image_list) == 1:
|
| 75 |
+
return [image_list[0]] * num_frames
|
| 76 |
+
elif num_frames == 2:
|
| 77 |
+
return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2)
|
| 78 |
+
elif num_frames == 3:
|
| 79 |
+
return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]]
|
| 80 |
+
else:
|
| 81 |
+
total_frames = len(image_list)
|
| 82 |
+
indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int)
|
| 83 |
+
sampled_frames = [image_list[i] for i in indices]
|
| 84 |
+
return sampled_frames
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
num_frames_for_reference = 8
|
| 88 |
+
ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10"
|
| 89 |
+
libero_10_task_list = [
|
| 90 |
+
"KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it",
|
| 91 |
+
"KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it",
|
| 92 |
+
"KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it",
|
| 93 |
+
"KITCHEN_SCENE8_put_both_moka_pots_on_the_stove",
|
| 94 |
+
"LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket",
|
| 95 |
+
"LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket",
|
| 96 |
+
"LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket",
|
| 97 |
+
"LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate",
|
| 98 |
+
"LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate",
|
| 99 |
+
"STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy"
|
| 100 |
+
]
|
| 101 |
+
reference_frames_dict = {}
|
| 102 |
+
for task_name in libero_10_task_list:
|
| 103 |
+
ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo")
|
| 104 |
+
ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png"))
|
| 105 |
+
ref_frm_file_list.sort()
|
| 106 |
+
reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference)
|
| 107 |
+
reference_frames_dict[task_name] = reference_frames_temp
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def read_manifest(manifest_path: Path) -> Dict:
|
| 111 |
+
"""Read the test demo manifest JSON file."""
|
| 112 |
+
if not manifest_path.is_file():
|
| 113 |
+
raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
|
| 114 |
+
|
| 115 |
+
with manifest_path.open("r", encoding="utf-8") as f:
|
| 116 |
+
manifest_data = json.load(f)
|
| 117 |
+
|
| 118 |
+
# Convert relative paths to absolute paths
|
| 119 |
+
manifest_dir = manifest_path.parent
|
| 120 |
+
for demo in manifest_data.get("demos", []):
|
| 121 |
+
demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]]
|
| 122 |
+
|
| 123 |
+
return manifest_data
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def image_to_base64(path: Path) -> str:
|
| 127 |
+
"""Convert an image file to base64 encoded JPEG."""
|
| 128 |
+
with Image.open(path) as img:
|
| 129 |
+
img = img.convert("RGB")
|
| 130 |
+
buffer = BytesIO()
|
| 131 |
+
img.save(buffer, format="JPEG", quality=95)
|
| 132 |
+
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def encode_images(paths: List[str]) -> List[str]:
|
| 136 |
+
"""Encode a list of image paths to base64."""
|
| 137 |
+
return [image_to_base64(Path(p)) for p in paths]
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def call_trajectory_critic(
|
| 141 |
+
session: requests.Session,
|
| 142 |
+
base_url: str,
|
| 143 |
+
task: str,
|
| 144 |
+
frames_b64: List[str],
|
| 145 |
+
reference_b64: Optional[List[str]],
|
| 146 |
+
timeout: float,
|
| 147 |
+
) -> Dict:
|
| 148 |
+
"""Call the VLAC trajectory-critic endpoint."""
|
| 149 |
+
payload = {
|
| 150 |
+
"task": task,
|
| 151 |
+
"frames": frames_b64,
|
| 152 |
+
"reference": reference_b64,
|
| 153 |
+
"ref_num": len(reference_b64 or []),
|
| 154 |
+
"skip": 1,
|
| 155 |
+
"batch_size": min(len(frames_b64), 8),
|
| 156 |
+
"think": False,
|
| 157 |
+
"return_video": False,
|
| 158 |
+
}
|
| 159 |
+
start = time.time()
|
| 160 |
+
resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout)
|
| 161 |
+
resp.raise_for_status()
|
| 162 |
+
result = resp.json()
|
| 163 |
+
result["latency_sec"] = time.time() - start
|
| 164 |
+
return result
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ---------------------------------------------------------------------------
|
| 168 |
+
# Evaluation
|
| 169 |
+
# ---------------------------------------------------------------------------
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def evaluate_demos(
|
| 173 |
+
manifest_data: Dict,
|
| 174 |
+
base_url: str,
|
| 175 |
+
timeout: float,
|
| 176 |
+
use_reference: bool = False,
|
| 177 |
+
) -> Dict[str, any]:
|
| 178 |
+
"""Evaluate all demos and collect value statistics."""
|
| 179 |
+
session = requests.Session()
|
| 180 |
+
task_name = manifest_data.get("task_name", "")
|
| 181 |
+
demos = manifest_data.get("demos", [])
|
| 182 |
+
|
| 183 |
+
results = []
|
| 184 |
+
failed_demos = []
|
| 185 |
+
|
| 186 |
+
print(f"\nEvaluating {len(demos)} test demonstrations...")
|
| 187 |
+
print(f"Task: {task_name}")
|
| 188 |
+
print(f"Use reference: {use_reference}\n")
|
| 189 |
+
|
| 190 |
+
for demo in tqdm(demos, desc="Processing demos"):
|
| 191 |
+
demo_name = demo["demo_name"]
|
| 192 |
+
frame_paths = demo["frame_paths"]
|
| 193 |
+
|
| 194 |
+
# try:
|
| 195 |
+
# Encode frames
|
| 196 |
+
frames_b64 = encode_images(frame_paths)
|
| 197 |
+
|
| 198 |
+
# For now, no reference trajectory (can be added later)
|
| 199 |
+
print(f"Using reference frames for task {task_name}")
|
| 200 |
+
reference_b64 = encode_images(reference_frames_dict[task_name])
|
| 201 |
+
|
| 202 |
+
# Call VLAC service
|
| 203 |
+
result = call_trajectory_critic(
|
| 204 |
+
session=session,
|
| 205 |
+
base_url=base_url,
|
| 206 |
+
task=task_name,
|
| 207 |
+
frames_b64=frames_b64,
|
| 208 |
+
reference_b64=reference_b64,
|
| 209 |
+
timeout=timeout,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Extract values
|
| 213 |
+
value_list = result.get("value_list", [])
|
| 214 |
+
if not value_list:
|
| 215 |
+
print(f"\n[warn] No values returned for demo {demo_name}")
|
| 216 |
+
failed_demos.append(demo_name)
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
# Record results
|
| 220 |
+
demo_result = {
|
| 221 |
+
"demo_name": demo_name,
|
| 222 |
+
"total_frames": demo["total_frames"],
|
| 223 |
+
"success_index": demo["success_index"],
|
| 224 |
+
"num_sampled_frames": len(frame_paths),
|
| 225 |
+
"value_list": value_list,
|
| 226 |
+
"last_value": value_list[-1], # The critical value for success frame
|
| 227 |
+
"mean_value": float(np.mean(value_list)),
|
| 228 |
+
"std_value": float(np.std(value_list)),
|
| 229 |
+
"latency_sec": result.get("latency_sec", 0.0),
|
| 230 |
+
}
|
| 231 |
+
results.append(demo_result)
|
| 232 |
+
|
| 233 |
+
# except requests.RequestException as exc:
|
| 234 |
+
# print(f"\n[error] Request failed for demo {demo_name}: {exc}")
|
| 235 |
+
# failed_demos.append(demo_name)
|
| 236 |
+
# except Exception as exc:
|
| 237 |
+
# print(f"\n[error] Unexpected error for demo {demo_name}: {exc}")
|
| 238 |
+
# failed_demos.append(demo_name)
|
| 239 |
+
|
| 240 |
+
return {
|
| 241 |
+
"task_name": task_name,
|
| 242 |
+
"total_demos": len(demos),
|
| 243 |
+
"successful_evals": len(results),
|
| 244 |
+
"failed_demos": failed_demos,
|
| 245 |
+
"results": results,
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def compute_statistics(evaluation_results: Dict) -> Dict[str, float]:
|
| 250 |
+
"""Compute summary statistics from evaluation results."""
|
| 251 |
+
results = evaluation_results["results"]
|
| 252 |
+
if not results:
|
| 253 |
+
return {}
|
| 254 |
+
|
| 255 |
+
last_values = [r["last_value"] for r in results]
|
| 256 |
+
mean_values = [r["mean_value"] for r in results]
|
| 257 |
+
latencies = [r["latency_sec"] for r in results]
|
| 258 |
+
|
| 259 |
+
stats = {
|
| 260 |
+
"last_value_mean": float(np.mean(last_values)),
|
| 261 |
+
"last_value_std": float(np.std(last_values)),
|
| 262 |
+
"last_value_min": float(np.min(last_values)),
|
| 263 |
+
"last_value_max": float(np.max(last_values)),
|
| 264 |
+
"last_value_median": float(np.median(last_values)),
|
| 265 |
+
"last_value_q25": float(np.percentile(last_values, 25)),
|
| 266 |
+
"last_value_q75": float(np.percentile(last_values, 75)),
|
| 267 |
+
"mean_latency": float(np.mean(latencies)),
|
| 268 |
+
"total_evaluated": len(results),
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
# Count how many demos have last_value >= various thresholds
|
| 272 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 273 |
+
count = sum(1 for v in last_values if v >= threshold)
|
| 274 |
+
stats[f"count_above_{threshold}"] = count
|
| 275 |
+
stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100)
|
| 276 |
+
|
| 277 |
+
return stats
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None:
|
| 281 |
+
"""Create visualization plots for value distribution."""
|
| 282 |
+
results = evaluation_results["results"]
|
| 283 |
+
if not results:
|
| 284 |
+
print("No results to plot")
|
| 285 |
+
return
|
| 286 |
+
|
| 287 |
+
task_name = evaluation_results["task_name"]
|
| 288 |
+
last_values = [r["last_value"] for r in results]
|
| 289 |
+
|
| 290 |
+
# Create figure with multiple subplots
|
| 291 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 292 |
+
fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold')
|
| 293 |
+
|
| 294 |
+
# 1. Histogram of last values
|
| 295 |
+
ax1 = axes[0, 0]
|
| 296 |
+
ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
|
| 297 |
+
ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 298 |
+
ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 299 |
+
ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12)
|
| 300 |
+
ax1.set_ylabel('Frequency', fontsize=12)
|
| 301 |
+
ax1.set_title('Distribution of Success Frame Values', fontsize=14)
|
| 302 |
+
ax1.legend()
|
| 303 |
+
ax1.grid(True, alpha=0.3)
|
| 304 |
+
|
| 305 |
+
# 2. Box plot of last values
|
| 306 |
+
ax2 = axes[0, 1]
|
| 307 |
+
box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values'])
|
| 308 |
+
for patch in box_data['boxes']:
|
| 309 |
+
patch.set_facecolor('lightblue')
|
| 310 |
+
ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 311 |
+
ax2.set_ylabel('Value', fontsize=12)
|
| 312 |
+
ax2.set_title('Success Frame Value Distribution', fontsize=14)
|
| 313 |
+
ax2.legend()
|
| 314 |
+
ax2.grid(True, alpha=0.3, axis='y')
|
| 315 |
+
|
| 316 |
+
# 3. Value progression across demos
|
| 317 |
+
ax3 = axes[1, 0]
|
| 318 |
+
demo_indices = range(len(results))
|
| 319 |
+
ax3.scatter(demo_indices, last_values, alpha=0.6, s=50, c='steelblue')
|
| 320 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 321 |
+
ax3.axhline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})')
|
| 322 |
+
ax3.set_xlabel('Demo Index', fontsize=12)
|
| 323 |
+
ax3.set_ylabel('Last Frame Value', fontsize=12)
|
| 324 |
+
ax3.set_title('Success Frame Values Across Demos', fontsize=14)
|
| 325 |
+
ax3.legend()
|
| 326 |
+
ax3.grid(True, alpha=0.3)
|
| 327 |
+
|
| 328 |
+
# 4. Cumulative distribution
|
| 329 |
+
ax4 = axes[1, 1]
|
| 330 |
+
sorted_values = np.sort(last_values)
|
| 331 |
+
cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100
|
| 332 |
+
ax4.plot(sorted_values, cumulative, linewidth=2, color='steelblue')
|
| 333 |
+
ax4.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 334 |
+
ax4.set_xlabel('Success Frame Value', fontsize=12)
|
| 335 |
+
ax4.set_ylabel('Cumulative Percentage (%)', fontsize=12)
|
| 336 |
+
ax4.set_title('Cumulative Distribution', fontsize=14)
|
| 337 |
+
ax4.legend()
|
| 338 |
+
ax4.grid(True, alpha=0.3)
|
| 339 |
+
|
| 340 |
+
plt.tight_layout()
|
| 341 |
+
|
| 342 |
+
# Save the plot
|
| 343 |
+
plot_path = output_dir / f"{task_name}_value_distribution.png"
|
| 344 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 345 |
+
print(f"\nPlot saved to: {plot_path}")
|
| 346 |
+
|
| 347 |
+
# Also save a PDF version
|
| 348 |
+
pdf_path = output_dir / f"{task_name}_value_distribution.pdf"
|
| 349 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 350 |
+
print(f"PDF saved to: {pdf_path}")
|
| 351 |
+
|
| 352 |
+
plt.close()
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None:
|
| 356 |
+
"""Save evaluation results and statistics to JSON files."""
|
| 357 |
+
task_name = evaluation_results["task_name"]
|
| 358 |
+
|
| 359 |
+
# Save detailed results
|
| 360 |
+
results_path = output_dir / f"{task_name}_evaluation_results.json"
|
| 361 |
+
with results_path.open("w", encoding="utf-8") as f:
|
| 362 |
+
json.dump(evaluation_results, f, indent=2)
|
| 363 |
+
print(f"\nDetailed results saved to: {results_path}")
|
| 364 |
+
|
| 365 |
+
# Save summary statistics
|
| 366 |
+
stats_path = output_dir / f"{task_name}_statistics.json"
|
| 367 |
+
with stats_path.open("w", encoding="utf-8") as f:
|
| 368 |
+
json.dump(statistics, f, indent=2)
|
| 369 |
+
print(f"Statistics saved to: {stats_path}")
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def find_manifest_file(manifests_root: Path, task_name: str) -> Optional[Path]:
|
| 373 |
+
"""Find the manifest file for a given task name.
|
| 374 |
+
|
| 375 |
+
Tries different patterns commonly used.
|
| 376 |
+
"""
|
| 377 |
+
# Try different patterns
|
| 378 |
+
patterns = [
|
| 379 |
+
manifests_root / task_name / f"{task_name}_test_manifest.json",
|
| 380 |
+
manifests_root / task_name / "test_manifest.json",
|
| 381 |
+
manifests_root / f"{task_name}_test_manifest.json",
|
| 382 |
+
]
|
| 383 |
+
|
| 384 |
+
for candidate in patterns:
|
| 385 |
+
if candidate.exists():
|
| 386 |
+
return candidate
|
| 387 |
+
|
| 388 |
+
return None
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
def evaluate_single_task(
|
| 392 |
+
manifest_path: Path,
|
| 393 |
+
output_dir: Path,
|
| 394 |
+
base_url: str,
|
| 395 |
+
timeout: float,
|
| 396 |
+
use_reference: bool,
|
| 397 |
+
) -> Optional[Dict]:
|
| 398 |
+
"""Evaluate a single task and return the statistics.
|
| 399 |
+
|
| 400 |
+
Returns:
|
| 401 |
+
Dictionary with evaluation results and statistics, or None if failed
|
| 402 |
+
"""
|
| 403 |
+
try:
|
| 404 |
+
manifest_data = read_manifest(manifest_path)
|
| 405 |
+
except FileNotFoundError as exc:
|
| 406 |
+
print(f"Error reading manifest: {exc}")
|
| 407 |
+
return None
|
| 408 |
+
|
| 409 |
+
task_name = manifest_data.get("task_name", "unknown")
|
| 410 |
+
|
| 411 |
+
print(f"\n{'='*80}")
|
| 412 |
+
print(f"Evaluating task: {task_name}")
|
| 413 |
+
print(f"Manifest: {manifest_path}")
|
| 414 |
+
print(f"{'='*80}")
|
| 415 |
+
|
| 416 |
+
# Run evaluation
|
| 417 |
+
evaluation_results = evaluate_demos(
|
| 418 |
+
manifest_data=manifest_data,
|
| 419 |
+
base_url=base_url,
|
| 420 |
+
timeout=timeout,
|
| 421 |
+
use_reference=use_reference,
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
# Compute statistics
|
| 425 |
+
statistics = compute_statistics(evaluation_results)
|
| 426 |
+
|
| 427 |
+
# Print summary
|
| 428 |
+
print("\n" + "-" * 80)
|
| 429 |
+
print("TASK EVALUATION SUMMARY")
|
| 430 |
+
print("-" * 80)
|
| 431 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 432 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 433 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 434 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 435 |
+
|
| 436 |
+
if statistics:
|
| 437 |
+
print(f"\nMean success value: {statistics['last_value_mean']:.2f}")
|
| 438 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 439 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 440 |
+
print(f"Values >= 90: {statistics.get('count_above_90', 0)} ({statistics.get('percent_above_90', 0):.1f}%)")
|
| 441 |
+
|
| 442 |
+
# Save results
|
| 443 |
+
task_output_dir = output_dir / task_name
|
| 444 |
+
task_output_dir.mkdir(parents=True, exist_ok=True)
|
| 445 |
+
save_results(evaluation_results, statistics, task_output_dir)
|
| 446 |
+
|
| 447 |
+
# Create plots
|
| 448 |
+
if evaluation_results["results"]:
|
| 449 |
+
plot_value_distribution(evaluation_results, task_output_dir)
|
| 450 |
+
|
| 451 |
+
return {
|
| 452 |
+
"task_name": task_name,
|
| 453 |
+
"evaluation_results": evaluation_results,
|
| 454 |
+
"statistics": statistics,
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
def plot_aggregate_statistics(all_task_results: List[Dict], output_dir: Path) -> None:
|
| 459 |
+
"""Create aggregate plots across all tasks."""
|
| 460 |
+
if not all_task_results:
|
| 461 |
+
return
|
| 462 |
+
|
| 463 |
+
# Extract data
|
| 464 |
+
task_names = [r["task_name"] for r in all_task_results]
|
| 465 |
+
mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
|
| 466 |
+
median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
|
| 467 |
+
std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
|
| 468 |
+
|
| 469 |
+
# Create figure with subplots
|
| 470 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
| 471 |
+
fig.suptitle("VLAC Value Estimation - Aggregate Statistics Across All Tasks", fontsize=16, fontweight='bold')
|
| 472 |
+
|
| 473 |
+
# 1. Mean values per task
|
| 474 |
+
ax1 = axes[0, 0]
|
| 475 |
+
bars = ax1.bar(range(len(task_names)), mean_values, color='steelblue', alpha=0.7)
|
| 476 |
+
ax1.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 477 |
+
ax1.axhline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Overall Mean ({np.mean(mean_values):.1f})')
|
| 478 |
+
ax1.set_xlabel('Task', fontsize=12)
|
| 479 |
+
ax1.set_ylabel('Mean Success Value', fontsize=12)
|
| 480 |
+
ax1.set_title('Mean Success Frame Values by Task', fontsize=14)
|
| 481 |
+
ax1.set_xticks(range(len(task_names)))
|
| 482 |
+
ax1.set_xticklabels(range(1, len(task_names) + 1))
|
| 483 |
+
ax1.legend()
|
| 484 |
+
ax1.grid(True, alpha=0.3, axis='y')
|
| 485 |
+
|
| 486 |
+
# 2. Distribution of mean values
|
| 487 |
+
ax2 = axes[0, 1]
|
| 488 |
+
ax2.hist(mean_values, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
|
| 489 |
+
ax2.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 490 |
+
ax2.axvline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(mean_values):.1f})')
|
| 491 |
+
ax2.set_xlabel('Mean Success Value', fontsize=12)
|
| 492 |
+
ax2.set_ylabel('Frequency (Tasks)', fontsize=12)
|
| 493 |
+
ax2.set_title('Distribution of Task-Level Mean Values', fontsize=14)
|
| 494 |
+
ax2.legend()
|
| 495 |
+
ax2.grid(True, alpha=0.3)
|
| 496 |
+
|
| 497 |
+
# 3. Median values per task
|
| 498 |
+
ax3 = axes[1, 0]
|
| 499 |
+
bars = ax3.bar(range(len(task_names)), median_values, color='coral', alpha=0.7)
|
| 500 |
+
ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)')
|
| 501 |
+
ax3.axhline(np.median(median_values), color='green', linestyle='-', linewidth=2, label=f'Overall Median ({np.median(median_values):.1f})')
|
| 502 |
+
ax3.set_xlabel('Task', fontsize=12)
|
| 503 |
+
ax3.set_ylabel('Median Success Value', fontsize=12)
|
| 504 |
+
ax3.set_title('Median Success Frame Values by Task', fontsize=14)
|
| 505 |
+
ax3.set_xticks(range(len(task_names)))
|
| 506 |
+
ax3.set_xticklabels(range(1, len(task_names) + 1))
|
| 507 |
+
ax3.legend()
|
| 508 |
+
ax3.grid(True, alpha=0.3, axis='y')
|
| 509 |
+
|
| 510 |
+
# 4. Std deviation per task
|
| 511 |
+
ax4 = axes[1, 1]
|
| 512 |
+
bars = ax4.bar(range(len(task_names)), std_values, color='orange', alpha=0.7)
|
| 513 |
+
ax4.axhline(np.mean(std_values), color='green', linestyle='-', linewidth=2, label=f'Mean Std ({np.mean(std_values):.1f})')
|
| 514 |
+
ax4.set_xlabel('Task', fontsize=12)
|
| 515 |
+
ax4.set_ylabel('Standard Deviation', fontsize=12)
|
| 516 |
+
ax4.set_title('Variability in Success Values by Task', fontsize=14)
|
| 517 |
+
ax4.set_xticks(range(len(task_names)))
|
| 518 |
+
ax4.set_xticklabels(range(1, len(task_names) + 1))
|
| 519 |
+
ax4.legend()
|
| 520 |
+
ax4.grid(True, alpha=0.3, axis='y')
|
| 521 |
+
|
| 522 |
+
plt.tight_layout()
|
| 523 |
+
|
| 524 |
+
# Save plots
|
| 525 |
+
plot_path = output_dir / "aggregate_statistics.png"
|
| 526 |
+
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
| 527 |
+
print(f"\nAggregate plot saved to: {plot_path}")
|
| 528 |
+
|
| 529 |
+
pdf_path = output_dir / "aggregate_statistics.pdf"
|
| 530 |
+
plt.savefig(pdf_path, bbox_inches='tight')
|
| 531 |
+
print(f"Aggregate PDF saved to: {pdf_path}")
|
| 532 |
+
|
| 533 |
+
plt.close()
|
| 534 |
+
|
| 535 |
+
|
| 536 |
+
# ---------------------------------------------------------------------------
|
| 537 |
+
# CLI
|
| 538 |
+
# ---------------------------------------------------------------------------
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
def parse_args() -> argparse.Namespace:
|
| 542 |
+
parser = argparse.ArgumentParser(
|
| 543 |
+
description="Evaluate value estimation for test demonstrations"
|
| 544 |
+
)
|
| 545 |
+
|
| 546 |
+
# Mode selection
|
| 547 |
+
parser.add_argument(
|
| 548 |
+
"--process-all-tasks",
|
| 549 |
+
action="store_true",
|
| 550 |
+
help="Process all LIBERO-10 tasks"
|
| 551 |
+
)
|
| 552 |
+
|
| 553 |
+
# Arguments for processing all tasks
|
| 554 |
+
parser.add_argument(
|
| 555 |
+
"--manifests-root",
|
| 556 |
+
type=Path,
|
| 557 |
+
help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)"
|
| 558 |
+
)
|
| 559 |
+
|
| 560 |
+
# Arguments for processing a single task
|
| 561 |
+
parser.add_argument(
|
| 562 |
+
"--manifest-path",
|
| 563 |
+
type=Path,
|
| 564 |
+
help="Path to the test manifest JSON file (for single task mode)",
|
| 565 |
+
)
|
| 566 |
+
|
| 567 |
+
# Common arguments
|
| 568 |
+
parser.add_argument(
|
| 569 |
+
"--output-dir",
|
| 570 |
+
type=Path,
|
| 571 |
+
default="evaluation_results",
|
| 572 |
+
help="Directory to save evaluation results and plots",
|
| 573 |
+
)
|
| 574 |
+
parser.add_argument(
|
| 575 |
+
"--base-url",
|
| 576 |
+
default="http://localhost:8111",
|
| 577 |
+
help="VLAC service base URL (default: http://localhost:8111)",
|
| 578 |
+
)
|
| 579 |
+
parser.add_argument(
|
| 580 |
+
"--timeout",
|
| 581 |
+
type=float,
|
| 582 |
+
default=30.0,
|
| 583 |
+
help="HTTP request timeout in seconds (default: 30.0)",
|
| 584 |
+
)
|
| 585 |
+
parser.add_argument(
|
| 586 |
+
"--use-reference",
|
| 587 |
+
action="store_true",
|
| 588 |
+
help="Use reference trajectory (if available)",
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
args = parser.parse_args()
|
| 592 |
+
|
| 593 |
+
# Validate arguments
|
| 594 |
+
if args.process_all_tasks:
|
| 595 |
+
if not args.manifests_root:
|
| 596 |
+
parser.error("--manifests-root is required when using --process-all-tasks")
|
| 597 |
+
else:
|
| 598 |
+
if not args.manifest_path:
|
| 599 |
+
parser.error("--manifest-path is required for single task mode")
|
| 600 |
+
|
| 601 |
+
return args
|
| 602 |
+
|
| 603 |
+
|
| 604 |
+
def main() -> int:
|
| 605 |
+
args = parse_args()
|
| 606 |
+
|
| 607 |
+
# Create output directory
|
| 608 |
+
output_dir = args.output_dir.expanduser()
|
| 609 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 610 |
+
|
| 611 |
+
if args.process_all_tasks:
|
| 612 |
+
# Process all LIBERO-10 tasks
|
| 613 |
+
manifests_root = args.manifests_root.expanduser()
|
| 614 |
+
|
| 615 |
+
if not manifests_root.exists():
|
| 616 |
+
print(f"Error: Manifests root directory not found: {manifests_root}")
|
| 617 |
+
return 1
|
| 618 |
+
|
| 619 |
+
print("=" * 80)
|
| 620 |
+
print("EVALUATING ALL LIBERO-10 TASKS")
|
| 621 |
+
print("=" * 80)
|
| 622 |
+
print(f"Manifests root: {manifests_root}")
|
| 623 |
+
print(f"Output directory: {output_dir}")
|
| 624 |
+
print(f"Base URL: {args.base_url}")
|
| 625 |
+
print(f"Total tasks to evaluate: {len(LIBERO_10_TASKS)}")
|
| 626 |
+
print("=" * 80)
|
| 627 |
+
|
| 628 |
+
successful_tasks = []
|
| 629 |
+
failed_tasks = []
|
| 630 |
+
all_task_results = []
|
| 631 |
+
|
| 632 |
+
for idx, task_name in enumerate(LIBERO_10_TASKS, 1):
|
| 633 |
+
print(f"\n[{idx}/{len(LIBERO_10_TASKS)}] Processing: {task_name}")
|
| 634 |
+
|
| 635 |
+
# Find manifest file
|
| 636 |
+
manifest_path = find_manifest_file(manifests_root, task_name)
|
| 637 |
+
if manifest_path is None:
|
| 638 |
+
print(f" [ERROR] Manifest file not found for task: {task_name}")
|
| 639 |
+
failed_tasks.append(task_name)
|
| 640 |
+
continue
|
| 641 |
+
|
| 642 |
+
# Evaluate the task
|
| 643 |
+
result = evaluate_single_task(
|
| 644 |
+
manifest_path=manifest_path,
|
| 645 |
+
output_dir=output_dir,
|
| 646 |
+
base_url=args.base_url,
|
| 647 |
+
timeout=args.timeout,
|
| 648 |
+
use_reference=args.use_reference,
|
| 649 |
+
)
|
| 650 |
+
|
| 651 |
+
if result:
|
| 652 |
+
successful_tasks.append(task_name)
|
| 653 |
+
all_task_results.append(result)
|
| 654 |
+
else:
|
| 655 |
+
failed_tasks.append(task_name)
|
| 656 |
+
|
| 657 |
+
# Print overall summary
|
| 658 |
+
print("\n" + "=" * 80)
|
| 659 |
+
print("EVALUATION COMPLETE - ALL TASKS")
|
| 660 |
+
print("=" * 80)
|
| 661 |
+
print(f"Successfully evaluated: {len(successful_tasks)}/{len(LIBERO_10_TASKS)} tasks")
|
| 662 |
+
print(f"Failed: {len(failed_tasks)}/{len(LIBERO_10_TASKS)} tasks")
|
| 663 |
+
|
| 664 |
+
if failed_tasks:
|
| 665 |
+
print("\nFailed tasks:")
|
| 666 |
+
for task in failed_tasks:
|
| 667 |
+
print(f" - {task}")
|
| 668 |
+
|
| 669 |
+
# Compute and display aggregate statistics
|
| 670 |
+
if all_task_results:
|
| 671 |
+
print("\n" + "=" * 80)
|
| 672 |
+
print("AGGREGATE STATISTICS ACROSS ALL TASKS")
|
| 673 |
+
print("=" * 80)
|
| 674 |
+
|
| 675 |
+
all_mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results]
|
| 676 |
+
all_median_values = [r["statistics"]["last_value_median"] for r in all_task_results]
|
| 677 |
+
all_std_values = [r["statistics"]["last_value_std"] for r in all_task_results]
|
| 678 |
+
|
| 679 |
+
print(f"\nOverall mean of task means: {np.mean(all_mean_values):.2f} ± {np.std(all_mean_values):.2f}")
|
| 680 |
+
print(f"Overall median of task medians: {np.median(all_median_values):.2f}")
|
| 681 |
+
print(f"Average std deviation: {np.mean(all_std_values):.2f}")
|
| 682 |
+
|
| 683 |
+
print(f"\nBest performing task: {all_task_results[np.argmax(all_mean_values)]['task_name']} ({max(all_mean_values):.2f})")
|
| 684 |
+
print(f"Worst performing task: {all_task_results[np.argmin(all_mean_values)]['task_name']} ({min(all_mean_values):.2f})")
|
| 685 |
+
|
| 686 |
+
# Save aggregate statistics
|
| 687 |
+
aggregate_stats = {
|
| 688 |
+
"total_tasks": len(LIBERO_10_TASKS),
|
| 689 |
+
"successful_tasks": len(successful_tasks),
|
| 690 |
+
"failed_tasks": len(failed_tasks),
|
| 691 |
+
"overall_mean_of_means": float(np.mean(all_mean_values)),
|
| 692 |
+
"overall_std_of_means": float(np.std(all_mean_values)),
|
| 693 |
+
"overall_median_of_medians": float(np.median(all_median_values)),
|
| 694 |
+
"average_std_deviation": float(np.mean(all_std_values)),
|
| 695 |
+
"best_task": all_task_results[np.argmax(all_mean_values)]['task_name'],
|
| 696 |
+
"best_task_mean_value": float(max(all_mean_values)),
|
| 697 |
+
"worst_task": all_task_results[np.argmin(all_mean_values)]['task_name'],
|
| 698 |
+
"worst_task_mean_value": float(min(all_mean_values)),
|
| 699 |
+
"task_results": [
|
| 700 |
+
{
|
| 701 |
+
"task_name": r["task_name"],
|
| 702 |
+
"mean_value": r["statistics"]["last_value_mean"],
|
| 703 |
+
"median_value": r["statistics"]["last_value_median"],
|
| 704 |
+
"std_value": r["statistics"]["last_value_std"],
|
| 705 |
+
}
|
| 706 |
+
for r in all_task_results
|
| 707 |
+
]
|
| 708 |
+
}
|
| 709 |
+
|
| 710 |
+
aggregate_path = output_dir / "aggregate_statistics.json"
|
| 711 |
+
with aggregate_path.open("w", encoding="utf-8") as f:
|
| 712 |
+
json.dump(aggregate_stats, f, indent=2)
|
| 713 |
+
print(f"\nAggregate statistics saved to: {aggregate_path}")
|
| 714 |
+
|
| 715 |
+
# Create aggregate plots
|
| 716 |
+
plot_aggregate_statistics(all_task_results, output_dir)
|
| 717 |
+
|
| 718 |
+
print("\n" + "=" * 80)
|
| 719 |
+
print(f"All results saved to: {output_dir}")
|
| 720 |
+
print("=" * 80)
|
| 721 |
+
|
| 722 |
+
else:
|
| 723 |
+
# Process a single task
|
| 724 |
+
print("=" * 80)
|
| 725 |
+
print("VLAC Value Estimation Evaluation - Single Task")
|
| 726 |
+
print("=" * 80)
|
| 727 |
+
|
| 728 |
+
result = evaluate_single_task(
|
| 729 |
+
manifest_path=args.manifest_path.expanduser(),
|
| 730 |
+
output_dir=output_dir,
|
| 731 |
+
base_url=args.base_url,
|
| 732 |
+
timeout=args.timeout,
|
| 733 |
+
use_reference=args.use_reference,
|
| 734 |
+
)
|
| 735 |
+
|
| 736 |
+
if not result:
|
| 737 |
+
print("\nEvaluation failed!")
|
| 738 |
+
return 1
|
| 739 |
+
|
| 740 |
+
# Print detailed statistics for single task
|
| 741 |
+
statistics = result["statistics"]
|
| 742 |
+
evaluation_results = result["evaluation_results"]
|
| 743 |
+
|
| 744 |
+
print("\n" + "=" * 80)
|
| 745 |
+
print("DETAILED EVALUATION SUMMARY")
|
| 746 |
+
print("=" * 80)
|
| 747 |
+
print(f"Task: {evaluation_results['task_name']}")
|
| 748 |
+
print(f"Total demos: {evaluation_results['total_demos']}")
|
| 749 |
+
print(f"Successfully evaluated: {evaluation_results['successful_evals']}")
|
| 750 |
+
print(f"Failed demos: {len(evaluation_results['failed_demos'])}")
|
| 751 |
+
|
| 752 |
+
if statistics:
|
| 753 |
+
print("\n" + "-" * 80)
|
| 754 |
+
print("SUCCESS FRAME VALUE STATISTICS")
|
| 755 |
+
print("-" * 80)
|
| 756 |
+
print(f"Mean: {statistics['last_value_mean']:.2f}")
|
| 757 |
+
print(f"Std Dev: {statistics['last_value_std']:.2f}")
|
| 758 |
+
print(f"Median: {statistics['last_value_median']:.2f}")
|
| 759 |
+
print(f"Min: {statistics['last_value_min']:.2f}")
|
| 760 |
+
print(f"Max: {statistics['last_value_max']:.2f}")
|
| 761 |
+
print(f"Q25: {statistics['last_value_q25']:.2f}")
|
| 762 |
+
print(f"Q75: {statistics['last_value_q75']:.2f}")
|
| 763 |
+
|
| 764 |
+
print("\n" + "-" * 80)
|
| 765 |
+
print("THRESHOLD ANALYSIS")
|
| 766 |
+
print("-" * 80)
|
| 767 |
+
for threshold in [80, 85, 90, 95, 100]:
|
| 768 |
+
count = statistics[f"count_above_{threshold}"]
|
| 769 |
+
percent = statistics[f"percent_above_{threshold}"]
|
| 770 |
+
print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)")
|
| 771 |
+
|
| 772 |
+
print("\n" + "-" * 80)
|
| 773 |
+
print(f"Mean latency: {statistics['mean_latency']:.2f}s")
|
| 774 |
+
print("-" * 80)
|
| 775 |
+
|
| 776 |
+
print("\n" + "=" * 80)
|
| 777 |
+
print("EVALUATION COMPLETE")
|
| 778 |
+
print("=" * 80)
|
| 779 |
+
|
| 780 |
+
return 0
|
| 781 |
+
|
| 782 |
+
|
| 783 |
+
if __name__ == "__main__":
|
| 784 |
+
sys.exit(main())
|