| |
| """Evaluate value estimation for test demonstrations from prepare_test_demo_single_task.py. |
| |
| This script: |
| 1. Reads test demo manifests created by prepare_test_demo_single_task.py |
| 2. Calls the VLAC trajectory-critic service for each demo |
| 3. Records the last value (success frame value) - ideally should be 100 |
| 4. Plots statistics to visualize the value distribution |
| |
| Usage: |
| # Evaluate all LIBERO-10 tasks |
| python evaluate_test_demo_values.py --process-all-tasks --manifests-root <root_dir> --output-dir <output_dir> |
| |
| # Evaluate a single task |
| python evaluate_test_demo_values.py --manifest-path <path_to_manifest.json> --output-dir <output_dir> |
| |
| Examples: |
| # Evaluate all LIBERO-10 tasks |
| python evaluate_test_demo_values.py \ |
| --process-all-tasks \ |
| --manifests-root toy_test_demos_LIBERO_10 \ |
| --output-dir evaluation_results_all_tasks \ |
| --base-url http://localhost:8111 |
| |
| # Evaluate a single task |
| python evaluate_test_demo_values.py \ |
| --manifest-path toy_test_demos_LIBERO_10/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it/KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it_test_manifest.json \ |
| --output-dir evaluation_results \ |
| --base-url http://localhost:8111 |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import base64 |
| import json |
| import os |
| import glob |
| import sys |
| import time |
| from io import BytesIO |
| from pathlib import Path |
| from typing import Dict, List, Optional |
|
|
| import matplotlib.pyplot as plt |
| import numpy as np |
| import requests |
| from PIL import Image |
| from tqdm import tqdm |
|
|
| |
| LIBERO_10_TASKS = [ |
| "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it", |
| "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it", |
| "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it", |
| "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove", |
| "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket", |
| "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket", |
| "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket", |
| "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate", |
| "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate", |
| "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy", |
| ] |
|
|
| |
| |
| |
|
|
| def sample_fixed_interval_frames(image_list, num_frames): |
| |
| |
| if len(image_list) == 0: |
| raise ValueError("image_list is empty") |
| elif len(image_list) == 1: |
| return [image_list[0]] * num_frames |
| elif num_frames == 2: |
| return [image_list[0]] * (num_frames//2) + [image_list[-1]] * (num_frames//2) |
| elif num_frames == 3: |
| return [image_list[0]] + [image_list[1]] * (num_frames-2) + [image_list[-1]] |
| else: |
| total_frames = len(image_list) |
| indices = np.linspace(start=0, stop=total_frames - 1, num=num_frames, dtype=int) |
| sampled_frames = [image_list[i] for i in indices] |
| return sampled_frames |
|
|
|
|
| num_frames_for_reference = 8 |
| ref_frm_root_dir = "/home/zechen/Data/Robo/LIBERO_Regen/libero_10_single_expert_demo/libero_10" |
| libero_10_task_list = [ |
| "KITCHEN_SCENE3_turn_on_the_stove_and_put_the_moka_pot_on_it", |
| "KITCHEN_SCENE4_put_the_black_bowl_in_the_bottom_drawer_of_the_cabinet_and_close_it", |
| "KITCHEN_SCENE6_put_the_yellow_and_white_mug_in_the_microwave_and_close_it", |
| "KITCHEN_SCENE8_put_both_moka_pots_on_the_stove", |
| "LIVING_ROOM_SCENE1_put_both_the_alphabet_soup_and_the_cream_cheese_box_in_the_basket", |
| "LIVING_ROOM_SCENE2_put_both_the_alphabet_soup_and_the_tomato_sauce_in_the_basket", |
| "LIVING_ROOM_SCENE2_put_both_the_cream_cheese_box_and_the_butter_in_the_basket", |
| "LIVING_ROOM_SCENE5_put_the_white_mug_on_the_left_plate_and_put_the_yellow_and_white_mug_on_the_right_plate", |
| "LIVING_ROOM_SCENE6_put_the_white_mug_on_the_plate_and_put_the_chocolate_pudding_to_the_right_of_the_plate", |
| "STUDY_SCENE1_pick_up_the_book_and_place_it_in_the_back_compartment_of_the_caddy" |
| ] |
| reference_frames_dict = {} |
| for task_name in libero_10_task_list: |
| ref_frm_task_dir = os.path.join(ref_frm_root_dir, task_name+"_demo") |
| ref_frm_file_list = glob.glob(os.path.join(ref_frm_task_dir, "*.png")) |
| ref_frm_file_list.sort() |
| reference_frames_temp = sample_fixed_interval_frames(ref_frm_file_list, num_frames_for_reference) |
| reference_frames_dict[task_name] = reference_frames_temp |
|
|
|
|
| def read_manifest(manifest_path: Path) -> Dict: |
| """Read the test demo manifest JSON file.""" |
| if not manifest_path.is_file(): |
| raise FileNotFoundError(f"Manifest file not found: {manifest_path}") |
| |
| with manifest_path.open("r", encoding="utf-8") as f: |
| manifest_data = json.load(f) |
| |
| |
| manifest_dir = manifest_path.parent |
| for demo in manifest_data.get("demos", []): |
| demo["frame_paths"] = [str(manifest_dir / path) for path in demo["frame_paths"]] |
| |
| return manifest_data |
|
|
|
|
| def image_to_base64(path: Path) -> str: |
| """Convert an image file to base64 encoded JPEG.""" |
| with Image.open(path) as img: |
| img = img.convert("RGB") |
| buffer = BytesIO() |
| img.save(buffer, format="JPEG", quality=95) |
| return base64.b64encode(buffer.getvalue()).decode("utf-8") |
|
|
|
|
| def encode_images(paths: List[str]) -> List[str]: |
| """Encode a list of image paths to base64.""" |
| return [image_to_base64(Path(p)) for p in paths] |
|
|
|
|
| def call_trajectory_critic( |
| session: requests.Session, |
| base_url: str, |
| task: str, |
| frames_b64: List[str], |
| reference_b64: Optional[List[str]], |
| timeout: float, |
| ) -> Dict: |
| """Call the VLAC trajectory-critic endpoint.""" |
| payload = { |
| "task": task, |
| "frames": frames_b64, |
| "reference": reference_b64, |
| "ref_num": len(reference_b64 or []), |
| "skip": 1, |
| "batch_size": min(len(frames_b64), 8), |
| "think": False, |
| "return_video": False, |
| } |
| start = time.time() |
| resp = session.post(f"{base_url.rstrip('/')}/trajectory-critic", json=payload, timeout=timeout) |
| resp.raise_for_status() |
| result = resp.json() |
| result["latency_sec"] = time.time() - start |
| return result |
|
|
|
|
| |
| |
| |
|
|
|
|
| def evaluate_demos( |
| manifest_data: Dict, |
| base_url: str, |
| timeout: float, |
| use_reference: bool = False, |
| ) -> Dict[str, any]: |
| """Evaluate all demos and collect value statistics.""" |
| session = requests.Session() |
| task_name = manifest_data.get("task_name", "") |
| demos = manifest_data.get("demos", []) |
| |
| results = [] |
| failed_demos = [] |
| |
| print(f"\nEvaluating {len(demos)} test demonstrations...") |
| print(f"Task: {task_name}") |
| print(f"Use reference: {use_reference}\n") |
| |
| for demo in tqdm(demos, desc="Processing demos"): |
| demo_name = demo["demo_name"] |
| frame_paths = demo["frame_paths"] |
|
|
| temp_frame_paths = [frame_paths[0], frame_paths[-1]] |
| |
| |
| |
| frames_b64 = encode_images(temp_frame_paths) |
| |
| |
| print(f"Using reference frames for task {task_name}") |
| reference_b64 = encode_images(reference_frames_dict[task_name]) |
| |
| |
| result = call_trajectory_critic( |
| session=session, |
| base_url=base_url, |
| task=task_name, |
| frames_b64=frames_b64, |
| reference_b64=reference_b64, |
| timeout=timeout, |
| ) |
| |
| |
| value_list = result.get("value_list", []) |
| if not value_list: |
| print(f"\n[warn] No values returned for demo {demo_name}") |
| failed_demos.append(demo_name) |
| continue |
| |
| |
| demo_result = { |
| "demo_name": demo_name, |
| "total_frames": demo["total_frames"], |
| "success_index": demo["success_index"], |
| "num_sampled_frames": len(frame_paths), |
| "value_list": value_list, |
| "last_value": value_list[-1], |
| "mean_value": float(np.mean(value_list)), |
| "std_value": float(np.std(value_list)), |
| "latency_sec": result.get("latency_sec", 0.0), |
| } |
| results.append(demo_result) |
| |
| |
| |
| |
| |
| |
| |
| |
| return { |
| "task_name": task_name, |
| "total_demos": len(demos), |
| "successful_evals": len(results), |
| "failed_demos": failed_demos, |
| "results": results, |
| } |
|
|
|
|
| def compute_statistics(evaluation_results: Dict) -> Dict[str, float]: |
| """Compute summary statistics from evaluation results.""" |
| results = evaluation_results["results"] |
| if not results: |
| return {} |
| |
| last_values = [r["last_value"] for r in results] |
| mean_values = [r["mean_value"] for r in results] |
| std_values = [r["std_value"] for r in results] |
| latencies = [r["latency_sec"] for r in results] |
| |
| |
| total_frames_list = [r["total_frames"] for r in results] |
| success_indices = [r["success_index"] for r in results] |
| |
| stats = { |
| "last_value_mean": float(np.mean(last_values)), |
| "last_value_std": float(np.std(last_values)), |
| "last_value_min": float(np.min(last_values)), |
| "last_value_max": float(np.max(last_values)), |
| "last_value_median": float(np.median(last_values)), |
| "last_value_q25": float(np.percentile(last_values, 25)), |
| "last_value_q75": float(np.percentile(last_values, 75)), |
| "mean_latency": float(np.mean(latencies)), |
| "total_evaluated": len(results), |
| } |
| |
| |
| stats["trajectory_length_mean"] = float(np.mean(total_frames_list)) |
| stats["trajectory_length_std"] = float(np.std(total_frames_list)) |
| stats["trajectory_length_min"] = float(np.min(total_frames_list)) |
| stats["trajectory_length_max"] = float(np.max(total_frames_list)) |
| stats["success_index_mean"] = float(np.mean(success_indices)) |
| stats["success_index_std"] = float(np.std(success_indices)) |
| |
| |
| if len(results) > 2: |
| |
| corr_length_value = np.corrcoef(total_frames_list, last_values)[0, 1] |
| stats["corr_total_frames_vs_last_value"] = float(corr_length_value) |
| |
| |
| corr_success_value = np.corrcoef(success_indices, last_values)[0, 1] |
| stats["corr_success_index_vs_last_value"] = float(corr_success_value) |
| |
| |
| corr_length_std = np.corrcoef(total_frames_list, std_values)[0, 1] |
| stats["corr_total_frames_vs_std_value"] = float(corr_length_std) |
| |
| |
| corr_success_std = np.corrcoef(success_indices, std_values)[0, 1] |
| stats["corr_success_index_vs_std_value"] = float(corr_success_std) |
| else: |
| stats["corr_total_frames_vs_last_value"] = float('nan') |
| stats["corr_success_index_vs_last_value"] = float('nan') |
| stats["corr_total_frames_vs_std_value"] = float('nan') |
| stats["corr_success_index_vs_std_value"] = float('nan') |
| |
| |
| for threshold in [80, 85, 90, 95, 100]: |
| count = sum(1 for v in last_values if v >= threshold) |
| stats[f"count_above_{threshold}"] = count |
| stats[f"percent_above_{threshold}"] = float(count / len(last_values) * 100) |
| |
| return stats |
|
|
|
|
| def plot_value_distribution(evaluation_results: Dict, output_dir: Path) -> None: |
| """Create visualization plots for value distribution.""" |
| results = evaluation_results["results"] |
| if not results: |
| print("No results to plot") |
| return |
| |
| task_name = evaluation_results["task_name"] |
| last_values = [r["last_value"] for r in results] |
| total_frames_list = [r["total_frames"] for r in results] |
| success_indices = [r["success_index"] for r in results] |
| std_values = [r["std_value"] for r in results] |
| |
| |
| fig, axes = plt.subplots(3, 2, figsize=(14, 16)) |
| fig.suptitle(f"Value Estimation Analysis: {task_name}", fontsize=16, fontweight='bold') |
| |
| |
| ax1 = axes[0, 0] |
| ax1.hist(last_values, bins=30, edgecolor='black', alpha=0.7, color='steelblue') |
| ax1.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)') |
| ax1.axvline(np.mean(last_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(last_values):.1f})') |
| ax1.set_xlabel('Last Frame Value (Success Frame)', fontsize=12) |
| ax1.set_ylabel('Frequency', fontsize=12) |
| ax1.set_title('Distribution of Success Frame Values', fontsize=14) |
| ax1.legend() |
| ax1.grid(True, alpha=0.3) |
| |
| |
| ax2 = axes[0, 1] |
| box_data = ax2.boxplot([last_values], vert=True, patch_artist=True, labels=['Success Values']) |
| for patch in box_data['boxes']: |
| patch.set_facecolor('lightblue') |
| ax2.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)') |
| ax2.set_ylabel('Value', fontsize=12) |
| ax2.set_title('Success Frame Value Distribution', fontsize=14) |
| ax2.legend() |
| ax2.grid(True, alpha=0.3, axis='y') |
| |
| |
| ax3 = axes[1, 0] |
| ax3.scatter(total_frames_list, last_values, alpha=0.6, s=50, c='steelblue') |
| if len(results) > 2: |
| |
| z = np.polyfit(total_frames_list, last_values, 1) |
| p = np.poly1d(z) |
| x_trend = np.linspace(min(total_frames_list), max(total_frames_list), 100) |
| ax3.plot(x_trend, p(x_trend), "r--", linewidth=2, alpha=0.8, label='Trend') |
| corr = np.corrcoef(total_frames_list, last_values)[0, 1] |
| ax3.text(0.05, 0.95, f'Corr: {corr:.3f}', transform=ax3.transAxes, |
| fontsize=11, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) |
| ax3.axhline(100, color='red', linestyle='--', linewidth=1, alpha=0.5, label='Target (100)') |
| ax3.set_xlabel('Total Frames (Trajectory Length)', fontsize=12) |
| ax3.set_ylabel('Success Frame Value', fontsize=12) |
| ax3.set_title('Trajectory Length vs. Success Value', fontsize=14) |
| ax3.legend() |
| ax3.grid(True, alpha=0.3) |
| |
| |
| ax4 = axes[1, 1] |
| ax4.scatter(success_indices, last_values, alpha=0.6, s=50, c='coral') |
| if len(results) > 2: |
| |
| z = np.polyfit(success_indices, last_values, 1) |
| p = np.poly1d(z) |
| x_trend = np.linspace(min(success_indices), max(success_indices), 100) |
| ax4.plot(x_trend, p(x_trend), "r--", linewidth=2, alpha=0.8, label='Trend') |
| corr = np.corrcoef(success_indices, last_values)[0, 1] |
| ax4.text(0.05, 0.95, f'Corr: {corr:.3f}', transform=ax4.transAxes, |
| fontsize=11, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) |
| ax4.axhline(100, color='red', linestyle='--', linewidth=1, alpha=0.5, label='Target (100)') |
| ax4.set_xlabel('Success Index (Frame)', fontsize=12) |
| ax4.set_ylabel('Success Frame Value', fontsize=12) |
| ax4.set_title('Success Frame Index vs. Success Value', fontsize=14) |
| ax4.legend() |
| ax4.grid(True, alpha=0.3) |
| |
| |
| ax5 = axes[2, 0] |
| ax5.scatter(total_frames_list, std_values, alpha=0.6, s=50, c='orange') |
| if len(results) > 2: |
| |
| z = np.polyfit(total_frames_list, std_values, 1) |
| p = np.poly1d(z) |
| x_trend = np.linspace(min(total_frames_list), max(total_frames_list), 100) |
| ax5.plot(x_trend, p(x_trend), "r--", linewidth=2, alpha=0.8, label='Trend') |
| corr = np.corrcoef(total_frames_list, std_values)[0, 1] |
| ax5.text(0.05, 0.95, f'Corr: {corr:.3f}', transform=ax5.transAxes, |
| fontsize=11, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) |
| ax5.set_xlabel('Total Frames (Trajectory Length)', fontsize=12) |
| ax5.set_ylabel('Std Dev of Values', fontsize=12) |
| ax5.set_title('Trajectory Length vs. Value Variability', fontsize=14) |
| ax5.legend() |
| ax5.grid(True, alpha=0.3) |
| |
| |
| ax6 = axes[2, 1] |
| sorted_values = np.sort(last_values) |
| cumulative = np.arange(1, len(sorted_values) + 1) / len(sorted_values) * 100 |
| ax6.plot(sorted_values, cumulative, linewidth=2, color='steelblue') |
| ax6.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)') |
| ax6.set_xlabel('Success Frame Value', fontsize=12) |
| ax6.set_ylabel('Cumulative Percentage (%)', fontsize=12) |
| ax6.set_title('Cumulative Distribution', fontsize=14) |
| ax6.legend() |
| ax6.grid(True, alpha=0.3) |
| |
| plt.tight_layout() |
| |
| |
| plot_path = output_dir / f"{task_name}_value_distribution.png" |
| plt.savefig(plot_path, dpi=300, bbox_inches='tight') |
| print(f"\nPlot saved to: {plot_path}") |
| |
| |
| pdf_path = output_dir / f"{task_name}_value_distribution.pdf" |
| plt.savefig(pdf_path, bbox_inches='tight') |
| print(f"PDF saved to: {pdf_path}") |
| |
| plt.close() |
|
|
|
|
| def save_results(evaluation_results: Dict, statistics: Dict, output_dir: Path) -> None: |
| """Save evaluation results and statistics to JSON files.""" |
| task_name = evaluation_results["task_name"] |
| |
| |
| results_path = output_dir / f"{task_name}_evaluation_results.json" |
| with results_path.open("w", encoding="utf-8") as f: |
| json.dump(evaluation_results, f, indent=2) |
| print(f"\nDetailed results saved to: {results_path}") |
| |
| |
| stats_path = output_dir / f"{task_name}_statistics.json" |
| with stats_path.open("w", encoding="utf-8") as f: |
| json.dump(statistics, f, indent=2) |
| print(f"Statistics saved to: {stats_path}") |
|
|
|
|
| def find_manifest_file(manifests_root: Path, task_name: str) -> Optional[Path]: |
| """Find the manifest file for a given task name. |
| |
| Tries different patterns commonly used. |
| """ |
| |
| patterns = [ |
| manifests_root / task_name / f"{task_name}_test_manifest.json", |
| manifests_root / task_name / "test_manifest.json", |
| manifests_root / f"{task_name}_test_manifest.json", |
| ] |
| |
| for candidate in patterns: |
| if candidate.exists(): |
| return candidate |
| |
| return None |
|
|
|
|
| def evaluate_single_task( |
| manifest_path: Path, |
| output_dir: Path, |
| base_url: str, |
| timeout: float, |
| use_reference: bool, |
| ) -> Optional[Dict]: |
| """Evaluate a single task and return the statistics. |
| |
| Returns: |
| Dictionary with evaluation results and statistics, or None if failed |
| """ |
| try: |
| manifest_data = read_manifest(manifest_path) |
| except FileNotFoundError as exc: |
| print(f"Error reading manifest: {exc}") |
| return None |
| |
| task_name = manifest_data.get("task_name", "unknown") |
| |
| print(f"\n{'='*80}") |
| print(f"Evaluating task: {task_name}") |
| print(f"Manifest: {manifest_path}") |
| print(f"{'='*80}") |
| |
| |
| evaluation_results = evaluate_demos( |
| manifest_data=manifest_data, |
| base_url=base_url, |
| timeout=timeout, |
| use_reference=use_reference, |
| ) |
| |
| |
| statistics = compute_statistics(evaluation_results) |
| |
| |
| print("\n" + "-" * 80) |
| print("TASK EVALUATION SUMMARY") |
| print("-" * 80) |
| print(f"Task: {evaluation_results['task_name']}") |
| print(f"Total demos: {evaluation_results['total_demos']}") |
| print(f"Successfully evaluated: {evaluation_results['successful_evals']}") |
| print(f"Failed demos: {len(evaluation_results['failed_demos'])}") |
| |
| if statistics: |
| print(f"\nMean success value: {statistics['last_value_mean']:.2f}") |
| print(f"Std Dev: {statistics['last_value_std']:.2f}") |
| print(f"Median: {statistics['last_value_median']:.2f}") |
| print(f"Values >= 90: {statistics.get('count_above_90', 0)} ({statistics.get('percent_above_90', 0):.1f}%)") |
| |
| print(f"\nTrajectory length: {statistics['trajectory_length_mean']:.1f} ± {statistics['trajectory_length_std']:.1f} frames") |
| |
| if not np.isnan(statistics.get('corr_total_frames_vs_last_value', float('nan'))): |
| print(f"\nCorrelations:") |
| print(f" Length vs. Value: {statistics['corr_total_frames_vs_last_value']:+.3f}") |
| print(f" Success idx vs. Value: {statistics['corr_success_index_vs_last_value']:+.3f}") |
| print(f" Length vs. Variability:{statistics['corr_total_frames_vs_std_value']:+.3f}") |
| |
| |
| task_output_dir = output_dir / task_name |
| task_output_dir.mkdir(parents=True, exist_ok=True) |
| save_results(evaluation_results, statistics, task_output_dir) |
| |
| |
| if evaluation_results["results"]: |
| plot_value_distribution(evaluation_results, task_output_dir) |
| |
| return { |
| "task_name": task_name, |
| "evaluation_results": evaluation_results, |
| "statistics": statistics, |
| } |
|
|
|
|
| def plot_aggregate_statistics(all_task_results: List[Dict], output_dir: Path) -> None: |
| """Create aggregate plots across all tasks.""" |
| if not all_task_results: |
| return |
| |
| |
| task_names = [r["task_name"] for r in all_task_results] |
| mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results] |
| median_values = [r["statistics"]["last_value_median"] for r in all_task_results] |
| std_values = [r["statistics"]["last_value_std"] for r in all_task_results] |
| |
| |
| fig, axes = plt.subplots(2, 2, figsize=(16, 12)) |
| fig.suptitle("VLAC Value Estimation - Aggregate Statistics Across All Tasks", fontsize=16, fontweight='bold') |
| |
| |
| ax1 = axes[0, 0] |
| bars = ax1.bar(range(len(task_names)), mean_values, color='steelblue', alpha=0.7) |
| ax1.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)') |
| ax1.axhline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Overall Mean ({np.mean(mean_values):.1f})') |
| ax1.set_xlabel('Task', fontsize=12) |
| ax1.set_ylabel('Mean Success Value', fontsize=12) |
| ax1.set_title('Mean Success Frame Values by Task', fontsize=14) |
| ax1.set_xticks(range(len(task_names))) |
| ax1.set_xticklabels(range(1, len(task_names) + 1)) |
| ax1.legend() |
| ax1.grid(True, alpha=0.3, axis='y') |
| |
| |
| ax2 = axes[0, 1] |
| ax2.hist(mean_values, bins=15, edgecolor='black', alpha=0.7, color='steelblue') |
| ax2.axvline(100, color='red', linestyle='--', linewidth=2, label='Target (100)') |
| ax2.axvline(np.mean(mean_values), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(mean_values):.1f})') |
| ax2.set_xlabel('Mean Success Value', fontsize=12) |
| ax2.set_ylabel('Frequency (Tasks)', fontsize=12) |
| ax2.set_title('Distribution of Task-Level Mean Values', fontsize=14) |
| ax2.legend() |
| ax2.grid(True, alpha=0.3) |
| |
| |
| ax3 = axes[1, 0] |
| bars = ax3.bar(range(len(task_names)), median_values, color='coral', alpha=0.7) |
| ax3.axhline(100, color='red', linestyle='--', linewidth=2, label='Target (100)') |
| ax3.axhline(np.median(median_values), color='green', linestyle='-', linewidth=2, label=f'Overall Median ({np.median(median_values):.1f})') |
| ax3.set_xlabel('Task', fontsize=12) |
| ax3.set_ylabel('Median Success Value', fontsize=12) |
| ax3.set_title('Median Success Frame Values by Task', fontsize=14) |
| ax3.set_xticks(range(len(task_names))) |
| ax3.set_xticklabels(range(1, len(task_names) + 1)) |
| ax3.legend() |
| ax3.grid(True, alpha=0.3, axis='y') |
| |
| |
| ax4 = axes[1, 1] |
| bars = ax4.bar(range(len(task_names)), std_values, color='orange', alpha=0.7) |
| ax4.axhline(np.mean(std_values), color='green', linestyle='-', linewidth=2, label=f'Mean Std ({np.mean(std_values):.1f})') |
| ax4.set_xlabel('Task', fontsize=12) |
| ax4.set_ylabel('Standard Deviation', fontsize=12) |
| ax4.set_title('Variability in Success Values by Task', fontsize=14) |
| ax4.set_xticks(range(len(task_names))) |
| ax4.set_xticklabels(range(1, len(task_names) + 1)) |
| ax4.legend() |
| ax4.grid(True, alpha=0.3, axis='y') |
| |
| plt.tight_layout() |
| |
| |
| plot_path = output_dir / "aggregate_statistics.png" |
| plt.savefig(plot_path, dpi=300, bbox_inches='tight') |
| print(f"\nAggregate plot saved to: {plot_path}") |
| |
| pdf_path = output_dir / "aggregate_statistics.pdf" |
| plt.savefig(pdf_path, bbox_inches='tight') |
| print(f"Aggregate PDF saved to: {pdf_path}") |
| |
| plt.close() |
|
|
|
|
| |
| |
| |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser( |
| description="Evaluate value estimation for test demonstrations" |
| ) |
| |
| |
| parser.add_argument( |
| "--process-all-tasks", |
| action="store_true", |
| help="Process all LIBERO-10 tasks" |
| ) |
| |
| |
| parser.add_argument( |
| "--manifests-root", |
| type=Path, |
| help="Root directory containing all task manifest subdirectories (required with --process-all-tasks)" |
| ) |
| |
| |
| parser.add_argument( |
| "--manifest-path", |
| type=Path, |
| help="Path to the test manifest JSON file (for single task mode)", |
| ) |
| |
| |
| parser.add_argument( |
| "--output-dir", |
| type=Path, |
| default="evaluation_results", |
| help="Directory to save evaluation results and plots", |
| ) |
| parser.add_argument( |
| "--base-url", |
| default="http://localhost:8111", |
| help="VLAC service base URL (default: http://localhost:8111)", |
| ) |
| parser.add_argument( |
| "--timeout", |
| type=float, |
| default=30.0, |
| help="HTTP request timeout in seconds (default: 30.0)", |
| ) |
| parser.add_argument( |
| "--use-reference", |
| action="store_true", |
| help="Use reference trajectory (if available)", |
| ) |
| |
| args = parser.parse_args() |
| |
| |
| if args.process_all_tasks: |
| if not args.manifests_root: |
| parser.error("--manifests-root is required when using --process-all-tasks") |
| else: |
| if not args.manifest_path: |
| parser.error("--manifest-path is required for single task mode") |
| |
| return args |
|
|
|
|
| def main() -> int: |
| args = parse_args() |
| |
| |
| output_dir = args.output_dir.expanduser() |
| output_dir.mkdir(parents=True, exist_ok=True) |
| |
| if args.process_all_tasks: |
| |
| manifests_root = args.manifests_root.expanduser() |
| |
| if not manifests_root.exists(): |
| print(f"Error: Manifests root directory not found: {manifests_root}") |
| return 1 |
| |
| print("=" * 80) |
| print("EVALUATING ALL LIBERO-10 TASKS") |
| print("=" * 80) |
| print(f"Manifests root: {manifests_root}") |
| print(f"Output directory: {output_dir}") |
| print(f"Base URL: {args.base_url}") |
| print(f"Total tasks to evaluate: {len(LIBERO_10_TASKS)}") |
| print("=" * 80) |
| |
| successful_tasks = [] |
| failed_tasks = [] |
| all_task_results = [] |
| |
| for idx, task_name in enumerate(LIBERO_10_TASKS, 1): |
| print(f"\n[{idx}/{len(LIBERO_10_TASKS)}] Processing: {task_name}") |
| |
| |
| manifest_path = find_manifest_file(manifests_root, task_name) |
| if manifest_path is None: |
| print(f" [ERROR] Manifest file not found for task: {task_name}") |
| failed_tasks.append(task_name) |
| continue |
| |
| |
| result = evaluate_single_task( |
| manifest_path=manifest_path, |
| output_dir=output_dir, |
| base_url=args.base_url, |
| timeout=args.timeout, |
| use_reference=args.use_reference, |
| ) |
| |
| if result: |
| successful_tasks.append(task_name) |
| all_task_results.append(result) |
| else: |
| failed_tasks.append(task_name) |
| |
| |
| print("\n" + "=" * 80) |
| print("EVALUATION COMPLETE - ALL TASKS") |
| print("=" * 80) |
| print(f"Successfully evaluated: {len(successful_tasks)}/{len(LIBERO_10_TASKS)} tasks") |
| print(f"Failed: {len(failed_tasks)}/{len(LIBERO_10_TASKS)} tasks") |
| |
| if failed_tasks: |
| print("\nFailed tasks:") |
| for task in failed_tasks: |
| print(f" - {task}") |
| |
| |
| if all_task_results: |
| print("\n" + "=" * 80) |
| print("AGGREGATE STATISTICS ACROSS ALL TASKS") |
| print("=" * 80) |
| |
| all_mean_values = [r["statistics"]["last_value_mean"] for r in all_task_results] |
| all_median_values = [r["statistics"]["last_value_median"] for r in all_task_results] |
| all_std_values = [r["statistics"]["last_value_std"] for r in all_task_results] |
| all_traj_lengths = [r["statistics"]["trajectory_length_mean"] for r in all_task_results] |
| |
| |
| all_corr_len_val = [r["statistics"]["corr_total_frames_vs_last_value"] for r in all_task_results |
| if not np.isnan(r["statistics"].get("corr_total_frames_vs_last_value", float('nan')))] |
| all_corr_len_std = [r["statistics"]["corr_total_frames_vs_std_value"] for r in all_task_results |
| if not np.isnan(r["statistics"].get("corr_total_frames_vs_std_value", float('nan')))] |
| |
| print(f"\nValue Statistics:") |
| print(f" Overall mean of task means: {np.mean(all_mean_values):.2f} ± {np.std(all_mean_values):.2f}") |
| print(f" Overall median of task medians: {np.median(all_median_values):.2f}") |
| print(f" Average std deviation: {np.mean(all_std_values):.2f}") |
| |
| print(f"\nTrajectory Length Statistics:") |
| print(f" Average trajectory length: {np.mean(all_traj_lengths):.1f} ± {np.std(all_traj_lengths):.1f} frames") |
| print(f" Range: {min(all_traj_lengths):.0f} - {max(all_traj_lengths):.0f} frames") |
| |
| if all_corr_len_val: |
| print(f"\nCorrelation Analysis (averaged across tasks):") |
| print(f" Avg. Length vs. Value correlation: {np.mean(all_corr_len_val):+.3f} ± {np.std(all_corr_len_val):.3f}") |
| print(f" Avg. Length vs. Variability correlation:{np.mean(all_corr_len_std):+.3f} ± {np.std(all_corr_len_std):.3f}") |
| |
| |
| negative_corr_count = sum(1 for c in all_corr_len_val if c < -0.3) |
| positive_corr_count = sum(1 for c in all_corr_len_val if c > 0.3) |
| print(f"\n Tasks with negative correlation (< -0.3): {negative_corr_count}/{len(all_corr_len_val)}") |
| print(f" Tasks with positive correlation (> +0.3): {positive_corr_count}/{len(all_corr_len_val)}") |
| |
| if np.mean(all_corr_len_val) < -0.3: |
| print(f"\n → Overall trend: Longer trajectories tend to have LOWER success values") |
| elif np.mean(all_corr_len_val) > 0.3: |
| print(f"\n → Overall trend: Longer trajectories tend to have HIGHER success values") |
| else: |
| print(f"\n → Overall trend: Weak relationship between trajectory length and success value") |
| |
| print(f"\nBest performing task: {all_task_results[np.argmax(all_mean_values)]['task_name']} ({max(all_mean_values):.2f})") |
| print(f"Worst performing task: {all_task_results[np.argmin(all_mean_values)]['task_name']} ({min(all_mean_values):.2f})") |
| |
| |
| aggregate_stats = { |
| "total_tasks": len(LIBERO_10_TASKS), |
| "successful_tasks": len(successful_tasks), |
| "failed_tasks": len(failed_tasks), |
| "overall_mean_of_means": float(np.mean(all_mean_values)), |
| "overall_std_of_means": float(np.std(all_mean_values)), |
| "overall_median_of_medians": float(np.median(all_median_values)), |
| "average_std_deviation": float(np.mean(all_std_values)), |
| "average_trajectory_length": float(np.mean(all_traj_lengths)), |
| "trajectory_length_std": float(np.std(all_traj_lengths)), |
| "best_task": all_task_results[np.argmax(all_mean_values)]['task_name'], |
| "best_task_mean_value": float(max(all_mean_values)), |
| "worst_task": all_task_results[np.argmin(all_mean_values)]['task_name'], |
| "worst_task_mean_value": float(min(all_mean_values)), |
| "task_results": [ |
| { |
| "task_name": r["task_name"], |
| "mean_value": r["statistics"]["last_value_mean"], |
| "median_value": r["statistics"]["last_value_median"], |
| "std_value": r["statistics"]["last_value_std"], |
| "trajectory_length": r["statistics"]["trajectory_length_mean"], |
| "corr_length_vs_value": r["statistics"].get("corr_total_frames_vs_last_value", None), |
| "corr_length_vs_variability": r["statistics"].get("corr_total_frames_vs_std_value", None), |
| } |
| for r in all_task_results |
| ] |
| } |
| |
| |
| if all_corr_len_val: |
| aggregate_stats["avg_corr_length_vs_value"] = float(np.mean(all_corr_len_val)) |
| aggregate_stats["std_corr_length_vs_value"] = float(np.std(all_corr_len_val)) |
| aggregate_stats["avg_corr_length_vs_variability"] = float(np.mean(all_corr_len_std)) |
| aggregate_stats["std_corr_length_vs_variability"] = float(np.std(all_corr_len_std)) |
| aggregate_stats["tasks_with_negative_correlation"] = int(sum(1 for c in all_corr_len_val if c < -0.3)) |
| aggregate_stats["tasks_with_positive_correlation"] = int(sum(1 for c in all_corr_len_val if c > 0.3)) |
| |
| aggregate_path = output_dir / "aggregate_statistics.json" |
| with aggregate_path.open("w", encoding="utf-8") as f: |
| json.dump(aggregate_stats, f, indent=2) |
| print(f"\nAggregate statistics saved to: {aggregate_path}") |
| |
| |
| plot_aggregate_statistics(all_task_results, output_dir) |
| |
| print("\n" + "=" * 80) |
| print(f"All results saved to: {output_dir}") |
| print("=" * 80) |
| |
| else: |
| |
| print("=" * 80) |
| print("VLAC Value Estimation Evaluation - Single Task") |
| print("=" * 80) |
| |
| result = evaluate_single_task( |
| manifest_path=args.manifest_path.expanduser(), |
| output_dir=output_dir, |
| base_url=args.base_url, |
| timeout=args.timeout, |
| use_reference=args.use_reference, |
| ) |
| |
| if not result: |
| print("\nEvaluation failed!") |
| return 1 |
| |
| |
| statistics = result["statistics"] |
| evaluation_results = result["evaluation_results"] |
| |
| print("\n" + "=" * 80) |
| print("DETAILED EVALUATION SUMMARY") |
| print("=" * 80) |
| print(f"Task: {evaluation_results['task_name']}") |
| print(f"Total demos: {evaluation_results['total_demos']}") |
| print(f"Successfully evaluated: {evaluation_results['successful_evals']}") |
| print(f"Failed demos: {len(evaluation_results['failed_demos'])}") |
| |
| if statistics: |
| print("\n" + "-" * 80) |
| print("SUCCESS FRAME VALUE STATISTICS") |
| print("-" * 80) |
| print(f"Mean: {statistics['last_value_mean']:.2f}") |
| print(f"Std Dev: {statistics['last_value_std']:.2f}") |
| print(f"Median: {statistics['last_value_median']:.2f}") |
| print(f"Min: {statistics['last_value_min']:.2f}") |
| print(f"Max: {statistics['last_value_max']:.2f}") |
| print(f"Q25: {statistics['last_value_q25']:.2f}") |
| print(f"Q75: {statistics['last_value_q75']:.2f}") |
| |
| print("\n" + "-" * 80) |
| print("TRAJECTORY LENGTH STATISTICS") |
| print("-" * 80) |
| print(f"Mean total frames: {statistics['trajectory_length_mean']:.1f} ± {statistics['trajectory_length_std']:.1f}") |
| print(f"Range: {statistics['trajectory_length_min']:.0f} - {statistics['trajectory_length_max']:.0f} frames") |
| print(f"Mean success index: {statistics['success_index_mean']:.1f} ± {statistics['success_index_std']:.1f}") |
| |
| print("\n" + "-" * 80) |
| print("CORRELATION ANALYSIS") |
| print("-" * 80) |
| if not np.isnan(statistics.get('corr_total_frames_vs_last_value', float('nan'))): |
| print(f"Trajectory Length vs. Success Value: {statistics['corr_total_frames_vs_last_value']:+.3f}") |
| print(f"Success Index vs. Success Value: {statistics['corr_success_index_vs_last_value']:+.3f}") |
| print(f"Trajectory Length vs. Value Variability: {statistics['corr_total_frames_vs_std_value']:+.3f}") |
| print(f"Success Index vs. Value Variability: {statistics['corr_success_index_vs_std_value']:+.3f}") |
| |
| |
| corr_len_val = statistics['corr_total_frames_vs_last_value'] |
| if abs(corr_len_val) > 0.5: |
| direction = "NEGATIVE" if corr_len_val < 0 else "POSITIVE" |
| print(f"\n → Strong {direction} correlation: ", end="") |
| if corr_len_val < 0: |
| print("Longer trajectories tend to have LOWER success values") |
| else: |
| print("Longer trajectories tend to have HIGHER success values") |
| elif abs(corr_len_val) > 0.3: |
| direction = "negative" if corr_len_val < 0 else "positive" |
| print(f"\n → Moderate {direction} correlation detected") |
| else: |
| print(f"\n → Weak correlation: trajectory length has minimal impact on success value") |
| else: |
| print("Insufficient data for correlation analysis (need > 2 demos)") |
| |
| print("\n" + "-" * 80) |
| print("THRESHOLD ANALYSIS") |
| print("-" * 80) |
| for threshold in [80, 85, 90, 95, 100]: |
| count = statistics[f"count_above_{threshold}"] |
| percent = statistics[f"percent_above_{threshold}"] |
| print(f"Values >= {threshold:3d}: {count:3d} demos ({percent:5.1f}%)") |
| |
| print("\n" + "-" * 80) |
| print(f"Mean latency: {statistics['mean_latency']:.2f}s") |
| print("-" * 80) |
| |
| print("\n" + "=" * 80) |
| print("EVALUATION COMPLETE") |
| print("=" * 80) |
| |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|