| """Gradio Dashboard for Agent Cost Optimizer. |
| |
| Visualizes: |
| - Cost-quality frontier (scatter plot: success rate vs avg cost) |
| - Baseline comparison bar charts |
| - Per-scenario breakdown |
| - Module ablation impact |
| """ |
|
|
| import json |
| import sys |
| from pathlib import Path |
| from typing import Dict, List, Any |
|
|
| import gradio as gr |
|
|
|
|
| def load_results(path: str) -> Dict[str, Any]: |
| with open(path) as f: |
| return json.load(f) |
|
|
|
|
| def parse_report(report_path: str) -> str: |
| with open(report_path) as f: |
| return f.read() |
|
|
|
|
| def create_frontier_plot(results: Dict[str, Any]): |
| """Create scatter plot data for cost-quality frontier.""" |
| points = [] |
| for name, data in results.items(): |
| success = (data.get("num_success", 0) + data.get("num_partial", 0)) / max(data.get("num_tasks", 1), 1) |
| cost = data.get("avg_cost_success", 0) |
| points.append({"baseline": name, "success_rate": success, "cost_per_success": cost}) |
| |
| |
| points.sort(key=lambda p: (-p["success_rate"], p["cost_per_success"])) |
| |
| |
| frontier = [] |
| min_cost = float("inf") |
| for p in points: |
| if p["cost_per_success"] <= min_cost: |
| frontier.append(p) |
| min_cost = p["cost_per_success"] |
| |
| return points, frontier |
|
|
|
|
| def build_dashboard(results_path: str, report_path: str): |
| results = load_results(results_path) |
| report_text = parse_report(report_path) |
| points, frontier = create_frontier_plot(results) |
| |
| with gr.Blocks(title="Agent Cost Optimizer Dashboard") as demo: |
| gr.Markdown("# Agent Cost Optimizer - Cost-Quality Dashboard") |
| gr.Markdown("Visualize cost-quality tradeoffs across routing strategies and ablations.") |
| |
| with gr.Row(): |
| with gr.Column(scale=2): |
| gr.Markdown("## Cost-Quality Frontier") |
| gr.Markdown("**X-axis**: Average cost per successful task | **Y-axis**: Success rate") |
| |
| |
| scatter_data = [ |
| [p["baseline"], f"{p['success_rate']:.1%}", f"${p['cost_per_success']:.4f}"] |
| for p in points |
| ] |
| gr.Dataframe( |
| headers=["Baseline", "Success Rate", "Cost per Success"], |
| value=scatter_data, |
| label="All Baselines", |
| ) |
| |
| frontier_data = [ |
| [p["baseline"], f"{p['success_rate']:.1%}", f"${p['cost_per_success']:.4f}"] |
| for p in frontier |
| ] |
| gr.Dataframe( |
| headers=["Baseline", "Success Rate", "Cost per Success"], |
| value=frontier_data, |
| label="Pareto Frontier", |
| ) |
| |
| with gr.Column(scale=1): |
| gr.Markdown("## Pareto Frontier Baselines") |
| pareto_names = [p["baseline"] for p in frontier] |
| for name in pareto_names: |
| gr.Markdown(f"- **{name}**") |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("## Baseline Comparison") |
| comparison_data = [] |
| for name, data in results.items(): |
| comparison_data.append([ |
| name, |
| f"{(data.get('num_success',0)+data.get('num_partial',0))/max(data.get('num_tasks',1),1):.1%}", |
| f"${data.get('avg_cost_success',0):.4f}", |
| f"${data.get('total_cost',0):.2f}", |
| f"{data.get('cost_reduction_vs_frontier',0):.1%}", |
| f"{data.get('false_done_rate',0):.1%}", |
| f"{data.get('unsafe_cheap_miss_rate',0):.1%}", |
| f"{data.get('regression_rate',0):.1%}", |
| ]) |
| gr.Dataframe( |
| headers=["Baseline", "Success", "Cost/Success", "Total Cost", "Cost Reduction", "False-DONE", "Cheap Miss", "Regression"], |
| value=comparison_data, |
| ) |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("## Per-Scenario Breakdown (Full Optimizer)") |
| full_data = results.get("full_optimizer", {}) |
| scenario_stats = full_data.get("per_scenario_stats", {}) |
| if scenario_stats: |
| scenario_data = [] |
| for scenario, stats in scenario_stats.items(): |
| count = stats.get("count", 0) |
| success = stats.get("success", 0) |
| cost = stats.get("cost", 0) |
| scenario_data.append([ |
| scenario, |
| str(count), |
| f"{success/max(count,1):.1%}", |
| f"${cost:.2f}", |
| ]) |
| gr.Dataframe( |
| headers=["Scenario", "Count", "Success Rate", "Total Cost"], |
| value=scenario_data, |
| ) |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("## Ablation Impact") |
| gr.Markdown("Cost increase when removing each module (vs full_optimizer)") |
| |
| full_cost = results.get("full_optimizer", {}).get("total_cost", 0) |
| ablation_data = [] |
| for name, data in results.items(): |
| if name.startswith("no_"): |
| delta = data.get("total_cost", 0) - full_cost |
| pct = (delta / max(full_cost, 0.001)) * 100 |
| ablation_data.append([name, f"${delta:.2f}", f"{pct:.1f}%"]) |
| |
| if ablation_data: |
| ablation_data.sort(key=lambda x: float(x[1].replace("$", "")), reverse=True) |
| gr.Dataframe( |
| headers=["Module Removed", "Cost Increase", "% Increase"], |
| value=ablation_data, |
| ) |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("## Full Report") |
| gr.Textbox(report_text, lines=40, label="Benchmark Report") |
| |
| return demo |
|
|
|
|
| def main(): |
| import argparse |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--results", default="./eval_results_v2/baseline_results.json", |
| help="Path to baseline results JSON") |
| parser.add_argument("--report", default="./eval_results_v2/report.txt", |
| help="Path to report text file") |
| parser.add_argument("--port", type=int, default=7860) |
| args = parser.parse_args() |
| |
| demo = build_dashboard(args.results, args.report) |
| demo.launch(server_name="0.0.0.0", server_port=args.port) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|