"""Gradio Dashboard for Agent Cost Optimizer. Visualizes: - Cost-quality frontier (scatter plot: success rate vs avg cost) - Baseline comparison bar charts - Per-scenario breakdown - Module ablation impact """ import json import sys from pathlib import Path from typing import Dict, List, Any import gradio as gr def load_results(path: str) -> Dict[str, Any]: with open(path) as f: return json.load(f) def parse_report(report_path: str) -> str: with open(report_path) as f: return f.read() def create_frontier_plot(results: Dict[str, Any]): """Create scatter plot data for cost-quality frontier.""" points = [] for name, data in results.items(): success = (data.get("num_success", 0) + data.get("num_partial", 0)) / max(data.get("num_tasks", 1), 1) cost = data.get("avg_cost_success", 0) points.append({"baseline": name, "success_rate": success, "cost_per_success": cost}) # Sort by success rate desc, cost asc for frontier points.sort(key=lambda p: (-p["success_rate"], p["cost_per_success"])) # Build Pareto frontier frontier = [] min_cost = float("inf") for p in points: if p["cost_per_success"] <= min_cost: frontier.append(p) min_cost = p["cost_per_success"] return points, frontier def build_dashboard(results_path: str, report_path: str): results = load_results(results_path) report_text = parse_report(report_path) points, frontier = create_frontier_plot(results) with gr.Blocks(title="Agent Cost Optimizer Dashboard") as demo: gr.Markdown("# Agent Cost Optimizer - Cost-Quality Dashboard") gr.Markdown("Visualize cost-quality tradeoffs across routing strategies and ablations.") with gr.Row(): with gr.Column(scale=2): gr.Markdown("## Cost-Quality Frontier") gr.Markdown("**X-axis**: Average cost per successful task | **Y-axis**: Success rate") # Scatter plot using native Gradio components scatter_data = [ [p["baseline"], f"{p['success_rate']:.1%}", f"${p['cost_per_success']:.4f}"] for p in points ] gr.Dataframe( headers=["Baseline", "Success Rate", "Cost per Success"], value=scatter_data, label="All Baselines", ) frontier_data = [ [p["baseline"], f"{p['success_rate']:.1%}", f"${p['cost_per_success']:.4f}"] for p in frontier ] gr.Dataframe( headers=["Baseline", "Success Rate", "Cost per Success"], value=frontier_data, label="Pareto Frontier", ) with gr.Column(scale=1): gr.Markdown("## Pareto Frontier Baselines") pareto_names = [p["baseline"] for p in frontier] for name in pareto_names: gr.Markdown(f"- **{name}**") with gr.Row(): with gr.Column(): gr.Markdown("## Baseline Comparison") comparison_data = [] for name, data in results.items(): comparison_data.append([ name, f"{(data.get('num_success',0)+data.get('num_partial',0))/max(data.get('num_tasks',1),1):.1%}", f"${data.get('avg_cost_success',0):.4f}", f"${data.get('total_cost',0):.2f}", f"{data.get('cost_reduction_vs_frontier',0):.1%}", f"{data.get('false_done_rate',0):.1%}", f"{data.get('unsafe_cheap_miss_rate',0):.1%}", f"{data.get('regression_rate',0):.1%}", ]) gr.Dataframe( headers=["Baseline", "Success", "Cost/Success", "Total Cost", "Cost Reduction", "False-DONE", "Cheap Miss", "Regression"], value=comparison_data, ) with gr.Row(): with gr.Column(): gr.Markdown("## Per-Scenario Breakdown (Full Optimizer)") full_data = results.get("full_optimizer", {}) scenario_stats = full_data.get("per_scenario_stats", {}) if scenario_stats: scenario_data = [] for scenario, stats in scenario_stats.items(): count = stats.get("count", 0) success = stats.get("success", 0) cost = stats.get("cost", 0) scenario_data.append([ scenario, str(count), f"{success/max(count,1):.1%}", f"${cost:.2f}", ]) gr.Dataframe( headers=["Scenario", "Count", "Success Rate", "Total Cost"], value=scenario_data, ) with gr.Row(): with gr.Column(): gr.Markdown("## Ablation Impact") gr.Markdown("Cost increase when removing each module (vs full_optimizer)") full_cost = results.get("full_optimizer", {}).get("total_cost", 0) ablation_data = [] for name, data in results.items(): if name.startswith("no_"): delta = data.get("total_cost", 0) - full_cost pct = (delta / max(full_cost, 0.001)) * 100 ablation_data.append([name, f"${delta:.2f}", f"{pct:.1f}%"]) if ablation_data: ablation_data.sort(key=lambda x: float(x[1].replace("$", "")), reverse=True) gr.Dataframe( headers=["Module Removed", "Cost Increase", "% Increase"], value=ablation_data, ) with gr.Row(): with gr.Column(): gr.Markdown("## Full Report") gr.Textbox(report_text, lines=40, label="Benchmark Report") return demo def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--results", default="./eval_results_v2/baseline_results.json", help="Path to baseline results JSON") parser.add_argument("--report", default="./eval_results_v2/report.txt", help="Path to report text file") parser.add_argument("--port", type=int, default=7860) args = parser.parse_args() demo = build_dashboard(args.results, args.report) demo.launch(server_name="0.0.0.0", server_port=args.port) if __name__ == "__main__": main()