narcolepticchicken commited on
Commit
11d5cdf
·
verified ·
1 Parent(s): 284d6c8

Upload dashboard.py

Browse files
Files changed (1) hide show
  1. dashboard.py +176 -0
dashboard.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio Dashboard for Agent Cost Optimizer.
2
+
3
+ Visualizes:
4
+ - Cost-quality frontier (scatter plot: success rate vs avg cost)
5
+ - Baseline comparison bar charts
6
+ - Per-scenario breakdown
7
+ - Module ablation impact
8
+ """
9
+
10
+ import json
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import Dict, List, Any
14
+
15
+ import gradio as gr
16
+
17
+
18
+ def load_results(path: str) -> Dict[str, Any]:
19
+ with open(path) as f:
20
+ return json.load(f)
21
+
22
+
23
+ def parse_report(report_path: str) -> str:
24
+ with open(report_path) as f:
25
+ return f.read()
26
+
27
+
28
+ def create_frontier_plot(results: Dict[str, Any]):
29
+ """Create scatter plot data for cost-quality frontier."""
30
+ points = []
31
+ for name, data in results.items():
32
+ success = (data.get("num_success", 0) + data.get("num_partial", 0)) / max(data.get("num_tasks", 1), 1)
33
+ cost = data.get("avg_cost_success", 0)
34
+ points.append({"baseline": name, "success_rate": success, "cost_per_success": cost})
35
+
36
+ # Sort by success rate desc, cost asc for frontier
37
+ points.sort(key=lambda p: (-p["success_rate"], p["cost_per_success"]))
38
+
39
+ # Build Pareto frontier
40
+ frontier = []
41
+ min_cost = float("inf")
42
+ for p in points:
43
+ if p["cost_per_success"] <= min_cost:
44
+ frontier.append(p)
45
+ min_cost = p["cost_per_success"]
46
+
47
+ return points, frontier
48
+
49
+
50
+ def build_dashboard(results_path: str, report_path: str):
51
+ results = load_results(results_path)
52
+ report_text = parse_report(report_path)
53
+ points, frontier = create_frontier_plot(results)
54
+
55
+ with gr.Blocks(title="Agent Cost Optimizer Dashboard") as demo:
56
+ gr.Markdown("# Agent Cost Optimizer - Cost-Quality Dashboard")
57
+ gr.Markdown("Visualize cost-quality tradeoffs across routing strategies and ablations.")
58
+
59
+ with gr.Row():
60
+ with gr.Column(scale=2):
61
+ gr.Markdown("## Cost-Quality Frontier")
62
+ gr.Markdown("**X-axis**: Average cost per successful task | **Y-axis**: Success rate")
63
+
64
+ # Scatter plot using native Gradio components
65
+ scatter_data = [
66
+ [p["baseline"], f"{p['success_rate']:.1%}", f"${p['cost_per_success']:.4f}"]
67
+ for p in points
68
+ ]
69
+ gr.Dataframe(
70
+ headers=["Baseline", "Success Rate", "Cost per Success"],
71
+ value=scatter_data,
72
+ label="All Baselines",
73
+ )
74
+
75
+ frontier_data = [
76
+ [p["baseline"], f"{p['success_rate']:.1%}", f"${p['cost_per_success']:.4f}"]
77
+ for p in frontier
78
+ ]
79
+ gr.Dataframe(
80
+ headers=["Baseline", "Success Rate", "Cost per Success"],
81
+ value=frontier_data,
82
+ label="Pareto Frontier",
83
+ )
84
+
85
+ with gr.Column(scale=1):
86
+ gr.Markdown("## Pareto Frontier Baselines")
87
+ pareto_names = [p["baseline"] for p in frontier]
88
+ for name in pareto_names:
89
+ gr.Markdown(f"- **{name}**")
90
+
91
+ with gr.Row():
92
+ with gr.Column():
93
+ gr.Markdown("## Baseline Comparison")
94
+ comparison_data = []
95
+ for name, data in results.items():
96
+ comparison_data.append([
97
+ name,
98
+ f"{(data.get('num_success',0)+data.get('num_partial',0))/max(data.get('num_tasks',1),1):.1%}",
99
+ f"${data.get('avg_cost_success',0):.4f}",
100
+ f"${data.get('total_cost',0):.2f}",
101
+ f"{data.get('cost_reduction_vs_frontier',0):.1%}",
102
+ f"{data.get('false_done_rate',0):.1%}",
103
+ f"{data.get('unsafe_cheap_miss_rate',0):.1%}",
104
+ f"{data.get('regression_rate',0):.1%}",
105
+ ])
106
+ gr.Dataframe(
107
+ headers=["Baseline", "Success", "Cost/Success", "Total Cost", "Cost Reduction", "False-DONE", "Cheap Miss", "Regression"],
108
+ value=comparison_data,
109
+ )
110
+
111
+ with gr.Row():
112
+ with gr.Column():
113
+ gr.Markdown("## Per-Scenario Breakdown (Full Optimizer)")
114
+ full_data = results.get("full_optimizer", {})
115
+ scenario_stats = full_data.get("per_scenario_stats", {})
116
+ if scenario_stats:
117
+ scenario_data = []
118
+ for scenario, stats in scenario_stats.items():
119
+ count = stats.get("count", 0)
120
+ success = stats.get("success", 0)
121
+ cost = stats.get("cost", 0)
122
+ scenario_data.append([
123
+ scenario,
124
+ str(count),
125
+ f"{success/max(count,1):.1%}",
126
+ f"${cost:.2f}",
127
+ ])
128
+ gr.Dataframe(
129
+ headers=["Scenario", "Count", "Success Rate", "Total Cost"],
130
+ value=scenario_data,
131
+ )
132
+
133
+ with gr.Row():
134
+ with gr.Column():
135
+ gr.Markdown("## Ablation Impact")
136
+ gr.Markdown("Cost increase when removing each module (vs full_optimizer)")
137
+
138
+ full_cost = results.get("full_optimizer", {}).get("total_cost", 0)
139
+ ablation_data = []
140
+ for name, data in results.items():
141
+ if name.startswith("no_"):
142
+ delta = data.get("total_cost", 0) - full_cost
143
+ pct = (delta / max(full_cost, 0.001)) * 100
144
+ ablation_data.append([name, f"${delta:.2f}", f"{pct:.1f}%"])
145
+
146
+ if ablation_data:
147
+ ablation_data.sort(key=lambda x: float(x[1].replace("$", "")), reverse=True)
148
+ gr.Dataframe(
149
+ headers=["Module Removed", "Cost Increase", "% Increase"],
150
+ value=ablation_data,
151
+ )
152
+
153
+ with gr.Row():
154
+ with gr.Column():
155
+ gr.Markdown("## Full Report")
156
+ gr.Textbox(report_text, lines=40, label="Benchmark Report")
157
+
158
+ return demo
159
+
160
+
161
+ def main():
162
+ import argparse
163
+ parser = argparse.ArgumentParser()
164
+ parser.add_argument("--results", default="./eval_results_v2/baseline_results.json",
165
+ help="Path to baseline results JSON")
166
+ parser.add_argument("--report", default="./eval_results_v2/report.txt",
167
+ help="Path to report text file")
168
+ parser.add_argument("--port", type=int, default=7860)
169
+ args = parser.parse_args()
170
+
171
+ demo = build_dashboard(args.results, args.report)
172
+ demo.launch(server_name="0.0.0.0", server_port=args.port)
173
+
174
+
175
+ if __name__ == "__main__":
176
+ main()