narcolepticchicken commited on
Commit
d1120af
·
verified ·
1 Parent(s): 1fdedf8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -0
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio Space for Agent Cost Optimizer Dashboard.
2
+
3
+ This app visualizes cost-quality frontiers from ACO benchmark runs.
4
+ """
5
+
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Dict, List, Any
9
+
10
+ import gradio as gr
11
+
12
+
13
+ def load_results(path: str) -> Dict[str, Any]:
14
+ with open(path) as f:
15
+ return json.load(f)
16
+
17
+
18
+ def parse_report(report_path: str) -> str:
19
+ with open(report_path) as f:
20
+ return f.read()
21
+
22
+
23
+ def create_frontier_plot(results: Dict[str, Any]):
24
+ points = []
25
+ for name, data in results.items():
26
+ success = (data.get("num_success", 0) + data.get("num_partial", 0)) / max(data.get("num_tasks", 1), 1)
27
+ cost = data.get("avg_cost_success", 0)
28
+ points.append({"baseline": name, "success_rate": success, "cost_per_success": cost})
29
+ points.sort(key=lambda p: (-p["success_rate"], p["cost_per_success"]))
30
+ frontier = []
31
+ min_cost = float("inf")
32
+ for p in points:
33
+ if p["cost_per_success"] <= min_cost:
34
+ frontier.append(p)
35
+ min_cost = p["cost_per_success"]
36
+ return points, frontier
37
+
38
+
39
+ def build_dashboard():
40
+ results_path = Path("eval_results_v2/baseline_results.json")
41
+ report_path = Path("eval_results_v2/report.txt")
42
+
43
+ if not results_path.exists() or not report_path.exists():
44
+ return gr.Interface(
45
+ fn=lambda: "Run benchmark first: python standalone_eval_v2.py",
46
+ inputs=[], outputs="text", title="ACO Dashboard (No Data)"
47
+ )
48
+
49
+ results = load_results(str(results_path))
50
+ report_text = parse_report(str(report_path))
51
+ points, frontier = create_frontier_plot(results)
52
+
53
+ with gr.Blocks(title="Agent Cost Optimizer Dashboard") as demo:
54
+ gr.Markdown("# Agent Cost Optimizer - Cost-Quality Dashboard")
55
+ gr.Markdown("Visualize cost-quality tradeoffs across routing strategies and ablations.")
56
+
57
+ with gr.Row():
58
+ with gr.Column(scale=2):
59
+ gr.Markdown("## Cost-Quality Frontier")
60
+ gr.Markdown("**X-axis**: Average cost per successful task | **Y-axis**: Success rate")
61
+
62
+ scatter_data = [
63
+ [p["baseline"], f"{p['success_rate']:.1%}", f"${p['cost_per_success']:.4f}"]
64
+ for p in points
65
+ ]
66
+ gr.Dataframe(
67
+ headers=["Baseline", "Success Rate", "Cost per Success"],
68
+ value=scatter_data,
69
+ label="All Baselines",
70
+ )
71
+
72
+ frontier_data = [
73
+ [p["baseline"], f"{p['success_rate']:.1%}", f"${p['cost_per_success']:.4f}"]
74
+ for p in frontier
75
+ ]
76
+ gr.Dataframe(
77
+ headers=["Baseline", "Success Rate", "Cost per Success"],
78
+ value=frontier_data,
79
+ label="Pareto Frontier",
80
+ )
81
+
82
+ with gr.Column(scale=1):
83
+ gr.Markdown("## Pareto Frontier Baselines")
84
+ pareto_names = [p["baseline"] for p in frontier]
85
+ for name in pareto_names:
86
+ gr.Markdown(f"- **{name}**")
87
+
88
+ with gr.Row():
89
+ with gr.Column():
90
+ gr.Markdown("## Baseline Comparison")
91
+ comparison_data = []
92
+ for name, data in results.items():
93
+ comparison_data.append([
94
+ name,
95
+ f"{(data.get('num_success',0)+data.get('num_partial',0))/max(data.get('num_tasks',1),1):.1%}",
96
+ f"${data.get('avg_cost_success',0):.4f}",
97
+ f"${data.get('total_cost',0):.2f}",
98
+ f"{data.get('cost_reduction_vs_frontier',0):.1%}",
99
+ f"{data.get('false_done_rate',0):.1%}",
100
+ f"{data.get('unsafe_cheap_miss_rate',0):.1%}",
101
+ f"{data.get('regression_rate',0):.1%}",
102
+ ])
103
+ gr.Dataframe(
104
+ headers=["Baseline", "Success", "Cost/Success", "Total Cost", "Cost Reduction", "False-DONE", "Cheap Miss", "Regression"],
105
+ value=comparison_data,
106
+ )
107
+
108
+ with gr.Row():
109
+ with gr.Column():
110
+ gr.Markdown("## Per-Scenario Breakdown (Full Optimizer)")
111
+ full_data = results.get("full_optimizer", {})
112
+ scenario_stats = full_data.get("per_scenario_stats", {})
113
+ if scenario_stats:
114
+ scenario_data = []
115
+ for scenario, stats in scenario_stats.items():
116
+ count = stats.get("count", 0)
117
+ success = stats.get("success", 0)
118
+ cost = stats.get("cost", 0)
119
+ scenario_data.append([
120
+ scenario,
121
+ str(count),
122
+ f"{success/max(count,1):.1%}",
123
+ f"${cost:.2f}",
124
+ ])
125
+ gr.Dataframe(
126
+ headers=["Scenario", "Count", "Success Rate", "Total Cost"],
127
+ value=scenario_data,
128
+ )
129
+
130
+ with gr.Row():
131
+ with gr.Column():
132
+ gr.Markdown("## Ablation Impact")
133
+ gr.Markdown("Cost increase when removing each module (vs full_optimizer)")
134
+
135
+ full_cost = results.get("full_optimizer", {}).get("total_cost", 0)
136
+ ablation_data = []
137
+ for name, data in results.items():
138
+ if name.startswith("no_"):
139
+ delta = data.get("total_cost", 0) - full_cost
140
+ pct = (delta / max(full_cost, 0.001)) * 100
141
+ ablation_data.append([name, f"${delta:.2f}", f"{pct:.1f}%"])
142
+
143
+ if ablation_data:
144
+ ablation_data.sort(key=lambda x: float(x[1].replace("$", "")), reverse=True)
145
+ gr.Dataframe(
146
+ headers=["Module Removed", "Cost Increase", "% Increase"],
147
+ value=ablation_data,
148
+ )
149
+
150
+ with gr.Row():
151
+ with gr.Column():
152
+ gr.Markdown("## Full Report")
153
+ gr.Textbox(report_text, lines=40, label="Benchmark Report", interactive=False)
154
+
155
+ return demo
156
+
157
+
158
+ if __name__ == "__main__":
159
+ demo = build_dashboard()
160
+ demo.launch(server_name="0.0.0.0", server_port=7860)