muthuk1 commited on
Commit
c8e45c7
·
verified ·
1 Parent(s): 91ff0de

Update Benchmark tab with live 'Run Benchmark Now' that calls API and populates real data"

Browse files
Files changed (1) hide show
  1. web/src/components/tabs/Benchmark.tsx +187 -115
web/src/components/tabs/Benchmark.tsx CHANGED
@@ -7,35 +7,82 @@ import {
7
  BarChart, Bar, XAxis, YAxis, CartesianGrid,
8
  } from "recharts";
9
 
10
- const DEMO_AGGREGATE = {
11
- numSamples: 100,
12
- baseline: { avgF1: 0.5523, avgEM: 0.3810, avgContextHit: 0.4520, avgTokens: 952, avgCost: 0.000203, avgLatency: 1240 },
13
- graphrag: { avgF1: 0.6241, avgEM: 0.4230, avgContextHit: 0.5830, avgTokens: 2387, avgCost: 0.000518, avgLatency: 3820 },
14
- f1WinRate: 0.62,
15
- byType: [
16
- { type: "bridge", count: 58, baselineF1: 0.52, graphragF1: 0.63 },
17
- { type: "comparison", count: 42, baselineF1: 0.58, graphragF1: 0.61 },
18
- ],
19
- };
20
 
21
- const radarData = [
22
- { metric: "F1 Score", Baseline: 55, GraphRAG: 62 },
23
- { metric: "Exact Match", Baseline: 38, GraphRAG: 42 },
24
- { metric: "Context Hit", Baseline: 45, GraphRAG: 58 },
25
- { metric: "Token Eff.", Baseline: 90, GraphRAG: 40 },
26
- { metric: "Cost Eff.", Baseline: 85, GraphRAG: 35 },
27
- ];
28
 
29
  export function Benchmark() {
30
  const [running, setRunning] = useState(false);
31
- const [samples, setSamples] = useState(50);
32
- const [data] = useState(DEMO_AGGREGATE);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- const typeData = data.byType.map((t) => ({
35
- name: t.type.charAt(0).toUpperCase() + t.type.slice(1),
36
- Baseline: +(t.baselineF1 * 100).toFixed(1),
37
- GraphRAG: +(t.graphragF1 * 100).toFixed(1),
38
- }));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  return (
41
  <div>
@@ -45,114 +92,139 @@ export function Benchmark() {
45
  <div>
46
  <div className="display-sm mb-2">Batch Benchmark</div>
47
  <p className="body-sm" style={{ color: "#6c6a64" }}>
48
- Run both pipelines on HotpotQA multi-hop questions
49
  </p>
50
  </div>
51
  <div className="flex items-center gap-4 ml-auto">
52
  <label className="caption">
53
  Samples
54
- <input
55
- type="range"
56
- min={10}
57
- max={500}
58
- step={10}
59
- value={samples}
60
- onChange={(e) => setSamples(+e.target.value)}
61
- className="block w-32 mt-1 accent-[#FF6B00]"
62
- />
63
  <span className="body-sm font-mono">{samples}</span>
64
  </label>
65
- <button className="btn btn-primary" onClick={() => setRunning(true)} disabled={running}>
66
- {running ? "Running…" : "🏃 Run Benchmark"}
 
 
 
 
 
67
  </button>
68
  </div>
69
  </div>
 
 
 
 
 
70
  </div>
71
 
72
- {/* Summary Metrics */}
73
- <div className="grid grid-cols-2 md:grid-cols-4 gap-4 mb-6">
74
- {[
75
- { label: "Avg F1 (Baseline)", value: data.baseline.avgF1.toFixed(4), color: "#0072CE" },
76
- { label: "Avg F1 (GraphRAG)", value: data.graphrag.avgF1.toFixed(4), color: "#FF6B00" },
77
- { label: "GraphRAG Win Rate", value: (data.f1WinRate * 100).toFixed(0) + "%", color: "#5db872" },
78
- { label: "Samples Evaluated", value: data.numSamples.toString(), color: "#002B49" },
79
- ].map((m, i) => (
80
- <div key={i} className="card-cream text-center" style={{ padding: "20px" }}>
81
- <div className="metric-value-sm" style={{ color: m.color }}>{m.value}</div>
82
- <div className="metric-label">{m.label}</div>
 
 
 
 
83
  </div>
84
- ))}
85
- </div>
86
 
87
- <div className="grid grid-cols-1 lg:grid-cols-2 gap-6 mb-6">
88
- {/* Radar Chart */}
89
- <div className="card">
90
- <div className="title-md mb-4">Multi-Metric Radar</div>
91
- <ResponsiveContainer width="100%" height={340}>
92
- <RadarChart data={radarData}>
93
- <PolarGrid stroke="#002B49" strokeOpacity={0.12} />
94
- <PolarAngleAxis dataKey="metric" tick={{ fill: "#6c6a64", fontSize: 12 }} />
95
- <Radar name="Baseline" dataKey="Baseline" stroke="#0072CE" fill="#0072CE" fillOpacity={0.15} strokeWidth={2} />
96
- <Radar name="GraphRAG" dataKey="GraphRAG" stroke="#FF6B00" fill="#FF6B00" fillOpacity={0.15} strokeWidth={2} />
97
- <Legend />
98
- <Tooltip contentStyle={{ background: "#faf9f5", border: "1px solid #e6dfd8", borderRadius: "8px" }} />
99
- </RadarChart>
100
- </ResponsiveContainer>
101
- </div>
 
102
 
103
- {/* By Question Type */}
104
- <div className="card">
105
- <div className="title-md mb-4">F1 by Question Type</div>
106
- <ResponsiveContainer width="100%" height={340}>
107
- <BarChart data={typeData} margin={{ top: 20, right: 20, left: 0, bottom: 0 }}>
108
- <CartesianGrid strokeDasharray="3 3" stroke="#002B49" strokeOpacity={0.08} />
109
- <XAxis dataKey="name" tick={{ fill: "#6c6a64", fontSize: 13 }} />
110
- <YAxis domain={[0, 100]} tick={{ fill: "#6c6a64", fontSize: 12 }} />
111
- <Tooltip contentStyle={{ background: "#faf9f5", border: "1px solid #e6dfd8", borderRadius: "8px" }} />
112
- <Legend />
113
- <Bar dataKey="Baseline" fill="#0072CE" radius={[4, 4, 0, 0]} />
114
- <Bar dataKey="GraphRAG" fill="#FF6B00" radius={[4, 4, 0, 0]} />
115
- </BarChart>
116
- </ResponsiveContainer>
117
- </div>
118
- </div>
 
 
119
 
120
- {/* Detailed Table */}
121
- <div className="card">
122
- <div className="title-md mb-4">Detailed Comparison</div>
123
- <div className="overflow-x-auto">
124
- <table style={{ width: "100%", borderCollapse: "collapse", fontSize: "0.875rem" }}>
125
- <thead>
126
- <tr style={{ borderBottom: "2px solid var(--color-hairline)" }}>
127
- {["Metric", "Baseline RAG", "GraphRAG", "Winner"].map((h) => (
128
- <th key={h} className="caption-uppercase text-left" style={{ padding: "12px 16px" }}>{h}</th>
129
- ))}
130
- </tr>
131
- </thead>
132
- <tbody>
133
- {[
134
- { metric: "Avg F1 Score", b: data.baseline.avgF1.toFixed(4), g: data.graphrag.avgF1.toFixed(4), winner: "graphrag" },
135
- { metric: "Avg Exact Match", b: data.baseline.avgEM.toFixed(4), g: data.graphrag.avgEM.toFixed(4), winner: "graphrag" },
136
- { metric: "Avg Context Hit", b: data.baseline.avgContextHit.toFixed(4), g: data.graphrag.avgContextHit.toFixed(4), winner: "graphrag" },
137
- { metric: "Avg Tokens/Query", b: data.baseline.avgTokens.toFixed(0), g: data.graphrag.avgTokens.toFixed(0), winner: "baseline" },
138
- { metric: "Avg Cost ($)", b: "$" + data.baseline.avgCost.toFixed(6), g: "$" + data.graphrag.avgCost.toFixed(6), winner: "baseline" },
139
- { metric: "Avg Latency (ms)", b: data.baseline.avgLatency.toFixed(0), g: data.graphrag.avgLatency.toFixed(0), winner: "baseline" },
140
- ].map((row, i) => (
141
- <tr key={i} style={{ borderBottom: "1px solid var(--color-hairline-soft)" }}>
142
- <td className="title-sm" style={{ padding: "12px 16px" }}>{row.metric}</td>
143
- <td style={{ padding: "12px 16px", fontFamily: "var(--font-mono)", color: "#0072CE" }}>{row.b}</td>
144
- <td style={{ padding: "12px 16px", fontFamily: "var(--font-mono)", color: "#FF6B00" }}>{row.g}</td>
145
- <td style={{ padding: "12px 16px" }}>
146
- <span className={row.winner === "graphrag" ? "badge-orange" : "badge-blue"} style={{ fontSize: "0.6875rem" }}>
147
- {row.winner === "graphrag" ? "GraphRAG" : "Baseline"}
148
- </span>
149
- </td>
150
- </tr>
151
- ))}
152
- </tbody>
153
- </table>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  </div>
155
- </div>
156
  </div>
157
  );
158
  }
 
7
  BarChart, Bar, XAxis, YAxis, CartesianGrid,
8
  } from "recharts";
9
 
10
+ interface AggregateData {
11
+ numSamples: number;
12
+ baseline: { avgF1: number; avgEM: number; avgTokens: number; avgCost: number; avgLatency: number };
13
+ graphrag: { avgF1: number; avgEM: number; avgTokens: number; avgCost: number; avgLatency: number };
14
+ graphragF1WinRate: number;
15
+ byType: {
16
+ bridge?: { count: number; baselineF1: number; graphragF1: number } | null;
17
+ comparison?: { count: number; baselineF1: number; graphragF1: number } | null;
18
+ };
19
+ }
20
 
21
+ const INITIAL: AggregateData = {
22
+ numSamples: 0,
23
+ baseline: { avgF1: 0, avgEM: 0, avgTokens: 0, avgCost: 0, avgLatency: 0 },
24
+ graphrag: { avgF1: 0, avgEM: 0, avgTokens: 0, avgCost: 0, avgLatency: 0 },
25
+ graphragF1WinRate: 0,
26
+ byType: {},
27
+ };
28
 
29
  export function Benchmark() {
30
  const [running, setRunning] = useState(false);
31
+ const [samples, setSamples] = useState(10);
32
+ const [data, setData] = useState<AggregateData>(INITIAL);
33
+ const [report, setReport] = useState("");
34
+ const [demoMode, setDemoMode] = useState(false);
35
+ const [hasResults, setHasResults] = useState(false);
36
+
37
+ const runBenchmark = async () => {
38
+ setRunning(true);
39
+ setReport("Running benchmark...");
40
+ try {
41
+ const res = await fetch("/api/benchmark", {
42
+ method: "POST",
43
+ headers: { "Content-Type": "application/json" },
44
+ body: JSON.stringify({ numSamples: samples }),
45
+ });
46
+ const result = await res.json();
47
+ setData(result.aggregate);
48
+ setDemoMode(result.demoMode ?? false);
49
+ setHasResults(true);
50
 
51
+ // Build report text
52
+ const a = result.aggregate;
53
+ const lines = [
54
+ `BENCHMARK RESULTS (${a.numSamples} samples, ${result.provider}/${result.model})`,
55
+ `${result.demoMode ? "⚠️ DEMO MODE — Set API key for real results" : "✅ LIVE RESULTS"}`,
56
+ "",
57
+ `Metric Baseline GraphRAG Winner`,
58
+ `${"─".repeat(60)}`,
59
+ `Avg F1 ${a.baseline.avgF1.toFixed(4)} ${a.graphrag.avgF1.toFixed(4)} ${a.graphrag.avgF1 > a.baseline.avgF1 ? "GraphRAG" : "Baseline"}`,
60
+ `Avg EM ${a.baseline.avgEM.toFixed(4)} ${a.graphrag.avgEM.toFixed(4)} ${a.graphrag.avgEM > a.baseline.avgEM ? "GraphRAG" : "Baseline"}`,
61
+ `Avg Tokens ${a.baseline.avgTokens} ${a.graphrag.avgTokens} ${a.baseline.avgTokens < a.graphrag.avgTokens ? "Baseline" : "GraphRAG"}`,
62
+ `Avg Cost ($) ${a.baseline.avgCost.toFixed(6)} ${a.graphrag.avgCost.toFixed(6)}`,
63
+ `Avg Latency (ms) ${a.baseline.avgLatency} ${a.graphrag.avgLatency}`,
64
+ "",
65
+ `GraphRAG F1 Win Rate: ${(a.graphragF1WinRate * 100).toFixed(0)}%`,
66
+ `Token Ratio: ${a.graphrag.avgTokens > 0 && a.baseline.avgTokens > 0 ? (a.graphrag.avgTokens / a.baseline.avgTokens).toFixed(1) : "N/A"}x`,
67
+ ];
68
+ setReport(lines.join("\n"));
69
+ } catch (err) {
70
+ setReport(`Error: ${err}`);
71
+ }
72
+ setRunning(false);
73
+ };
74
+
75
+ const radarData = hasResults ? [
76
+ { metric: "F1 Score", Baseline: +(data.baseline.avgF1 * 100).toFixed(0), GraphRAG: +(data.graphrag.avgF1 * 100).toFixed(0) },
77
+ { metric: "Exact Match", Baseline: +(data.baseline.avgEM * 100).toFixed(0), GraphRAG: +(data.graphrag.avgEM * 100).toFixed(0) },
78
+ { metric: "Speed", Baseline: 85, GraphRAG: Math.max(10, 100 - Math.round(data.graphrag.avgLatency / Math.max(data.baseline.avgLatency, 1) * 30)) },
79
+ { metric: "Cost Eff.", Baseline: 85, GraphRAG: Math.max(10, 100 - Math.round(data.graphrag.avgCost / Math.max(data.baseline.avgCost, 0.000001) * 20)) },
80
+ { metric: "Win Rate", Baseline: +((1 - data.graphragF1WinRate) * 100).toFixed(0), GraphRAG: +(data.graphragF1WinRate * 100).toFixed(0) },
81
+ ] : [];
82
+
83
+ const typeData = [];
84
+ if (data.byType.bridge) typeData.push({ name: "Bridge", Baseline: +(data.byType.bridge.baselineF1 * 100).toFixed(1), GraphRAG: +(data.byType.bridge.graphragF1 * 100).toFixed(1) });
85
+ if (data.byType.comparison) typeData.push({ name: "Comparison", Baseline: +(data.byType.comparison.baselineF1 * 100).toFixed(1), GraphRAG: +(data.byType.comparison.graphragF1 * 100).toFixed(1) });
86
 
87
  return (
88
  <div>
 
92
  <div>
93
  <div className="display-sm mb-2">Batch Benchmark</div>
94
  <p className="body-sm" style={{ color: "#6c6a64" }}>
95
+ Run both pipelines on HotpotQA questions and evaluate F1, EM, tokens, cost
96
  </p>
97
  </div>
98
  <div className="flex items-center gap-4 ml-auto">
99
  <label className="caption">
100
  Samples
101
+ <input type="range" min={5} max={10} step={1} value={samples}
102
+ onChange={e => setSamples(+e.target.value)} className="block w-32 mt-1 accent-[#FF6B00]" />
 
 
 
 
 
 
 
103
  <span className="body-sm font-mono">{samples}</span>
104
  </label>
105
+ <button className="btn btn-primary" onClick={runBenchmark} disabled={running}>
106
+ {running ? (
107
+ <span className="flex items-center gap-2">
108
+ <span className="animate-spin inline-block w-4 h-4 border-2 border-white border-t-transparent rounded-full" />
109
+ Running…
110
+ </span>
111
+ ) : "🏃 Run Benchmark Now"}
112
  </button>
113
  </div>
114
  </div>
115
+ {demoMode && hasResults && (
116
+ <div className="mt-3 body-sm" style={{ color: "#d4a017" }}>
117
+ ⚠️ Demo mode — showing simulated results. Set an API key for real benchmark data.
118
+ </div>
119
+ )}
120
  </div>
121
 
122
+ {hasResults && (
123
+ <>
124
+ {/* Summary Cards */}
125
+ <div className="grid grid-cols-2 md:grid-cols-4 gap-4 mb-6">
126
+ {[
127
+ { label: "Avg F1 (Baseline)", value: data.baseline.avgF1.toFixed(4), color: "#0072CE" },
128
+ { label: "Avg F1 (GraphRAG)", value: data.graphrag.avgF1.toFixed(4), color: "#FF6B00" },
129
+ { label: "GraphRAG Win Rate", value: (data.graphragF1WinRate * 100).toFixed(0) + "%", color: "#5db872" },
130
+ { label: "Samples Evaluated", value: data.numSamples.toString(), color: "#002B49" },
131
+ ].map((m, i) => (
132
+ <div key={i} className="card-cream text-center" style={{ padding: "20px" }}>
133
+ <div className="metric-value-sm" style={{ color: m.color }}>{m.value}</div>
134
+ <div className="metric-label">{m.label}</div>
135
+ </div>
136
+ ))}
137
  </div>
 
 
138
 
139
+ <div className="grid grid-cols-1 lg:grid-cols-2 gap-6 mb-6">
140
+ {/* Radar */}
141
+ {radarData.length > 0 && (
142
+ <div className="card">
143
+ <div className="title-md mb-4">Multi-Metric Radar</div>
144
+ <ResponsiveContainer width="100%" height={340}>
145
+ <RadarChart data={radarData}>
146
+ <PolarGrid stroke="#002B49" strokeOpacity={0.12} />
147
+ <PolarAngleAxis dataKey="metric" tick={{ fill: "#6c6a64", fontSize: 12 }} />
148
+ <Radar name="Baseline" dataKey="Baseline" stroke="#0072CE" fill="#0072CE" fillOpacity={0.15} strokeWidth={2} />
149
+ <Radar name="GraphRAG" dataKey="GraphRAG" stroke="#FF6B00" fill="#FF6B00" fillOpacity={0.15} strokeWidth={2} />
150
+ <Legend /><Tooltip contentStyle={{ background: "#faf9f5", border: "1px solid #e6dfd8", borderRadius: "8px" }} />
151
+ </RadarChart>
152
+ </ResponsiveContainer>
153
+ </div>
154
+ )}
155
 
156
+ {/* By Type */}
157
+ {typeData.length > 0 && (
158
+ <div className="card">
159
+ <div className="title-md mb-4">F1 by Question Type</div>
160
+ <ResponsiveContainer width="100%" height={340}>
161
+ <BarChart data={typeData} margin={{ top: 20, right: 20, left: 0, bottom: 0 }}>
162
+ <CartesianGrid strokeDasharray="3 3" stroke="#002B49" strokeOpacity={0.08} />
163
+ <XAxis dataKey="name" tick={{ fill: "#6c6a64", fontSize: 13 }} />
164
+ <YAxis domain={[0, 100]} tick={{ fill: "#6c6a64", fontSize: 12 }} />
165
+ <Tooltip contentStyle={{ background: "#faf9f5", border: "1px solid #e6dfd8", borderRadius: "8px" }} />
166
+ <Legend />
167
+ <Bar dataKey="Baseline" fill="#0072CE" radius={[4, 4, 0, 0]} />
168
+ <Bar dataKey="GraphRAG" fill="#FF6B00" radius={[4, 4, 0, 0]} />
169
+ </BarChart>
170
+ </ResponsiveContainer>
171
+ </div>
172
+ )}
173
+ </div>
174
 
175
+ {/* Detailed Table */}
176
+ <div className="card mb-6">
177
+ <div className="title-md mb-4">Detailed Comparison</div>
178
+ <div className="overflow-x-auto">
179
+ <table style={{ width: "100%", borderCollapse: "collapse", fontSize: "0.875rem" }}>
180
+ <thead>
181
+ <tr style={{ borderBottom: "2px solid var(--color-hairline)" }}>
182
+ {["Metric", "Baseline RAG", "GraphRAG", "Winner"].map(h => (
183
+ <th key={h} className="caption-uppercase text-left" style={{ padding: "12px 16px" }}>{h}</th>
184
+ ))}
185
+ </tr>
186
+ </thead>
187
+ <tbody>
188
+ {[
189
+ { metric: "Avg F1 Score", b: data.baseline.avgF1.toFixed(4), g: data.graphrag.avgF1.toFixed(4), winner: data.graphrag.avgF1 > data.baseline.avgF1 ? "graphrag" : "baseline" },
190
+ { metric: "Avg Exact Match", b: data.baseline.avgEM.toFixed(4), g: data.graphrag.avgEM.toFixed(4), winner: data.graphrag.avgEM > data.baseline.avgEM ? "graphrag" : "baseline" },
191
+ { metric: "Avg Tokens/Query", b: data.baseline.avgTokens.toString(), g: data.graphrag.avgTokens.toString(), winner: data.baseline.avgTokens < data.graphrag.avgTokens ? "baseline" : "graphrag" },
192
+ { metric: "Avg Cost ($)", b: "$" + data.baseline.avgCost.toFixed(6), g: "$" + data.graphrag.avgCost.toFixed(6), winner: data.baseline.avgCost < data.graphrag.avgCost ? "baseline" : "graphrag" },
193
+ { metric: "Avg Latency (ms)", b: data.baseline.avgLatency.toString(), g: data.graphrag.avgLatency.toString(), winner: data.baseline.avgLatency < data.graphrag.avgLatency ? "baseline" : "graphrag" },
194
+ { metric: "F1 Win Rate", b: ((1 - data.graphragF1WinRate) * 100).toFixed(0) + "%", g: (data.graphragF1WinRate * 100).toFixed(0) + "%", winner: data.graphragF1WinRate > 0.5 ? "graphrag" : "baseline" },
195
+ ].map((row, i) => (
196
+ <tr key={i} style={{ borderBottom: "1px solid var(--color-hairline-soft)" }}>
197
+ <td className="title-sm" style={{ padding: "12px 16px" }}>{row.metric}</td>
198
+ <td style={{ padding: "12px 16px", fontFamily: "var(--font-mono)", color: "#0072CE" }}>{row.b}</td>
199
+ <td style={{ padding: "12px 16px", fontFamily: "var(--font-mono)", color: "#FF6B00" }}>{row.g}</td>
200
+ <td style={{ padding: "12px 16px" }}>
201
+ <span className={row.winner === "graphrag" ? "badge-orange" : "badge-blue"} style={{ fontSize: "0.6875rem" }}>
202
+ {row.winner === "graphrag" ? "GraphRAG" : "Baseline"}
203
+ </span>
204
+ </td>
205
+ </tr>
206
+ ))}
207
+ </tbody>
208
+ </table>
209
+ </div>
210
+ </div>
211
+ </>
212
+ )}
213
+
214
+ {/* Report */}
215
+ {report && (
216
+ <div className="card-dark">
217
+ <div className="code-window-header">
218
+ <div className="code-window-dot code-window-dot-red" />
219
+ <div className="code-window-dot code-window-dot-yellow" />
220
+ <div className="code-window-dot code-window-dot-green" />
221
+ <span className="body-sm" style={{ color: "#a09d96", marginLeft: "8px" }}>benchmark_report.txt</span>
222
+ </div>
223
+ <pre className="code-window-body" style={{ whiteSpace: "pre-wrap", fontSize: "0.8125rem" }}>
224
+ {report}
225
+ </pre>
226
  </div>
227
+ )}
228
  </div>
229
  );
230
  }