Update Benchmark tab with live 'Run Benchmark Now' that calls API and populates real data"
Browse files- web/src/components/tabs/Benchmark.tsx +187 -115
web/src/components/tabs/Benchmark.tsx
CHANGED
|
@@ -7,35 +7,82 @@ import {
|
|
| 7 |
BarChart, Bar, XAxis, YAxis, CartesianGrid,
|
| 8 |
} from "recharts";
|
| 9 |
|
| 10 |
-
|
| 11 |
-
numSamples:
|
| 12 |
-
baseline: { avgF1:
|
| 13 |
-
graphrag: { avgF1:
|
| 14 |
-
|
| 15 |
-
byType:
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
}
|
| 20 |
|
| 21 |
-
const
|
| 22 |
-
|
| 23 |
-
{
|
| 24 |
-
{
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
|
| 29 |
export function Benchmark() {
|
| 30 |
const [running, setRunning] = useState(false);
|
| 31 |
-
const [samples, setSamples] = useState(
|
| 32 |
-
const [data] = useState(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
return (
|
| 41 |
<div>
|
|
@@ -45,114 +92,139 @@ export function Benchmark() {
|
|
| 45 |
<div>
|
| 46 |
<div className="display-sm mb-2">Batch Benchmark</div>
|
| 47 |
<p className="body-sm" style={{ color: "#6c6a64" }}>
|
| 48 |
-
Run both pipelines on HotpotQA
|
| 49 |
</p>
|
| 50 |
</div>
|
| 51 |
<div className="flex items-center gap-4 ml-auto">
|
| 52 |
<label className="caption">
|
| 53 |
Samples
|
| 54 |
-
<input
|
| 55 |
-
|
| 56 |
-
min={10}
|
| 57 |
-
max={500}
|
| 58 |
-
step={10}
|
| 59 |
-
value={samples}
|
| 60 |
-
onChange={(e) => setSamples(+e.target.value)}
|
| 61 |
-
className="block w-32 mt-1 accent-[#FF6B00]"
|
| 62 |
-
/>
|
| 63 |
<span className="body-sm font-mono">{samples}</span>
|
| 64 |
</label>
|
| 65 |
-
<button className="btn btn-primary" onClick={
|
| 66 |
-
{running ?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
</button>
|
| 68 |
</div>
|
| 69 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
</div>
|
| 71 |
|
| 72 |
-
{
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
</div>
|
| 84 |
-
))}
|
| 85 |
-
</div>
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
| 102 |
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
</div>
|
| 155 |
-
|
| 156 |
</div>
|
| 157 |
);
|
| 158 |
}
|
|
|
|
| 7 |
BarChart, Bar, XAxis, YAxis, CartesianGrid,
|
| 8 |
} from "recharts";
|
| 9 |
|
| 10 |
+
interface AggregateData {
|
| 11 |
+
numSamples: number;
|
| 12 |
+
baseline: { avgF1: number; avgEM: number; avgTokens: number; avgCost: number; avgLatency: number };
|
| 13 |
+
graphrag: { avgF1: number; avgEM: number; avgTokens: number; avgCost: number; avgLatency: number };
|
| 14 |
+
graphragF1WinRate: number;
|
| 15 |
+
byType: {
|
| 16 |
+
bridge?: { count: number; baselineF1: number; graphragF1: number } | null;
|
| 17 |
+
comparison?: { count: number; baselineF1: number; graphragF1: number } | null;
|
| 18 |
+
};
|
| 19 |
+
}
|
| 20 |
|
| 21 |
+
const INITIAL: AggregateData = {
|
| 22 |
+
numSamples: 0,
|
| 23 |
+
baseline: { avgF1: 0, avgEM: 0, avgTokens: 0, avgCost: 0, avgLatency: 0 },
|
| 24 |
+
graphrag: { avgF1: 0, avgEM: 0, avgTokens: 0, avgCost: 0, avgLatency: 0 },
|
| 25 |
+
graphragF1WinRate: 0,
|
| 26 |
+
byType: {},
|
| 27 |
+
};
|
| 28 |
|
| 29 |
export function Benchmark() {
|
| 30 |
const [running, setRunning] = useState(false);
|
| 31 |
+
const [samples, setSamples] = useState(10);
|
| 32 |
+
const [data, setData] = useState<AggregateData>(INITIAL);
|
| 33 |
+
const [report, setReport] = useState("");
|
| 34 |
+
const [demoMode, setDemoMode] = useState(false);
|
| 35 |
+
const [hasResults, setHasResults] = useState(false);
|
| 36 |
+
|
| 37 |
+
const runBenchmark = async () => {
|
| 38 |
+
setRunning(true);
|
| 39 |
+
setReport("Running benchmark...");
|
| 40 |
+
try {
|
| 41 |
+
const res = await fetch("/api/benchmark", {
|
| 42 |
+
method: "POST",
|
| 43 |
+
headers: { "Content-Type": "application/json" },
|
| 44 |
+
body: JSON.stringify({ numSamples: samples }),
|
| 45 |
+
});
|
| 46 |
+
const result = await res.json();
|
| 47 |
+
setData(result.aggregate);
|
| 48 |
+
setDemoMode(result.demoMode ?? false);
|
| 49 |
+
setHasResults(true);
|
| 50 |
|
| 51 |
+
// Build report text
|
| 52 |
+
const a = result.aggregate;
|
| 53 |
+
const lines = [
|
| 54 |
+
`BENCHMARK RESULTS (${a.numSamples} samples, ${result.provider}/${result.model})`,
|
| 55 |
+
`${result.demoMode ? "⚠️ DEMO MODE — Set API key for real results" : "✅ LIVE RESULTS"}`,
|
| 56 |
+
"",
|
| 57 |
+
`Metric Baseline GraphRAG Winner`,
|
| 58 |
+
`${"─".repeat(60)}`,
|
| 59 |
+
`Avg F1 ${a.baseline.avgF1.toFixed(4)} ${a.graphrag.avgF1.toFixed(4)} ${a.graphrag.avgF1 > a.baseline.avgF1 ? "GraphRAG" : "Baseline"}`,
|
| 60 |
+
`Avg EM ${a.baseline.avgEM.toFixed(4)} ${a.graphrag.avgEM.toFixed(4)} ${a.graphrag.avgEM > a.baseline.avgEM ? "GraphRAG" : "Baseline"}`,
|
| 61 |
+
`Avg Tokens ${a.baseline.avgTokens} ${a.graphrag.avgTokens} ${a.baseline.avgTokens < a.graphrag.avgTokens ? "Baseline" : "GraphRAG"}`,
|
| 62 |
+
`Avg Cost ($) ${a.baseline.avgCost.toFixed(6)} ${a.graphrag.avgCost.toFixed(6)}`,
|
| 63 |
+
`Avg Latency (ms) ${a.baseline.avgLatency} ${a.graphrag.avgLatency}`,
|
| 64 |
+
"",
|
| 65 |
+
`GraphRAG F1 Win Rate: ${(a.graphragF1WinRate * 100).toFixed(0)}%`,
|
| 66 |
+
`Token Ratio: ${a.graphrag.avgTokens > 0 && a.baseline.avgTokens > 0 ? (a.graphrag.avgTokens / a.baseline.avgTokens).toFixed(1) : "N/A"}x`,
|
| 67 |
+
];
|
| 68 |
+
setReport(lines.join("\n"));
|
| 69 |
+
} catch (err) {
|
| 70 |
+
setReport(`Error: ${err}`);
|
| 71 |
+
}
|
| 72 |
+
setRunning(false);
|
| 73 |
+
};
|
| 74 |
+
|
| 75 |
+
const radarData = hasResults ? [
|
| 76 |
+
{ metric: "F1 Score", Baseline: +(data.baseline.avgF1 * 100).toFixed(0), GraphRAG: +(data.graphrag.avgF1 * 100).toFixed(0) },
|
| 77 |
+
{ metric: "Exact Match", Baseline: +(data.baseline.avgEM * 100).toFixed(0), GraphRAG: +(data.graphrag.avgEM * 100).toFixed(0) },
|
| 78 |
+
{ metric: "Speed", Baseline: 85, GraphRAG: Math.max(10, 100 - Math.round(data.graphrag.avgLatency / Math.max(data.baseline.avgLatency, 1) * 30)) },
|
| 79 |
+
{ metric: "Cost Eff.", Baseline: 85, GraphRAG: Math.max(10, 100 - Math.round(data.graphrag.avgCost / Math.max(data.baseline.avgCost, 0.000001) * 20)) },
|
| 80 |
+
{ metric: "Win Rate", Baseline: +((1 - data.graphragF1WinRate) * 100).toFixed(0), GraphRAG: +(data.graphragF1WinRate * 100).toFixed(0) },
|
| 81 |
+
] : [];
|
| 82 |
+
|
| 83 |
+
const typeData = [];
|
| 84 |
+
if (data.byType.bridge) typeData.push({ name: "Bridge", Baseline: +(data.byType.bridge.baselineF1 * 100).toFixed(1), GraphRAG: +(data.byType.bridge.graphragF1 * 100).toFixed(1) });
|
| 85 |
+
if (data.byType.comparison) typeData.push({ name: "Comparison", Baseline: +(data.byType.comparison.baselineF1 * 100).toFixed(1), GraphRAG: +(data.byType.comparison.graphragF1 * 100).toFixed(1) });
|
| 86 |
|
| 87 |
return (
|
| 88 |
<div>
|
|
|
|
| 92 |
<div>
|
| 93 |
<div className="display-sm mb-2">Batch Benchmark</div>
|
| 94 |
<p className="body-sm" style={{ color: "#6c6a64" }}>
|
| 95 |
+
Run both pipelines on HotpotQA questions and evaluate F1, EM, tokens, cost
|
| 96 |
</p>
|
| 97 |
</div>
|
| 98 |
<div className="flex items-center gap-4 ml-auto">
|
| 99 |
<label className="caption">
|
| 100 |
Samples
|
| 101 |
+
<input type="range" min={5} max={10} step={1} value={samples}
|
| 102 |
+
onChange={e => setSamples(+e.target.value)} className="block w-32 mt-1 accent-[#FF6B00]" />
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
<span className="body-sm font-mono">{samples}</span>
|
| 104 |
</label>
|
| 105 |
+
<button className="btn btn-primary" onClick={runBenchmark} disabled={running}>
|
| 106 |
+
{running ? (
|
| 107 |
+
<span className="flex items-center gap-2">
|
| 108 |
+
<span className="animate-spin inline-block w-4 h-4 border-2 border-white border-t-transparent rounded-full" />
|
| 109 |
+
Running…
|
| 110 |
+
</span>
|
| 111 |
+
) : "🏃 Run Benchmark Now"}
|
| 112 |
</button>
|
| 113 |
</div>
|
| 114 |
</div>
|
| 115 |
+
{demoMode && hasResults && (
|
| 116 |
+
<div className="mt-3 body-sm" style={{ color: "#d4a017" }}>
|
| 117 |
+
⚠️ Demo mode — showing simulated results. Set an API key for real benchmark data.
|
| 118 |
+
</div>
|
| 119 |
+
)}
|
| 120 |
</div>
|
| 121 |
|
| 122 |
+
{hasResults && (
|
| 123 |
+
<>
|
| 124 |
+
{/* Summary Cards */}
|
| 125 |
+
<div className="grid grid-cols-2 md:grid-cols-4 gap-4 mb-6">
|
| 126 |
+
{[
|
| 127 |
+
{ label: "Avg F1 (Baseline)", value: data.baseline.avgF1.toFixed(4), color: "#0072CE" },
|
| 128 |
+
{ label: "Avg F1 (GraphRAG)", value: data.graphrag.avgF1.toFixed(4), color: "#FF6B00" },
|
| 129 |
+
{ label: "GraphRAG Win Rate", value: (data.graphragF1WinRate * 100).toFixed(0) + "%", color: "#5db872" },
|
| 130 |
+
{ label: "Samples Evaluated", value: data.numSamples.toString(), color: "#002B49" },
|
| 131 |
+
].map((m, i) => (
|
| 132 |
+
<div key={i} className="card-cream text-center" style={{ padding: "20px" }}>
|
| 133 |
+
<div className="metric-value-sm" style={{ color: m.color }}>{m.value}</div>
|
| 134 |
+
<div className="metric-label">{m.label}</div>
|
| 135 |
+
</div>
|
| 136 |
+
))}
|
| 137 |
</div>
|
|
|
|
|
|
|
| 138 |
|
| 139 |
+
<div className="grid grid-cols-1 lg:grid-cols-2 gap-6 mb-6">
|
| 140 |
+
{/* Radar */}
|
| 141 |
+
{radarData.length > 0 && (
|
| 142 |
+
<div className="card">
|
| 143 |
+
<div className="title-md mb-4">Multi-Metric Radar</div>
|
| 144 |
+
<ResponsiveContainer width="100%" height={340}>
|
| 145 |
+
<RadarChart data={radarData}>
|
| 146 |
+
<PolarGrid stroke="#002B49" strokeOpacity={0.12} />
|
| 147 |
+
<PolarAngleAxis dataKey="metric" tick={{ fill: "#6c6a64", fontSize: 12 }} />
|
| 148 |
+
<Radar name="Baseline" dataKey="Baseline" stroke="#0072CE" fill="#0072CE" fillOpacity={0.15} strokeWidth={2} />
|
| 149 |
+
<Radar name="GraphRAG" dataKey="GraphRAG" stroke="#FF6B00" fill="#FF6B00" fillOpacity={0.15} strokeWidth={2} />
|
| 150 |
+
<Legend /><Tooltip contentStyle={{ background: "#faf9f5", border: "1px solid #e6dfd8", borderRadius: "8px" }} />
|
| 151 |
+
</RadarChart>
|
| 152 |
+
</ResponsiveContainer>
|
| 153 |
+
</div>
|
| 154 |
+
)}
|
| 155 |
|
| 156 |
+
{/* By Type */}
|
| 157 |
+
{typeData.length > 0 && (
|
| 158 |
+
<div className="card">
|
| 159 |
+
<div className="title-md mb-4">F1 by Question Type</div>
|
| 160 |
+
<ResponsiveContainer width="100%" height={340}>
|
| 161 |
+
<BarChart data={typeData} margin={{ top: 20, right: 20, left: 0, bottom: 0 }}>
|
| 162 |
+
<CartesianGrid strokeDasharray="3 3" stroke="#002B49" strokeOpacity={0.08} />
|
| 163 |
+
<XAxis dataKey="name" tick={{ fill: "#6c6a64", fontSize: 13 }} />
|
| 164 |
+
<YAxis domain={[0, 100]} tick={{ fill: "#6c6a64", fontSize: 12 }} />
|
| 165 |
+
<Tooltip contentStyle={{ background: "#faf9f5", border: "1px solid #e6dfd8", borderRadius: "8px" }} />
|
| 166 |
+
<Legend />
|
| 167 |
+
<Bar dataKey="Baseline" fill="#0072CE" radius={[4, 4, 0, 0]} />
|
| 168 |
+
<Bar dataKey="GraphRAG" fill="#FF6B00" radius={[4, 4, 0, 0]} />
|
| 169 |
+
</BarChart>
|
| 170 |
+
</ResponsiveContainer>
|
| 171 |
+
</div>
|
| 172 |
+
)}
|
| 173 |
+
</div>
|
| 174 |
|
| 175 |
+
{/* Detailed Table */}
|
| 176 |
+
<div className="card mb-6">
|
| 177 |
+
<div className="title-md mb-4">Detailed Comparison</div>
|
| 178 |
+
<div className="overflow-x-auto">
|
| 179 |
+
<table style={{ width: "100%", borderCollapse: "collapse", fontSize: "0.875rem" }}>
|
| 180 |
+
<thead>
|
| 181 |
+
<tr style={{ borderBottom: "2px solid var(--color-hairline)" }}>
|
| 182 |
+
{["Metric", "Baseline RAG", "GraphRAG", "Winner"].map(h => (
|
| 183 |
+
<th key={h} className="caption-uppercase text-left" style={{ padding: "12px 16px" }}>{h}</th>
|
| 184 |
+
))}
|
| 185 |
+
</tr>
|
| 186 |
+
</thead>
|
| 187 |
+
<tbody>
|
| 188 |
+
{[
|
| 189 |
+
{ metric: "Avg F1 Score", b: data.baseline.avgF1.toFixed(4), g: data.graphrag.avgF1.toFixed(4), winner: data.graphrag.avgF1 > data.baseline.avgF1 ? "graphrag" : "baseline" },
|
| 190 |
+
{ metric: "Avg Exact Match", b: data.baseline.avgEM.toFixed(4), g: data.graphrag.avgEM.toFixed(4), winner: data.graphrag.avgEM > data.baseline.avgEM ? "graphrag" : "baseline" },
|
| 191 |
+
{ metric: "Avg Tokens/Query", b: data.baseline.avgTokens.toString(), g: data.graphrag.avgTokens.toString(), winner: data.baseline.avgTokens < data.graphrag.avgTokens ? "baseline" : "graphrag" },
|
| 192 |
+
{ metric: "Avg Cost ($)", b: "$" + data.baseline.avgCost.toFixed(6), g: "$" + data.graphrag.avgCost.toFixed(6), winner: data.baseline.avgCost < data.graphrag.avgCost ? "baseline" : "graphrag" },
|
| 193 |
+
{ metric: "Avg Latency (ms)", b: data.baseline.avgLatency.toString(), g: data.graphrag.avgLatency.toString(), winner: data.baseline.avgLatency < data.graphrag.avgLatency ? "baseline" : "graphrag" },
|
| 194 |
+
{ metric: "F1 Win Rate", b: ((1 - data.graphragF1WinRate) * 100).toFixed(0) + "%", g: (data.graphragF1WinRate * 100).toFixed(0) + "%", winner: data.graphragF1WinRate > 0.5 ? "graphrag" : "baseline" },
|
| 195 |
+
].map((row, i) => (
|
| 196 |
+
<tr key={i} style={{ borderBottom: "1px solid var(--color-hairline-soft)" }}>
|
| 197 |
+
<td className="title-sm" style={{ padding: "12px 16px" }}>{row.metric}</td>
|
| 198 |
+
<td style={{ padding: "12px 16px", fontFamily: "var(--font-mono)", color: "#0072CE" }}>{row.b}</td>
|
| 199 |
+
<td style={{ padding: "12px 16px", fontFamily: "var(--font-mono)", color: "#FF6B00" }}>{row.g}</td>
|
| 200 |
+
<td style={{ padding: "12px 16px" }}>
|
| 201 |
+
<span className={row.winner === "graphrag" ? "badge-orange" : "badge-blue"} style={{ fontSize: "0.6875rem" }}>
|
| 202 |
+
{row.winner === "graphrag" ? "GraphRAG ✓" : "Baseline ✓"}
|
| 203 |
+
</span>
|
| 204 |
+
</td>
|
| 205 |
+
</tr>
|
| 206 |
+
))}
|
| 207 |
+
</tbody>
|
| 208 |
+
</table>
|
| 209 |
+
</div>
|
| 210 |
+
</div>
|
| 211 |
+
</>
|
| 212 |
+
)}
|
| 213 |
+
|
| 214 |
+
{/* Report */}
|
| 215 |
+
{report && (
|
| 216 |
+
<div className="card-dark">
|
| 217 |
+
<div className="code-window-header">
|
| 218 |
+
<div className="code-window-dot code-window-dot-red" />
|
| 219 |
+
<div className="code-window-dot code-window-dot-yellow" />
|
| 220 |
+
<div className="code-window-dot code-window-dot-green" />
|
| 221 |
+
<span className="body-sm" style={{ color: "#a09d96", marginLeft: "8px" }}>benchmark_report.txt</span>
|
| 222 |
+
</div>
|
| 223 |
+
<pre className="code-window-body" style={{ whiteSpace: "pre-wrap", fontSize: "0.8125rem" }}>
|
| 224 |
+
{report}
|
| 225 |
+
</pre>
|
| 226 |
</div>
|
| 227 |
+
)}
|
| 228 |
</div>
|
| 229 |
);
|
| 230 |
}
|