"use client"; import { useState } from "react"; import { RadarChart, Radar, PolarGrid, PolarAngleAxis, ResponsiveContainer, Tooltip, Legend, BarChart, Bar, XAxis, YAxis, CartesianGrid, } from "recharts"; interface AggregateData { numSamples: number; baseline: { avgF1: number; avgEM: number; avgTokens: number; avgCost: number; avgLatency: number }; graphrag: { avgF1: number; avgEM: number; avgTokens: number; avgCost: number; avgLatency: number }; graphragF1WinRate: number; byType: { bridge?: { count: number; baselineF1: number; graphragF1: number } | null; comparison?: { count: number; baselineF1: number; graphragF1: number } | null; }; } const INITIAL: AggregateData = { numSamples: 0, baseline: { avgF1: 0, avgEM: 0, avgTokens: 0, avgCost: 0, avgLatency: 0 }, graphrag: { avgF1: 0, avgEM: 0, avgTokens: 0, avgCost: 0, avgLatency: 0 }, graphragF1WinRate: 0, byType: {}, }; export function Benchmark() { const [running, setRunning] = useState(false); const [samples, setSamples] = useState(10); const [data, setData] = useState(INITIAL); const [report, setReport] = useState(""); const [demoMode, setDemoMode] = useState(false); const [hasResults, setHasResults] = useState(false); const runBenchmark = async () => { setRunning(true); setReport("Running benchmark..."); try { const res = await fetch("/api/benchmark", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ numSamples: samples }), }); const result = await res.json(); setData(result.aggregate); setDemoMode(result.demoMode ?? false); setHasResults(true); // Build report text const a = result.aggregate; const lines = [ `BENCHMARK RESULTS (${a.numSamples} samples, ${result.provider}/${result.model})`, `${result.demoMode ? "⚠️ DEMO MODE — Set API key for real results" : "✅ LIVE RESULTS"}`, "", `Metric Baseline GraphRAG Winner`, `${"─".repeat(60)}`, `Avg F1 ${a.baseline.avgF1.toFixed(4)} ${a.graphrag.avgF1.toFixed(4)} ${a.graphrag.avgF1 > a.baseline.avgF1 ? "GraphRAG" : "Baseline"}`, `Avg EM ${a.baseline.avgEM.toFixed(4)} ${a.graphrag.avgEM.toFixed(4)} ${a.graphrag.avgEM > a.baseline.avgEM ? "GraphRAG" : "Baseline"}`, `Avg Tokens ${a.baseline.avgTokens} ${a.graphrag.avgTokens} ${a.baseline.avgTokens < a.graphrag.avgTokens ? "Baseline" : "GraphRAG"}`, `Avg Cost ($) ${a.baseline.avgCost.toFixed(6)} ${a.graphrag.avgCost.toFixed(6)}`, `Avg Latency (ms) ${a.baseline.avgLatency} ${a.graphrag.avgLatency}`, "", `GraphRAG F1 Win Rate: ${(a.graphragF1WinRate * 100).toFixed(0)}%`, `Token Ratio: ${a.graphrag.avgTokens > 0 && a.baseline.avgTokens > 0 ? (a.graphrag.avgTokens / a.baseline.avgTokens).toFixed(1) : "N/A"}x`, ]; setReport(lines.join("\n")); } catch (err) { setReport(`Error: ${err}`); } setRunning(false); }; const radarData = hasResults ? [ { metric: "F1 Score", Baseline: +(data.baseline.avgF1 * 100).toFixed(0), GraphRAG: +(data.graphrag.avgF1 * 100).toFixed(0) }, { metric: "Exact Match", Baseline: +(data.baseline.avgEM * 100).toFixed(0), GraphRAG: +(data.graphrag.avgEM * 100).toFixed(0) }, { metric: "Speed", Baseline: 85, GraphRAG: Math.max(10, 100 - Math.round(data.graphrag.avgLatency / Math.max(data.baseline.avgLatency, 1) * 30)) }, { metric: "Cost Eff.", Baseline: 85, GraphRAG: Math.max(10, 100 - Math.round(data.graphrag.avgCost / Math.max(data.baseline.avgCost, 0.000001) * 20)) }, { metric: "Win Rate", Baseline: +((1 - data.graphragF1WinRate) * 100).toFixed(0), GraphRAG: +(data.graphragF1WinRate * 100).toFixed(0) }, ] : []; const typeData = []; if (data.byType.bridge) typeData.push({ name: "Bridge", Baseline: +(data.byType.bridge.baselineF1 * 100).toFixed(1), GraphRAG: +(data.byType.bridge.graphragF1 * 100).toFixed(1) }); if (data.byType.comparison) typeData.push({ name: "Comparison", Baseline: +(data.byType.comparison.baselineF1 * 100).toFixed(1), GraphRAG: +(data.byType.comparison.graphragF1 * 100).toFixed(1) }); return (
{/* Controls */}
Batch Benchmark

Run both pipelines on HotpotQA questions and evaluate F1, EM, tokens, cost

{demoMode && hasResults && (
⚠️ Demo mode — showing simulated results. Set an API key for real benchmark data.
)}
{hasResults && ( <> {/* Summary Cards */}
{[ { label: "Avg F1 (Baseline)", value: data.baseline.avgF1.toFixed(4), color: "#0072CE" }, { label: "Avg F1 (GraphRAG)", value: data.graphrag.avgF1.toFixed(4), color: "#FF6B00" }, { label: "GraphRAG Win Rate", value: (data.graphragF1WinRate * 100).toFixed(0) + "%", color: "#5db872" }, { label: "Samples Evaluated", value: data.numSamples.toString(), color: "#002B49" }, ].map((m, i) => (
{m.value}
{m.label}
))}
{/* Radar */} {radarData.length > 0 && (
Multi-Metric Radar
)} {/* By Type */} {typeData.length > 0 && (
F1 by Question Type
)}
{/* Detailed Table */}
Detailed Comparison
{["Metric", "Baseline RAG", "GraphRAG", "Winner"].map(h => ( ))} {[ { metric: "Avg F1 Score", b: data.baseline.avgF1.toFixed(4), g: data.graphrag.avgF1.toFixed(4), winner: data.graphrag.avgF1 > data.baseline.avgF1 ? "graphrag" : "baseline" }, { metric: "Avg Exact Match", b: data.baseline.avgEM.toFixed(4), g: data.graphrag.avgEM.toFixed(4), winner: data.graphrag.avgEM > data.baseline.avgEM ? "graphrag" : "baseline" }, { metric: "Avg Tokens/Query", b: data.baseline.avgTokens.toString(), g: data.graphrag.avgTokens.toString(), winner: data.baseline.avgTokens < data.graphrag.avgTokens ? "baseline" : "graphrag" }, { metric: "Avg Cost ($)", b: "$" + data.baseline.avgCost.toFixed(6), g: "$" + data.graphrag.avgCost.toFixed(6), winner: data.baseline.avgCost < data.graphrag.avgCost ? "baseline" : "graphrag" }, { metric: "Avg Latency (ms)", b: data.baseline.avgLatency.toString(), g: data.graphrag.avgLatency.toString(), winner: data.baseline.avgLatency < data.graphrag.avgLatency ? "baseline" : "graphrag" }, { metric: "F1 Win Rate", b: ((1 - data.graphragF1WinRate) * 100).toFixed(0) + "%", g: (data.graphragF1WinRate * 100).toFixed(0) + "%", winner: data.graphragF1WinRate > 0.5 ? "graphrag" : "baseline" }, ].map((row, i) => ( ))}
{h}
{row.metric} {row.b} {row.g} {row.winner === "graphrag" ? "GraphRAG ✓" : "Baseline ✓"}
)} {/* Report */} {report && (
benchmark_report.txt
            {report}
          
)}
); }