muthuk1's picture
Add 4-tab Gradio Dashboard (Live Comparison, Batch Benchmark, Cost Analysis, Graph Explorer)
dffc215 verified
raw
history blame
25.2 kB
"""
GraphRAG Comparison Dashboard β€” 4-Tab Gradio UI
================================================
Tab 1: Live Query Comparison (side-by-side)
Tab 2: Batch Benchmark Results (HotpotQA)
Tab 3: Cost Analysis (projections + distributions)
Tab 4: Graph Explorer (interactive knowledge graph + reasoning paths)
Novelties: Adaptive routing, graph reasoning explanations, real-time cost tracking
"""
import json
import logging
import os
import time
from typing import Any, Dict, List, Optional, Tuple
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from graphrag.layers.graph_layer import GraphLayer
from graphrag.layers.llm_layer import LLMLayer
from graphrag.layers.orchestration_layer import InferenceOrchestrator, EmbeddingManager
from graphrag.layers.evaluation_layer import EvaluationLayer, EvalSample, compute_f1, compute_exact_match
from graphrag.benchmark import BenchmarkRunner
logger = logging.getLogger(__name__)
# ── Global State ─────────────────────────────────────────
orchestrator = None
evaluator = None
benchmark_runner = None
_initialized = False
_benchmark_results = []
def initialize_system():
"""Initialize all components."""
global orchestrator, evaluator, benchmark_runner, _initialized
if _initialized:
return "βœ… System already initialized."
llm = LLMLayer(api_key=os.getenv("OPENAI_API_KEY", ""),
model=os.getenv("LLM_MODEL", "gpt-4o-mini"))
llm.initialize()
embedder = EmbeddingManager(provider="openai", model="text-embedding-3-small",
api_key=os.getenv("OPENAI_API_KEY", ""))
embedder.initialize()
graph = GraphLayer()
tg_host = os.getenv("TG_HOST", "")
if tg_host:
graph.connect()
orchestrator = InferenceOrchestrator(graph_layer=graph, llm_layer=llm, embedder=embedder)
orchestrator.initialize()
evaluator = EvaluationLayer(eval_llm_model=os.getenv("LLM_MODEL", "gpt-4o-mini"),
api_key=os.getenv("OPENAI_API_KEY", ""))
evaluator.initialize()
benchmark_runner = BenchmarkRunner(orchestrator, evaluator)
_initialized = True
return "βœ… System initialized successfully! (LLM: " + llm.model + ")"
# ── Tab 1: Live Query Comparison ─────────────────────────
def run_live_comparison(query, enable_adaptive, top_k, hops):
if not query.strip():
return ("Please enter a query.", "", "", "", 0, 0, 0, 0, 0, 0, None, "", "", "")
if not _initialized:
initialize_system()
try:
passages = _get_demo_passages(query)
if enable_adaptive:
comparison = orchestrator.run_adaptive(query, passages)
else:
comparison = orchestrator.run_comparison(query, passages, int(top_k), int(hops))
b, g = comparison.baseline, comparison.graphrag
fig = _build_comparison_chart(b, g)
baseline_ctx = "\n\n---\n\n".join([
f"**[{i+1}]:** {c[:300]}{'...' if len(c) > 300 else ''}"
for i, c in enumerate(b.contexts[:5])
]) or "No contexts."
graphrag_ctx = "\n\n---\n\n".join([
f"**[{i+1}]:** {c[:300]}{'...' if len(c) > 300 else ''}"
for i, c in enumerate(g.contexts[:5])
]) or "No contexts."
entities_display = ""
if g.entities_found:
entities_display = "**Entities Found:**\n" + "\n".join(
[f"- πŸ”΅ **{e.get('name','N/A')}** ({e.get('entity_type','N/A')})"
for e in g.entities_found[:8]])
if g.relations_traversed:
entities_display += "\n\n**Relationships:**\n" + "\n".join(
[f"- πŸ”— {r}" for r in g.relations_traversed[:8]])
routing_info = ""
if enable_adaptive:
routing_info = (
f"**🧠 Adaptive Routing:**\n"
f"- Complexity: {g.complexity_score:.2f} | Type: {g.query_type}\n"
f"- Recommended: **{comparison.recommended_pipeline.upper()}**\n"
f"- {comparison.routing_reason}")
return ("βœ… Done!", b.answer, g.answer, routing_info,
b.total_tokens, g.total_tokens,
round(b.latency_ms, 1), round(g.latency_ms, 1),
round(b.cost_usd, 6), round(g.cost_usd, 6),
fig, baseline_ctx, graphrag_ctx, entities_display)
except Exception as e:
return (f"❌ Error: {e}", "", "", "", 0, 0, 0, 0, 0, 0, None, "", "", "")
def _get_demo_passages(query):
try:
from datasets import load_dataset
ds = load_dataset("hotpotqa/hotpot_qa", "distractor", split="validation", streaming=True)
for row in ds:
return [f"{t}: {' '.join(s)}"
for t, s in zip(row["context"]["title"], row["context"]["sentences"])]
except Exception:
pass
return ["Demo passage. Connect TigerGraph for full functionality.",
"GraphRAG extracts entities and relationships for better retrieval.",
"The system supports both baseline RAG and GraphRAG pipelines."]
def _build_comparison_chart(baseline, graphrag):
fig = make_subplots(rows=1, cols=3, subplot_titles=("Tokens", "Latency (ms)", "Cost ($)"),
horizontal_spacing=0.12)
colors = ["#3498db", "#e74c3c"]
methods = ["Baseline", "GraphRAG"]
fig.add_trace(go.Bar(x=methods, y=[baseline.total_tokens, graphrag.total_tokens],
marker_color=colors, text=[baseline.total_tokens, graphrag.total_tokens],
textposition='auto', showlegend=False), row=1, col=1)
fig.add_trace(go.Bar(x=methods, y=[baseline.latency_ms, graphrag.latency_ms],
marker_color=colors, text=[f"{baseline.latency_ms:.0f}", f"{graphrag.latency_ms:.0f}"],
textposition='auto', showlegend=False), row=1, col=2)
fig.add_trace(go.Bar(x=methods, y=[baseline.cost_usd, graphrag.cost_usd],
marker_color=colors, text=[f"${baseline.cost_usd:.6f}", f"${graphrag.cost_usd:.6f}"],
textposition='auto', showlegend=False), row=1, col=3)
fig.update_layout(height=350, margin=dict(t=40, b=20, l=20, r=20),
paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
return fig
# ── Tab 2: Batch Benchmark ───────────────────────────────
def run_batch_benchmark(num_samples, top_k, hops, progress=gr.Progress()):
global _benchmark_results
if not _initialized:
initialize_system()
def progress_cb(cur, tot, _):
progress(cur / tot, desc=f"Processing {cur}/{tot}...")
try:
results = benchmark_runner.run_hotpotqa_benchmark(
num_samples=int(num_samples), top_k=int(top_k), hops=int(hops),
progress_callback=progress_cb)
_benchmark_results = results.get("results", [])
agg = results.get("aggregate", {})
report = results.get("report", "")
if not _benchmark_results:
return "No results.", None, None, None, report
summary = pd.DataFrame({
"Metric": ["Avg F1", "Avg EM", "Avg Tokens", "Avg Cost ($)", "Avg Latency (ms)", "F1 Win Rate"],
"Baseline RAG": [
f"{agg['baseline']['avg_f1']:.4f}", f"{agg['baseline']['avg_em']:.4f}",
f"{agg['baseline']['avg_tokens']:.0f}", f"${agg['baseline']['avg_cost']:.6f}",
f"{agg['baseline']['avg_latency_ms']:.0f}",
f"{1 - agg.get('graphrag_f1_win_rate', 0.5):.1%}"],
"GraphRAG": [
f"{agg['graphrag']['avg_f1']:.4f}", f"{agg['graphrag']['avg_em']:.4f}",
f"{agg['graphrag']['avg_tokens']:.0f}", f"${agg['graphrag']['avg_cost']:.6f}",
f"{agg['graphrag']['avg_latency_ms']:.0f}",
f"{agg.get('graphrag_f1_win_rate', 0.5):.1%}"]
})
bar_fig = _build_benchmark_bar(agg)
radar_fig = _build_radar(agg)
return (f"βœ… Done! {len(_benchmark_results)} samples.", summary, bar_fig, radar_fig, report)
except Exception as e:
return f"❌ Error: {e}", None, None, None, ""
def _build_benchmark_bar(agg):
metrics = ["F1", "EM", "Context Hit"]
bvals = [agg["baseline"]["avg_f1"], agg["baseline"]["avg_em"], agg["baseline"]["avg_context_hit"]]
gvals = [agg["graphrag"]["avg_f1"], agg["graphrag"]["avg_em"], agg["graphrag"]["avg_context_hit"]]
fig = go.Figure(data=[
go.Bar(name="Baseline", x=metrics, y=bvals, marker_color="#3498db",
text=[f"{v:.3f}" for v in bvals], textposition='auto'),
go.Bar(name="GraphRAG", x=metrics, y=gvals, marker_color="#e74c3c",
text=[f"{v:.3f}" for v in gvals], textposition='auto')])
fig.update_layout(barmode='group', title="Answer Quality", yaxis_title="Score", height=400,
paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
return fig
def _build_radar(agg):
b, g = agg["baseline"], agg["graphrag"]
cats = ["F1", "EM", "Context Hit", "Token Eff.", "Cost Eff."]
te = min(b["avg_tokens"] / max(g["avg_tokens"], 1), 2.0)
ce = min(b["avg_cost"] / max(g["avg_cost"], 0.000001), 2.0)
bv = [b["avg_f1"], b["avg_em"], b["avg_context_hit"], 1.0, 1.0]
gv = [g["avg_f1"], g["avg_em"], g["avg_context_hit"], te, ce]
fig = go.Figure()
fig.add_trace(go.Scatterpolar(r=bv+[bv[0]], theta=cats+[cats[0]], fill='toself',
name='Baseline', line_color='#3498db', opacity=0.6))
fig.add_trace(go.Scatterpolar(r=gv+[gv[0]], theta=cats+[cats[0]], fill='toself',
name='GraphRAG', line_color='#e74c3c', opacity=0.6))
fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1.2])),
title="Multi-Metric Radar", height=450, paper_bgcolor='rgba(0,0,0,0)')
return fig
# ── Tab 3: Cost Analysis ─────────────────────────────────
def compute_cost_analysis(num_queries, model):
pricing = {
"gpt-4o-mini": {"input": 0.00015, "output": 0.0006},
"gpt-4o": {"input": 0.0025, "output": 0.01},
"gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
"claude-3-5-sonnet": {"input": 0.003, "output": 0.015},
"claude-3-haiku": {"input": 0.00025, "output": 0.00125},
}
p = pricing.get(model, pricing["gpt-4o-mini"])
n = int(num_queries)
if _benchmark_results:
ab = sum(r["baseline_tokens"] for r in _benchmark_results) / len(_benchmark_results)
ag = sum(r["graphrag_tokens"] for r in _benchmark_results) / len(_benchmark_results)
acb = sum(r["baseline_cost"] for r in _benchmark_results) / len(_benchmark_results)
acg = sum(r["graphrag_cost"] for r in _benchmark_results) / len(_benchmark_results)
else:
ab, ag = 950, 2400
acb = (800/1000*p["input"] + 150/1000*p["output"])
acg = (2200/1000*p["input"] + 200/1000*p["output"])
summary = pd.DataFrame({
"Metric": ["Avg Tokens", "Cost/Query", f"Total ({n:,}q)", "Monthly (1K qpd)", "Annual"],
"Baseline": [f"{ab:.0f}", f"${acb:.6f}", f"${acb*n:.4f}", f"${acb*1000*30:.2f}", f"${acb*1000*365:.2f}"],
"GraphRAG": [f"{ag:.0f}", f"${acg:.6f}", f"${acg*n:.4f}", f"${acg*1000*30:.2f}", f"${acg*1000*365:.2f}"],
"Ratio": [f"{ag/max(ab,1):.2f}x"]*5
})
qr = list(range(0, n+1, max(n//50, 1)))
fig_cum = go.Figure()
fig_cum.add_trace(go.Scatter(x=qr, y=[acb*q for q in qr], mode='lines', name='Baseline',
line=dict(color='#3498db', width=3)))
fig_cum.add_trace(go.Scatter(x=qr, y=[acg*q for q in qr], mode='lines', name='GraphRAG',
line=dict(color='#e74c3c', width=3)))
fig_cum.update_layout(title=f"Cumulative Cost ({model})", xaxis_title="Queries", yaxis_title="Cost ($)",
height=400, paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
fig_tok = go.Figure()
if _benchmark_results:
fig_tok.add_trace(go.Histogram(x=[r["baseline_tokens"] for r in _benchmark_results],
name="Baseline", opacity=0.7, marker_color="#3498db"))
fig_tok.add_trace(go.Histogram(x=[r["graphrag_tokens"] for r in _benchmark_results],
name="GraphRAG", opacity=0.7, marker_color="#e74c3c"))
fig_tok.update_layout(barmode='overlay', title="Token Distribution", height=400,
paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
else:
fig_tok.add_annotation(text="Run benchmark first for distribution", showarrow=False)
return summary, fig_cum, fig_tok
# ── Tab 4: Graph Explorer ────────────────────────────────
def explore_graph(query, depth):
if not _initialized:
initialize_system()
try:
import networkx as nx
passages = _get_demo_passages(query)
gr_result = orchestrator.run_graphrag(query, passages, hops=int(depth))
G = nx.Graph()
for e in gr_result.entities_found[:20]:
G.add_node(e.get("name", "?"), entity_type=e.get("entity_type", "CONCEPT"),
description=e.get("description", ""))
for r in gr_result.relations_traversed[:30]:
parts = r.split(" -[")
if len(parts) == 2:
src = parts[0].strip()
rest = parts[1].split("]-> ")
if len(rest) == 2:
rtype = rest[0].strip()
tgt = rest[1].split(": ")[0].strip()
G.add_edge(src, tgt, relation=rtype)
if not G.nodes():
G.add_node("Query", entity_type="QUERY")
for e in gr_result.entities_found[:5]:
G.add_node(e.get("name", "Entity"), entity_type=e.get("entity_type", "CONCEPT"))
G.add_edge("Query", e.get("name", "Entity"), relation="FOUND")
pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
colors_map = {"PERSON": "#FF6B6B", "ORGANIZATION": "#4ECDC4", "LOCATION": "#45B7D1",
"EVENT": "#FFA07A", "DATE": "#98D8C8", "CONCEPT": "#AED6F1",
"WORK": "#F9E79F", "PRODUCT": "#D7BDE2", "TECHNOLOGY": "#82E0AA", "QUERY": "#F39C12"}
edge_x, edge_y = [], []
for u, v in G.edges():
x0, y0 = pos[u]; x1, y1 = pos[v]
edge_x.extend([x0, x1, None]); edge_y.extend([y0, y1, None])
fig = go.Figure()
fig.add_trace(go.Scatter(x=edge_x, y=edge_y, mode='lines',
line=dict(width=1.5, color='#888'), hoverinfo='none', showlegend=False))
fig.add_trace(go.Scatter(
x=[pos[n][0] for n in G.nodes()], y=[pos[n][1] for n in G.nodes()],
mode='markers+text', text=list(G.nodes()), textposition="top center", textfont=dict(size=10),
marker=dict(size=[20 + G.degree(n)*5 for n in G.nodes()],
color=[colors_map.get(G.nodes[n].get("entity_type", "CONCEPT"), "#AED6F1") for n in G.nodes()],
line=dict(width=2, color='white')),
hovertext=[f"{n} ({G.nodes[n].get('entity_type','')})" for n in G.nodes()],
hoverinfo='text', showlegend=False))
fig.update_layout(title=f"Knowledge Graph: {query[:50]}...", showlegend=False, hovermode='closest',
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
height=500, margin=dict(b=20,l=20,r=20,t=40),
paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
info = {"nodes": len(G.nodes()), "edges": len(G.edges()),
"entities": len(gr_result.entities_found), "relations": len(gr_result.relations_traversed)}
stats = pd.DataFrame({"Metric": ["Nodes", "Edges", "Avg Degree", "Density", "Entities", "Relations"],
"Value": [len(G.nodes()), len(G.edges()),
f"{sum(d for _,d in G.degree())/max(len(G.nodes()),1):.1f}",
f"{nx.density(G):.3f}",
len(gr_result.entities_found), len(gr_result.relations_traversed)]})
explanation = orchestrator.explain_graphrag_reasoning(query, gr_result)
return fig, info, stats, explanation, gr_result.answer
except Exception as e:
empty = go.Figure()
empty.add_annotation(text=str(e), showarrow=False)
return empty, {}, pd.DataFrame(), str(e), ""
# ── Build Dashboard ───────────────────────────────────────
def build_dashboard():
with gr.Blocks(title="GraphRAG Inference Dashboard") as demo:
gr.Markdown("""
# πŸ” GraphRAG Inference Hackathon β€” Comparison Dashboard
### Proving that graphs make LLM inference faster, cheaper, and smarter
**Architecture:** TigerGraph (Graph) β†’ Orchestration β†’ LLM β†’ Evaluation
| **Novelties:** 🧠 Adaptive Routing | πŸ“‹ Schema-Bounded Extraction | πŸ”— Reasoning Paths | πŸ”‘ Dual-Level Keywords
""")
with gr.Row():
init_btn = gr.Button("πŸš€ Initialize System", variant="primary", scale=2)
init_status = gr.Textbox(label="Status", interactive=False, scale=3)
init_btn.click(fn=initialize_system, outputs=init_status)
with gr.Tabs():
# ── Tab 1: Live Comparison ──────────────────
with gr.Tab("πŸ”΄ Live Query Comparison"):
gr.Markdown("## Side-by-Side Pipeline Comparison")
with gr.Row():
query_input = gr.Textbox(label="Question", placeholder="e.g., Were Scott Derrickson and Ed Wood of the same nationality?", lines=2, scale=3)
with gr.Column(scale=1):
adaptive = gr.Checkbox(label="🧠 Adaptive Routing", value=True)
topk = gr.Slider(1, 10, value=5, step=1, label="Top-K")
hops_s = gr.Slider(1, 4, value=2, step=1, label="Hops")
run_btn = gr.Button("β–Ά Run Comparison", variant="primary", size="lg")
status = gr.Textbox(label="Status", interactive=False)
routing = gr.Markdown(visible=True)
with gr.Row():
with gr.Column():
gr.Markdown("### πŸ”΅ Baseline RAG")
b_ans = gr.Textbox(label="Answer", lines=5, interactive=False)
with gr.Row():
b_tok = gr.Number(label="Tokens", precision=0)
b_lat = gr.Number(label="Latency (ms)", precision=1)
b_cost = gr.Number(label="Cost ($)", precision=6)
with gr.Column():
gr.Markdown("### πŸ”΄ GraphRAG")
g_ans = gr.Textbox(label="Answer", lines=5, interactive=False)
with gr.Row():
g_tok = gr.Number(label="Tokens", precision=0)
g_lat = gr.Number(label="Latency (ms)", precision=1)
g_cost = gr.Number(label="Cost ($)", precision=6)
chart = gr.Plot(label="Comparison")
with gr.Accordion("πŸ“„ Retrieved Contexts", open=False):
with gr.Row():
b_ctx = gr.Markdown()
g_ctx = gr.Markdown()
with gr.Accordion("πŸ•ΈοΈ Entities & Relations", open=False):
ent_disp = gr.Markdown()
run_btn.click(fn=run_live_comparison, inputs=[query_input, adaptive, topk, hops_s],
outputs=[status, b_ans, g_ans, routing, b_tok, g_tok, b_lat, g_lat,
b_cost, g_cost, chart, b_ctx, g_ctx, ent_disp])
gr.Examples(examples=[
["Were Scott Derrickson and Ed Wood of the same nationality?"],
["What government position was held by the woman who portrayed Nora Batty?"],
["Which magazine was started first, Arthur's Magazine or First for Women?"],
["Who was born first, Arthur Conan Doyle or Agatha Christie?"],
["What is the capital of the country where the Eiffel Tower is located?"]],
inputs=query_input, label="πŸ“ Example Questions")
# ── Tab 2: Batch Benchmark ──────────────────
with gr.Tab("πŸ“Š Batch Benchmark"):
gr.Markdown("## Benchmark on HotpotQA")
with gr.Row():
n_samples = gr.Slider(10, 500, value=50, step=10, label="Samples")
bk = gr.Slider(1, 10, value=5, step=1, label="Top-K")
bh = gr.Slider(1, 4, value=2, step=1, label="Hops")
bench_btn = gr.Button("πŸƒ Run Benchmark", variant="primary")
bench_status = gr.Textbox(label="Status", interactive=False)
summary_df = gr.Dataframe(label="Summary")
with gr.Row():
bar_chart = gr.Plot(label="Quality")
radar_chart = gr.Plot(label="Radar")
with gr.Accordion("πŸ“ Full Report", open=False):
report = gr.Textbox(lines=30, interactive=False)
bench_btn.click(fn=run_batch_benchmark, inputs=[n_samples, bk, bh],
outputs=[bench_status, summary_df, bar_chart, radar_chart, report])
# ── Tab 3: Cost Analysis ────────────────────
with gr.Tab("πŸ’° Cost Analysis"):
gr.Markdown("## Cost & Token Analysis")
with gr.Row():
cq = gr.Slider(100, 100000, value=10000, step=100, label="Queries to Project")
cm = gr.Dropdown(["gpt-4o-mini", "gpt-4o", "gpt-3.5-turbo", "claude-3-5-sonnet", "claude-3-haiku"],
value="gpt-4o-mini", label="Model")
cost_btn = gr.Button("πŸ’΅ Calculate", variant="primary")
cost_df = gr.Dataframe(label="Breakdown")
with gr.Row():
cum_chart = gr.Plot(label="Cumulative Cost")
tok_chart = gr.Plot(label="Token Distribution")
cost_btn.click(fn=compute_cost_analysis, inputs=[cq, cm],
outputs=[cost_df, cum_chart, tok_chart])
# ── Tab 4: Graph Explorer ───────────────────
with gr.Tab("πŸ•ΈοΈ Graph Explorer"):
gr.Markdown("## Interactive Knowledge Graph Explorer\n*Visualize how GraphRAG traverses the graph*")
with gr.Row():
gq = gr.Textbox(label="Query", placeholder="Enter a question...", scale=3)
gd = gr.Slider(1, 4, value=2, step=1, label="Depth", scale=1)
exp_btn = gr.Button("πŸ” Explore", variant="primary", scale=1)
graph_plot = gr.Plot(label="Knowledge Graph")
with gr.Row():
graph_stats = gr.Dataframe(label="Stats")
node_info = gr.JSON(label="Details")
with gr.Accordion("🧠 Reasoning Path", open=True):
reasoning = gr.Markdown()
graph_ans = gr.Textbox(label="GraphRAG Answer", interactive=False)
exp_btn.click(fn=explore_graph, inputs=[gq, gd],
outputs=[graph_plot, node_info, graph_stats, reasoning, graph_ans])
gr.Examples(examples=[
["Who directed the movie starring Tom Hanks released in 1994?"],
["What is the relationship between Einstein and relativity?"],
["Which country hosted the 2024 Olympics and what is its capital?"]],
inputs=gq, label="πŸ“ Examples")
gr.Markdown("""
---
**GraphRAG Inference Hackathon** by TigerGraph | TigerGraph + GPT-4o-mini + Gradio + RAGAS
**Novelties:** Adaptive Query Routing 🧠 | Schema-Bounded Extraction πŸ“‹ | Graph Reasoning Paths πŸ”— | Dual-Level Keywords πŸ”‘
""")
return demo
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
demo = build_dashboard()
demo.launch(server_port=7860, share=False, show_error=True)