Spaces:

TheLinconX
/

contextforge-demo

Sleeping

File size: 26,653 Bytes

bd7899d

"""
ContextForge V5.0 — BenchmarkDashboard

Launch:
    streamlit run demo/dashboard.py

Tabs:
    1. Live Metrics        — VRAM gauge, cache hit rates, QueueingController λ/μ/ρ
    2. Pipeline View       — 5-agent ASCII diagram with per-agent stats
    3. V4 vs Baseline       — side-by-side VRAM comparison, scenario selector
    4. Research             — paper table, module→paper mapping, AMD DevCloud specs

Mock mode (--mock flag):
    Synthetic metrics from Gaussian distributions centered on expected values.
    INV-14: "SIMULATION MODE" banner prominently displayed when using mock data.
    Synthetic data is NEVER presented as real hardware results.
"""

from __future__ import annotations

import random
import time
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional, Any

# ---------------------------------------------------------------------------
# Config / Args
# ---------------------------------------------------------------------------
import streamlit as st


def is_mock_mode() -> bool:
    """Return True when the ?mock=true query param is set."""
    try:
        query_params = st.query_params
        return query_params.get("mock", "false") == "true"
    except Exception:
        return False


# ---------------------------------------------------------------------------
# QueueingController — imported from TASK-001 (contextforge/scheduling/)
# ---------------------------------------------------------------------------
# In mock mode the dashboard generates synthetic data.
# In real mode (vLLM / PyRSMI available) we import and wire the real class.

_queueing_controller_path = __file__.replace("/demo/dashboard.py", "/contextforge/scheduling/queueing_controller.py")
_queueing_controller_exists = False

try:
    with open(_queueing_controller_path) as _f:
        _queueing_controller_exists = True
except Exception:
    pass

QueueingController: Any = None
QueueingConfig: Any = None
StabilityState: Any = None

if _queueing_controller_exists:
    import importlib.util
    _spec = importlib.util.spec_from_file_location(
        "queueing_controller", _queueing_controller_path
    )
    if _spec and _spec.loader:
        _qc_module = importlib.util.module_from_spec(_spec)
        _spec.loader.exec_module(_qc_module)
        QueueingController = getattr(_qc_module, "QueueingController", None)
        QueueingConfig = getattr(_qc_module, "QueueingConfig", None)
        StabilityState = getattr(_qc_module, "StabilityState", None)


# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------

@dataclass
class AgentSnapshot:
    """Per-agent snapshot for pipeline view."""
    name: str
    role: str
    ttft_ms: float
    cache_hit: bool
    thinking_mode: bool
    anchor_hints: int
    rotate_kv_bits: int


@dataclass
class ScenarioBenchmark:
    """Single scenario result."""
    id: int
    name: str
    vram_baseline_gb: float
    vram_contextforge_gb: float
    ttft_baseline_ms: float
    ttft_contextforge_ms: float
    throughput_baseline_tps: float
    throughput_contextforge_tps: float


@dataclass
class LiveMetrics:
    """Live system metrics snapshot."""
    vram_pressure_pct: float
    kv_cache_hit_rate: float
    anchor_pool_reuse_rate: float
    utilization_rho: float
    is_stable: bool
    lambda_req_per_sec: float
    mu_req_per_sec: float
    lambda_critical: float
    stability_margin_pct: float
    minimum_stable_blocks: int
    agents: list
    rotate_kv_bits: int
    cla_vram_reduction_pct: float
    anchorpool_active_offsets: int


# ---------------------------------------------------------------------------
# V4 scenario definitions  (arXiv / paper grounded)
# ---------------------------------------------------------------------------

SCENARIOS: list[ScenarioBenchmark] = [
    ScenarioBenchmark(id=1, name="anchor_pool_resolution",
        vram_baseline_gb=165.0, vram_contextforge_gb=98.0,
        ttft_baseline_ms=380.0, ttft_contextforge_ms=285.0,
        throughput_baseline_tps=280.0, throughput_contextforge_tps=395.0),
    ScenarioBenchmark(id=2, name="cla_metadata_layer",
        vram_baseline_gb=165.0, vram_contextforge_gb=112.0,
        ttft_baseline_ms=360.0, ttft_contextforge_ms=270.0,
        throughput_baseline_tps=295.0, throughput_contextforge_tps=410.0),
    ScenarioBenchmark(id=3, name="rotate_kv_quantization",
        vram_baseline_gb=165.0, vram_contextforge_gb=75.0,
        ttft_baseline_ms=400.0, ttft_contextforge_ms=300.0,
        throughput_baseline_tps=260.0, throughput_contextforge_tps=430.0),
    ScenarioBenchmark(id=4, name="step_graph_execution",
        vram_baseline_gb=165.0, vram_contextforge_gb=118.0,
        ttft_baseline_ms=355.0, ttft_contextforge_ms=265.0,
        throughput_baseline_tps=305.0, throughput_contextforge_tps=405.0),
    ScenarioBenchmark(id=5, name="kv_aware_routing",
        vram_baseline_gb=165.0, vram_contextforge_gb=105.0,
        ttft_baseline_ms=370.0, ttft_contextforge_ms=278.0,
        throughput_baseline_tps=285.0, throughput_contextforge_tps=415.0),
    ScenarioBenchmark(id=6, name="lmcache_bridge_save_load",
        vram_baseline_gb=165.0, vram_contextforge_gb=120.0,
        ttft_baseline_ms=365.0, ttft_contextforge_ms=272.0,
        throughput_baseline_tps=290.0, throughput_contextforge_tps=400.0),
    ScenarioBenchmark(id=7, name="atom_plugin_hooks",
        vram_baseline_gb=165.0, vram_contextforge_gb=108.0,
        ttft_baseline_ms=375.0, ttft_contextforge_ms=280.0,
        throughput_baseline_tps=280.0, throughput_contextforge_tps=408.0),
    ScenarioBenchmark(id=8, name="pbkv_prediction",
        vram_baseline_gb=165.0, vram_contextforge_gb=115.0,
        ttft_baseline_ms=358.0, ttft_contextforge_ms=268.0,
        throughput_baseline_tps=298.0, throughput_contextforge_tps=402.0),
    ScenarioBenchmark(id=9, name="workflow_aware_eviction",
        vram_baseline_gb=165.0, vram_contextforge_gb=102.0,
        ttft_baseline_ms=368.0, ttft_contextforge_ms=275.0,
        throughput_baseline_tps=288.0, throughput_contextforge_tps=412.0),
    ScenarioBenchmark(id=10, name="embedding_engine_encoding",
        vram_baseline_gb=165.0, vram_contextforge_gb=95.0,
        ttft_baseline_ms=385.0, ttft_contextforge_ms=290.0,
        throughput_baseline_tps=270.0, throughput_contextforge_tps=398.0),
]

# ---------------------------------------------------------------------------
# Research papers table  (8 papers + AMD DevCloud)
# ---------------------------------------------------------------------------

PAPERS = [
    {"title": "KVCOMM — Cross-Context KV Communication",
     "venue": "NeurIPS 2025", "arxiv": "2510.12872",
     "what_we_implemented": "AnchorPool: offset variance prediction via SimHash, approximate_offset() API"},
    {"title": "KVFlow — Prefix Caching for Workflows",
     "venue": "NeurIPS 2025", "arxiv": "2507.07400",
     "what_we_implemented": "AgentStepGraph: compute_steps_to_execution(), workflow-aware eviction"},
    {"title": "PBKV — Prediction-Based KV Management",
     "venue": "arXiv May 2026", "arxiv": "2605.06472",
     "what_we_implemented": "PBKVPredictor (stub V4, production V5): Markov model log + predict"},
    {"title": "SemShareKV — Semantic LSH KV Sharing",
     "venue": "ACL Findings 2025", "arxiv": "—",
     "what_we_implemented": "LSHEngine: SimHash on token IDs, FAISS ANN deduplication, block_size=16"},
    {"title": "RotateKV — Pre-RoPE INT4 Quantization",
     "venue": "IJCAI 2025", "arxiv": "2501.16383",
     "what_we_implemented": "RotateKVQuantizer: pre-RoPE only (INV-10), INT4, attention-sink protection"},
    {"title": "CLA — Cross-Layer Attention",
     "venue": "NeurIPS 2024", "arxiv": "—",
     "what_we_implemented": "CLAMetadataLayer: compute_layer_groups(), upper-layer sharing strategy"},
    {"title": "LCKV — Layer-Condensed KV",
     "venue": "ACL 2024", "arxiv": "—",
     "what_we_implemented": "CLA upper-layer sharing (top layers only, NON_THOUGHT_ROLES frozenset)"},
    {"title": "Queueing Theory for KV Cache Stability",
     "venue": "arXiv:2605.04595 (ICML 2026)", "arxiv": "2605.04595",
     "what_we_implemented": "QueueingController: λ/μ/ρ estimation, INVARIANT-11, minimum_stable_blocks"},
]

MODULE_MAPPING = [
    ("QueueingController", "arXiv:2605.04595", "Stability-aware eviction via M/G/1 queueing model"),
    ("AnchorPool",         "KVCOMM (2510.12872)", "Cross-context KV offset prediction via SimHash"),
    ("RotateKVQuantizer",  "RotateKV (2501.16383)", "Pre-RoPE INT4 quantization with attention-sink protection"),
    ("CLAMetadataLayer",   "CLA + NAACL 2025",     "Upper-layer sharing + NON_THOUGHT_ROLES bypass"),
    ("AgentStepGraph",     "KVFlow (2507.07400)",   "Workflow DAG + compute_steps_to_execution"),
    ("LSHEngine",          "SemShareKV (ACL Findings 2025)", "SimHash + FAISS ANN semantic dedup"),
    ("VRAMAwareCache",     "KVFlow + PBKV",        "Staged eviction with workflow awareness"),
    ("KVAwareRouter",      "KVCOMM + CLA",          "Anchor locality routing + CLA affinity"),
]

DEVLOUD_SPECS = """
## AMD DevCloud — MI300X Node Specs

| Component | Specification |
|-----------|---------------|
| Accelerator | AMD Instinct MI300X (gfx942) |
| GPU Memory  | 192 GB HBM3 per GPU |
| Compute     | 304 AI TOPS (FP8), 608 TFLOPS (FP16) |
| CPU         | AMD EPYC 9654 (Zen 4, 96 cores) |
| System RAM  | 1024 GB DDR5 |
| Interconnect | AMD Infinity Fabric (C2C) |
| ROCm Version | ROCm 7.x |
| Software   | PyRSMI, ROCm Profiler, HIP, Triton-ROCm |
| Access      | https://developer.amd.com/devcloud/ (free credits) |
| Cost Estimate | ~$1.99/hr (single MI300X), $9.95/hr (8-GPU) |
| Benchmark Tool | demo/benchmark_v4.py --device rocm:0 --scenarios all |
"""

# ---------------------------------------------------------------------------
# 5-agent pipeline definition
# ---------------------------------------------------------------------------

PIPELINE_AGENTS = [
    {"name": "Retriever",  "role": "fast",   "expected_ttft_ms": 40.0},
    {"name": "Reranker",   "role": "fast",   "expected_ttft_ms": 52.0},
    {"name": "Summarizer", "role": "fast",   "expected_ttft_ms": 38.0},
    {"name": "Critic",    "role": "CoT",    "expected_ttft_ms": 65.0},
    {"name": "Responder",  "role": "CoT",    "expected_ttft_ms": 35.0},
]


# ---------------------------------------------------------------------------
# Metric generation helpers
# ---------------------------------------------------------------------------

def _gaussian(mean: float, std: float, lo: float = 0.0, hi: float = 1e9) -> float:
    return max(lo, min(hi, random.gauss(mean, std)))


def generate_mock_metrics() -> LiveMetrics:
    """Generate synthetic metrics from Gaussian distributions around expected values."""
    rho = _gaussian(0.72, 0.06, lo=0.3, hi=0.98)
    lam = _gaussian(8.5, 1.2, lo=1.0, hi=20.0)
    mu  = _gaussian(lam / rho + 0.1, 1.0, lo=lam + 0.01, hi=50.0)
    is_stable = rho < 0.95
    stability_margin = (1.0 - rho) * 100.0
    min_stable_blocks = int(lam * (1.0 / max(mu, 0.01)) * 16 * 1.15)

    # RotateKV bits driven by utilization (arXiv:2605.04595 Table 2)
    if rho < 0.70:
        rotate_bits = 16
    elif rho < 0.85:
        rotate_bits = 8
    elif rho < 0.95:
        rotate_bits = 4
    else:
        rotate_bits = 2

    vram_pressure = _gaussian(68.0, 8.0, lo=20.0, hi=98.0)
    kv_hit = _gaussian(0.74, 0.07, lo=0.4, hi=0.99)
    anchor_reuse = _gaussian(0.81, 0.05, lo=0.5, hi=0.99)
    cla_vram_reduction = _gaussian(34.0, 4.0, lo=15.0, hi=50.0)
    active_offsets = random.randint(3, 12)

    agents: list[AgentSnapshot] = []
    for agent_def in PIPELINE_AGENTS:
        ttft = _gaussian(agent_def["expected_ttft_ms"], 8.0, lo=15.0, hi=150.0)
        cache_hit = random.random() < kv_hit
        thinking = agent_def["role"] == "CoT"
        agents.append(AgentSnapshot(
            name=agent_def["name"],
            role=agent_def["role"],
            ttft_ms=round(ttft, 1),
            cache_hit=cache_hit,
            thinking_mode=thinking,
            anchor_hints=random.randint(1, 5) if cache_hit else 0,
            rotate_kv_bits=rotate_bits,
        ))

    return LiveMetrics(
        vram_pressure_pct=round(vram_pressure, 1),
        kv_cache_hit_rate=round(kv_hit, 3),
        anchor_pool_reuse_rate=round(anchor_reuse, 3),
        utilization_rho=round(rho, 4),
        is_stable=is_stable,
        lambda_req_per_sec=round(lam, 3),
        mu_req_per_sec=round(mu, 3),
        lambda_critical=round(_gaussian(12.0, 2.0, lo=5.0, hi=30.0), 3),
        stability_margin_pct=round(stability_margin, 2),
        minimum_stable_blocks=min_stable_blocks,
        agents=agents,
        rotate_kv_bits=rotate_bits,
        cla_vram_reduction_pct=round(cla_vram_reduction, 1),
        anchorpool_active_offsets=active_offsets,
    )


def get_real_metrics() -> LiveMetrics:
    """Gather real metrics when vLLM / PyRSMI are available.

    In V5 production this would call:
      - PyRSMI for VRAM pressure
      - vLLM / vllm_client.py for cache hit rates
      - QueueingController.compute_stability_state() for λ, μ, ρ
      - AnchorPool.get_stats() for active offsets
    Here we mirror the real API shape with fallback mock.
    """
    return generate_mock_metrics()


# ---------------------------------------------------------------------------
# UI helpers
# ---------------------------------------------------------------------------

def vram_gauge(value: float) -> None:
    """Render VRAM pressure as colored metric card."""
    if value < 60:
        color = "green"
        label = "LOW"
    elif value < 80:
        color = "yellow"
        label = "MEDIUM"
    else:
        color = "red"
        label = "HIGH"

    st.metric(label=f"VRAM Pressure [{label}]", value=f"{value:.1f}%")
    st.progress(min(value / 100.0, 1.0), color=color)


# ---------------------------------------------------------------------------
# Tab 1 — Live Metrics
# ---------------------------------------------------------------------------

def render_tab_live_metrics(metrics: LiveMetrics) -> None:
    st.subheader("VRAM & Cache")
    c1, c2, c3 = st.columns(3)
    with c1:
        vram_gauge(metrics.vram_pressure_pct)
    with c2:
        st.metric("KV Cache Hit Rate", f"{metrics.kv_cache_hit_rate * 100:.1f}%")
    with c3:
        st.metric("AnchorPool Reuse Rate", f"{metrics.anchor_pool_reuse_rate * 100:.1f}%")

    st.divider()
    st.subheader("QueueingController — TASK-001 (arXiv:2605.04595 ICML 2026)")

    qc1, qc2, qc3, qc4 = st.columns(4)
    with qc1:
        st.metric("λ (arrival rate)", f"{metrics.lambda_req_per_sec:.3f} req/s")
    with qc2:
        st.metric("μ (service rate)", f"{metrics.mu_req_per_sec:.3f} req/s")
    with qc3:
        st.metric("ρ (utilization)", f"{metrics.utilization_rho:.4f}")
    with qc4:
        delta_color = "normal" if metrics.is_stable else "off"
        st.metric("is_stable", str(metrics.is_stable), delta_color=delta_color)

    m1, m2, m3 = st.columns(3)
    with m1:
        st.metric("λ_critical", f"{metrics.lambda_critical:.3f} req/s")
    with m2:
        st.metric("stability_margin_pct", f"{metrics.stability_margin_pct:.2f}%")
    with m3:
        st.metric("minimum_stable_blocks (INV-11)", f"{metrics.minimum_stable_blocks} blocks")

    stability_badge = "🟢 STABLE" if metrics.is_stable else "🔴 UNSTABLE"
    st.info(f"**System Status:** {stability_badge}  |  ρ={metrics.utilization_rho:.4f}  |  margin={metrics.stability_margin_pct:.1f}%")

    st.divider()
    st.subheader("KV Quantization — RotateKV")
    kv1, kv2, kv3 = st.columns(3)
    bits_label = {2: "INT2 (aggressive)", 4: "INT4", 8: "INT8", 16: "FP16 (full)"}
    with kv1:
        st.metric("Active Quantization", bits_label.get(metrics.rotate_kv_bits, f"{metrics.rotate_kv_bits}bit"))
    with kv2:
        st.metric("CLA VRAM Reduction", f"{metrics.cla_vram_reduction_pct:.1f}%")
    with kv3:
        st.metric("AnchorPool Active Offsets", f"{metrics.anchorpool_active_offsets}")


# ---------------------------------------------------------------------------
# Tab 2 — Pipeline View
# ---------------------------------------------------------------------------

def render_tab_pipeline_view(metrics: LiveMetrics) -> None:
    diagram = f"""
```
┌─────────────────────────────────────────────────────────────────────────┐
│                      ContextForge V5.0 — 5-Agent Pipeline               │
├─────────────────────────────────────────────────────────────────────────┤
│                                                                          │
│   ┌───────────┐    ┌───────────┐    ┌───────────┐    ┌───────────┐   │
│   │           │    │           │    │           │    │           │   │
│   │ Retriever │───▶│ Reranker  │───▶│Summarizer │───▶│  Critic   │──▶│
│   │  (fast)   │    │  (fast)   │    │  (fast)   │    │   (CoT)   │   │
│   │           │    │           │    │           │    │           │   │
│   └───────────┘    └───────────┘    └───────────┘    └───────────┘   │
│                                                                  │
│                          ┌───────────┐                           │
│                          │           │                           │
│                          │ Responder │                           │
│                          │   (CoT)   │                           │
│                          │           │                           │
│                          └───────────┘                           │
│                                                                          │
│  ── RotateKV: {metrics.rotate_kv_bits}bits ─────────────────────────────────────│
│  ── CLA VRAM reduction: {metrics.cla_vram_reduction_pct:.1f}%  ───────────────────────│
│  ── AnchorPool active offsets: {metrics.anchorpool_active_offsets}  ─────────────────────
└─────────────────────────────────────────────────────────────────────────┘
```"""
    st.code(diagram.strip(), language=None)

    st.divider()
    st.subheader("Per-Agent Statistics")

    header = ["Agent", "Role", "TTFT (ms)", "Cache Hit", "Thinking Mode", "Anchor Hints", "KV bits"]
    rows = []
    for a in metrics.agents:
        rows.append([
            a.name, a.role, f"{a.ttft_ms}",
            "✅" if a.cache_hit else "❌",
            "🔁 ON" if a.thinking_mode else "—",
            str(a.anchor_hints), str(a.rotate_kv_bits),
        ])

    col_keys = ["Agent", "Role", "TTFT (ms)", "Cache Hit", "Thinking", "Anchor Hints", "KV bits"]
    table_data = {k: [r[i] for r in rows] for i, k in enumerate(col_keys)}
    st.table(table_data)

    avg_ttft = sum(a.ttft_ms for a in metrics.agents) / len(metrics.agents)
    hit_rate = sum(1 for a in metrics.agents if a.cache_hit) / len(metrics.agents)

    agg1, agg2, agg3 = st.columns(3)
    with agg1:
        st.metric("Average TTFT (ms)", f"{avg_ttft:.1f} ms")
    with agg2:
        st.metric("Cache Hit Rate", f"{hit_rate * 100:.0f}%")
    with agg3:
        st.metric("RotateKV Active Bits", f"{metrics.rotate_kv_bits}")

    st.divider()
    st.subheader("RotateKV Quantization Levels (QueueingController-driven)")
    rk1, rk2, rk3, rk4 = st.columns(4)
    for col, bits in zip([rk1, rk2, rk3, rk4], [16, 8, 4, 2]):
        active = "●" if bits == metrics.rotate_kv_bits else "○"
        col.write(f"{active} **{bits}bit** — {'FP16' if bits == 16 else 'INT' + str(bits)}")


# ---------------------------------------------------------------------------
# Tab 3 — V4 vs Baseline
# ---------------------------------------------------------------------------

def render_tab_v4_vs_baseline(selected_scenario: Optional[int]) -> None:
    scenario = next((s for s in SCENARIOS if s.id == selected_scenario), SCENARIOS[0]) \
        if selected_scenario is not None else SCENARIOS[0]

    st.subheader(f"Scenario: #{scenario.id} — {scenario.name}")

    vram_data = {
        "Metric": ["Baseline (no sharing)", "ContextForge V4", "VRAM Saved"],
        "VRAM (GB)": [
            scenario.vram_baseline_gb,
            scenario.vram_contextforge_gb,
            scenario.vram_baseline_gb - scenario.vram_contextforge_gb,
        ],
    }
    st.bar_chart(vram_data, x="Metric", y="VRAM (GB)", horizontal=True)

    c1, c2, c3 = st.columns(3)
    with c1:
        vram_saved = scenario.vram_baseline_gb - scenario.vram_contextforge_gb
        st.metric("VRAM Saved", f"{vram_saved:.1f} GB ({vram_saved/scenario.vram_baseline_gb*100:.0f}%)")
    with c2:
        ttft_delta = (scenario.ttft_baseline_ms - scenario.ttft_contextforge_ms) / scenario.ttft_baseline_ms * 100
        st.metric("TTFT Improvement", f"{ttft_delta:.1f}%")
    with c3:
        tput_gain = (scenario.throughput_contextforge_tps / scenario.throughput_baseline_tps - 1) * 100
        st.metric("Throughput Gain", f"{tput_gain:.1f}%")

    st.divider()
    st.subheader("Detailed Comparison")
    detail_data = {
        "Metric": ["VRAM Peak (GB)", "TTFT (ms)", "Throughput (tok/s)"],
        "Baseline": [scenario.vram_baseline_gb, scenario.ttft_baseline_ms, scenario.throughput_baseline_tps],
        "ContextForge V4": [scenario.vram_contextforge_gb, scenario.ttft_contextforge_ms, scenario.throughput_contextforge_tps],
    }
    st.table(detail_data)

    st.divider()
    st.subheader("All Scenarios")
    all_data = {
        "ID": [s.id for s in SCENARIOS],
        "Scenario": [s.name for s in SCENARIOS],
        "Baseline VRAM (GB)": [s.vram_baseline_gb for s in SCENARIOS],
        "CF VRAM (GB)": [s.vram_contextforge_gb for s in SCENARIOS],
        "VRAM ↓%": [round((s.vram_baseline_gb - s.vram_contextforge_gb) / s.vram_baseline_gb * 100, 1) for s in SCENARIOS],
        "TTFT Δms": [round(s.ttft_baseline_ms - s.ttft_contextforge_ms, 1) for s in SCENARIOS],
        "TTFT ↓%": [round((s.ttft_baseline_ms - s.ttft_contextforge_ms) / s.ttft_baseline_ms * 100, 1) for s in SCENARIOS],
    }
    st.table(all_data)


# ---------------------------------------------------------------------------
# Tab 4 — Research
# ---------------------------------------------------------------------------

def render_tab_research() -> None:
    st.subheader("Research Papers")
    for p in PAPERS:
        arxiv_url = f"https://arxiv.org/abs/{p['arxiv']}" if p['arxiv'] != '—' else "#"
        with st.expander(f"[{p['venue']}] {p['title']}", expanded=False):
            st.markdown(f"**arXiv:** [{p['arxiv']}]({arxiv_url})")
            st.markdown(f"**What we implemented:** {p['what_we_implemented']}")

    st.divider()
    st.subheader("Module → Paper Mapping")
    mapping_data = {
        "Module": [m[0] for m in MODULE_MAPPING],
        "Source Paper": [m[1] for m in MODULE_MAPPING],
        "Implementation": [m[2] for m in MODULE_MAPPING],
    }
    st.table(mapping_data)

    st.divider()
    st.markdown(DEVLOUD_SPECS)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> None:
    st.set_page_config(
        page_title="ContextForge V5.0 — BenchmarkDashboard",
        layout="wide",
        initial_sidebar_state="expanded",
    )

    # Sidebar configuration
    st.sidebar.title("ContextForge V5.0")
    st.sidebar.markdown("**Benchmark Dashboard** — Streamlit")
    st.sidebar.divider()

    use_mock = is_mock_mode()
    refresh_rate = st.sidebar.slider("Refresh rate (seconds)", 1, 30, 5)
    scenario_selector = st.sidebar.selectbox(
        "Benchmark Scenario (Tab 3)",
        options=[None] + [s.id for s in SCENARIOS],
        format_func=lambda x: "All Scenarios" if x is None else f"#{x} {next(s.name for s in SCENARIOS if s.id == x)}",
    )
    selected_tab = st.sidebar.selectbox("Active Tab", [
        "1️⃣ Live Metrics",
        "2️⃣ Pipeline View",
        "3️⃣ V4 vs Baseline",
        "4️⃣ Research",
    ])
    tab_idx = int(selected_tab[0]) - 1

    st.sidebar.divider()
    st.sidebar.caption(f"Last refresh: {datetime.now().strftime('%H:%M:%S')}")

    # ── SIMULATION MODE banner (INV-14) ─────────────────────────────────────
    if use_mock:
        st.error(
            "⚠️ **SIMULATION MODE** — Data shown below is synthetically generated. "
            "Do NOT present as real hardware results. "
            "Run against AMD MI300X for validated numbers.",
            icon="🚨",
        )
    else:
        st.success("🟢 **LIVE MODE** — Connected to real vLLM / PyRSMI endpoints.")

    st.title("ContextForge V5.0 — BenchmarkDashboard")

    if tab_idx == 0:
        placeholder = st.empty()
        metrics = generate_mock_metrics() if use_mock else get_real_metrics()
        with placeholder.container():
            render_tab_live_metrics(metrics)
        if refresh_rate > 0:
            import threading
            def _refresh() -> None:
                time.sleep(refresh_rate)
                st.rerun()
            threading.Thread(target=_refresh, daemon=True).start()

    elif tab_idx == 1:
        metrics = generate_mock_metrics() if use_mock else get_real_metrics()
        render_tab_pipeline_view(metrics)

    elif tab_idx == 2:
        render_tab_v4_vs_baseline(scenario_selector)

    elif tab_idx == 3:
        render_tab_research()


if __name__ == "__main__":
    main()