Spaces:
Sleeping
Sleeping
Pablo
ContextForge V5.0 PREVIEW: QueueingController, VisualKVCache, SpeculativeCoordinator, PBKVPredictor Markov, Dashboard, DevCloud runner
bd7899d | """ | |
| ContextForge V5.0 — BenchmarkDashboard | |
| Launch: | |
| streamlit run demo/dashboard.py | |
| Tabs: | |
| 1. Live Metrics — VRAM gauge, cache hit rates, QueueingController λ/μ/ρ | |
| 2. Pipeline View — 5-agent ASCII diagram with per-agent stats | |
| 3. V4 vs Baseline — side-by-side VRAM comparison, scenario selector | |
| 4. Research — paper table, module→paper mapping, AMD DevCloud specs | |
| Mock mode (--mock flag): | |
| Synthetic metrics from Gaussian distributions centered on expected values. | |
| INV-14: "SIMULATION MODE" banner prominently displayed when using mock data. | |
| Synthetic data is NEVER presented as real hardware results. | |
| """ | |
| from __future__ import annotations | |
| import random | |
| import time | |
| from dataclasses import dataclass, field | |
| from datetime import datetime | |
| from typing import Optional, Any | |
| # --------------------------------------------------------------------------- | |
| # Config / Args | |
| # --------------------------------------------------------------------------- | |
| import streamlit as st | |
| def is_mock_mode() -> bool: | |
| """Return True when the ?mock=true query param is set.""" | |
| try: | |
| query_params = st.query_params | |
| return query_params.get("mock", "false") == "true" | |
| except Exception: | |
| return False | |
| # --------------------------------------------------------------------------- | |
| # QueueingController — imported from TASK-001 (contextforge/scheduling/) | |
| # --------------------------------------------------------------------------- | |
| # In mock mode the dashboard generates synthetic data. | |
| # In real mode (vLLM / PyRSMI available) we import and wire the real class. | |
| _queueing_controller_path = __file__.replace("/demo/dashboard.py", "/contextforge/scheduling/queueing_controller.py") | |
| _queueing_controller_exists = False | |
| try: | |
| with open(_queueing_controller_path) as _f: | |
| _queueing_controller_exists = True | |
| except Exception: | |
| pass | |
| QueueingController: Any = None | |
| QueueingConfig: Any = None | |
| StabilityState: Any = None | |
| if _queueing_controller_exists: | |
| import importlib.util | |
| _spec = importlib.util.spec_from_file_location( | |
| "queueing_controller", _queueing_controller_path | |
| ) | |
| if _spec and _spec.loader: | |
| _qc_module = importlib.util.module_from_spec(_spec) | |
| _spec.loader.exec_module(_qc_module) | |
| QueueingController = getattr(_qc_module, "QueueingController", None) | |
| QueueingConfig = getattr(_qc_module, "QueueingConfig", None) | |
| StabilityState = getattr(_qc_module, "StabilityState", None) | |
| # --------------------------------------------------------------------------- | |
| # Data structures | |
| # --------------------------------------------------------------------------- | |
| class AgentSnapshot: | |
| """Per-agent snapshot for pipeline view.""" | |
| name: str | |
| role: str | |
| ttft_ms: float | |
| cache_hit: bool | |
| thinking_mode: bool | |
| anchor_hints: int | |
| rotate_kv_bits: int | |
| class ScenarioBenchmark: | |
| """Single scenario result.""" | |
| id: int | |
| name: str | |
| vram_baseline_gb: float | |
| vram_contextforge_gb: float | |
| ttft_baseline_ms: float | |
| ttft_contextforge_ms: float | |
| throughput_baseline_tps: float | |
| throughput_contextforge_tps: float | |
| class LiveMetrics: | |
| """Live system metrics snapshot.""" | |
| vram_pressure_pct: float | |
| kv_cache_hit_rate: float | |
| anchor_pool_reuse_rate: float | |
| utilization_rho: float | |
| is_stable: bool | |
| lambda_req_per_sec: float | |
| mu_req_per_sec: float | |
| lambda_critical: float | |
| stability_margin_pct: float | |
| minimum_stable_blocks: int | |
| agents: list | |
| rotate_kv_bits: int | |
| cla_vram_reduction_pct: float | |
| anchorpool_active_offsets: int | |
| # --------------------------------------------------------------------------- | |
| # V4 scenario definitions (arXiv / paper grounded) | |
| # --------------------------------------------------------------------------- | |
| SCENARIOS: list[ScenarioBenchmark] = [ | |
| ScenarioBenchmark(id=1, name="anchor_pool_resolution", | |
| vram_baseline_gb=165.0, vram_contextforge_gb=98.0, | |
| ttft_baseline_ms=380.0, ttft_contextforge_ms=285.0, | |
| throughput_baseline_tps=280.0, throughput_contextforge_tps=395.0), | |
| ScenarioBenchmark(id=2, name="cla_metadata_layer", | |
| vram_baseline_gb=165.0, vram_contextforge_gb=112.0, | |
| ttft_baseline_ms=360.0, ttft_contextforge_ms=270.0, | |
| throughput_baseline_tps=295.0, throughput_contextforge_tps=410.0), | |
| ScenarioBenchmark(id=3, name="rotate_kv_quantization", | |
| vram_baseline_gb=165.0, vram_contextforge_gb=75.0, | |
| ttft_baseline_ms=400.0, ttft_contextforge_ms=300.0, | |
| throughput_baseline_tps=260.0, throughput_contextforge_tps=430.0), | |
| ScenarioBenchmark(id=4, name="step_graph_execution", | |
| vram_baseline_gb=165.0, vram_contextforge_gb=118.0, | |
| ttft_baseline_ms=355.0, ttft_contextforge_ms=265.0, | |
| throughput_baseline_tps=305.0, throughput_contextforge_tps=405.0), | |
| ScenarioBenchmark(id=5, name="kv_aware_routing", | |
| vram_baseline_gb=165.0, vram_contextforge_gb=105.0, | |
| ttft_baseline_ms=370.0, ttft_contextforge_ms=278.0, | |
| throughput_baseline_tps=285.0, throughput_contextforge_tps=415.0), | |
| ScenarioBenchmark(id=6, name="lmcache_bridge_save_load", | |
| vram_baseline_gb=165.0, vram_contextforge_gb=120.0, | |
| ttft_baseline_ms=365.0, ttft_contextforge_ms=272.0, | |
| throughput_baseline_tps=290.0, throughput_contextforge_tps=400.0), | |
| ScenarioBenchmark(id=7, name="atom_plugin_hooks", | |
| vram_baseline_gb=165.0, vram_contextforge_gb=108.0, | |
| ttft_baseline_ms=375.0, ttft_contextforge_ms=280.0, | |
| throughput_baseline_tps=280.0, throughput_contextforge_tps=408.0), | |
| ScenarioBenchmark(id=8, name="pbkv_prediction", | |
| vram_baseline_gb=165.0, vram_contextforge_gb=115.0, | |
| ttft_baseline_ms=358.0, ttft_contextforge_ms=268.0, | |
| throughput_baseline_tps=298.0, throughput_contextforge_tps=402.0), | |
| ScenarioBenchmark(id=9, name="workflow_aware_eviction", | |
| vram_baseline_gb=165.0, vram_contextforge_gb=102.0, | |
| ttft_baseline_ms=368.0, ttft_contextforge_ms=275.0, | |
| throughput_baseline_tps=288.0, throughput_contextforge_tps=412.0), | |
| ScenarioBenchmark(id=10, name="embedding_engine_encoding", | |
| vram_baseline_gb=165.0, vram_contextforge_gb=95.0, | |
| ttft_baseline_ms=385.0, ttft_contextforge_ms=290.0, | |
| throughput_baseline_tps=270.0, throughput_contextforge_tps=398.0), | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Research papers table (8 papers + AMD DevCloud) | |
| # --------------------------------------------------------------------------- | |
| PAPERS = [ | |
| {"title": "KVCOMM — Cross-Context KV Communication", | |
| "venue": "NeurIPS 2025", "arxiv": "2510.12872", | |
| "what_we_implemented": "AnchorPool: offset variance prediction via SimHash, approximate_offset() API"}, | |
| {"title": "KVFlow — Prefix Caching for Workflows", | |
| "venue": "NeurIPS 2025", "arxiv": "2507.07400", | |
| "what_we_implemented": "AgentStepGraph: compute_steps_to_execution(), workflow-aware eviction"}, | |
| {"title": "PBKV — Prediction-Based KV Management", | |
| "venue": "arXiv May 2026", "arxiv": "2605.06472", | |
| "what_we_implemented": "PBKVPredictor (stub V4, production V5): Markov model log + predict"}, | |
| {"title": "SemShareKV — Semantic LSH KV Sharing", | |
| "venue": "ACL Findings 2025", "arxiv": "—", | |
| "what_we_implemented": "LSHEngine: SimHash on token IDs, FAISS ANN deduplication, block_size=16"}, | |
| {"title": "RotateKV — Pre-RoPE INT4 Quantization", | |
| "venue": "IJCAI 2025", "arxiv": "2501.16383", | |
| "what_we_implemented": "RotateKVQuantizer: pre-RoPE only (INV-10), INT4, attention-sink protection"}, | |
| {"title": "CLA — Cross-Layer Attention", | |
| "venue": "NeurIPS 2024", "arxiv": "—", | |
| "what_we_implemented": "CLAMetadataLayer: compute_layer_groups(), upper-layer sharing strategy"}, | |
| {"title": "LCKV — Layer-Condensed KV", | |
| "venue": "ACL 2024", "arxiv": "—", | |
| "what_we_implemented": "CLA upper-layer sharing (top layers only, NON_THOUGHT_ROLES frozenset)"}, | |
| {"title": "Queueing Theory for KV Cache Stability", | |
| "venue": "arXiv:2605.04595 (ICML 2026)", "arxiv": "2605.04595", | |
| "what_we_implemented": "QueueingController: λ/μ/ρ estimation, INVARIANT-11, minimum_stable_blocks"}, | |
| ] | |
| MODULE_MAPPING = [ | |
| ("QueueingController", "arXiv:2605.04595", "Stability-aware eviction via M/G/1 queueing model"), | |
| ("AnchorPool", "KVCOMM (2510.12872)", "Cross-context KV offset prediction via SimHash"), | |
| ("RotateKVQuantizer", "RotateKV (2501.16383)", "Pre-RoPE INT4 quantization with attention-sink protection"), | |
| ("CLAMetadataLayer", "CLA + NAACL 2025", "Upper-layer sharing + NON_THOUGHT_ROLES bypass"), | |
| ("AgentStepGraph", "KVFlow (2507.07400)", "Workflow DAG + compute_steps_to_execution"), | |
| ("LSHEngine", "SemShareKV (ACL Findings 2025)", "SimHash + FAISS ANN semantic dedup"), | |
| ("VRAMAwareCache", "KVFlow + PBKV", "Staged eviction with workflow awareness"), | |
| ("KVAwareRouter", "KVCOMM + CLA", "Anchor locality routing + CLA affinity"), | |
| ] | |
| DEVLOUD_SPECS = """ | |
| ## AMD DevCloud — MI300X Node Specs | |
| | Component | Specification | | |
| |-----------|---------------| | |
| | Accelerator | AMD Instinct MI300X (gfx942) | | |
| | GPU Memory | 192 GB HBM3 per GPU | | |
| | Compute | 304 AI TOPS (FP8), 608 TFLOPS (FP16) | | |
| | CPU | AMD EPYC 9654 (Zen 4, 96 cores) | | |
| | System RAM | 1024 GB DDR5 | | |
| | Interconnect | AMD Infinity Fabric (C2C) | | |
| | ROCm Version | ROCm 7.x | | |
| | Software | PyRSMI, ROCm Profiler, HIP, Triton-ROCm | | |
| | Access | https://developer.amd.com/devcloud/ (free credits) | | |
| | Cost Estimate | ~$1.99/hr (single MI300X), $9.95/hr (8-GPU) | | |
| | Benchmark Tool | demo/benchmark_v4.py --device rocm:0 --scenarios all | | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # 5-agent pipeline definition | |
| # --------------------------------------------------------------------------- | |
| PIPELINE_AGENTS = [ | |
| {"name": "Retriever", "role": "fast", "expected_ttft_ms": 40.0}, | |
| {"name": "Reranker", "role": "fast", "expected_ttft_ms": 52.0}, | |
| {"name": "Summarizer", "role": "fast", "expected_ttft_ms": 38.0}, | |
| {"name": "Critic", "role": "CoT", "expected_ttft_ms": 65.0}, | |
| {"name": "Responder", "role": "CoT", "expected_ttft_ms": 35.0}, | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Metric generation helpers | |
| # --------------------------------------------------------------------------- | |
| def _gaussian(mean: float, std: float, lo: float = 0.0, hi: float = 1e9) -> float: | |
| return max(lo, min(hi, random.gauss(mean, std))) | |
| def generate_mock_metrics() -> LiveMetrics: | |
| """Generate synthetic metrics from Gaussian distributions around expected values.""" | |
| rho = _gaussian(0.72, 0.06, lo=0.3, hi=0.98) | |
| lam = _gaussian(8.5, 1.2, lo=1.0, hi=20.0) | |
| mu = _gaussian(lam / rho + 0.1, 1.0, lo=lam + 0.01, hi=50.0) | |
| is_stable = rho < 0.95 | |
| stability_margin = (1.0 - rho) * 100.0 | |
| min_stable_blocks = int(lam * (1.0 / max(mu, 0.01)) * 16 * 1.15) | |
| # RotateKV bits driven by utilization (arXiv:2605.04595 Table 2) | |
| if rho < 0.70: | |
| rotate_bits = 16 | |
| elif rho < 0.85: | |
| rotate_bits = 8 | |
| elif rho < 0.95: | |
| rotate_bits = 4 | |
| else: | |
| rotate_bits = 2 | |
| vram_pressure = _gaussian(68.0, 8.0, lo=20.0, hi=98.0) | |
| kv_hit = _gaussian(0.74, 0.07, lo=0.4, hi=0.99) | |
| anchor_reuse = _gaussian(0.81, 0.05, lo=0.5, hi=0.99) | |
| cla_vram_reduction = _gaussian(34.0, 4.0, lo=15.0, hi=50.0) | |
| active_offsets = random.randint(3, 12) | |
| agents: list[AgentSnapshot] = [] | |
| for agent_def in PIPELINE_AGENTS: | |
| ttft = _gaussian(agent_def["expected_ttft_ms"], 8.0, lo=15.0, hi=150.0) | |
| cache_hit = random.random() < kv_hit | |
| thinking = agent_def["role"] == "CoT" | |
| agents.append(AgentSnapshot( | |
| name=agent_def["name"], | |
| role=agent_def["role"], | |
| ttft_ms=round(ttft, 1), | |
| cache_hit=cache_hit, | |
| thinking_mode=thinking, | |
| anchor_hints=random.randint(1, 5) if cache_hit else 0, | |
| rotate_kv_bits=rotate_bits, | |
| )) | |
| return LiveMetrics( | |
| vram_pressure_pct=round(vram_pressure, 1), | |
| kv_cache_hit_rate=round(kv_hit, 3), | |
| anchor_pool_reuse_rate=round(anchor_reuse, 3), | |
| utilization_rho=round(rho, 4), | |
| is_stable=is_stable, | |
| lambda_req_per_sec=round(lam, 3), | |
| mu_req_per_sec=round(mu, 3), | |
| lambda_critical=round(_gaussian(12.0, 2.0, lo=5.0, hi=30.0), 3), | |
| stability_margin_pct=round(stability_margin, 2), | |
| minimum_stable_blocks=min_stable_blocks, | |
| agents=agents, | |
| rotate_kv_bits=rotate_bits, | |
| cla_vram_reduction_pct=round(cla_vram_reduction, 1), | |
| anchorpool_active_offsets=active_offsets, | |
| ) | |
| def get_real_metrics() -> LiveMetrics: | |
| """Gather real metrics when vLLM / PyRSMI are available. | |
| In V5 production this would call: | |
| - PyRSMI for VRAM pressure | |
| - vLLM / vllm_client.py for cache hit rates | |
| - QueueingController.compute_stability_state() for λ, μ, ρ | |
| - AnchorPool.get_stats() for active offsets | |
| Here we mirror the real API shape with fallback mock. | |
| """ | |
| return generate_mock_metrics() | |
| # --------------------------------------------------------------------------- | |
| # UI helpers | |
| # --------------------------------------------------------------------------- | |
| def vram_gauge(value: float) -> None: | |
| """Render VRAM pressure as colored metric card.""" | |
| if value < 60: | |
| color = "green" | |
| label = "LOW" | |
| elif value < 80: | |
| color = "yellow" | |
| label = "MEDIUM" | |
| else: | |
| color = "red" | |
| label = "HIGH" | |
| st.metric(label=f"VRAM Pressure [{label}]", value=f"{value:.1f}%") | |
| st.progress(min(value / 100.0, 1.0), color=color) | |
| # --------------------------------------------------------------------------- | |
| # Tab 1 — Live Metrics | |
| # --------------------------------------------------------------------------- | |
| def render_tab_live_metrics(metrics: LiveMetrics) -> None: | |
| st.subheader("VRAM & Cache") | |
| c1, c2, c3 = st.columns(3) | |
| with c1: | |
| vram_gauge(metrics.vram_pressure_pct) | |
| with c2: | |
| st.metric("KV Cache Hit Rate", f"{metrics.kv_cache_hit_rate * 100:.1f}%") | |
| with c3: | |
| st.metric("AnchorPool Reuse Rate", f"{metrics.anchor_pool_reuse_rate * 100:.1f}%") | |
| st.divider() | |
| st.subheader("QueueingController — TASK-001 (arXiv:2605.04595 ICML 2026)") | |
| qc1, qc2, qc3, qc4 = st.columns(4) | |
| with qc1: | |
| st.metric("λ (arrival rate)", f"{metrics.lambda_req_per_sec:.3f} req/s") | |
| with qc2: | |
| st.metric("μ (service rate)", f"{metrics.mu_req_per_sec:.3f} req/s") | |
| with qc3: | |
| st.metric("ρ (utilization)", f"{metrics.utilization_rho:.4f}") | |
| with qc4: | |
| delta_color = "normal" if metrics.is_stable else "off" | |
| st.metric("is_stable", str(metrics.is_stable), delta_color=delta_color) | |
| m1, m2, m3 = st.columns(3) | |
| with m1: | |
| st.metric("λ_critical", f"{metrics.lambda_critical:.3f} req/s") | |
| with m2: | |
| st.metric("stability_margin_pct", f"{metrics.stability_margin_pct:.2f}%") | |
| with m3: | |
| st.metric("minimum_stable_blocks (INV-11)", f"{metrics.minimum_stable_blocks} blocks") | |
| stability_badge = "🟢 STABLE" if metrics.is_stable else "🔴 UNSTABLE" | |
| st.info(f"**System Status:** {stability_badge} | ρ={metrics.utilization_rho:.4f} | margin={metrics.stability_margin_pct:.1f}%") | |
| st.divider() | |
| st.subheader("KV Quantization — RotateKV") | |
| kv1, kv2, kv3 = st.columns(3) | |
| bits_label = {2: "INT2 (aggressive)", 4: "INT4", 8: "INT8", 16: "FP16 (full)"} | |
| with kv1: | |
| st.metric("Active Quantization", bits_label.get(metrics.rotate_kv_bits, f"{metrics.rotate_kv_bits}bit")) | |
| with kv2: | |
| st.metric("CLA VRAM Reduction", f"{metrics.cla_vram_reduction_pct:.1f}%") | |
| with kv3: | |
| st.metric("AnchorPool Active Offsets", f"{metrics.anchorpool_active_offsets}") | |
| # --------------------------------------------------------------------------- | |
| # Tab 2 — Pipeline View | |
| # --------------------------------------------------------------------------- | |
| def render_tab_pipeline_view(metrics: LiveMetrics) -> None: | |
| diagram = f""" | |
| ``` | |
| ┌─────────────────────────────────────────────────────────────────────────┐ | |
| │ ContextForge V5.0 — 5-Agent Pipeline │ | |
| ├─────────────────────────────────────────────────────────────────────────┤ | |
| │ │ | |
| │ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ | |
| │ │ │ │ │ │ │ │ │ │ | |
| │ │ Retriever │───▶│ Reranker │───▶│Summarizer │───▶│ Critic │──▶│ | |
| │ │ (fast) │ │ (fast) │ │ (fast) │ │ (CoT) │ │ | |
| │ │ │ │ │ │ │ │ │ │ | |
| │ └───────────┘ └───────────┘ └───────────┘ └───────────┘ │ | |
| │ │ | |
| │ ┌───────────┐ │ | |
| │ │ │ │ | |
| │ │ Responder │ │ | |
| │ │ (CoT) │ │ | |
| │ │ │ │ | |
| │ └───────────┘ │ | |
| │ │ | |
| │ ── RotateKV: {metrics.rotate_kv_bits}bits ─────────────────────────────────────│ | |
| │ ── CLA VRAM reduction: {metrics.cla_vram_reduction_pct:.1f}% ───────────────────────│ | |
| │ ── AnchorPool active offsets: {metrics.anchorpool_active_offsets} ───────────────────── | |
| └─────────────────────────────────────────────────────────────────────────┘ | |
| ```""" | |
| st.code(diagram.strip(), language=None) | |
| st.divider() | |
| st.subheader("Per-Agent Statistics") | |
| header = ["Agent", "Role", "TTFT (ms)", "Cache Hit", "Thinking Mode", "Anchor Hints", "KV bits"] | |
| rows = [] | |
| for a in metrics.agents: | |
| rows.append([ | |
| a.name, a.role, f"{a.ttft_ms}", | |
| "✅" if a.cache_hit else "❌", | |
| "🔁 ON" if a.thinking_mode else "—", | |
| str(a.anchor_hints), str(a.rotate_kv_bits), | |
| ]) | |
| col_keys = ["Agent", "Role", "TTFT (ms)", "Cache Hit", "Thinking", "Anchor Hints", "KV bits"] | |
| table_data = {k: [r[i] for r in rows] for i, k in enumerate(col_keys)} | |
| st.table(table_data) | |
| avg_ttft = sum(a.ttft_ms for a in metrics.agents) / len(metrics.agents) | |
| hit_rate = sum(1 for a in metrics.agents if a.cache_hit) / len(metrics.agents) | |
| agg1, agg2, agg3 = st.columns(3) | |
| with agg1: | |
| st.metric("Average TTFT (ms)", f"{avg_ttft:.1f} ms") | |
| with agg2: | |
| st.metric("Cache Hit Rate", f"{hit_rate * 100:.0f}%") | |
| with agg3: | |
| st.metric("RotateKV Active Bits", f"{metrics.rotate_kv_bits}") | |
| st.divider() | |
| st.subheader("RotateKV Quantization Levels (QueueingController-driven)") | |
| rk1, rk2, rk3, rk4 = st.columns(4) | |
| for col, bits in zip([rk1, rk2, rk3, rk4], [16, 8, 4, 2]): | |
| active = "●" if bits == metrics.rotate_kv_bits else "○" | |
| col.write(f"{active} **{bits}bit** — {'FP16' if bits == 16 else 'INT' + str(bits)}") | |
| # --------------------------------------------------------------------------- | |
| # Tab 3 — V4 vs Baseline | |
| # --------------------------------------------------------------------------- | |
| def render_tab_v4_vs_baseline(selected_scenario: Optional[int]) -> None: | |
| scenario = next((s for s in SCENARIOS if s.id == selected_scenario), SCENARIOS[0]) \ | |
| if selected_scenario is not None else SCENARIOS[0] | |
| st.subheader(f"Scenario: #{scenario.id} — {scenario.name}") | |
| vram_data = { | |
| "Metric": ["Baseline (no sharing)", "ContextForge V4", "VRAM Saved"], | |
| "VRAM (GB)": [ | |
| scenario.vram_baseline_gb, | |
| scenario.vram_contextforge_gb, | |
| scenario.vram_baseline_gb - scenario.vram_contextforge_gb, | |
| ], | |
| } | |
| st.bar_chart(vram_data, x="Metric", y="VRAM (GB)", horizontal=True) | |
| c1, c2, c3 = st.columns(3) | |
| with c1: | |
| vram_saved = scenario.vram_baseline_gb - scenario.vram_contextforge_gb | |
| st.metric("VRAM Saved", f"{vram_saved:.1f} GB ({vram_saved/scenario.vram_baseline_gb*100:.0f}%)") | |
| with c2: | |
| ttft_delta = (scenario.ttft_baseline_ms - scenario.ttft_contextforge_ms) / scenario.ttft_baseline_ms * 100 | |
| st.metric("TTFT Improvement", f"{ttft_delta:.1f}%") | |
| with c3: | |
| tput_gain = (scenario.throughput_contextforge_tps / scenario.throughput_baseline_tps - 1) * 100 | |
| st.metric("Throughput Gain", f"{tput_gain:.1f}%") | |
| st.divider() | |
| st.subheader("Detailed Comparison") | |
| detail_data = { | |
| "Metric": ["VRAM Peak (GB)", "TTFT (ms)", "Throughput (tok/s)"], | |
| "Baseline": [scenario.vram_baseline_gb, scenario.ttft_baseline_ms, scenario.throughput_baseline_tps], | |
| "ContextForge V4": [scenario.vram_contextforge_gb, scenario.ttft_contextforge_ms, scenario.throughput_contextforge_tps], | |
| } | |
| st.table(detail_data) | |
| st.divider() | |
| st.subheader("All Scenarios") | |
| all_data = { | |
| "ID": [s.id for s in SCENARIOS], | |
| "Scenario": [s.name for s in SCENARIOS], | |
| "Baseline VRAM (GB)": [s.vram_baseline_gb for s in SCENARIOS], | |
| "CF VRAM (GB)": [s.vram_contextforge_gb for s in SCENARIOS], | |
| "VRAM ↓%": [round((s.vram_baseline_gb - s.vram_contextforge_gb) / s.vram_baseline_gb * 100, 1) for s in SCENARIOS], | |
| "TTFT Δms": [round(s.ttft_baseline_ms - s.ttft_contextforge_ms, 1) for s in SCENARIOS], | |
| "TTFT ↓%": [round((s.ttft_baseline_ms - s.ttft_contextforge_ms) / s.ttft_baseline_ms * 100, 1) for s in SCENARIOS], | |
| } | |
| st.table(all_data) | |
| # --------------------------------------------------------------------------- | |
| # Tab 4 — Research | |
| # --------------------------------------------------------------------------- | |
| def render_tab_research() -> None: | |
| st.subheader("Research Papers") | |
| for p in PAPERS: | |
| arxiv_url = f"https://arxiv.org/abs/{p['arxiv']}" if p['arxiv'] != '—' else "#" | |
| with st.expander(f"[{p['venue']}] {p['title']}", expanded=False): | |
| st.markdown(f"**arXiv:** [{p['arxiv']}]({arxiv_url})") | |
| st.markdown(f"**What we implemented:** {p['what_we_implemented']}") | |
| st.divider() | |
| st.subheader("Module → Paper Mapping") | |
| mapping_data = { | |
| "Module": [m[0] for m in MODULE_MAPPING], | |
| "Source Paper": [m[1] for m in MODULE_MAPPING], | |
| "Implementation": [m[2] for m in MODULE_MAPPING], | |
| } | |
| st.table(mapping_data) | |
| st.divider() | |
| st.markdown(DEVLOUD_SPECS) | |
| # --------------------------------------------------------------------------- | |
| # Main | |
| # --------------------------------------------------------------------------- | |
| def main() -> None: | |
| st.set_page_config( | |
| page_title="ContextForge V5.0 — BenchmarkDashboard", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| ) | |
| # Sidebar configuration | |
| st.sidebar.title("ContextForge V5.0") | |
| st.sidebar.markdown("**Benchmark Dashboard** — Streamlit") | |
| st.sidebar.divider() | |
| use_mock = is_mock_mode() | |
| refresh_rate = st.sidebar.slider("Refresh rate (seconds)", 1, 30, 5) | |
| scenario_selector = st.sidebar.selectbox( | |
| "Benchmark Scenario (Tab 3)", | |
| options=[None] + [s.id for s in SCENARIOS], | |
| format_func=lambda x: "All Scenarios" if x is None else f"#{x} {next(s.name for s in SCENARIOS if s.id == x)}", | |
| ) | |
| selected_tab = st.sidebar.selectbox("Active Tab", [ | |
| "1️⃣ Live Metrics", | |
| "2️⃣ Pipeline View", | |
| "3️⃣ V4 vs Baseline", | |
| "4️⃣ Research", | |
| ]) | |
| tab_idx = int(selected_tab[0]) - 1 | |
| st.sidebar.divider() | |
| st.sidebar.caption(f"Last refresh: {datetime.now().strftime('%H:%M:%S')}") | |
| # ── SIMULATION MODE banner (INV-14) ───────────────────────────────────── | |
| if use_mock: | |
| st.error( | |
| "⚠️ **SIMULATION MODE** — Data shown below is synthetically generated. " | |
| "Do NOT present as real hardware results. " | |
| "Run against AMD MI300X for validated numbers.", | |
| icon="🚨", | |
| ) | |
| else: | |
| st.success("🟢 **LIVE MODE** — Connected to real vLLM / PyRSMI endpoints.") | |
| st.title("ContextForge V5.0 — BenchmarkDashboard") | |
| if tab_idx == 0: | |
| placeholder = st.empty() | |
| metrics = generate_mock_metrics() if use_mock else get_real_metrics() | |
| with placeholder.container(): | |
| render_tab_live_metrics(metrics) | |
| if refresh_rate > 0: | |
| import threading | |
| def _refresh() -> None: | |
| time.sleep(refresh_rate) | |
| st.rerun() | |
| threading.Thread(target=_refresh, daemon=True).start() | |
| elif tab_idx == 1: | |
| metrics = generate_mock_metrics() if use_mock else get_real_metrics() | |
| render_tab_pipeline_view(metrics) | |
| elif tab_idx == 2: | |
| render_tab_v4_vs_baseline(scenario_selector) | |
| elif tab_idx == 3: | |
| render_tab_research() | |
| if __name__ == "__main__": | |
| main() |