Spaces:

bledden
/

stack-doctor

Build error

App Files Files Community

bledden commited on Mar 8

Commit

c75f6b6

verified ·

1 Parent(s): 1741387

Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

Dockerfile +31 -0
models.py +41 -0
serve.py +143 -0
server/__init__.py +11 -0
server/app.py +41 -0
server/baselines.py +217 -0
server/scenarios.py +0 -0
server/stack_doctor_environment.py +279 -0
server/stack_doctor_mcp.py +396 -0
static/index.html +1566 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.12-slim
+WORKDIR /app
+# Install dependencies
+RUN pip install --no-cache-dir \
+    openenv-core \
+    fastapi \
+    uvicorn \
+    mcp \
+    torch --index-url https://download.pytorch.org/whl/cpu \
+    transformers \
+    accelerate \
+    sentencepiece
+# Pre-download model weights at build time (faster cold start)
+RUN python -c "from transformers import AutoModelForCausalLM, AutoTokenizer; \
+    AutoTokenizer.from_pretrained('Qwen/Qwen2.5-1.5B-Instruct'); \
+    AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-1.5B-Instruct')"
+# Copy app code
+COPY server/ server/
+COPY models.py .
+COPY hf_space/serve.py .
+# Copy dashboard
+COPY dashboard.html static/index.html
+EXPOSE 7860
+CMD ["python", "serve.py"]

models.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+Data models for the Stack Doctor Environment.
+An overseer LLM diagnoses sick inference stacks by probing subsystems,
+reconciling conflicting specialist-agent reports, and selecting the
+minimal correct fix.
+"""
+from pydantic import Field
+from openenv.core.env_server.types import Action, Observation
+class StackDoctorAction(Action):
+    """Agent action — a JSON message selecting one of 4 action types."""
+    message: str = Field(
+        ...,
+        description=(
+            'JSON action. One of:\n'
+            '  {"type":"inspect","target":"logs|config|snippet|metrics"}\n'
+            '  {"type":"ask_specialist","specialist":"runtime|dispatch|kernel|loader"}\n'
+            '  {"type":"apply_fix","fix":"relax_arch_check|add_whitelist_entry|fix_runtime_path|switch_backend|update_model_config|fix_weight_mapping|tune_memory_config|fix_quantization|fix_comm_config|update_driver_config"}\n'
+            '  {"type":"submit","root_cause":"...","fix":"...","justification":"..."}'
+        ),
+    )
+class StackDoctorObservation(Observation):
+    """What the agent sees after each action."""
+    output: str = Field(default="", description="Natural-language feedback")
+    incident_ticket: str = Field(default="", description="The incident description")
+    hardware: str = Field(default="", description="Hardware identifier")
+    model_name: str = Field(default="", description="Model being served")
+    backend: str = Field(default="", description="Inference backend in use")
+    log_excerpt: str = Field(default="", description="Log snippet")
+    code_snippet: str = Field(default="", description="Config or code snippet")
+    specialist_opinions: dict = Field(default_factory=dict, description="Specialist name -> {opinion, confidence}")
+    steps_remaining: int = Field(default=6, description="Steps left in episode")
+    fix_used: bool = Field(default=False, description="Whether apply_fix has been used")

serve.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""Unified server for HF Spaces: environment + inference + dashboard on port 7860."""
+import json
+import os
+import sys
+import time
+import threading
+sys.path.insert(0, "/app")
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, JSONResponse
+import uvicorn
+from server.app import app as env_app
+env_app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Model state (loaded in background)
+MODEL_STATE = {"model": None, "tokenizer": None, "ready": False, "error": None}
+UNTRAINED_SYSTEM = (
+    "You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\n"
+    "You receive an incident ticket with hardware/model/backend context, log excerpts, and specialist opinions.\n"
+    "Some specialists may be wrong. Output a JSON array of actions:\n"
+    '  {"type":"inspect","target":"logs|config|snippet|metrics"}\n'
+    '  {"type":"ask_specialist","specialist":"runtime|dispatch|kernel|loader"}\n'
+    '  {"type":"apply_fix","fix":"<fix_name>"}\n'
+    '  {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<why>"}'
+)
+TRAINED_SYSTEM = (
+    "You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\n"
+    "You are methodical: first inspect logs and config, then query specialists to cross-verify (some lie), then apply a fix and submit.\n\n"
+    "Available actions (output as a JSON array):\n"
+    '  {"type":"inspect","target":"logs"} or "config" or "snippet" or "metrics"\n'
+    '  {"type":"ask_specialist","specialist":"runtime"} or "dispatch" or "kernel" or "loader"\n'
+    '  {"type":"apply_fix","fix":"<name>"} -- available fixes: add_whitelist_entry, fix_comm_config, fix_quantization, fix_runtime_path, fix_weight_mapping, relax_arch_check, switch_backend, tune_memory_config, update_driver_config, update_model_config\n'
+    '  {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<detailed reasoning>"}\n\n'
+    "Available root causes: arch_guard, backend_selector, backend_whitelist, distributed_comm, driver_compat, memory_oom, model_config, quantization_error, runtime_loader, weight_layout\n\n"
+    "IMPORTANT: Pick ONE target per inspect, ONE specialist per query. Investigate before submitting. Give a detailed justification.\n\n"
+    "Example output:\n"
+    '[{"type":"inspect","target":"logs"},{"type":"inspect","target":"config"},{"type":"ask_specialist","specialist":"kernel"},'
+    '{"type":"apply_fix","fix":"relax_arch_check"},'
+    '{"type":"submit","root_cause":"arch_guard","fix":"relax_arch_check","justification":"Logs show architecture check failure for SM90. Config confirms guard enabled. Kernel specialist confirmed not a kernel issue."}]'
+)
+def load_model_background():
+    """Load Qwen 1.5B in a background thread so the server starts fast."""
+    try:
+        print("[Model] Loading Qwen2.5-1.5B-Instruct (CPU)...")
+        t0 = time.time()
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        import torch
+        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32,
+            device_map="cpu",
+        )
+        MODEL_STATE["model"] = model
+        MODEL_STATE["tokenizer"] = tokenizer
+        MODEL_STATE["ready"] = True
+        print(f"[Model] Loaded in {time.time()-t0:.1f}s")
+    except Exception as ex:
+        MODEL_STATE["error"] = str(ex)
+        print(f"[Model] Failed to load: {ex}")
+threading.Thread(target=load_model_background, daemon=True).start()
+@env_app.post("/generate")
+async def generate_endpoint(request: Request):
+    body = await request.json()
+    prompt_text = body.get("prompt", "")
+    max_tokens = body.get("max_tokens", 512)
+    mode = body.get("mode", "untrained")
+    if not MODEL_STATE["ready"]:
+        if MODEL_STATE["error"]:
+            return JSONResponse({"error": MODEL_STATE["error"]}, status_code=500)
+        return JSONResponse({"error": "Model still loading, please wait..."}, status_code=503)
+    model = MODEL_STATE["model"]
+    tokenizer = MODEL_STATE["tokenizer"]
+    system = TRAINED_SYSTEM if mode == "trained" else UNTRAINED_SYSTEM
+    messages = [
+        {"role": "system", "content": system},
+        {"role": "user", "content": prompt_text},
+    ]
+    import torch
+    text_input = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = tokenizer(text_input, return_tensors="pt")
+    t0 = time.time()
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
+    text = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    gen_time = time.time() - t0
+    print(f"[Model] Generated {len(text)} chars in {gen_time:.1f}s (mode={mode})")
+    return JSONResponse({"text": text, "gen_time": gen_time})
+@env_app.get("/model_status")
+async def model_status():
+    return JSONResponse({
+        "ready": MODEL_STATE["ready"],
+        "error": MODEL_STATE["error"],
+    })
+@env_app.get("/", include_in_schema=False)
+async def root():
+    return FileResponse("/app/static/index.html")
+if __name__ == "__main__":
+    uvicorn.run(env_app, host="0.0.0.0", port=7860)

server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""Stack Doctor environment server components."""
+from .stack_doctor_environment import StackDoctorEnvironment
+__all__ = ["StackDoctorEnvironment"]
+def get_mcp_environment():
+    """Lazy import of MCP environment (requires fastapi/uvicorn)."""
+    from .stack_doctor_mcp import StackDoctorMCPEnvironment
+    return StackDoctorMCPEnvironment

server/app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+FastAPI application for the Stack Doctor Environment.
+Exposes both:
+  - WebSocket API (reset/step/state) for RL training
+  - MCP API (tools/list, tools/call) for agent interaction
+Usage:
+    uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+"""
+try:
+    from openenv.core.env_server.http_server import create_app
+except Exception as e:
+    raise ImportError(
+        "openenv is required. Install with: uv sync"
+    ) from e
+from models import StackDoctorAction, StackDoctorObservation
+from .stack_doctor_mcp import StackDoctorMCPEnvironment
+app = create_app(
+    StackDoctorMCPEnvironment,
+    StackDoctorAction,
+    StackDoctorObservation,
+    env_name="stack_doctor",
+    max_concurrent_envs=4,
+)
+def main(host: str = "0.0.0.0", port: int = 8000):
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=8000)
+    args = parser.parse_args()
+    main(port=args.port)

server/baselines.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""
+Oracle, heuristic, and random baselines for Stack Doctor.
+Used to validate the reward function: random < heuristic < oracle must hold.
+"""
+from __future__ import annotations
+import json
+import random
+from .scenarios import (
+    ROOT_CAUSE_TO_FIX,
+    ROOT_CAUSES,
+    FIXES,
+    SPECIALISTS,
+    Scenario,
+    SCENARIOS,
+    TRAIN_SCENARIOS,
+    EVAL_SCENARIOS,
+)
+def oracle_policy(scenario: Scenario) -> list[dict]:
+    """Perfect policy: submit correct answer in 1 step."""
+    return [
+        {
+            "type": "submit",
+            "root_cause": scenario.root_cause,
+            "fix": scenario.correct_fix,
+            "justification": f"Root cause is {scenario.root_cause}, applying the correct fix.",
+        }
+    ]
+def heuristic_policy(scenario: Scenario) -> list[dict]:
+    """
+    Reasonable heuristic: inspect logs, ask the highest-confidence specialist,
+    then submit based on clues.
+    Uses keyword matching on specialist opinions and logs to guess root cause.
+    """
+    actions = []
+    # Step 1: inspect logs
+    actions.append({"type": "inspect", "target": "logs"})
+    # Step 2: ask the highest-confidence specialist
+    best_spec = max(
+        scenario.specialist_opinions.items(),
+        key=lambda kv: kv[1].confidence,
+    )
+    actions.append({"type": "ask_specialist", "specialist": best_spec[0]})
+    # Step 3: heuristic root-cause guess from keywords
+    combined_text = (
+        scenario.incident_ticket
+        + " " + scenario.initial_log
+        + " " + best_spec[1].opinion
+    ).lower()
+    guess = _keyword_guess(combined_text)
+    # Step 4: apply fix
+    actions.append({"type": "apply_fix", "fix": ROOT_CAUSE_TO_FIX[guess]})
+    # Step 5: submit
+    actions.append({
+        "type": "submit",
+        "root_cause": guess,
+        "fix": ROOT_CAUSE_TO_FIX[guess],
+        "justification": f"Keyword analysis of logs and specialist opinions points to {guess}.",
+    })
+    return actions
+def random_policy(scenario: Scenario) -> list[dict]:
+    """Random policy: random actions, random submit."""
+    actions = []
+    n_steps = random.randint(1, 5)
+    for _ in range(n_steps - 1):
+        choice = random.choice(["inspect", "ask_specialist"])
+        if choice == "inspect":
+            actions.append({
+                "type": "inspect",
+                "target": random.choice(["logs", "config", "snippet", "metrics"]),
+            })
+        else:
+            actions.append({
+                "type": "ask_specialist",
+                "specialist": random.choice(SPECIALISTS),
+            })
+    # Final: random submit
+    rc = random.choice(ROOT_CAUSES)
+    actions.append({
+        "type": "submit",
+        "root_cause": rc,
+        "fix": ROOT_CAUSE_TO_FIX[rc],
+    })
+    return actions
+def _keyword_guess(text: str) -> str:
+    """Guess root cause from keyword presence in text."""
+    scores = {rc: 0 for rc in ROOT_CAUSES}
+    # arch_guard keywords
+    for kw in ["arch", "architecture", "sm_12", "sm_120", "sm_121", "supported_arch", "capability", "is_supported"]:
+        if kw in text:
+            scores["arch_guard"] += 1
+    # backend_whitelist keywords
+    for kw in ["whitelist", "supported_gpu", "not in", "marlin", "awq", "gpu name"]:
+        if kw in text:
+            scores["backend_whitelist"] += 1
+    # runtime_loader keywords
+    for kw in ["runtime", "libcuda", "ld_library", "cuda_home", "symlink", "shared object", "rocm_path", "hipError"]:
+        if kw in text:
+            scores["runtime_loader"] += 1
+    # backend_selector keywords
+    for kw in ["backend", "selector", "xformers", "flash_attn", "latency", "slow", "e4m3fn", "fp8 format"]:
+        if kw in text:
+            scores["backend_selector"] += 1
+    # model_config keywords
+    for kw in ["config", "num_expert", "shape mismatch", "rope", "checkpoint", "config.json"]:
+        if kw in text:
+            scores["model_config"] += 1
+    # weight_layout keywords
+    for kw in ["weight", "mapping", "swap", "gate_proj", "up_proj", "convert", "layout", "qkv"]:
+        if kw in text:
+            scores["weight_layout"] += 1
+    # memory_oom keywords
+    for kw in ["out of memory", "oom", "kv_cache", "memory", "max_model_len", "batch size", "vram"]:
+        if kw in text:
+            scores["memory_oom"] += 1
+    # quantization_error keywords
+    for kw in ["quantiz", "fp8", "int4", "nf4", "calibrat", "precision", "scale factor", "gptq"]:
+        if kw in text:
+            scores["quantization_error"] += 1
+    # distributed_comm keywords
+    for kw in ["nccl", "tensor parallel", "all_reduce", "rdma", "pipeline parallel", "collective", "rank"]:
+        if kw in text:
+            scores["distributed_comm"] += 1
+    # driver_compat keywords
+    for kw in ["driver", "cudnn", "toolkit", "nvcc", "cuda version", "driver version", "libcudnn"]:
+        if kw in text:
+            scores["driver_compat"] += 1
+    return max(scores, key=scores.get)
+def evaluate_policy(policy_fn, scenarios: list[Scenario], n_runs: int = 1) -> dict:
+    """
+    Run a policy across scenarios and compute metrics.
+    Returns dict with:
+      - rc_accuracy: fraction of correct root cause submissions
+      - fix_accuracy: fraction of correct fix submissions
+      - avg_steps: average steps to resolution
+      - avg_reward: average cumulative reward
+    """
+    from .stack_doctor_environment import StackDoctorEnvironment
+    from models import StackDoctorAction
+    total_rc_correct = 0
+    total_fix_correct = 0
+    total_steps = 0
+    total_reward = 0.0
+    total_episodes = 0
+    for _ in range(n_runs):
+        for scenario in scenarios:
+            env = StackDoctorEnvironment()
+            env.reset(scenario_id=scenario.id)
+            actions = policy_fn(scenario)
+            cumulative = 0.0
+            steps = 0
+            for action_dict in actions:
+                obs = env.step(StackDoctorAction(message=json.dumps(action_dict)))
+                cumulative += obs.reward
+                steps += 1
+                if obs.done:
+                    break
+            # Check if submit happened
+            last_action = actions[-1] if actions else {}
+            if last_action.get("type") == "submit":
+                if last_action["root_cause"] == scenario.root_cause:
+                    total_rc_correct += 1
+                if last_action["fix"] == scenario.correct_fix:
+                    total_fix_correct += 1
+            total_steps += steps
+            total_reward += cumulative
+            total_episodes += 1
+    return {
+        "rc_accuracy": total_rc_correct / total_episodes if total_episodes else 0,
+        "fix_accuracy": total_fix_correct / total_episodes if total_episodes else 0,
+        "avg_steps": total_steps / total_episodes if total_episodes else 0,
+        "avg_reward": total_reward / total_episodes if total_episodes else 0,
+        "n_episodes": total_episodes,
+    }

server/scenarios.py ADDED Viewed

The diff for this file is too large to render. See raw diff

server/stack_doctor_environment.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+Stack Doctor Environment.
+An overseer LLM diagnoses sick inference stacks by probing subsystems,
+reconciling conflicting specialist-agent reports, and selecting the
+minimal correct fix.
+Inspired by real SM12x enablement bugs across vLLM, FlashInfer, SGLang,
+CUTLASS, and Flash-Attention.
+"""
+from __future__ import annotations
+import json
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+from models import StackDoctorAction, StackDoctorObservation
+from .scenarios import (
+    ROOT_CAUSE_TO_FIX,
+    FIX_TO_ROOT_CAUSE,
+    ROOT_CAUSES,
+    FIXES,
+    SPECIALISTS,
+    Scenario,
+    SpecialistOpinion,
+    get_scenario,
+    randomize_specialist_opinions,
+)
+MAX_STEPS = 6
+INSPECT_TARGETS = {"logs", "config", "snippet", "metrics"}
+VALID_FIXES = set(FIXES)
+VALID_ROOT_CAUSES = set(ROOT_CAUSES)
+class EpisodeState:
+    """Internal mutable episode state (not exposed to agent)."""
+    def __init__(
+        self,
+        scenario: Scenario,
+        specialist_opinions: dict[str, SpecialistOpinion] | None = None,
+    ):
+        self.scenario = scenario
+        # Per-episode randomized specialist opinions (falls back to scenario defaults)
+        self.specialist_opinions = specialist_opinions or scenario.specialist_opinions
+        self.step_count = 0
+        self.fix_applied = False
+        self.fix_was_correct: bool | None = None
+        self.done = False
+        self.cumulative_reward = 0.0
+        self.actions_taken: list[dict] = []
+class StackDoctorEnvironment(Environment):
+    """
+    Stack Doctor: incident-response RL environment for
+    inference-stack diagnosis.
+    """
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self):
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._episode: EpisodeState | None = None
+    def reset(self, seed=None, episode_id=None, **kwargs) -> StackDoctorObservation:
+        scenario_id = kwargs.get("scenario_id")
+        split = kwargs.get("split", "train")
+        scenario = get_scenario(scenario_id, split=split)
+        self._state = State(
+            episode_id=episode_id or str(uuid4()),
+            step_count=0,
+        )
+        randomized_opinions = randomize_specialist_opinions(scenario)
+        self._episode = EpisodeState(scenario, specialist_opinions=randomized_opinions)
+        specialist_obs = {}
+        for name, op in randomized_opinions.items():
+            specialist_obs[name] = {
+                "opinion": op.opinion,
+                "confidence": op.confidence,
+            }
+        return StackDoctorObservation(
+            output=(
+                "STACK DOCTOR — New incident assigned.\n"
+                "Diagnose the root cause, optionally apply a fix, then submit your diagnosis.\n"
+                "You have 6 steps. Use them wisely.\n\n"
+                "Available actions (send as JSON):\n"
+                '  {"type":"inspect","target":"logs|config|snippet|metrics"}\n'
+                '  {"type":"ask_specialist","specialist":"runtime|dispatch|kernel|loader"}\n'
+                '  {"type":"apply_fix","fix":"relax_arch_check|add_whitelist_entry|fix_runtime_path|switch_backend|update_model_config|fix_weight_mapping|tune_memory_config|fix_quantization|fix_comm_config|update_driver_config"}\n'
+                '  {"type":"submit","root_cause":"...","fix":"...","justification":"reason for diagnosis"}\n'
+            ),
+            incident_ticket=scenario.incident_ticket,
+            hardware=scenario.hardware,
+            model_name=scenario.model_name,
+            backend=scenario.backend,
+            log_excerpt=scenario.initial_log,
+            code_snippet=scenario.initial_snippet,
+            specialist_opinions=specialist_obs,
+            steps_remaining=MAX_STEPS,
+            fix_used=False,
+            done=False,
+            reward=0.0,
+        )
+    def step(self, action: StackDoctorAction, **kwargs) -> StackDoctorObservation:
+        ep = self._episode
+        if ep is None or ep.done:
+            return self._terminal_obs("Episode is over. Call reset() to start a new incident.", 0.0)
+        self._state.step_count += 1
+        ep.step_count += 1
+        try:
+            parsed = json.loads(action.message)
+        except (json.JSONDecodeError, TypeError):
+            return self._handle_invalid(ep, f"Invalid JSON: {action.message[:200]}")
+        action_type = parsed.get("type")
+        if action_type == "inspect":
+            return self._handle_inspect(ep, parsed)
+        elif action_type == "ask_specialist":
+            return self._handle_ask_specialist(ep, parsed)
+        elif action_type == "apply_fix":
+            return self._handle_apply_fix(ep, parsed)
+        elif action_type == "submit":
+            return self._handle_submit(ep, parsed)
+        else:
+            return self._handle_invalid(ep, f"Unknown action type: {action_type}")
+    @property
+    def state(self) -> State:
+        return self._state
+    def _handle_inspect(self, ep: EpisodeState, parsed: dict) -> StackDoctorObservation:
+        target = parsed.get("target")
+        if target not in INSPECT_TARGETS:
+            return self._handle_invalid(ep, f"Invalid inspect target: {target}. Use: {INSPECT_TARGETS}")
+        reward = -0.25
+        ep.cumulative_reward += reward
+        ep.actions_taken.append({"type": "inspect", "target": target})
+        ir = ep.scenario.inspect_results
+        result_map = {"logs": ir.logs, "config": ir.config, "snippet": ir.snippet, "metrics": ir.metrics}
+        return self._step_obs(ep, output=f"[INSPECT {target.upper()}]\n{result_map[target]}", reward=reward)
+    def _handle_ask_specialist(self, ep: EpisodeState, parsed: dict) -> StackDoctorObservation:
+        specialist = parsed.get("specialist")
+        if specialist not in SPECIALISTS:
+            return self._handle_invalid(ep, f"Invalid specialist: {specialist}. Use: {SPECIALISTS}")
+        reward = -0.25
+        ep.cumulative_reward += reward
+        ep.actions_taken.append({"type": "ask_specialist", "specialist": specialist})
+        followup = ep.scenario.specialist_followups.get(specialist, "No additional information.")
+        return self._step_obs(ep, output=f"[SPECIALIST: {specialist.upper()}]\n{followup}", reward=reward)
+    def _handle_apply_fix(self, ep: EpisodeState, parsed: dict) -> StackDoctorObservation:
+        if ep.fix_applied:
+            return self._handle_invalid(ep, "apply_fix already used this episode. You can only apply one fix.")
+        fix = parsed.get("fix")
+        if fix not in VALID_FIXES:
+            return self._handle_invalid(ep, f"Invalid fix: {fix}. Use one of: {sorted(VALID_FIXES)}")
+        ep.fix_applied = True
+        is_correct = fix == ep.scenario.correct_fix
+        ep.fix_was_correct = is_correct
+        reward = 3.0 if is_correct else -2.0
+        ep.cumulative_reward += reward
+        ep.actions_taken.append({"type": "apply_fix", "fix": fix, "correct": is_correct})
+        if is_correct:
+            output = f"[FIX APPLIED: {fix}] Fix applied successfully. Systems recovering. Now submit your diagnosis."
+        else:
+            output = f"[FIX APPLIED: {fix}] Fix applied but the issue persists. Consider your diagnosis carefully."
+        return self._step_obs(ep, output=output, reward=reward)
+    def _handle_submit(self, ep: EpisodeState, parsed: dict) -> StackDoctorObservation:
+        root_cause = parsed.get("root_cause")
+        fix = parsed.get("fix")
+        justification = parsed.get("justification", "")
+        if root_cause not in VALID_ROOT_CAUSES:
+            return self._handle_invalid(ep, f"Invalid root_cause: {root_cause}. Use one of: {sorted(VALID_ROOT_CAUSES)}")
+        if fix not in VALID_FIXES:
+            return self._handle_invalid(ep, f"Invalid fix: {fix}. Use one of: {sorted(VALID_FIXES)}")
+        ep.done = True
+        correct_rc = ep.scenario.root_cause
+        correct_fix = ep.scenario.correct_fix
+        rc_correct = root_cause == correct_rc
+        fix_correct = fix == correct_fix
+        has_justification = len(justification.strip()) >= 10
+        reward = 0.0
+        reward += 8.0 if rc_correct else -4.0
+        reward += 8.0 if fix_correct else -4.0
+        if (rc_correct and fix_correct) and ep.step_count <= 4:
+            reward += 2.0
+        if has_justification:
+            reward += 1.0
+        ep.cumulative_reward += reward
+        ep.actions_taken.append({
+            "type": "submit", "root_cause": root_cause, "fix": fix,
+            "justification": justification,
+            "rc_correct": rc_correct, "fix_correct": fix_correct,
+            "has_justification": has_justification,
+        })
+        output_lines = ["[DIAGNOSIS SUBMITTED]"]
+        output_lines.append(f"  Root cause: {root_cause} — {'CORRECT' if rc_correct else 'WRONG (was: ' + correct_rc + ')'}")
+        output_lines.append(f"  Fix: {fix} — {'CORRECT' if fix_correct else 'WRONG (was: ' + correct_fix + ')'}")
+        if has_justification:
+            output_lines.append(f"  Justification: {justification.strip()}")
+            output_lines.append("  JUSTIFICATION BONUS: +1")
+        else:
+            output_lines.append("  No justification provided (missed +1 bonus)")
+        output_lines.append(f"  Steps used: {ep.step_count}/{MAX_STEPS}")
+        if rc_correct and fix_correct and ep.step_count <= 4:
+            output_lines.append("  EFFICIENCY BONUS: +2 (solved in <= 4 steps)")
+        output_lines.append(f"  Episode reward: {ep.cumulative_reward:.2f}")
+        return self._terminal_obs("\n".join(output_lines), reward)
+    def _handle_invalid(self, ep: EpisodeState, msg: str) -> StackDoctorObservation:
+        reward = -2.0
+        ep.cumulative_reward += reward
+        ep.actions_taken.append({"type": "invalid", "message": msg})
+        if ep.step_count >= MAX_STEPS:
+            ep.done = True
+            return self._terminal_obs(f"[INVALID ACTION] {msg}\n[EPISODE OVER] Max steps reached. Auto-fail.", reward)
+        return self._step_obs(ep, output=f"[INVALID ACTION] {msg}", reward=reward)
+    def _step_obs(self, ep: EpisodeState, output: str, reward: float) -> StackDoctorObservation:
+        remaining = MAX_STEPS - ep.step_count
+        if remaining <= 0 and not ep.done:
+            ep.done = True
+            timeout_penalty = -4.0
+            reward += timeout_penalty
+            ep.cumulative_reward += timeout_penalty
+            output += "\n\n[EPISODE OVER] Max steps reached without submission. Auto-fail. Reward: -4"
+        return StackDoctorObservation(
+            output=output, incident_ticket=ep.scenario.incident_ticket,
+            hardware=ep.scenario.hardware, model_name=ep.scenario.model_name,
+            backend=ep.scenario.backend, log_excerpt="", code_snippet="",
+            specialist_opinions={}, steps_remaining=remaining, fix_used=ep.fix_applied,
+            done=ep.done, reward=reward,
+            metadata={"cumulative_reward": ep.cumulative_reward, "step": ep.step_count, "scenario_id": ep.scenario.id},
+        )
+    def _terminal_obs(self, output: str, reward: float) -> StackDoctorObservation:
+        ep = self._episode
+        return StackDoctorObservation(
+            output=output, incident_ticket=ep.scenario.incident_ticket if ep else "",
+            hardware=ep.scenario.hardware if ep else "", model_name=ep.scenario.model_name if ep else "",
+            backend=ep.scenario.backend if ep else "", log_excerpt="", code_snippet="",
+            specialist_opinions={}, steps_remaining=0, fix_used=ep.fix_applied if ep else False,
+            done=True, reward=reward,
+            metadata={"cumulative_reward": ep.cumulative_reward if ep else 0.0, "step": ep.step_count if ep else 0, "scenario_id": ep.scenario.id if ep else ""},
+        )

server/stack_doctor_mcp.py ADDED Viewed

	@@ -0,0 +1,396 @@

+"""
+Stack Doctor MCP Environment.
+Wraps the core Stack Doctor environment with MCP tools that agents
+can discover and invoke. This is the agent-facing interface —
+agents call tools like read_log(), query_specialist(), submit_diagnosis()
+instead of constructing JSON action strings.
+The training (WebSocket) API still works through _step_impl().
+"""
+from __future__ import annotations
+import json
+from typing import Any, Optional
+from uuid import uuid4
+from mcp.server.fastmcp import FastMCP
+from openenv.core.env_server.mcp_environment import MCPEnvironment
+from openenv.core.env_server.types import Action, Observation, State
+from models import StackDoctorAction, StackDoctorObservation
+from .scenarios import (
+    ROOT_CAUSE_TO_FIX,
+    FIX_TO_ROOT_CAUSE,
+    ROOT_CAUSES,
+    FIXES,
+    SPECIALISTS,
+    Scenario,
+    get_scenario,
+)
+MAX_STEPS = 6
+VALID_FIXES = set(FIXES)
+VALID_ROOT_CAUSES = set(ROOT_CAUSES)
+class StackDoctorMCPEnvironment(MCPEnvironment):
+    """
+    Stack Doctor with MCP tool interface for agent interaction.
+    Agents discover available tools (read_log, check_config, view_code,
+    run_diagnostic, query_specialist, apply_fix, submit_diagnosis) and
+    call them to investigate incidents and submit diagnoses.
+    """
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self):
+        mcp = FastMCP("stack_doctor")
+        self._state_obj = State(episode_id=str(uuid4()), step_count=0)
+        self._scenario: Scenario | None = None
+        self._step_count = 0
+        self._fix_applied = False
+        self._fix_was_correct: bool | None = None
+        self._done = False
+        self._cumulative_reward = 0.0
+        self._actions_taken: list[dict] = []
+        env = self  # capture for closures
+        @mcp.tool()
+        def read_log() -> str:
+            """Read system and application logs for the current incident.
+            Returns log output from the affected inference stack including
+            error messages, warnings, and system state information.
+            Costs 1 step (-0.25 reward)."""
+            return env._do_inspect("logs")
+        @mcp.tool()
+        def check_config() -> str:
+            """Check configuration files for the current incident.
+            Returns relevant configuration parameters including GPU settings,
+            backend configuration, model parameters, and environment variables.
+            Costs 1 step (-0.25 reward)."""
+            return env._do_inspect("config")
+        @mcp.tool()
+        def view_code() -> str:
+            """View relevant source code snippets for the current incident.
+            Returns code from the affected component showing the likely
+            location of the bug or misconfiguration.
+            Costs 1 step (-0.25 reward)."""
+            return env._do_inspect("snippet")
+        @mcp.tool()
+        def run_diagnostic() -> str:
+            """Run performance diagnostics and metrics collection.
+            Returns metrics like latency, throughput, GPU utilization,
+            error rates, and memory usage for the affected system.
+            Costs 1 step (-0.25 reward)."""
+            return env._do_inspect("metrics")
+        @mcp.tool()
+        def query_specialist(specialist: str) -> str:
+            """Ask a specialist for their analysis of the incident.
+            Specialists: 'runtime', 'dispatch', 'kernel', 'loader'.
+            WARNING: At least one specialist gives wrong advice per incident.
+            Cross-verify specialist opinions before trusting them.
+            Costs 1 step (-0.25 reward)."""
+            return env._do_ask_specialist(specialist)
+        @mcp.tool()
+        def apply_fix(fix: str) -> str:
+            """Apply a fix to the system. Can only be used ONCE per incident.
+            Available fixes: 'relax_arch_check', 'add_whitelist_entry',
+            'fix_runtime_path', 'switch_backend', 'update_model_config',
+            'fix_weight_mapping', 'tune_memory_config', 'fix_quantization',
+            'fix_comm_config', 'update_driver_config'.
+            Correct fix: +3 reward. Wrong fix: -2 reward."""
+            return env._do_apply_fix(fix)
+        @mcp.tool()
+        def submit_diagnosis(root_cause: str, fix: str, justification: str = "") -> str:
+            """Submit your final diagnosis. This ends the episode.
+            Root causes: 'arch_guard', 'backend_whitelist', 'runtime_loader',
+            'backend_selector', 'model_config', 'weight_layout',
+            'memory_oom', 'quantization_error', 'distributed_comm', 'driver_compat'.
+            Fixes: 'relax_arch_check', 'add_whitelist_entry', 'fix_runtime_path',
+            'switch_backend', 'update_model_config', 'fix_weight_mapping',
+            'tune_memory_config', 'fix_quantization', 'fix_comm_config', 'update_driver_config'.
+            justification: A short sentence explaining WHY you chose this root cause
+            and fix based on the evidence you gathered. Bonus +1 if provided.
+            Correct root_cause: +8. Wrong: -4. Correct fix: +8. Wrong: -4.
+            Bonus +2 if solved in 4 or fewer steps. Bonus +1 for justification."""
+            return env._do_submit(root_cause, fix, justification)
+        super().__init__(mcp)
+    # ------------------------------------------------------------------
+    # MCP tool implementations
+    # ------------------------------------------------------------------
+    def _check_episode(self) -> str | None:
+        """Return error message if episode is not active."""
+        if self._scenario is None:
+            return "No active incident. Call reset() first."
+        if self._done:
+            return "Episode is over. Call reset() to start a new incident."
+        if self._step_count >= MAX_STEPS:
+            self._done = True
+            return "Max steps reached. Episode over."
+        return None
+    def _record_step(self, reward: float, action: dict) -> None:
+        self._step_count += 1
+        self._state_obj.step_count = self._step_count
+        self._cumulative_reward += reward
+        self._actions_taken.append(action)
+    def _do_inspect(self, target: str) -> str:
+        err = self._check_episode()
+        if err:
+            return err
+        ir = self._scenario.inspect_results
+        result_map = {
+            "logs": ir.logs,
+            "config": ir.config,
+            "snippet": ir.snippet,
+            "metrics": ir.metrics,
+        }
+        self._record_step(-0.25, {"type": "inspect", "target": target})
+        remaining = MAX_STEPS - self._step_count
+        return (
+            f"[INSPECT {target.upper()}]\n"
+            f"{result_map[target]}\n\n"
+            f"[Steps remaining: {remaining} | Reward: -0.25 | Cumulative: {self._cumulative_reward:.2f}]"
+        )
+    def _do_ask_specialist(self, specialist: str) -> str:
+        err = self._check_episode()
+        if err:
+            return err
+        if specialist not in SPECIALISTS:
+            self._record_step(-2.0, {"type": "invalid", "message": f"Unknown specialist: {specialist}"})
+            return f"Invalid specialist '{specialist}'. Available: {SPECIALISTS}. Penalty: -2.0"
+        followup = self._scenario.specialist_followups.get(specialist, "No additional information.")
+        self._record_step(-0.25, {"type": "ask_specialist", "specialist": specialist})
+        remaining = MAX_STEPS - self._step_count
+        return (
+            f"[SPECIALIST: {specialist.upper()}]\n"
+            f"{followup}\n\n"
+            f"[Steps remaining: {remaining} | Reward: -0.25 | Cumulative: {self._cumulative_reward:.2f}]"
+        )
+    def _do_apply_fix(self, fix: str) -> str:
+        err = self._check_episode()
+        if err:
+            return err
+        if self._fix_applied:
+            self._record_step(-2.0, {"type": "invalid", "message": "Fix already applied"})
+            return "You already applied a fix this episode. Only one fix allowed. Penalty: -2.0"
+        if fix not in VALID_FIXES:
+            self._record_step(-2.0, {"type": "invalid", "message": f"Invalid fix: {fix}"})
+            return f"Invalid fix '{fix}'. Available: {sorted(VALID_FIXES)}. Penalty: -2.0"
+        self._fix_applied = True
+        is_correct = fix == self._scenario.correct_fix
+        self._fix_was_correct = is_correct
+        reward = 3.0 if is_correct else -2.0
+        self._record_step(reward, {"type": "apply_fix", "fix": fix, "correct": is_correct})
+        remaining = MAX_STEPS - self._step_count
+        if is_correct:
+            return (
+                f"[FIX APPLIED: {fix}] Fix applied successfully. Systems recovering.\n"
+                f"Now submit your diagnosis with submit_diagnosis().\n\n"
+                f"[Steps remaining: {remaining} | Reward: +3.0 | Cumulative: {self._cumulative_reward:.2f}]"
+            )
+        else:
+            return (
+                f"[FIX APPLIED: {fix}] Fix applied but the issue persists.\n"
+                f"Consider your diagnosis carefully.\n\n"
+                f"[Steps remaining: {remaining} | Reward: -2.0 | Cumulative: {self._cumulative_reward:.2f}]"
+            )
+    def _do_submit(self, root_cause: str, fix: str, justification: str = "") -> str:
+        err = self._check_episode()
+        if err:
+            return err
+        if root_cause not in VALID_ROOT_CAUSES:
+            self._record_step(-2.0, {"type": "invalid", "message": f"Invalid root_cause: {root_cause}"})
+            return f"Invalid root_cause '{root_cause}'. Available: {sorted(VALID_ROOT_CAUSES)}. Penalty: -2.0"
+        if fix not in VALID_FIXES:
+            self._record_step(-2.0, {"type": "invalid", "message": f"Invalid fix: {fix}"})
+            return f"Invalid fix '{fix}'. Available: {sorted(VALID_FIXES)}. Penalty: -2.0"
+        self._done = True
+        rc_correct = root_cause == self._scenario.root_cause
+        fix_correct = fix == self._scenario.correct_fix
+        has_justification = len(justification.strip()) >= 10
+        reward = 0.0
+        reward += 8.0 if rc_correct else -4.0
+        reward += 8.0 if fix_correct else -4.0
+        if rc_correct and fix_correct and self._step_count + 1 <= 4:
+            reward += 2.0
+        if has_justification:
+            reward += 1.0
+        self._record_step(reward, {
+            "type": "submit", "root_cause": root_cause, "fix": fix,
+            "justification": justification,
+            "rc_correct": rc_correct, "fix_correct": fix_correct,
+            "has_justification": has_justification,
+        })
+        lines = ["[DIAGNOSIS SUBMITTED]"]
+        lines.append(f"  Root cause: {root_cause} — {'CORRECT' if rc_correct else 'WRONG (was: ' + self._scenario.root_cause + ')'}")
+        lines.append(f"  Fix: {fix} — {'CORRECT' if fix_correct else 'WRONG (was: ' + self._scenario.correct_fix + ')'}")
+        if has_justification:
+            lines.append(f"  Justification: {justification.strip()}")
+            lines.append("  JUSTIFICATION BONUS: +1")
+        else:
+            lines.append("  No justification provided (missed +1 bonus)")
+        lines.append(f"  Steps used: {self._step_count}/{MAX_STEPS}")
+        if rc_correct and fix_correct and self._step_count <= 4:
+            lines.append("  EFFICIENCY BONUS: +2 (solved in <= 4 steps)")
+        lines.append(f"  Episode reward: {self._cumulative_reward:.2f}")
+        return "\n".join(lines)
+    # ------------------------------------------------------------------
+    # OpenEnv Environment interface (for training / WebSocket API)
+    # ------------------------------------------------------------------
+    def reset(self, seed=None, episode_id=None, **kwargs) -> StackDoctorObservation:
+        scenario_id = kwargs.get("scenario_id")
+        split = kwargs.get("split", "train")
+        self._scenario = get_scenario(scenario_id, split=split)
+        self._state_obj = State(
+            episode_id=episode_id or str(uuid4()),
+            step_count=0,
+        )
+        self._step_count = 0
+        self._fix_applied = False
+        self._fix_was_correct = None
+        self._done = False
+        self._cumulative_reward = 0.0
+        self._actions_taken = []
+        specialist_obs = {}
+        for name, op in self._scenario.specialist_opinions.items():
+            specialist_obs[name] = {
+                "opinion": op.opinion,
+                "confidence": op.confidence,
+            }
+        return StackDoctorObservation(
+            output=(
+                "STACK DOCTOR — New incident assigned.\n"
+                "Investigate using the available tools: read_log(), check_config(), "
+                "view_code(), run_diagnostic(), query_specialist(name).\n"
+                "When ready, apply_fix(fix) and/or submit_diagnosis(root_cause, fix).\n"
+                "You have 6 steps. At least one specialist is WRONG — cross-verify.\n"
+            ),
+            incident_ticket=self._scenario.incident_ticket,
+            hardware=self._scenario.hardware,
+            model_name=self._scenario.model_name,
+            backend=self._scenario.backend,
+            log_excerpt=self._scenario.initial_log,
+            code_snippet=self._scenario.initial_snippet,
+            specialist_opinions=specialist_obs,
+            steps_remaining=MAX_STEPS,
+            fix_used=False,
+            done=False,
+            reward=0.0,
+        )
+    def _step_impl(
+        self,
+        action: Action,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> Observation:
+        """Handle non-MCP actions (JSON action strings for training)."""
+        if not isinstance(action, StackDoctorAction):
+            return self._make_obs("Invalid action type.", -2.0)
+        try:
+            parsed = json.loads(action.message)
+        except (json.JSONDecodeError, TypeError):
+            return self._make_obs(f"Invalid JSON: {action.message[:200]}", -2.0)
+        action_type = parsed.get("type")
+        if action_type == "inspect":
+            result = self._do_inspect(parsed.get("target", "logs"))
+        elif action_type == "ask_specialist":
+            result = self._do_ask_specialist(parsed.get("specialist", ""))
+        elif action_type == "apply_fix":
+            result = self._do_apply_fix(parsed.get("fix", ""))
+        elif action_type == "submit":
+            result = self._do_submit(parsed.get("root_cause", ""), parsed.get("fix", ""), parsed.get("justification", ""))
+        else:
+            self._record_step(-2.0, {"type": "invalid", "message": f"Unknown: {action_type}"})
+            result = f"Unknown action type: {action_type}. Penalty: -2.0"
+        # Extract last reward from actions
+        last_reward = 0.0
+        if self._actions_taken:
+            last = self._actions_taken[-1]
+            if last.get("type") == "submit":
+                # Calculate submit reward
+                rc_c = last.get("rc_correct", False)
+                fx_c = last.get("fix_correct", False)
+                last_reward = (8.0 if rc_c else -4.0) + (8.0 if fx_c else -4.0)
+                if rc_c and fx_c and self._step_count <= 4:
+                    last_reward += 2.0
+                if last.get("has_justification", False):
+                    last_reward += 1.0
+            elif last.get("type") == "apply_fix":
+                last_reward = 3.0 if last.get("correct") else -2.0
+            elif last.get("type") == "invalid":
+                last_reward = -2.0
+            else:
+                last_reward = -0.25
+        return self._make_obs(result, last_reward)
+    def _make_obs(self, output: str, reward: float) -> StackDoctorObservation:
+        remaining = MAX_STEPS - self._step_count
+        return StackDoctorObservation(
+            output=output,
+            incident_ticket=self._scenario.incident_ticket if self._scenario else "",
+            hardware=self._scenario.hardware if self._scenario else "",
+            model_name=self._scenario.model_name if self._scenario else "",
+            backend=self._scenario.backend if self._scenario else "",
+            log_excerpt="",
+            code_snippet="",
+            specialist_opinions={},
+            steps_remaining=remaining,
+            fix_used=self._fix_applied,
+            done=self._done,
+            reward=reward,
+            metadata={
+                "cumulative_reward": self._cumulative_reward,
+                "step": self._step_count,
+                "scenario_id": self._scenario.id if self._scenario else "",
+            },
+        )
+    @property
+    def state(self) -> State:
+        return self._state_obj

static/index.html ADDED Viewed

	@@ -0,0 +1,1566 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Stack Doctor — Incident War Room</title>
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@300;400;500;600&family=Outfit:wght@300;400;500;600;700&display=swap" rel="stylesheet">
+<style>
+*, *::before, *::after { margin: 0; padding: 0; box-sizing: border-box; }
+:root {
+  --bg-abyss: #060a11;
+  --bg-deep: #0a0f1a;
+  --bg-mid: #0f1623;
+  --bg-surface: #151d2e;
+  --bg-elevated: #1a2438;
+  --border-subtle: rgba(100, 180, 255, 0.08);
+  --border-active: rgba(0, 196, 255, 0.25);
+  --cyan: #00c4ff;
+  --cyan-bright: #40d4ff;
+  --cyan-dim: rgba(0, 196, 255, 0.15);
+  --cyan-glow: rgba(0, 196, 255, 0.4);
+  --amber: #f0a030;
+  --amber-dim: rgba(240, 160, 48, 0.15);
+  --amber-glow: rgba(240, 160, 48, 0.4);
+  --emerald: #00e676;
+  --emerald-dim: rgba(0, 230, 118, 0.12);
+  --emerald-glow: rgba(0, 230, 118, 0.35);
+  --coral: #ff3d5a;
+  --coral-dim: rgba(255, 61, 90, 0.12);
+  --text-primary: #d8e0ec;
+  --text-secondary: rgba(216, 224, 236, 0.55);
+  --text-tertiary: rgba(216, 224, 236, 0.3);
+  --font-display: 'Outfit', system-ui, sans-serif;
+  --font-mono: 'IBM Plex Mono', 'SF Mono', monospace;
+  --ease-out-expo: cubic-bezier(0.16, 1, 0.3, 1);
+  --duration-slow: 800ms;
+  --duration-med: 400ms;
+  --duration-fast: 200ms;
+}
+html { height: 100%; }
+body {
+  min-height: 100%;
+  background: var(--bg-abyss);
+  color: var(--text-primary);
+  font-family: var(--font-display);
+  -webkit-font-smoothing: antialiased;
+  overflow-y: auto;
+  overflow-x: hidden;
+}
+body::before {
+  content: '';
+  position: fixed; inset: 0;
+  background:
+    radial-gradient(ellipse 80% 60% at 20% 80%, rgba(0, 100, 180, 0.06) 0%, transparent 70%),
+    radial-gradient(ellipse 60% 50% at 80% 20%, rgba(0, 160, 255, 0.04) 0%, transparent 60%);
+  pointer-events: none; z-index: 0;
+}
+.grid-overlay {
+  position: fixed; inset: 0; z-index: 0; pointer-events: none;
+  background-image:
+    linear-gradient(rgba(100, 180, 255, 0.015) 1px, transparent 1px),
+    linear-gradient(90deg, rgba(100, 180, 255, 0.015) 1px, transparent 1px);
+  background-size: 60px 60px;
+}
+.app {
+  position: relative; z-index: 1;
+  max-width: 1400px;
+  margin: 0 auto;
+  padding: 20px 24px 40px;
+  display: flex; flex-direction: column; gap: 16px;
+}
+/* ══════════ HEADER ══════════ */
+.header {
+  display: flex; align-items: center; justify-content: space-between;
+  padding: 14px 24px;
+  background: linear-gradient(135deg, var(--bg-mid), var(--bg-surface));
+  border: 1px solid var(--border-subtle); border-radius: 14px;
+}
+.header-left { display: flex; align-items: center; gap: 14px; }
+.logo-mark {
+  width: 36px; height: 36px; border-radius: 9px;
+  background: linear-gradient(135deg, var(--cyan), rgba(0, 196, 255, 0.5));
+  display: flex; align-items: center; justify-content: center;
+  box-shadow: 0 0 24px var(--cyan-dim);
+  animation: logoPulse 4s ease-in-out infinite;
+}
+.logo-mark svg { width: 20px; height: 20px; }
+@keyframes logoPulse { 0%,100% { box-shadow: 0 0 20px var(--cyan-dim); } 50% { box-shadow: 0 0 36px var(--cyan-glow); } }
+.header-title { font-weight: 600; font-size: 16px; letter-spacing: 0.5px; }
+.header-subtitle { font-family: var(--font-mono); font-size: 11px; font-weight: 300; color: var(--text-secondary); }
+.header-right { display: flex; align-items: center; gap: 20px; }
+.header-meta-label { font-family: var(--font-mono); font-size: 9px; font-weight: 500; letter-spacing: 1.5px; text-transform: uppercase; color: var(--text-tertiary); }
+.header-meta-value { font-family: var(--font-mono); font-size: 12px; color: var(--cyan); }
+.status-badge {
+  font-family: var(--font-mono); font-size: 10px; font-weight: 500;
+  letter-spacing: 1px; text-transform: uppercase;
+  padding: 5px 14px; border-radius: 20px;
+  background: var(--cyan-dim); color: var(--cyan);
+  border: 1px solid rgba(0, 196, 255, 0.2);
+  transition: all var(--duration-slow) var(--ease-out-expo);
+}
+.status-badge.warning { background: var(--amber-dim); color: var(--amber); border-color: rgba(240, 160, 48, 0.3); }
+.status-badge.success { background: var(--emerald-dim); color: var(--emerald); border-color: rgba(0, 230, 118, 0.3); }
+.status-badge.error { background: var(--coral-dim); color: var(--coral); border-color: rgba(255, 61, 90, 0.3); }
+/* ══════════ SECTION TITLES ══════════ */
+.section-title {
+  font-family: var(--font-mono); font-size: 10px; font-weight: 500;
+  letter-spacing: 2px; text-transform: uppercase; color: var(--text-tertiary);
+  padding: 8px 0 0;
+  display: flex; align-items: center; gap: 10px;
+}
+.section-title::after { content: ''; flex: 1; height: 1px; background: var(--border-subtle); }
+/* ══════════ TRAINING CHART ══════════ */
+.chart-section {
+  display: grid; grid-template-columns: 1fr 1fr; gap: 16px;
+}
+.chart-panel {
+  background: linear-gradient(180deg, rgba(15, 22, 35, 0.85), rgba(10, 15, 26, 0.95));
+  border: 1px solid var(--border-subtle); border-radius: 14px;
+  padding: 24px 28px; position: relative; overflow: hidden;
+}
+.chart-panel-title {
+  font-family: var(--font-mono); font-size: 10px; font-weight: 500;
+  letter-spacing: 1.5px; text-transform: uppercase; color: var(--text-secondary);
+  margin-bottom: 4px;
+}
+.chart-panel-subtitle {
+  font-size: 13px; font-weight: 300; color: var(--text-tertiary);
+  margin-bottom: 16px;
+}
+.chart-canvas-wrap {
+  position: relative; width: 100%; height: 280px;
+}
+canvas { width: 100% !important; height: 100% !important; }
+.chart-stat-row {
+  display: flex; gap: 20px; margin-top: 16px; padding-top: 14px;
+  border-top: 1px solid var(--border-subtle);
+}
+.chart-stat { display: flex; flex-direction: column; gap: 2px; }
+.chart-stat-label { font-family: var(--font-mono); font-size: 9px; font-weight: 500; letter-spacing: 1px; text-transform: uppercase; color: var(--text-tertiary); }
+.chart-stat-value { font-family: var(--font-mono); font-size: 18px; font-weight: 300; }
+.chart-stat-value.emerald { color: var(--emerald); }
+.chart-stat-value.coral { color: var(--coral); }
+.chart-stat-value.cyan { color: var(--cyan); }
+.chart-stat-value.amber { color: var(--amber); }
+/* ══════════ ANNOTATION BADGES ══════════ */
+.annotation {
+  position: absolute; font-family: var(--font-mono); font-size: 9px;
+  font-weight: 500; letter-spacing: 0.5px; padding: 3px 8px;
+  border-radius: 4px; pointer-events: none; white-space: nowrap;
+}
+/* ══════════ WAR ROOM GRID ══════════ */
+.warroom { display: grid; grid-template-columns: 220px 1fr 260px; gap: 14px; }
+.panel {
+  background: linear-gradient(180deg, rgba(15, 22, 35, 0.85), rgba(10, 15, 26, 0.95));
+  border: 1px solid var(--border-subtle); border-radius: 14px; overflow: hidden;
+}
+.panel-header {
+  padding: 12px 16px 8px; display: flex; align-items: center; gap: 8px;
+  border-bottom: 1px solid var(--border-subtle);
+}
+.panel-header-dot {
+  width: 6px; height: 6px; border-radius: 50%;
+  background: var(--cyan); box-shadow: 0 0 8px var(--cyan-glow);
+  animation: dotPulse 3s ease-in-out infinite;
+}
+@keyframes dotPulse { 0%,100% { opacity: 0.6; } 50% { opacity: 1; } }
+.panel-header-title { font-family: var(--font-mono); font-size: 10px; font-weight: 500; letter-spacing: 1.5px; text-transform: uppercase; color: var(--text-secondary); }
+/* ══════════ ARCHITECTURE DIAGRAM ══════════ */
+.arch-body {
+  display: flex; flex-direction: column; align-items: center;
+  padding: 16px 14px; gap: 0;
+}
+.arch-layer {
+  width: 100%; padding: 12px;
+  background: var(--bg-deep); border: 1px solid var(--border-subtle); border-radius: 8px;
+  transition: all var(--duration-slow) var(--ease-out-expo);
+}
+.arch-layer .layer-name { font-family: var(--font-mono); font-size: 10px; font-weight: 500; letter-spacing: 1.2px; text-transform: uppercase; color: var(--text-secondary); transition: color var(--duration-slow); }
+.arch-layer .layer-detail { font-family: var(--font-mono); font-size: 9px; font-weight: 300; color: var(--text-tertiary); margin-top: 2px; }
+.arch-layer.scanning { border-color: rgba(0, 196, 255, 0.3); background: linear-gradient(135deg, rgba(0, 196, 255, 0.06), var(--bg-deep)); box-shadow: 0 0 20px var(--cyan-dim); }
+.arch-layer.scanning .layer-name { color: var(--cyan); }
+.arch-layer.identified { border-color: rgba(240, 160, 48, 0.4); background: linear-gradient(135deg, rgba(240, 160, 48, 0.08), var(--bg-deep)); box-shadow: 0 0 25px var(--amber-dim); animation: identPulse 2s ease-in-out infinite; }
+.arch-layer.identified .layer-name { color: var(--amber); }
+@keyframes identPulse { 0%,100% { box-shadow: 0 0 20px var(--amber-dim); } 50% { box-shadow: 0 0 35px rgba(240, 160, 48, 0.25); } }
+.arch-layer.resolved { border-color: rgba(0, 230, 118, 0.3); background: linear-gradient(135deg, rgba(0, 230, 118, 0.06), var(--bg-deep)); box-shadow: 0 0 20px var(--emerald-dim); }
+.arch-layer.resolved .layer-name { color: var(--emerald); }
+.arch-connector { width: 1px; height: 10px; background: linear-gradient(180deg, var(--border-active), transparent); position: relative; }
+.arch-connector .data-dot { width: 3px; height: 3px; border-radius: 50%; background: var(--cyan); position: absolute; left: -1px; animation: flowDown 2s linear infinite; opacity: 0.6; }
+@keyframes flowDown { 0% { top: 0; opacity: 0; } 20% { opacity: 0.8; } 80% { opacity: 0.8; } 100% { top: 100%; opacity: 0; } }
+/* ══════════ INVESTIGATION LOG ══════════ */
+.log-body {
+  height: 380px; overflow-y: auto; padding: 12px 16px;
+  display: flex; flex-direction: column; gap: 8px;
+  scrollbar-width: thin; scrollbar-color: rgba(100, 180, 255, 0.1) transparent;
+}
+.idle-prompt { display: flex; flex-direction: column; align-items: center; justify-content: center; height: 100%; gap: 12px; padding: 40px; }
+.idle-prompt .idle-text { font-size: 13px; color: var(--text-secondary); text-align: center; line-height: 1.6; }
+.incident-card {
+  background: linear-gradient(135deg, rgba(0, 196, 255, 0.04), var(--bg-surface));
+  border: 1px solid rgba(0, 196, 255, 0.12); border-radius: 10px;
+  padding: 14px; animation: cardIn 0.6s var(--ease-out-expo) both;
+}
+@keyframes cardIn { from { opacity: 0; transform: translateY(8px); } }
+.incident-label { font-family: var(--font-mono); font-size: 9px; font-weight: 500; letter-spacing: 1.5px; text-transform: uppercase; color: var(--cyan); margin-bottom: 8px; }
+.incident-text { font-size: 12px; line-height: 1.6; color: var(--text-primary); }
+.incident-meta { display: flex; gap: 16px; margin-top: 10px; flex-wrap: wrap; }
+.incident-meta-item .meta-label { font-family: var(--font-mono); font-size: 8px; font-weight: 500; letter-spacing: 1.2px; text-transform: uppercase; color: var(--text-tertiary); }
+.incident-meta-item .meta-value { font-family: var(--font-mono); font-size: 11px; color: var(--text-secondary); }
+.log-entry {
+  display: flex; gap: 10px; padding: 10px 12px;
+  background: var(--bg-deep); border: 1px solid var(--border-subtle); border-radius: 8px;
+  animation: entryIn 0.5s var(--ease-out-expo) both;
+}
+@keyframes entryIn { from { opacity: 0; transform: translateX(-12px); } }
+.log-entry-icon {
+  width: 24px; height: 24px; border-radius: 6px;
+  display: flex; align-items: center; justify-content: center;
+  flex-shrink: 0; font-size: 11px;
+}
+.log-entry-icon.inspect { background: var(--cyan-dim); color: var(--cyan); }
+.log-entry-icon.specialist { background: rgba(160, 120, 255, 0.12); color: #a078ff; }
+.log-entry-icon.fix { background: var(--amber-dim); color: var(--amber); }
+.log-entry-icon.submit { background: var(--emerald-dim); color: var(--emerald); }
+.log-entry-content { flex: 1; min-width: 0; }
+.log-entry-header { display: flex; align-items: center; justify-content: space-between; margin-bottom: 3px; }
+.log-entry-type { font-family: var(--font-mono); font-size: 10px; font-weight: 500; letter-spacing: 0.8px; text-transform: uppercase; }
+.log-entry-type.cyan { color: var(--cyan); }
+.log-entry-type.purple { color: #a078ff; }
+.log-entry-type.amber { color: var(--amber); }
+.log-entry-type.emerald { color: var(--emerald); }
+.log-entry-step { font-family: var(--font-mono); font-size: 9px; color: var(--text-tertiary); }
+.log-entry-text { font-family: var(--font-mono); font-size: 10px; font-weight: 300; line-height: 1.55; color: var(--text-secondary); white-space: pre-wrap; word-break: break-word; }
+.log-entry-reward { font-family: var(--font-mono); font-size: 10px; font-weight: 500; margin-top: 4px; }
+.log-entry-reward.positive { color: var(--emerald); }
+.log-entry-reward.negative { color: var(--coral); }
+/* ══════════ SPECIALISTS ══════════ */
+.specialists-body { padding: 10px 12px; display: flex; flex-direction: column; gap: 8px; overflow-y: auto; max-height: 420px; }
+.specialist-card {
+  background: var(--bg-deep); border: 1px solid var(--border-subtle); border-radius: 10px;
+  padding: 10px 12px; transition: all var(--duration-slow) var(--ease-out-expo);
+  animation: cardIn 0.5s var(--ease-out-expo) both;
+}
+.specialist-card.highlighted { border-color: rgba(160, 120, 255, 0.3); background: linear-gradient(135deg, rgba(160, 120, 255, 0.05), var(--bg-deep)); }
+.specialist-card.wrong { border-color: rgba(255, 61, 90, 0.15); opacity: 0.5; }
+.specialist-card.correct { border-color: rgba(0, 230, 118, 0.2); }
+.specialist-top { display: flex; align-items: center; justify-content: space-between; margin-bottom: 6px; }
+.specialist-name { font-family: var(--font-mono); font-size: 10px; font-weight: 500; letter-spacing: 1px; text-transform: uppercase; color: var(--text-secondary); transition: color var(--duration-med); }
+.specialist-card.highlighted .specialist-name { color: #a078ff; }
+.specialist-card.correct .specialist-name { color: var(--emerald); }
+.specialist-card.wrong .specialist-name { color: var(--coral); }
+.confidence-bar { width: 50px; height: 3px; background: rgba(255,255,255,0.06); border-radius: 2px; overflow: hidden; }
+.confidence-fill { height: 100%; border-radius: 2px; background: var(--cyan); transition: width 1s var(--ease-out-expo); }
+.specialist-card.wrong .confidence-fill { background: var(--coral); }
+.specialist-card.correct .confidence-fill { background: var(--emerald); }
+.specialist-opinion { font-size: 11px; font-weight: 300; line-height: 1.4; color: var(--text-secondary); }
+.specialist-card.wrong .specialist-opinion { opacity: 0.5; }
+.specialist-verdict { font-family: var(--font-mono); font-size: 9px; font-weight: 500; letter-spacing: 1px; text-transform: uppercase; margin-top: 6px; opacity: 0; transition: opacity var(--duration-med); }
+.specialist-card.wrong .specialist-verdict, .specialist-card.correct .specialist-verdict { opacity: 1; }
+.specialist-card.wrong .specialist-verdict { color: var(--coral); }
+.specialist-card.correct .specialist-verdict { color: var(--emerald); }
+/* ══════════ VITALS BAR ══════════ */
+.vitals-bar {
+  display: grid; grid-template-columns: repeat(5, 1fr); gap: 10px;
+}
+.vital {
+  background: linear-gradient(135deg, var(--bg-mid), var(--bg-surface));
+  border: 1px solid var(--border-subtle); border-radius: 10px;
+  padding: 10px 14px;
+}
+.vital-label { font-family: var(--font-mono); font-size: 9px; font-weight: 500; letter-spacing: 1.5px; text-transform: uppercase; color: var(--text-tertiary); margin-bottom: 4px; }
+.vital-value { font-family: var(--font-mono); font-size: 20px; font-weight: 300; transition: color var(--duration-med); }
+.vital-value.cyan { color: var(--cyan); }
+.vital-value.amber { color: var(--amber); }
+.vital-value.emerald { color: var(--emerald); }
+.vital-value.coral { color: var(--coral); }
+.steps-dots { display: flex; gap: 5px; margin-top: 4px; }
+.step-dot { width: 8px; height: 8px; border-radius: 50%; background: rgba(255,255,255,0.08); border: 1px solid rgba(255,255,255,0.06); transition: all var(--duration-med); }
+.step-dot.used { background: var(--cyan); border-color: var(--cyan); box-shadow: 0 0 8px var(--cyan-dim); }
+.step-dot.current { background: var(--amber); border-color: var(--amber); box-shadow: 0 0 8px var(--amber-dim); animation: dotPulse 1.5s ease-in-out infinite; }
+/* ══════════ CONTROLS ══════════ */
+.controls {
+  display: flex; justify-content: center; gap: 12px; padding: 4px 0;
+}
+.ctrl-btn {
+  font-family: var(--font-mono); font-size: 12px; font-weight: 500;
+  letter-spacing: 1px; text-transform: uppercase;
+  padding: 12px 28px; border-radius: 10px;
+  cursor: pointer; transition: all var(--duration-fast) ease;
+  border: none;
+}
+.ctrl-btn.primary {
+  background: linear-gradient(135deg, var(--cyan), rgba(0, 160, 220, 0.9));
+  color: #fff; box-shadow: 0 0 24px var(--cyan-dim);
+}
+.ctrl-btn.primary:hover { box-shadow: 0 0 40px var(--cyan-glow); transform: translateY(-1px); }
+.ctrl-btn.primary:disabled { opacity: 0.3; cursor: not-allowed; transform: none; box-shadow: none; }
+.ctrl-btn.secondary {
+  background: var(--bg-surface); color: var(--text-secondary);
+  border: 1px solid var(--border-subtle);
+}
+.ctrl-btn.secondary:hover { border-color: var(--border-active); color: var(--text-primary); }
+.server-input {
+  font-family: var(--font-mono); font-size: 11px; padding: 10px 14px;
+  background: var(--bg-deep); color: var(--cyan); border: 1px solid var(--border-subtle);
+  border-radius: 8px; width: 200px; outline: none;
+}
+.server-input:focus { border-color: var(--border-active); }
+.conn-status {
+  font-family: var(--font-mono); font-size: 10px; text-transform: uppercase;
+  letter-spacing: 1px; color: var(--text-tertiary); padding: 0 8px;
+}
+.conn-status.connected { color: var(--emerald); }
+.conn-status.error { color: var(--coral); }
+.conn-status.running { color: var(--amber); }
+/* ══════════ DIAGNOSIS OVERLAY ══════════ */
+.diagnosis-overlay {
+  position: fixed; inset: 0;
+  background: rgba(6, 10, 17, 0.85); backdrop-filter: blur(20px);
+  display: flex; align-items: center; justify-content: center;
+  z-index: 100; opacity: 0; pointer-events: none;
+  transition: opacity 0.6s var(--ease-out-expo);
+}
+.diagnosis-overlay.visible { opacity: 1; pointer-events: auto; }
+.diagnosis-card {
+  background: linear-gradient(180deg, var(--bg-surface), var(--bg-deep));
+  border: 1px solid var(--border-active); border-radius: 20px;
+  padding: 40px 48px; max-width: 520px; width: 100%; text-align: center;
+  transform: scale(0.92) translateY(20px);
+  transition: transform 0.8s var(--ease-out-expo);
+  box-shadow: 0 0 60px rgba(0, 196, 255, 0.08), 0 20px 60px rgba(0, 0, 0, 0.4);
+}
+.diagnosis-overlay.visible .diagnosis-card { transform: scale(1) translateY(0); }
+.diagnosis-title { font-family: var(--font-mono); font-size: 11px; font-weight: 500; letter-spacing: 3px; text-transform: uppercase; color: var(--cyan); margin-bottom: 24px; }
+.diagnosis-result { display: flex; flex-direction: column; gap: 12px; margin-bottom: 28px; }
+.diagnosis-row { display: flex; align-items: center; justify-content: space-between; padding: 12px 16px; background: var(--bg-deep); border-radius: 10px; border: 1px solid var(--border-subtle); }
+.diagnosis-row-label { font-family: var(--font-mono); font-size: 10px; font-weight: 500; letter-spacing: 1px; text-transform: uppercase; color: var(--text-tertiary); }
+.diagnosis-row-value { font-family: var(--font-mono); font-size: 13px; }
+.diagnosis-row-value.correct { color: var(--emerald); }
+.diagnosis-row-value.wrong { color: var(--coral); }
+.diagnosis-reward { font-size: 48px; font-weight: 700; letter-spacing: -2px; margin-bottom: 8px; }
+.diagnosis-reward-label { font-family: var(--font-mono); font-size: 10px; letter-spacing: 1px; text-transform: uppercase; color: var(--text-tertiary); }
+::-webkit-scrollbar { width: 4px; }
+::-webkit-scrollbar-track { background: transparent; }
+::-webkit-scrollbar-thumb { background: rgba(100, 180, 255, 0.15); border-radius: 4px; }
+@media (max-width: 1000px) {
+  .warroom { grid-template-columns: 1fr; }
+  .chart-section { grid-template-columns: 1fr; }
+  .vitals-bar { grid-template-columns: repeat(3, 1fr); }
+}
+</style>
+</head>
+<body>
+<div class="grid-overlay"></div>
+<div class="app">
+  <header class="header">
+    <div class="header-left">
+      <div class="logo-mark">
+        <svg viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round">
+          <path d="M12 3v18M3 12h18M7 7l10 10M17 7L7 17"/>
+        </svg>
+      </div>
+      <div>
+        <div class="header-title">Stack Doctor</div>
+        <div class="header-subtitle">Incident War Room</div>
+      </div>
+    </div>
+    <div class="header-right">
+      <div>
+        <div class="header-meta-label">Scenario</div>
+        <div class="header-meta-value" id="scenarioId">&mdash;</div>
+      </div>
+      <div>
+        <div class="header-meta-label">Episode</div>
+        <div class="header-meta-value" id="episodeTime">00:00</div>
+      </div>
+      <div class="status-badge" id="statusBadge">Standby</div>
+    </div>
+  </header>
+  <!-- ══════════ TRAINING DATA ══════════ -->
+  <div class="section-title">Training Analytics</div>
+  <div class="chart-section" style="grid-template-columns: 1fr; max-width: 900px; margin: 0 auto; width: 100%;">
+    <div class="chart-panel">
+      <div class="chart-panel-title">Qwen3.5-9B &mdash; Episode Reward</div>
+      <div class="chart-panel-subtitle">100 GRPO steps &mdash; base model already near-oracle</div>
+      <div class="chart-canvas-wrap"><canvas id="rewardChart"></canvas></div>
+      <div class="chart-stat-row">
+        <div class="chart-stat">
+          <div class="chart-stat-label">Peak</div>
+          <div class="chart-stat-value emerald">+26.00</div>
+        </div>
+        <div class="chart-stat">
+          <div class="chart-stat-label">Base Avg</div>
+          <div class="chart-stat-value cyan">+19.50</div>
+        </div>
+        <div class="chart-stat">
+          <div class="chart-stat-label">Zero-Std</div>
+          <div class="chart-stat-value coral">72%</div>
+        </div>
+      </div>
+    </div>
+    <div class="chart-panel">
+      <div class="chart-panel-title">Qwen3.5-9B &mdash; Completion Length</div>
+      <div class="chart-panel-subtitle">Thinking mode consumed token budget, hit 2048 cap</div>
+      <div class="chart-canvas-wrap"><canvas id="lengthChart"></canvas></div>
+      <div class="chart-stat-row">
+        <div class="chart-stat">
+          <div class="chart-stat-label">Collapse</div>
+          <div class="chart-stat-value coral">Step 36</div>
+        </div>
+        <div class="chart-stat">
+          <div class="chart-stat-label">Clipping</div>
+          <div class="chart-stat-value amber">Step 69</div>
+        </div>
+      </div>
+    </div>
+    <div class="chart-panel" style="border-color: rgba(0, 196, 255, 0.15);">
+      <div class="chart-panel-title">Qwen2.5-1.5B &mdash; Episode Reward</div>
+      <div class="chart-panel-subtitle">16 GRPO steps &mdash; weak model, real gradient signal</div>
+      <div class="chart-canvas-wrap"><canvas id="reward1bChart"></canvas></div>
+      <div class="chart-stat-row">
+        <div class="chart-stat">
+          <div class="chart-stat-label">Best Step</div>
+          <div class="chart-stat-value cyan">-1.75</div>
+        </div>
+        <div class="chart-stat">
+          <div class="chart-stat-label">Avg</div>
+          <div class="chart-stat-value amber">-4.90</div>
+        </div>
+        <div class="chart-stat">
+          <div class="chart-stat-label">Zero-Std</div>
+          <div class="chart-stat-value emerald">0%</div>
+        </div>
+      </div>
+    </div>
+  </div>
+  <!-- ══════════ DEMO CONTROLS ══════════ -->
+  <div class="section-title">Live Environment</div>
+  <div class="controls">
+    <input type="text" id="serverUrl" class="server-input" placeholder="Server URL (empty = same origin)">
+    <script>
+    /* Auto-detect: use localhost:8000 for local dev, empty for HF Spaces */
+    if (location.hostname === 'localhost' || location.hostname === '127.0.0.1') {
+      document.getElementById('serverUrl').value = 'http://localhost:8000';
+    }
+    </script>
+    <button class="ctrl-btn primary" id="demoBtn" onclick="runComparison()">&#9654; Run Comparison (Base → GRPO Trained)</button>
+    <button class="ctrl-btn secondary" id="resetBtn" onclick="resetState()">&#8634; Reset</button>
+    <span id="modelStatus" class="conn-status" style="margin-left:8px;">Model: checking...</span>
+    <span id="connStatus" class="conn-status">Disconnected</span>
+  </div>
+  <!-- ══════════ WAR ROOM ══════════ -->
+  <div class="warroom">
+    <div class="panel">
+      <div class="panel-header">
+        <div class="panel-header-dot"></div>
+        <div class="panel-header-title">Inference Stack</div>
+      </div>
+      <div class="arch-body" id="archBody">
+        <div class="arch-layer" id="layer-model"><div class="layer-name">Model</div><div class="layer-detail" id="detail-model">&mdash;</div></div>
+        <div class="arch-connector"><div class="data-dot"></div></div>
+        <div class="arch-layer" id="layer-kernel"><div class="layer-name">Kernel</div><div class="layer-detail">Attention / GEMM</div></div>
+        <div class="arch-connector"><div class="data-dot" style="animation-delay:-0.5s"></div></div>
+        <div class="arch-layer" id="layer-backend"><div class="layer-name">Backend</div><div class="layer-detail" id="detail-backend">&mdash;</div></div>
+        <div class="arch-connector"><div class="data-dot" style="animation-delay:-1s"></div></div>
+        <div class="arch-layer" id="layer-runtime"><div class="layer-name">Runtime</div><div class="layer-detail">CUDA / ROCm</div></div>
+        <div class="arch-connector"><div class="data-dot" style="animation-delay:-1.5s"></div></div>
+        <div class="arch-layer" id="layer-memory"><div class="layer-name">Memory</div><div class="layer-detail">HBM / KV Cache</div></div>
+        <div class="arch-connector"><div class="data-dot" style="animation-delay:-2s"></div></div>
+        <div class="arch-layer" id="layer-driver"><div class="layer-name">Driver</div><div class="layer-detail" id="detail-driver">&mdash;</div></div>
+      </div>
+    </div>
+    <div class="panel">
+      <div class="panel-header">
+        <div class="panel-header-dot"></div>
+        <div class="panel-header-title">Investigation Log</div>
+      </div>
+      <div class="log-body" id="logBody">
+        <div class="idle-prompt" id="idlePrompt">
+          <div class="idle-text">Awaiting incident assignment.<br>Click <strong>Run Demo</strong> above to start.</div>
+        </div>
+      </div>
+    </div>
+    <div class="panel">
+      <div class="panel-header">
+        <div class="panel-header-dot"></div>
+        <div class="panel-header-title">Specialist Agents</div>
+      </div>
+      <div class="specialists-body" id="specialistsBody"></div>
+    </div>
+  </div>
+  <!-- ══════════ VITALS ══════════ -->
+  <div class="vitals-bar">
+    <div class="vital">
+      <div class="vital-label">Steps</div>
+      <div class="steps-dots" id="stepsDots">
+        <div class="step-dot"></div><div class="step-dot"></div><div class="step-dot"></div>
+        <div class="step-dot"></div><div class="step-dot"></div><div class="step-dot"></div>
+      </div>
+    </div>
+    <div class="vital"><div class="vital-label">Reward</div><div class="vital-value" id="rewardValue">0.00</div></div>
+    <div class="vital"><div class="vital-label">Fix Status</div><div class="vital-value" id="fixStatus" style="font-size:13px">Not Applied</div></div>
+    <div class="vital"><div class="vital-label">Root Cause</div><div class="vital-value" id="rootCauseValue" style="font-size:13px">&mdash;</div></div>
+    <div class="vital"><div class="vital-label">Diagnosis</div><div class="vital-value" id="diagnosisValue" style="font-size:13px">Pending</div></div>
+  </div>
+</div>
+<div class="diagnosis-overlay" id="diagnosisOverlay">
+  <div class="diagnosis-card">
+    <div class="diagnosis-title">Diagnosis Submitted</div>
+    <div class="diagnosis-result" id="diagnosisResult"></div>
+    <div class="diagnosis-reward" id="diagnosisReward">+0.00</div>
+    <div class="diagnosis-reward-label">Episode Reward</div>
+  </div>
+</div>
+<script>
+/* ═══════════════════════════════════════════════
+   TRAINING DATA — Qwen3.5-9B, 100 GRPO steps
+   ═══════════════════════════════════════════════ */
+var TRAIN_DATA = {
+  steps: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100],
+  reward: [9.65,19.75,7.25,24.25,23.00,22.75,2.62,22.75,23.00,24.50,16.00,23.00,24.50,7.25,24.62,26.00,26.00,26.00,22.50,14.12,-5.45,7.12,-2.60,-8.50,18.50,0.88,26.25,7.88,9.62,7.88,6.88,24.25,-6.25,-5.50,-1.88,-1.75,-1.75,-5.12,-2.62,-1.75,-1.75,-1.75,-2.62,-1.75,-1.75,-1.75,-1.75,-1.75,-1.75,-1.75,-5.12,-1.75,-1.75,-1.75,-1.75,-1.75,-1.75,-1.75,-5.12,-5.12,-1.75,-1.75,-1.75,-1.75,-8.50,-5.12,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-5.00,-8.50,-8.50,-8.50,-1.75,-5.12,-1.75,-1.75,-1.75,-5.12,-1.75,-1.75,-0.12,-1.75,-2.62,-1.75,-1.75],
+  completion_length: [68,98,91,96,85,92,96,74,99,71,86,116,112,124,126,175,105,120,152,112,148,190,193,182,164,135,140,218,130,152,164,182,134,93,24,15,18,87,16,16,15,15,15,15,13,15,15,16,15,15,125,62,180,876,376,280,883,484,734,470,432,488,354,177,471,607,248,210,1234,3,2048,2048,2048,2048,2048,1024,1026,1078,2048,1025,14,1030,2048,1025,2048,2048,2048,1364,1032,15,15,15,14,15,15,1032,2048,1065,2048,1058],
+};
+/* Qwen2.5-1.5B — 16 steps before crash (fixed in next run) */
+var TRAIN_DATA_1B = {
+  steps: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16],
+  reward: [-5.625,-3.375,-8.375,-7.75,-2.75,-2.625,-3.0,-5.125,-8.875,-3.0,-1.75,-3.875,-9.5,-5.5,-3.125,-6.125],
+  completion_length: [65.5,57.5,53,36.5,81,98,89,41.5,46,62.5,75,72.5,48,52.5,44,60],
+};
+/* ═══════════════════════════════════════════════
+   CHARTS — lightweight canvas rendering
+   ═══════════════════════════════════════════════ */
+function drawChart(canvasId, data, opts) {
+  var canvas = document.getElementById(canvasId);
+  var dpr = window.devicePixelRatio || 1;
+  var rect = canvas.parentElement.getBoundingClientRect();
+  canvas.width = rect.width * dpr;
+  canvas.height = rect.height * dpr;
+  canvas.style.width = rect.width + 'px';
+  canvas.style.height = rect.height + 'px';
+  var ctx = canvas.getContext('2d');
+  ctx.scale(dpr, dpr);
+  var W = rect.width, H = rect.height;
+  var pad = { top: 28, right: 24, bottom: 40, left: 64 };
+  var plotW = W - pad.left - pad.right;
+  var plotH = H - pad.top - pad.bottom;
+  var minY = opts.minY !== undefined ? opts.minY : Math.min.apply(null, data);
+  var maxY = opts.maxY !== undefined ? opts.maxY : Math.max.apply(null, data);
+  var rangeY = maxY - minY || 1;
+  function xPos(i) { return pad.left + (i / (data.length - 1)) * plotW; }
+  function yPos(v) { return pad.top + plotH - ((v - minY) / rangeY) * plotH; }
+  // Grid lines
+  ctx.strokeStyle = 'rgba(100,180,255,0.06)';
+  ctx.lineWidth = 1;
+  var gridCount = 5;
+  for (var g = 0; g <= gridCount; g++) {
+    var gy = pad.top + (g / gridCount) * plotH;
+    ctx.beginPath(); ctx.moveTo(pad.left, gy); ctx.lineTo(W - pad.right, gy); ctx.stroke();
+    var label = (maxY - (g / gridCount) * rangeY).toFixed(0);
+    ctx.fillStyle = 'rgba(216,224,236,0.25)';
+    ctx.font = '10px IBM Plex Mono';
+    ctx.textAlign = 'right';
+    ctx.fillText(label, pad.left - 12, gy + 4);
+  }
+  // X axis labels
+  var stepsArr = opts.stepsArray || TRAIN_DATA.steps;
+  var xInterval = data.length > 30 ? 20 : data.length > 10 ? 5 : 2;
+  ctx.fillStyle = 'rgba(216,224,236,0.25)';
+  ctx.textAlign = 'center';
+  for (var x = 0; x < data.length; x += xInterval) {
+    ctx.fillText(stepsArr[x], xPos(x), H - 8);
+  }
+  ctx.fillText(stepsArr[data.length - 1], xPos(data.length - 1), H - 8);
+  // Zero line for reward chart
+  if (opts.zeroLine) {
+    var zy = yPos(0);
+    if (zy >= pad.top && zy <= pad.top + plotH) {
+      ctx.strokeStyle = 'rgba(255,255,255,0.12)';
+      ctx.setLineDash([4, 4]);
+      ctx.beginPath(); ctx.moveTo(pad.left, zy); ctx.lineTo(W - pad.right, zy); ctx.stroke();
+      ctx.setLineDash([]);
+    }
+  }
+  // Reference lines (baselines) — drawn before data so they appear underneath
+  if (opts.refLines) {
+    opts.refLines.forEach(function(ref) {
+      var ry = yPos(ref.value);
+      if (ry >= pad.top - 5 && ry <= pad.top + plotH + 5) {
+        ctx.strokeStyle = ref.color || 'rgba(255,255,255,0.2)';
+        ctx.lineWidth = 1.5;
+        ctx.setLineDash(ref.dash || [6, 4]);
+        ctx.beginPath(); ctx.moveTo(pad.left, ry); ctx.lineTo(W - pad.right, ry); ctx.stroke();
+        ctx.setLineDash([]);
+        // Draw label with opaque background so it's never covered
+        var labelY = ref.labelBelow ? ry + 24 : ry - 16;
+        ctx.font = '600 9px IBM Plex Mono';
+        var textWidth = ctx.measureText(ref.label).width;
+        // Background pill
+        ctx.fillStyle = 'rgba(6, 10, 17, 0.85)';
+        ctx.beginPath();
+        ctx.roundRect(W - pad.right - textWidth - 16, labelY - 9, textWidth + 12, 14, 3);
+        ctx.fill();
+        // Label text
+        ctx.fillStyle = ref.color || 'rgba(255,255,255,0.3)';
+        ctx.textAlign = 'right';
+        ctx.fillText(ref.label, W - pad.right - 8, labelY);
+      }
+    });
+  }
+  // Regions
+  if (opts.regions) {
+    opts.regions.forEach(function(r) {
+      var x1 = xPos(r.from);
+      var x2 = xPos(r.to);
+      ctx.fillStyle = r.color;
+      ctx.fillRect(x1, pad.top, x2 - x1, plotH);
+      ctx.fillStyle = r.labelColor || 'rgba(216,224,236,0.35)';
+      ctx.font = '600 8px IBM Plex Mono';
+      ctx.textAlign = 'center';
+      ctx.fillText(r.label, (x1 + x2) / 2, pad.top - 6);
+    });
+  }
+  // Line gradient
+  var grad = ctx.createLinearGradient(pad.left, 0, W - pad.right, 0);
+  if (opts.gradientStops) {
+    opts.gradientStops.forEach(function(s) { grad.addColorStop(s[0], s[1]); });
+  } else {
+    grad.addColorStop(0, opts.color || '#00c4ff');
+    grad.addColorStop(1, opts.color || '#00c4ff');
+  }
+  // Area fill
+  ctx.beginPath();
+  ctx.moveTo(xPos(0), yPos(data[0]));
+  for (var i = 1; i < data.length; i++) ctx.lineTo(xPos(i), yPos(data[i]));
+  ctx.lineTo(xPos(data.length - 1), yPos(minY));
+  ctx.lineTo(xPos(0), yPos(minY));
+  ctx.closePath();
+  var areaGrad = ctx.createLinearGradient(0, pad.top, 0, pad.top + plotH);
+  areaGrad.addColorStop(0, (opts.areaColor || 'rgba(0,196,255,0.12)'));
+  areaGrad.addColorStop(1, 'rgba(0,196,255,0)');
+  ctx.fillStyle = areaGrad;
+  ctx.fill();
+  // Line
+  ctx.strokeStyle = grad;
+  ctx.lineWidth = 2;
+  ctx.lineJoin = 'round';
+  ctx.beginPath();
+  ctx.moveTo(xPos(0), yPos(data[0]));
+  for (var j = 1; j < data.length; j++) ctx.lineTo(xPos(j), yPos(data[j]));
+  ctx.stroke();
+  // Dot at end
+  var lastX = xPos(data.length - 1), lastY = yPos(data[data.length - 1]);
+  ctx.fillStyle = opts.color || '#00c4ff';
+  ctx.beginPath(); ctx.arc(lastX, lastY, 3, 0, Math.PI * 2); ctx.fill();
+}
+function renderCharts() {
+  drawChart('rewardChart', TRAIN_DATA.reward, {
+    minY: -12, maxY: 30, zeroLine: true,
+    gradientStops: [[0, '#00e676'], [0.18, '#00e676'], [0.22, '#f0a030'], [0.35, '#ff3d5a'], [1, '#ff3d5a']],
+    areaColor: 'rgba(0,196,255,0.08)',
+    regions: [
+      { from: 0, to: 19, label: 'BASE MODEL', color: 'rgba(0,230,118,0.04)', labelColor: 'rgba(0,230,118,0.5)' },
+      { from: 20, to: 35, label: 'DEGRADATION', color: 'rgba(240,160,48,0.04)', labelColor: 'rgba(240,160,48,0.5)' },
+      { from: 36, to: 69, label: 'COLLAPSE', color: 'rgba(255,61,90,0.04)', labelColor: 'rgba(255,61,90,0.5)' },
+      { from: 70, to: 99, label: 'CLIPPING', color: 'rgba(255,61,90,0.03)', labelColor: 'rgba(255,61,90,0.4)' },
+    ],
+    refLines: [
+      { value: 19.5, label: '9B BASELINE +19.5', color: 'rgba(0,230,118,0.5)', dash: [8, 4] },
+    ],
+  });
+  drawChart('lengthChart', TRAIN_DATA.completion_length, {
+    minY: 0, maxY: 2200,
+    color: '#f0a030',
+    areaColor: 'rgba(240,160,48,0.08)',
+    regions: [
+      { from: 0, to: 35, label: 'NORMAL', color: 'rgba(0,230,118,0.03)', labelColor: 'rgba(0,230,118,0.4)' },
+      { from: 36, to: 68, label: 'SHORT OUTPUT', color: 'rgba(240,160,48,0.04)', labelColor: 'rgba(240,160,48,0.5)' },
+      { from: 69, to: 99, label: 'HIT CAP (2048)', color: 'rgba(255,61,90,0.04)', labelColor: 'rgba(255,61,90,0.5)' },
+    ],
+  });
+  drawChart('reward1bChart', TRAIN_DATA_1B.reward, {
+    minY: -12, maxY: 22, zeroLine: true,
+    stepsArray: TRAIN_DATA_1B.steps,
+    color: '#00c4ff',
+    areaColor: 'rgba(0,196,255,0.1)',
+    gradientStops: [[0, '#ff3d5a'], [0.4, '#f0a030'], [0.7, '#00c4ff'], [1, '#00c4ff']],
+    refLines: [
+      { value: 19.5, label: '9B BASELINE +19.5', color: 'rgba(0,230,118,0.4)', dash: [8, 4] },
+      { value: -4.9, label: '1.5B BASELINE -4.9', color: 'rgba(240,160,48,0.5)', dash: [4, 4], labelBelow: true },
+    ],
+  });
+}
+window.addEventListener('resize', renderCharts);
+setTimeout(renderCharts, 100);
+/* ═══════════════════════════════════════════════
+   WAR ROOM ENGINE
+   ═══════════════════════════════════════════════ */
+var ACTION_ICONS = { inspect: '\u2315', ask_specialist: '\u25C9', apply_fix: '\u26A1', submit: '\u2713', reward_breakdown: '\u2261' };
+var state = { step: 0, reward: 0, fixApplied: false, done: false, startTime: null, timerInterval: null };
+function startTimer() {
+  state.startTime = Date.now();
+  state.timerInterval = setInterval(function() {
+    var e = Math.floor((Date.now() - state.startTime) / 1000);
+    document.getElementById('episodeTime').textContent = String(Math.floor(e/60)).padStart(2,'0') + ':' + String(e%60).padStart(2,'0');
+  }, 1000);
+}
+function stopTimer() { if (state.timerInterval) clearInterval(state.timerInterval); }
+function updateSteps(step) {
+  document.querySelectorAll('.step-dot').forEach(function(d, i) {
+    d.className = 'step-dot';
+    if (i < step) d.classList.add('used');
+    else if (i === step && !state.done) d.classList.add('current');
+  });
+}
+function updateReward(delta) {
+  state.reward += delta;
+  var el = document.getElementById('rewardValue');
+  el.textContent = (state.reward >= 0 ? '+' : '') + state.reward.toFixed(2);
+  el.className = 'vital-value ' + (state.reward >= 0 ? 'emerald' : 'coral');
+}
+function setLayerState(id, cls) {
+  var el = document.getElementById('layer-' + id);
+  if (el) el.className = 'arch-layer' + (cls ? ' ' + cls : '');
+}
+function clearAllLayers() { document.querySelectorAll('.arch-layer').forEach(function(e) { e.className = 'arch-layer'; }); }
+function setStatus(text, cls) { var el = document.getElementById('statusBadge'); el.textContent = text; el.className = 'status-badge' + (cls ? ' ' + cls : ''); }
+function addLogEntry(opts) {
+  var idle = document.getElementById('idlePrompt'); if (idle) idle.remove();
+  var body = document.getElementById('logBody');
+  var entry = document.createElement('div'); entry.className = 'log-entry';
+  var iconCls = opts.type === 'ask_specialist' ? 'specialist' : opts.type;
+  var typeCls = opts.type === 'inspect' ? 'cyan' : opts.type === 'ask_specialist' ? 'purple' : opts.type === 'apply_fix' ? 'amber' : 'emerald';
+  var iconEl = document.createElement('div'); iconEl.className = 'log-entry-icon ' + iconCls; iconEl.textContent = ACTION_ICONS[opts.type] || '\u2022';
+  var contentEl = document.createElement('div'); contentEl.className = 'log-entry-content';
+  var headerEl = document.createElement('div'); headerEl.className = 'log-entry-header';
+  var typeEl = document.createElement('div'); typeEl.className = 'log-entry-type ' + typeCls; typeEl.textContent = opts.label;
+  var stepEl = document.createElement('div'); stepEl.className = 'log-entry-step'; stepEl.textContent = 'Step ' + state.step + '/6';
+  headerEl.appendChild(typeEl); headerEl.appendChild(stepEl);
+  var textEl = document.createElement('div'); textEl.className = 'log-entry-text'; textEl.textContent = opts.text;
+  contentEl.appendChild(headerEl); contentEl.appendChild(textEl);
+  if (opts.reward !== undefined) {
+    var rEl = document.createElement('div'); rEl.className = 'log-entry-reward ' + (opts.reward >= 0 ? 'positive' : 'negative');
+    rEl.textContent = (opts.reward >= 0 ? '+' : '') + opts.reward.toFixed(2); contentEl.appendChild(rEl);
+  }
+  entry.appendChild(iconEl); entry.appendChild(contentEl);
+  body.appendChild(entry); body.scrollTop = body.scrollHeight;
+}
+function addIncidentCard(sc) {
+  var idle = document.getElementById('idlePrompt'); if (idle) idle.remove();
+  var body = document.getElementById('logBody');
+  var card = document.createElement('div'); card.className = 'incident-card';
+  var label = document.createElement('div'); label.className = 'incident-label'; label.textContent = 'Incident Ticket';
+  var text = document.createElement('div'); text.className = 'incident-text'; text.textContent = sc.incident_ticket;
+  var meta = document.createElement('div'); meta.className = 'incident-meta';
+  [['Hardware', sc.hardware], ['Model', sc.model_name], ['Backend', sc.backend]].forEach(function(f) {
+    var item = document.createElement('div'); item.className = 'incident-meta-item';
+    var ml = document.createElement('div'); ml.className = 'meta-label'; ml.textContent = f[0];
+    var mv = document.createElement('div'); mv.className = 'meta-value'; mv.textContent = f[1];
+    item.appendChild(ml); item.appendChild(mv); meta.appendChild(item);
+  });
+  card.appendChild(label); card.appendChild(text); card.appendChild(meta); body.appendChild(card);
+}
+function populateSpecialists(ops) {
+  var body = document.getElementById('specialistsBody'); body.textContent = '';
+  var i = 0;
+  Object.keys(ops).forEach(function(name) {
+    var d = ops[name];
+    var card = document.createElement('div'); card.className = 'specialist-card'; card.id = 'specialist-' + name;
+    card.style.animationDelay = (i * 0.1) + 's';
+    var top = document.createElement('div'); top.className = 'specialist-top';
+    var nameEl = document.createElement('div'); nameEl.className = 'specialist-name'; nameEl.textContent = name;
+    var barW = document.createElement('div'); barW.className = 'confidence-bar';
+    var barF = document.createElement('div'); barF.className = 'confidence-fill'; barF.style.width = (d.confidence * 100) + '%';
+    barW.appendChild(barF); top.appendChild(nameEl); top.appendChild(barW);
+    var opEl = document.createElement('div'); opEl.className = 'specialist-opinion'; opEl.textContent = d.opinion;
+    var vEl = document.createElement('div'); vEl.className = 'specialist-verdict';
+    card.appendChild(top); card.appendChild(opEl); card.appendChild(vEl); body.appendChild(card); i++;
+  });
+}
+function highlightSpecialist(name) {
+  document.querySelectorAll('.specialist-card').forEach(function(c) { c.classList.remove('highlighted'); });
+  var c = document.getElementById('specialist-' + name); if (c) c.classList.add('highlighted');
+}
+function markSpecialist(name, correct) {
+  var c = document.getElementById('specialist-' + name); if (!c) return;
+  c.classList.remove('highlighted'); c.classList.add(correct ? 'correct' : 'wrong');
+  var v = c.querySelector('.specialist-verdict'); if (v) v.textContent = correct ? '\u2713 Helpful' : '\u2014 Not Relevant';
+}
+function showDiagnosis(d) {
+  var overlay = document.getElementById('diagnosisOverlay');
+  var result = document.getElementById('diagnosisResult'); result.textContent = '';
+  [['Root Cause', d.rootCause, d.rcCorrect, d.correctRc], ['Fix', d.fix, d.fixCorrect, d.correctFix]].forEach(function(r) {
+    var row = document.createElement('div'); row.className = 'diagnosis-row';
+    var lbl = document.createElement('div'); lbl.className = 'diagnosis-row-label'; lbl.textContent = r[0];
+    var val = document.createElement('div'); val.className = 'diagnosis-row-value ' + (r[2] ? 'correct' : 'wrong');
+    val.textContent = r[1] + (r[2] ? ' \u2713' : ' \u2717 \u2192 ' + r[3]);
+    row.appendChild(lbl); row.appendChild(val); result.appendChild(row);
+  });
+  // Steps used + time
+  var stepsRow = document.createElement('div'); stepsRow.className = 'diagnosis-row';
+  var stepsLbl = document.createElement('div'); stepsLbl.className = 'diagnosis-row-label'; stepsLbl.textContent = 'Steps Used';
+  var stepsVal = document.createElement('div'); stepsVal.className = 'diagnosis-row-value'; stepsVal.textContent = state.step + ' / 6';
+  stepsRow.appendChild(stepsLbl); stepsRow.appendChild(stepsVal); result.appendChild(stepsRow);
+  var timeRow = document.createElement('div'); timeRow.className = 'diagnosis-row';
+  var timeLbl = document.createElement('div'); timeLbl.className = 'diagnosis-row-label'; timeLbl.textContent = 'Time';
+  var elapsed = state.startTime ? Math.round((Date.now() - state.startTime) / 1000) : 0;
+  var timeVal = document.createElement('div'); timeVal.className = 'diagnosis-row-value'; timeVal.textContent = elapsed + 's';
+  timeRow.appendChild(timeLbl); timeRow.appendChild(timeVal); result.appendChild(timeRow);
+  var rEl = document.getElementById('diagnosisReward');
+  rEl.textContent = (d.totalReward >= 0 ? '+' : '') + d.totalReward.toFixed(2);
+  rEl.style.color = d.totalReward >= 0 ? 'var(--emerald)' : 'var(--coral)';
+  overlay.classList.add('visible');
+}
+function resetState() {
+  state = { step: 0, reward: 0, fixApplied: false, done: false, startTime: null, timerInterval: null };
+  stopTimer();
+  document.getElementById('logBody').textContent = '';
+  var logBody = document.getElementById('logBody');
+  var idle = document.createElement('div'); idle.className = 'idle-prompt'; idle.id = 'idlePrompt';
+  var txt = document.createElement('div'); txt.className = 'idle-text'; txt.textContent = 'Awaiting incident assignment.\nConnect to a live Stack Doctor environment above.';
+  idle.appendChild(txt); logBody.appendChild(idle);
+  document.getElementById('specialistsBody').textContent = '';
+  document.getElementById('rewardValue').textContent = '0.00'; document.getElementById('rewardValue').className = 'vital-value';
+  document.getElementById('fixStatus').textContent = 'Not Applied'; document.getElementById('fixStatus').className = 'vital-value';
+  document.getElementById('rootCauseValue').textContent = '\u2014'; document.getElementById('rootCauseValue').className = 'vital-value';
+  document.getElementById('diagnosisValue').textContent = 'Pending'; document.getElementById('diagnosisValue').className = 'vital-value';
+  document.getElementById('scenarioId').textContent = '\u2014';
+  document.getElementById('episodeTime').textContent = '00:00';
+  document.getElementById('diagnosisOverlay').classList.remove('visible');
+  setStatus('Standby', ''); clearAllLayers(); updateSteps(0);
+  setConnStatus('Disconnected', '');
+}
+/* ═══════════════════════════════════════════════
+   DEMO — arch_guard_01
+   ═══════════════════════════════════════════════ */
+var DEMO = {
+  id: 'arch_guard_01', root_cause: 'arch_guard', correct_fix: 'relax_arch_check',
+  incident_ticket: "FlashInfer attention kernel fails to launch on newly provisioned DGX Spark nodes. Error: 'Unsupported GPU architecture sm_121'. Identical model config works on H100 nodes.",
+  hardware: 'NVIDIA SM121 (DGX Spark)', model_name: 'DeepSeek-V3-671B', backend: 'FlashInfer 0.4',
+  specialist_opinions: {
+    runtime: { opinion: "CUDA runtime loaded successfully. No runtime issues detected.", confidence: 0.85 },
+    dispatch: { opinion: "Architecture check is blocking kernel dispatch. SM121 is not in the supported set despite being SM90-compatible.", confidence: 0.92 },
+    kernel: { opinion: "HMMA m16n8k16 instructions available on SM121. Capability check issue.", confidence: 0.88 },
+    loader: { opinion: "Model weights loaded correctly. Weight layout is standard.", confidence: 0.80 },
+  },
+  inspect_logs: "[FlashInfer] GPU: NVIDIA GH200 (sm_121)\n[FlashInfer] is_supported_arch(121) = False\n[FlashInfer] Architecture check FAILED\n[CUDA] All CUDA operations nominal\n[System] GPU memory: 96GB available",
+  inspect_config: "gpu_architecture: sm_121\ncuda_version: 13.0\nflashinfer_version: 0.4.1\nsupported_archs: [70, 75, 80, 86, 89, 90]",
+  followup_dispatch: "The dispatch table maps arch -> kernel. SM121 has no entry. Adding sm_12x family to the arch check should resolve this.",
+};
+function sleep(ms) { return new Promise(function(r) { setTimeout(r, ms); }); }
+/* ══════════════════════════════════════════════
+   LIVE ENVIRONMENT CONNECTION
+   ══════════════════════════════════════════════ */
+var SERVER = { url: '', ws: null };
+function getServerUrl() { return document.getElementById('serverUrl').value.replace(/\/$/, ''); }
+function getWsUrl() {
+  var base = getServerUrl();
+  if (!base) {
+    // Same-origin: derive WS URL from current page
+    var proto = location.protocol === 'https:' ? 'wss:' : 'ws:';
+    return proto + '//' + location.host + '/ws';
+  }
+  return base.replace(/^http/, 'ws') + '/ws';
+}
+function setConnStatus(text, cls) {
+  var el = document.getElementById('connStatus');
+  el.textContent = text; el.className = 'conn-status' + (cls ? ' ' + cls : '');
+}
+/* WebSocket-based communication — maintains session state across reset/step */
+function wsConnect() {
+  return new Promise(function(resolve, reject) {
+    var url = getWsUrl();
+    var ws = new WebSocket(url);
+    ws.onopen = function() { SERVER.ws = ws; resolve(ws); };
+    ws.onerror = function(e) { reject(new Error('WebSocket connection failed')); };
+    ws.onclose = function() { SERVER.ws = null; };
+  });
+}
+function wsSend(type, data) {
+  return new Promise(function(resolve, reject) {
+    if (!SERVER.ws || SERVER.ws.readyState !== WebSocket.OPEN) {
+      reject(new Error('WebSocket not connected')); return;
+    }
+    SERVER.ws.onmessage = function(evt) {
+      try {
+        var msg = JSON.parse(evt.data);
+        resolve(msg);  /* Always resolve — caller handles errors */
+      }
+      catch (e) { reject(new Error('Bad JSON from server')); }
+    };
+    SERVER.ws.send(JSON.stringify({ type: type, data: data || {} }));
+  });
+}
+function wsClose() {
+  if (SERVER.ws) { SERVER.ws.close(); SERVER.ws = null; }
+}
+/* Convenience wrappers — returns {observation, reward, done} from data envelope */
+async function serverReset(body) {
+  if (!SERVER.ws) await wsConnect();
+  var msg = await wsSend('reset', body || {});
+  if (msg.type === 'error') throw new Error((msg.data && msg.data.message) || 'Reset failed');
+  return msg.data;  /* {observation, reward, done} */
+}
+async function serverStep(actionMessage) {
+  /* WS step format: {type: "step", data: {message: "..."}} — NOT wrapped in action */
+  var msg = await wsSend('step', { message: actionMessage });
+  if (msg.type === 'error') {
+    /* Environment validation error (invalid target, specialist, etc.) — return penalty */
+    var errMsg = (msg.data && msg.data.message) || 'Unknown error';
+    return { observation: { output: 'Error: ' + errMsg }, reward: -2.0, done: false };
+  }
+  return msg.data;  /* {observation, reward, done} */
+}
+function disableButtons() {
+  var btn = document.getElementById('demoBtn');
+  btn.disabled = true; btn.textContent = '\u25CF Running...';
+}
+function enableButtons() {
+  var btn = document.getElementById('demoBtn');
+  btn.disabled = false; btn.textContent = '\u25B6 Run Comparison (Untrained \u2192 Trained)';
+}
+/* Map root_cause to a layer name for the architecture diagram */
+var CAUSE_TO_LAYER = {
+  arch_guard: 'backend', backend_whitelist: 'backend', backend_selector: 'backend',
+  runtime_loader: 'runtime', driver_compat: 'driver',
+  model_config: 'model', weight_layout: 'model',
+  memory_oom: 'memory', quantization_error: 'kernel',
+  distributed_comm: 'runtime'
+};
+/* Parse the environment observation to extract structured info */
+function parseObs(obs) {
+  return {
+    output: obs.output || '',
+    ticket: obs.incident_ticket || '',
+    hardware: obs.hardware || '',
+    model: obs.model_name || '',
+    backend: obs.backend || '',
+    log: obs.log_excerpt || '',
+    snippet: obs.code_snippet || '',
+    specialists: obs.specialist_opinions || {},
+    remaining: obs.steps_remaining,
+    fixUsed: obs.fix_used,
+    done: obs.done,
+    reward: obs.reward || 0,
+    meta: obs.metadata || {}
+  };
+}
+async function runLive(mode) {
+  disableButtons(); resetState(); setConnStatus('Connecting...', 'running');
+  try {
+    // Step 0: Reset — get a real scenario via WebSocket (stateful session)
+    var resetResp = await serverReset({});
+    var obs = parseObs(resetResp.observation || resetResp);
+    setConnStatus('Connected', 'connected');
+    startTimer();
+    setStatus('Incident Received', 'warning');
+    // Extract scenario ID from metadata
+    var scenarioId = (obs.meta && obs.meta.scenario_id) || 'unknown';
+    document.getElementById('scenarioId').textContent = scenarioId;
+    document.getElementById('detail-model').textContent = obs.model;
+    document.getElementById('detail-backend').textContent = obs.backend;
+    document.getElementById('detail-driver').textContent = obs.hardware;
+    // Populate incident card from real data
+    addIncidentCard({
+      incident_ticket: obs.ticket,
+      hardware: obs.hardware, model_name: obs.model, backend: obs.backend
+    });
+    populateSpecialists(obs.specialists);
+    await sleep(1800);
+    if (mode === 'untrained') {
+      await runLiveUntrained(obs);
+    } else {
+      await runLiveTrained(obs);
+    }
+    await sleep(6000);
+    document.getElementById('diagnosisOverlay').classList.remove('visible');
+  } catch (e) {
+    console.error('Live connection failed:', e);
+    addLogEntry({ type: 'submit', label: 'CONNECTION ERROR', reward: 0, text: e.message + '\n' + (e.stack || '') });
+    wsClose();
+    setConnStatus('Offline mode', 'running');
+    await runOffline(mode);
+    await sleep(6000);
+    document.getElementById('diagnosisOverlay').classList.remove('visible');
+  }
+  wsClose();
+  enableButtons();
+}
+/* ══════════════════════════════════════════════
+   OFFLINE FALLBACK (no server needed)
+   ══════════════════════════════════════════════ */
+async function runOffline(mode) {
+  var sc = DEMO;
+  startTimer();
+  setStatus('Incident Received', 'warning');
+  document.getElementById('scenarioId').textContent = sc.id;
+  document.getElementById('detail-model').textContent = sc.model_name;
+  document.getElementById('detail-backend').textContent = sc.backend;
+  document.getElementById('detail-driver').textContent = sc.hardware;
+  addIncidentCard(sc); populateSpecialists(sc.specialist_opinions);
+  await sleep(1800);
+  if (mode === 'untrained') { await runOfflineUntrained(sc); }
+  else { await runOfflineTrained(sc); }
+}
+async function runOfflineUntrained(sc) {
+  state.step = 1; updateSteps(1); setStatus('Model Acting', 'warning');
+  addLogEntry({ type: 'submit', label: 'Blind Submit (no investigation)', reward: 0,
+    text: 'Model skips investigation.\nOutput: [{"type":"submit","root_cause":"runtime_loader","fix":"fix_runtime_path","justification":"maybe"}]' });
+  await sleep(1800);
+  document.getElementById('rootCauseValue').textContent = 'runtime_loader';
+  document.getElementById('rootCauseValue').className = 'vital-value coral';
+  setLayerState('runtime', 'identified');
+  var totalReward = -11.5;
+  addLogEntry({ type: 'reward_breakdown', label: 'Reward Breakdown', reward: totalReward,
+    text: 'Root cause: runtime_loader \u2717 (expected arch_guard) \u2192 -8.0\nFix: fix_runtime_path \u2717 (expected relax_arch_check) \u2192 -2.0\nNo investigation \u2192 -1.0\nJustification too short \u2192 -0.5' });
+  updateReward(totalReward);
+  state.done = true; setStatus('Diagnosis Submitted', 'error');
+  document.getElementById('diagnosisValue').textContent = '\u2717 Incorrect';
+  document.getElementById('diagnosisValue').className = 'vital-value coral';
+  markSpecialist('runtime', false); markSpecialist('dispatch', true);
+  markSpecialist('kernel', true); markSpecialist('loader', false);
+  await sleep(1500); stopTimer();
+  showDiagnosis({ rcCorrect: false, fixCorrect: false, rootCause: 'runtime_loader', fix: 'fix_runtime_path',
+    correctRc: 'arch_guard', correctFix: 'relax_arch_check', totalReward: state.reward });
+}
+async function runOfflineTrained(sc) {
+  state.step = 1; updateSteps(1); setStatus('Investigating', 'warning');
+  setLayerState('runtime', 'scanning');
+  addLogEntry({ type: 'inspect', label: 'Inspect Logs', reward: -0.25, text: sc.inspect_logs });
+  updateReward(-0.25); await sleep(2200);
+  state.step = 2; updateSteps(2);
+  setLayerState('runtime', ''); setLayerState('backend', 'scanning');
+  addLogEntry({ type: 'inspect', label: 'Inspect Config', reward: -0.25, text: sc.inspect_config });
+  updateReward(-0.25); await sleep(2000);
+  state.step = 3; updateSteps(3);
+  setLayerState('backend', 'identified'); highlightSpecialist('dispatch');
+  addLogEntry({ type: 'ask_specialist', label: 'Query: Dispatch', reward: -0.25, text: sc.followup_dispatch });
+  updateReward(-0.25);
+  document.getElementById('rootCauseValue').textContent = 'arch_guard';
+  document.getElementById('rootCauseValue').className = 'vital-value amber';
+  await sleep(2200);
+  state.step = 4; updateSteps(4); setStatus('Applying Fix', 'warning');
+  state.fixApplied = true; setLayerState('backend', 'resolved');
+  addLogEntry({ type: 'apply_fix', label: 'Apply Fix: relax_arch_check', reward: 3.0, text: 'Fix applied successfully. Systems recovering.' });
+  updateReward(3.0);
+  document.getElementById('fixStatus').textContent = '\u2713 Applied';
+  document.getElementById('fixStatus').className = 'vital-value emerald';
+  ['backend', 'runtime', 'model', 'memory', 'driver'].forEach(function(l, i) {
+    setTimeout(function() { setLayerState(l, 'resolved'); }, i * 300);
+  }); await sleep(2000);
+  state.step = 5; updateSteps(5); state.done = true;
+  setStatus('Diagnosis Submitted', 'success');
+  addLogEntry({ type: 'submit', label: 'Submit Diagnosis', reward: 19.0,
+    text: 'Root cause: arch_guard \u2713\nFix: relax_arch_check \u2713\nJustification: Logs show sm_121 rejected by arch check. Dispatch confirmed SM121 supports HMMA. Config missing sm_12x in supported_archs.' });
+  updateReward(19.0);
+  document.getElementById('diagnosisValue').textContent = '\u2713 Correct';
+  document.getElementById('diagnosisValue').className = 'vital-value emerald';
+  markSpecialist('runtime', false); markSpecialist('dispatch', true);
+  markSpecialist('kernel', true); markSpecialist('loader', false);
+  await sleep(1500); stopTimer();
+  showDiagnosis({ rcCorrect: true, fixCorrect: true, rootCause: 'arch_guard', fix: 'relax_arch_check',
+    correctRc: 'arch_guard', correctFix: 'relax_arch_check', totalReward: state.reward });
+}
+/* ── UNTRAINED: blind submit, no investigation ── */
+async function runLiveUntrained(initObs) {
+  setStatus('Model Acting', 'warning');
+  state.step = 1; updateSteps(1);
+  // Untrained model skips all investigation — just submits a random wrong guess
+  addLogEntry({ type: 'inspect', label: 'Untrained Model Behavior', reward: 0,
+    text: 'Model receives incident but skips investigation.\nNo logs read. No config checked. No specialists queried.\nImmediately submits a blind guess...' });
+  await sleep(2000);
+  // Send a deliberately wrong submit to the real environment
+  state.step = 2; updateSteps(2);
+  var stepResp = await serverStep('{"type":"submit","root_cause":"runtime_loader","fix":"fix_runtime_path","justification":"idk"}');
+  var obs = parseObs(stepResp.observation || stepResp);
+  var stepReward = stepResp.reward !== undefined ? stepResp.reward : obs.reward;
+  setConnStatus('Connected', 'connected');
+  // Parse the real environment response
+  var outputText = obs.output;
+  var rcCorrect = outputText.indexOf('CORRECT') !== -1 && outputText.indexOf('Root cause') !== -1
+    && outputText.split('Root cause')[1].split('\n')[0].indexOf('CORRECT') !== -1;
+  var fixCorrect = outputText.indexOf('CORRECT') !== -1 && outputText.indexOf('Fix:') !== -1
+    && outputText.split('Fix:')[1].split('\n')[0].indexOf('CORRECT') !== -1;
+  // Extract actual correct answers from output
+  var correctRc = ''; var correctFix = '';
+  var rcMatch = outputText.match(/WRONG \(was: (\w+)\)/);
+  if (rcMatch) correctRc = rcMatch[1];
+  var fixMatch = outputText.match(/Fix:.*WRONG \(was: (\w+)\)/);
+  if (fixMatch) correctFix = fixMatch[1];
+  document.getElementById('rootCauseValue').textContent = 'runtime_loader';
+  document.getElementById('rootCauseValue').className = 'vital-value coral';
+  addLogEntry({ type: 'submit', label: 'Blind Submit', reward: stepReward,
+    text: outputText });
+  updateReward(stepReward);
+  state.done = true;
+  setStatus('Diagnosis Submitted', obs.reward >= 0 ? 'success' : 'error');
+  document.getElementById('diagnosisValue').textContent = rcCorrect ? '\u2713 Correct' : '\u2717 Incorrect';
+  document.getElementById('diagnosisValue').className = 'vital-value ' + (rcCorrect ? 'emerald' : 'coral');
+  await sleep(1500); stopTimer();
+  showDiagnosis({
+    rcCorrect: rcCorrect, fixCorrect: fixCorrect,
+    rootCause: 'runtime_loader', fix: 'fix_runtime_path',
+    correctRc: correctRc || 'unknown', correctFix: correctFix || 'unknown',
+    totalReward: state.reward
+  });
+}
+/* ── TRAINED: investigate, then diagnose ── */
+async function runLiveTrained(initObs) {
+  setStatus('Investigating', 'warning');
+  // Step 1: Inspect logs
+  state.step = 1; updateSteps(1);
+  setLayerState('runtime', 'scanning');
+  var step1 = await serverStep('{"type":"inspect","target":"logs"}');
+  var obs1 = parseObs(step1.observation || step1);
+  var rew1 = step1.reward !== undefined ? step1.reward : obs1.reward;
+  addLogEntry({ type: 'inspect', label: 'Inspect Logs', reward: rew1, text: obs1.output });
+  updateReward(rew1);
+  await sleep(2200);
+  // Step 2: Inspect config
+  state.step = 2; updateSteps(2);
+  setLayerState('runtime', ''); setLayerState('backend', 'scanning');
+  var step2 = await serverStep('{"type":"inspect","target":"config"}');
+  var obs2 = parseObs(step2.observation || step2);
+  var rew2 = step2.reward !== undefined ? step2.reward : obs2.reward;
+  addLogEntry({ type: 'inspect', label: 'Inspect Config', reward: rew2, text: obs2.output });
+  updateReward(rew2);
+  await sleep(2000);
+  // Step 3: Query a specialist — pick dispatch as a reasonable investigation choice
+  state.step = 3; updateSteps(3);
+  setLayerState('backend', 'identified');
+  highlightSpecialist('dispatch');
+  var step3 = await serverStep('{"type":"ask_specialist","specialist":"dispatch"}');
+  var obs3 = parseObs(step3.observation || step3);
+  var rew3 = step3.reward !== undefined ? step3.reward : obs3.reward;
+  addLogEntry({ type: 'ask_specialist', label: 'Query: Dispatch', reward: rew3, text: obs3.output });
+  updateReward(rew3);
+  await sleep(2200);
+  // Step 4: Smart submit — analyze the logs/config to guess the right answer
+  // For the demo, we use the scenario hints from the environment output to make an informed guess
+  // A real trained model would parse the observations and infer the root cause
+  state.step = 4; updateSteps(4); setStatus('Diagnosing', 'warning');
+  // Attempt to extract the root cause from clues in the observations
+  var allText = obs1.output + ' ' + obs2.output + ' ' + obs3.output;
+  var guessRc = inferRootCause(allText);
+  var guessFix = RC_TO_FIX[guessRc] || 'switch_backend';
+  var justification = 'Logs and config analysis indicates ' + guessRc + '. Dispatch specialist confirmed. Applying ' + guessFix + '.';
+  document.getElementById('rootCauseValue').textContent = guessRc;
+  document.getElementById('rootCauseValue').className = 'vital-value amber';
+  // Apply fix first
+  var step4 = await serverStep(JSON.stringify({ type: 'apply_fix', fix: guessFix }));
+  var obs4 = parseObs(step4.observation || step4);
+  var rew4 = step4.reward !== undefined ? step4.reward : obs4.reward;
+  var fixWorked = rew4 > 0;
+  addLogEntry({ type: 'apply_fix', label: 'Apply Fix: ' + guessFix, reward: rew4, text: obs4.output });
+  updateReward(rew4);
+  document.getElementById('fixStatus').textContent = fixWorked ? '\u2713 Applied' : '\u2717 Failed';
+  document.getElementById('fixStatus').className = 'vital-value ' + (fixWorked ? 'emerald' : 'coral');
+  if (fixWorked) {
+    var layers = ['backend', 'runtime', 'model', 'memory', 'driver'];
+    layers.forEach(function(l, i) { setTimeout(function() { setLayerState(l, 'resolved'); }, i * 300); });
+  }
+  await sleep(2000);
+  // Step 5: Submit diagnosis
+  var isDone4 = step4.done !== undefined ? step4.done : obs4.done;
+  if (!isDone4) {
+    state.step = 5; updateSteps(5);
+    var step5 = await serverStep(JSON.stringify({ type: 'submit', root_cause: guessRc, fix: guessFix, justification: justification }));
+    var obs5 = parseObs(step5.observation || step5);
+    var rew5 = step5.reward !== undefined ? step5.reward : obs5.reward;
+    addLogEntry({ type: 'submit', label: 'Submit Diagnosis', reward: rew5, text: obs5.output });
+    updateReward(rew5);
+    var outputText = obs5.output;
+    var rcCorrect = outputText.indexOf('Root cause') !== -1 && outputText.split('Root cause')[1].split('\n')[0].indexOf('CORRECT') !== -1;
+    var fixCorrect2 = outputText.indexOf('Fix:') !== -1 && outputText.split('Fix:')[1].split('\n')[0].indexOf('CORRECT') !== -1;
+    state.done = true;
+    setStatus('Diagnosis Submitted', rcCorrect ? 'success' : 'error');
+    document.getElementById('diagnosisValue').textContent = rcCorrect ? '\u2713 Correct' : '\u2717 Incorrect';
+    document.getElementById('diagnosisValue').className = 'vital-value ' + (rcCorrect ? 'emerald' : 'coral');
+    var correctRc = guessRc; var correctFix = guessFix;
+    var rcWrong = outputText.match(/WRONG \(was: (\w+)\)/);
+    if (rcWrong) correctRc = rcWrong[1];
+    var fixWrong = outputText.match(/Fix:.*WRONG \(was: (\w+)\)/);
+    if (fixWrong) correctFix = fixWrong[1];
+    await sleep(1500); stopTimer();
+    showDiagnosis({
+      rcCorrect: rcCorrect && !rcWrong, fixCorrect: fixCorrect2 && !fixWrong,
+      rootCause: guessRc, fix: guessFix,
+      correctRc: correctRc, correctFix: correctFix,
+      totalReward: state.reward
+    });
+  } else {
+    state.done = true; stopTimer();
+  }
+}
+/* ══════════════════════════════════════════════
+   LIVE MODEL INFERENCE — real Qwen 1.5B via MLX
+   ══════════════════════════════════════════════ */
+/* Inference URL: same origin on HF Spaces, localhost:8001 locally */
+var INFERENCE_URL = (location.hostname === 'localhost' || location.hostname === '127.0.0.1') ? 'http://localhost:8001' : '';
+function extractActionsJS(text) {
+  text = text.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
+  var start = text.indexOf('['), end = text.lastIndexOf(']');
+  if (start !== -1 && end > start) {
+    try { var a = JSON.parse(text.slice(start, end+1)); if (Array.isArray(a)) return a.filter(function(x){return typeof x==='object';}); } catch(e) {}
+  }
+  try { var a = JSON.parse(text); if (Array.isArray(a)) return a.filter(function(x){return typeof x==='object';}); if (typeof a === 'object') return [a]; } catch(e) {}
+  return null;
+}
+var UNTRAINED_SYSTEM = 'You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\nYou receive an incident ticket with hardware/model/backend context, log excerpts, and specialist opinions.\nSome specialists may be wrong. Output a JSON array of actions:\n  {"type":"inspect","target":"logs|config|snippet|metrics"}\n  {"type":"ask_specialist","specialist":"runtime|dispatch|kernel|loader"}\n  {"type":"apply_fix","fix":"<fix_name>"}\n  {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<why>"}';
+var TRAINED_SYSTEM = 'You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\nYou are methodical: first inspect logs and config, then query specialists to cross-verify (some lie), then apply a fix and submit.\n\nAvailable actions (output as a JSON array):\n  {"type":"inspect","target":"logs"} or "config" or "snippet" or "metrics"\n  {"type":"ask_specialist","specialist":"runtime"} or "dispatch" or "kernel" or "loader"\n  {"type":"apply_fix","fix":"<name>"} — available fixes: add_whitelist_entry, fix_comm_config, fix_quantization, fix_runtime_path, fix_weight_mapping, relax_arch_check, switch_backend, tune_memory_config, update_driver_config, update_model_config\n  {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<detailed reasoning>"}\n\nAvailable root causes: arch_guard, backend_selector, backend_whitelist, distributed_comm, driver_compat, memory_oom, model_config, quantization_error, runtime_loader, weight_layout\n\nIMPORTANT: Pick ONE target per inspect, ONE specialist per query. Investigate before submitting. Give a detailed justification.\n\nExample output:\n[{"type":"inspect","target":"logs"},{"type":"inspect","target":"config"},{"type":"ask_specialist","specialist":"kernel"},{"type":"apply_fix","fix":"relax_arch_check"},{"type":"submit","root_cause":"arch_guard","fix":"relax_arch_check","justification":"Logs show architecture check failure for SM90 on the backend. Config confirms the guard is enabled. Kernel specialist confirmed this is not a kernel issue. Relaxing the arch check resolves the incompatibility."}]';
+async function runComparison() {
+  disableButtons();
+  // 1. Pick a random scenario ID to pin both runs to the same problem
+  var scenarioIds = ['arch_guard_01','arch_guard_02','backend_whitelist_01','backend_whitelist_02','runtime_loader_01','runtime_loader_02','backend_selector_01'];
+  var scenarioId = scenarioIds[Math.floor(Math.random() * scenarioIds.length)];
+  // 2. Run untrained
+  await runLiveModel('untrained', scenarioId);
+  // 3. Pause between runs
+  await sleep(3000);
+  addLogEntry({ type: 'inspect', label: '--- Now running TRAINED model on same scenario ---', reward: 0, text: 'Same incident: ' + scenarioId });
+  await sleep(2000);
+  resetState();
+  // 4. Run trained on same scenario
+  await runLiveModel('trained', scenarioId);
+  enableButtons();
+}
+async function runLiveModel(mode, scenarioId) {
+  resetState(); setConnStatus('Connecting...', 'running');
+  var isTrained = mode === 'trained';
+  var ws = null;
+  try {
+    // 1. Raw WebSocket connect (derive WS URL from server URL or same-origin)
+    var base = getServerUrl();
+    var wsUrl;
+    if (!base) {
+      var proto = location.protocol === 'https:' ? 'wss:' : 'ws:';
+      wsUrl = proto + '//' + location.host + '/ws';
+    } else {
+      wsUrl = base.replace(/^http/, 'ws') + '/ws';
+    }
+    ws = new WebSocket(wsUrl);
+    await new Promise(function(res, rej) {
+      ws.onopen = res;
+      ws.onerror = function() { rej(new Error('WebSocket connect failed')); };
+      setTimeout(function() { rej(new Error('WebSocket timeout')); }, 5000);
+    });
+    function wsSendRaw(type, data) {
+      return new Promise(function(res, rej) {
+        ws.onmessage = function(e) { res(JSON.parse(e.data)); };
+        ws.send(JSON.stringify({type: type, data: data || {}}));
+        setTimeout(function() { rej(new Error('WS ' + type + ' timeout')); }, 15000);
+      });
+    }
+    // 2. Reset with pinned scenario ID (same scenario for both runs)
+    var resetData = scenarioId ? { scenario_id: scenarioId } : {};
+    var resetMsg = await wsSendRaw('reset', resetData);
+    if (resetMsg.type === 'error') throw new Error('Reset: ' + JSON.stringify(resetMsg.data));
+    var obs = resetMsg.data.observation;
+    setConnStatus('Connected', 'connected');
+    startTimer();
+    setStatus('Incident Received', 'warning');
+    document.getElementById('scenarioId').textContent = 'live';
+    document.getElementById('detail-model').textContent = obs.model_name || '';
+    document.getElementById('detail-backend').textContent = obs.backend || '';
+    document.getElementById('detail-driver').textContent = obs.hardware || '';
+    addIncidentCard({ incident_ticket: obs.incident_ticket, hardware: obs.hardware, model_name: obs.model_name, backend: obs.backend });
+    populateSpecialists(obs.specialist_opinions || {});
+    await sleep(1000);
+    // 3. Build prompt from raw observation
+    var opsStr = '';
+    var specs = obs.specialist_opinions || {};
+    for (var name in specs) {
+      var o = specs[name];
+      opsStr += '  ' + name + ': ' + o.opinion + ' (confidence: ' + o.confidence + ')\n';
+    }
+    var userPrompt = 'INCIDENT: ' + obs.incident_ticket + '\nHardware: ' + obs.hardware + ' | Model: ' + obs.model_name + ' | Backend: ' + obs.backend + '\nLOG:\n' + obs.log_excerpt + '\nSPECIALISTS:\n' + opsStr + '\nInvestigate and submit your diagnosis as a JSON action array.';
+    // 4. Call inference server
+    setStatus(isTrained ? 'Trained Model Thinking...' : 'Untrained Model Thinking...', 'warning');
+    addLogEntry({ type: 'inspect', label: 'Qwen 1.5B ' + (isTrained ? '(GRPO)' : '(Base)') + ' Generating...', reward: 0, text: 'Sending scenario to local MLX model for inference...' });
+    var genResp = await fetch(INFERENCE_URL + '/generate', {
+      method: 'POST', headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ prompt: userPrompt, max_tokens: 512, mode: isTrained ? 'trained' : 'untrained', system: isTrained ? TRAINED_SYSTEM : UNTRAINED_SYSTEM })
+    }).then(function(r) { return r.json(); });
+    addLogEntry({ type: 'inspect', label: 'Model Output (' + genResp.gen_time.toFixed(1) + 's)', reward: 0, text: genResp.text.slice(0, 500) });
+    await sleep(500);
+    // 5. Parse actions
+    var actions = extractActionsJS(genResp.text);
+    if (!actions || actions.length === 0) {
+      addLogEntry({ type: 'submit', label: 'Parse Failed', reward: -5, text: 'Could not parse model output as JSON actions.\nRaw: ' + genResp.text.slice(0, 200) });
+      updateReward(-5);
+      state.done = true; setStatus('Parse Error', 'error'); stopTimer();
+      ws.close(); enableButtons(); return;
+    }
+    setStatus('Executing Actions', 'warning');
+    // 6. Execute each action via raw WebSocket
+    var done = false;
+    var lastOutput = '';
+    var totalReward = 0;
+    for (var i = 0; i < actions.length && !done; i++) {
+      var action = actions[i];
+      var aType = action.type || '?';
+      state.step = i + 1; updateSteps(i + 1);
+      // Visual feedback on architecture diagram
+      if (aType === 'inspect') setLayerState('runtime', 'scanning');
+      if (aType === 'ask_specialist') highlightSpecialist(action.specialist || 'dispatch');
+      if (aType === 'apply_fix') setLayerState('backend', 'identified');
+      if (aType === 'submit') { setStatus('Diagnosing', 'warning'); document.getElementById('rootCauseValue').textContent = action.root_cause || '?'; }
+      var stepMsg = await wsSendRaw('step', { message: JSON.stringify(action) });
+      var rew = 0;
+      var stepOutput = '';
+      if (stepMsg.type === 'error') {
+        rew = -2;
+        stepOutput = 'Error: ' + ((stepMsg.data && stepMsg.data.message) || 'unknown');
+      } else {
+        rew = stepMsg.data.reward || 0;
+        done = stepMsg.data.done || false;
+        stepOutput = (stepMsg.data.observation && stepMsg.data.observation.output) || '';
+      }
+      totalReward += rew;
+      var label = aType;
+      if (aType === 'inspect') label = 'Inspect: ' + (action.target || '?');
+      if (aType === 'ask_specialist') label = 'Query: ' + (action.specialist || '?');
+      if (aType === 'apply_fix') label = 'Fix: ' + (action.fix || '?');
+      if (aType === 'submit') label = 'Submit Diagnosis';
+      addLogEntry({ type: aType, label: label, reward: rew, text: stepOutput });
+      updateReward(rew);
+      lastOutput = stepOutput;
+      // Green up layers on successful fix
+      if (aType === 'apply_fix' && rew > 0) {
+        ['backend', 'runtime', 'model', 'memory', 'driver'].forEach(function(l, idx) {
+          setTimeout(function() { setLayerState(l, 'resolved'); }, idx * 300);
+        });
+      }
+      await sleep(1200);
+    }
+    // 7. Final diagnosis
+    state.done = true; stopTimer();
+    if (lastOutput.indexOf('DIAGNOSIS SUBMITTED') !== -1 || lastOutput.indexOf('Root cause') !== -1 || done) {
+      var rcCorrect = lastOutput.indexOf('CORRECT') !== -1 && lastOutput.indexOf('Root cause') !== -1 && lastOutput.split('Root cause')[1].split('\n')[0].indexOf('CORRECT') !== -1;
+      setStatus('Diagnosis Submitted', totalReward > 0 ? 'success' : 'error');
+      document.getElementById('diagnosisValue').textContent = totalReward > 0 ? '\u2713 Correct' : '\u2717 Incorrect';
+      document.getElementById('diagnosisValue').className = 'vital-value ' + (totalReward > 0 ? 'emerald' : 'coral');
+      var submitAction = actions.find(function(a) { return a.type === 'submit'; }) || {};
+      var correctRc = submitAction.root_cause || '?';
+      var correctFix = submitAction.fix || '?';
+      var rcWrong = lastOutput.match(/WRONG \(was: (\w+)\)/);
+      if (rcWrong) correctRc = rcWrong[1];
+      var fixWrong = lastOutput.match(/Fix:.*WRONG \(was: (\w+)\)/);
+      if (fixWrong) correctFix = fixWrong[1];
+      showDiagnosis({
+        rcCorrect: rcCorrect && !rcWrong, fixCorrect: !fixWrong,
+        rootCause: submitAction.root_cause || '?', fix: submitAction.fix || '?',
+        correctRc: correctRc, correctFix: correctFix,
+        totalReward: state.reward
+      });
+      await sleep(8000);
+      document.getElementById('diagnosisOverlay').classList.remove('visible');
+    } else {
+      setStatus('Episode Ended', totalReward > 0 ? 'success' : 'error');
+    }
+  } catch (e) {
+    console.error('Live model error:', e);
+    setConnStatus('Error: ' + e.message, 'error');
+    setStatus('Error', 'error');
+    alert('ERROR: ' + e.message);
+    addLogEntry({ type: 'submit', label: 'ERROR', reward: 0, text: e.message + '\n' + (e.stack || '') });
+  }
+  if (ws) ws.close();
+  enableButtons();
+}
+/* Root cause inference from observation text — pattern matching on known signatures */
+var RC_TO_FIX = {
+  arch_guard: 'relax_arch_check', backend_whitelist: 'add_whitelist_entry',
+  runtime_loader: 'fix_runtime_path', backend_selector: 'switch_backend',
+  model_config: 'update_model_config', weight_layout: 'fix_weight_mapping',
+  memory_oom: 'tune_memory_config', quantization_error: 'fix_quantization',
+  distributed_comm: 'fix_comm_config', driver_compat: 'update_driver_config'
+};
+function inferRootCause(text) {
+  var t = text.toLowerCase();
+  if (t.indexOf('arch') !== -1 && (t.indexOf('guard') !== -1 || t.indexOf('unsupported') !== -1 || t.indexOf('architecture check') !== -1)) return 'arch_guard';
+  if (t.indexOf('whitelist') !== -1 || t.indexOf('not in supported') !== -1) return 'backend_whitelist';
+  if (t.indexOf('runtime') !== -1 && (t.indexOf('loader') !== -1 || t.indexOf('path') !== -1 || t.indexOf('dlopen') !== -1)) return 'runtime_loader';
+  if (t.indexOf('backend') !== -1 && (t.indexOf('selector') !== -1 || t.indexOf('fallback') !== -1)) return 'backend_selector';
+  if (t.indexOf('model') !== -1 && t.indexOf('config') !== -1 && (t.indexOf('mismatch') !== -1 || t.indexOf('invalid') !== -1)) return 'model_config';
+  if (t.indexOf('weight') !== -1 && (t.indexOf('layout') !== -1 || t.indexOf('shape') !== -1)) return 'weight_layout';
+  if (t.indexOf('oom') !== -1 || t.indexOf('out of memory') !== -1 || t.indexOf('memory') !== -1 && t.indexOf('exceed') !== -1) return 'memory_oom';
+  if (t.indexOf('quantiz') !== -1 || t.indexOf('quant') !== -1 && t.indexOf('error') !== -1) return 'quantization_error';
+  if (t.indexOf('nccl') !== -1 || t.indexOf('distributed') !== -1 || t.indexOf('comm') !== -1) return 'distributed_comm';
+  if (t.indexOf('driver') !== -1 && (t.indexOf('compat') !== -1 || t.indexOf('version') !== -1)) return 'driver_compat';
+  return 'arch_guard'; // fallback
+}
+/* Poll model status on page load */
+(function checkModelStatus() {
+  var statusUrl = INFERENCE_URL || '';
+  var el = document.getElementById('modelStatus');
+  fetch(statusUrl + '/model_status').then(function(r) { return r.json(); }).then(function(d) {
+    if (d.ready) {
+      el.textContent = 'Model: ready';
+      el.className = 'conn-status connected';
+    } else if (d.error) {
+      el.textContent = 'Model: error';
+      el.className = 'conn-status error';
+    } else {
+      el.textContent = 'Model: loading...';
+      el.className = 'conn-status running';
+      setTimeout(checkModelStatus, 3000);
+    }
+  }).catch(function() {
+    /* Local dev without /model_status endpoint — assume ready */
+    el.textContent = 'Model: ready';
+    el.className = 'conn-status connected';
+  });
+})();
+</script>
+</body>
+</html>