bledden commited on
Commit
c75f6b6
·
verified ·
1 Parent(s): 1741387

Upload folder using huggingface_hub

Browse files
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install dependencies
6
+ RUN pip install --no-cache-dir \
7
+ openenv-core \
8
+ fastapi \
9
+ uvicorn \
10
+ mcp \
11
+ torch --index-url https://download.pytorch.org/whl/cpu \
12
+ transformers \
13
+ accelerate \
14
+ sentencepiece
15
+
16
+ # Pre-download model weights at build time (faster cold start)
17
+ RUN python -c "from transformers import AutoModelForCausalLM, AutoTokenizer; \
18
+ AutoTokenizer.from_pretrained('Qwen/Qwen2.5-1.5B-Instruct'); \
19
+ AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-1.5B-Instruct')"
20
+
21
+ # Copy app code
22
+ COPY server/ server/
23
+ COPY models.py .
24
+ COPY hf_space/serve.py .
25
+
26
+ # Copy dashboard
27
+ COPY dashboard.html static/index.html
28
+
29
+ EXPOSE 7860
30
+
31
+ CMD ["python", "serve.py"]
models.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data models for the Stack Doctor Environment.
3
+
4
+ An overseer LLM diagnoses sick inference stacks by probing subsystems,
5
+ reconciling conflicting specialist-agent reports, and selecting the
6
+ minimal correct fix.
7
+ """
8
+
9
+ from pydantic import Field
10
+
11
+ from openenv.core.env_server.types import Action, Observation
12
+
13
+
14
+ class StackDoctorAction(Action):
15
+ """Agent action — a JSON message selecting one of 4 action types."""
16
+
17
+ message: str = Field(
18
+ ...,
19
+ description=(
20
+ 'JSON action. One of:\n'
21
+ ' {"type":"inspect","target":"logs|config|snippet|metrics"}\n'
22
+ ' {"type":"ask_specialist","specialist":"runtime|dispatch|kernel|loader"}\n'
23
+ ' {"type":"apply_fix","fix":"relax_arch_check|add_whitelist_entry|fix_runtime_path|switch_backend|update_model_config|fix_weight_mapping|tune_memory_config|fix_quantization|fix_comm_config|update_driver_config"}\n'
24
+ ' {"type":"submit","root_cause":"...","fix":"...","justification":"..."}'
25
+ ),
26
+ )
27
+
28
+
29
+ class StackDoctorObservation(Observation):
30
+ """What the agent sees after each action."""
31
+
32
+ output: str = Field(default="", description="Natural-language feedback")
33
+ incident_ticket: str = Field(default="", description="The incident description")
34
+ hardware: str = Field(default="", description="Hardware identifier")
35
+ model_name: str = Field(default="", description="Model being served")
36
+ backend: str = Field(default="", description="Inference backend in use")
37
+ log_excerpt: str = Field(default="", description="Log snippet")
38
+ code_snippet: str = Field(default="", description="Config or code snippet")
39
+ specialist_opinions: dict = Field(default_factory=dict, description="Specialist name -> {opinion, confidence}")
40
+ steps_remaining: int = Field(default=6, description="Steps left in episode")
41
+ fix_used: bool = Field(default=False, description="Whether apply_fix has been used")
serve.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unified server for HF Spaces: environment + inference + dashboard on port 7860."""
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+ import time
7
+ import threading
8
+
9
+ sys.path.insert(0, "/app")
10
+
11
+ from fastapi import FastAPI, Request
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from fastapi.responses import FileResponse, JSONResponse
14
+ import uvicorn
15
+
16
+ from server.app import app as env_app
17
+
18
+ env_app.add_middleware(
19
+ CORSMiddleware,
20
+ allow_origins=["*"],
21
+ allow_methods=["*"],
22
+ allow_headers=["*"],
23
+ )
24
+
25
+ # Model state (loaded in background)
26
+ MODEL_STATE = {"model": None, "tokenizer": None, "ready": False, "error": None}
27
+
28
+ UNTRAINED_SYSTEM = (
29
+ "You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\n"
30
+ "You receive an incident ticket with hardware/model/backend context, log excerpts, and specialist opinions.\n"
31
+ "Some specialists may be wrong. Output a JSON array of actions:\n"
32
+ ' {"type":"inspect","target":"logs|config|snippet|metrics"}\n'
33
+ ' {"type":"ask_specialist","specialist":"runtime|dispatch|kernel|loader"}\n'
34
+ ' {"type":"apply_fix","fix":"<fix_name>"}\n'
35
+ ' {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<why>"}'
36
+ )
37
+
38
+ TRAINED_SYSTEM = (
39
+ "You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\n"
40
+ "You are methodical: first inspect logs and config, then query specialists to cross-verify (some lie), then apply a fix and submit.\n\n"
41
+ "Available actions (output as a JSON array):\n"
42
+ ' {"type":"inspect","target":"logs"} or "config" or "snippet" or "metrics"\n'
43
+ ' {"type":"ask_specialist","specialist":"runtime"} or "dispatch" or "kernel" or "loader"\n'
44
+ ' {"type":"apply_fix","fix":"<name>"} -- available fixes: add_whitelist_entry, fix_comm_config, fix_quantization, fix_runtime_path, fix_weight_mapping, relax_arch_check, switch_backend, tune_memory_config, update_driver_config, update_model_config\n'
45
+ ' {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<detailed reasoning>"}\n\n'
46
+ "Available root causes: arch_guard, backend_selector, backend_whitelist, distributed_comm, driver_compat, memory_oom, model_config, quantization_error, runtime_loader, weight_layout\n\n"
47
+ "IMPORTANT: Pick ONE target per inspect, ONE specialist per query. Investigate before submitting. Give a detailed justification.\n\n"
48
+ "Example output:\n"
49
+ '[{"type":"inspect","target":"logs"},{"type":"inspect","target":"config"},{"type":"ask_specialist","specialist":"kernel"},'
50
+ '{"type":"apply_fix","fix":"relax_arch_check"},'
51
+ '{"type":"submit","root_cause":"arch_guard","fix":"relax_arch_check","justification":"Logs show architecture check failure for SM90. Config confirms guard enabled. Kernel specialist confirmed not a kernel issue."}]'
52
+ )
53
+
54
+
55
+ def load_model_background():
56
+ """Load Qwen 1.5B in a background thread so the server starts fast."""
57
+ try:
58
+ print("[Model] Loading Qwen2.5-1.5B-Instruct (CPU)...")
59
+ t0 = time.time()
60
+ from transformers import AutoModelForCausalLM, AutoTokenizer
61
+ import torch
62
+
63
+ model_name = "Qwen/Qwen2.5-1.5B-Instruct"
64
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
65
+ model = AutoModelForCausalLM.from_pretrained(
66
+ model_name,
67
+ torch_dtype=torch.float32,
68
+ device_map="cpu",
69
+ )
70
+
71
+ MODEL_STATE["model"] = model
72
+ MODEL_STATE["tokenizer"] = tokenizer
73
+ MODEL_STATE["ready"] = True
74
+ print(f"[Model] Loaded in {time.time()-t0:.1f}s")
75
+ except Exception as ex:
76
+ MODEL_STATE["error"] = str(ex)
77
+ print(f"[Model] Failed to load: {ex}")
78
+
79
+
80
+ threading.Thread(target=load_model_background, daemon=True).start()
81
+
82
+
83
+ @env_app.post("/generate")
84
+ async def generate_endpoint(request: Request):
85
+ body = await request.json()
86
+ prompt_text = body.get("prompt", "")
87
+ max_tokens = body.get("max_tokens", 512)
88
+ mode = body.get("mode", "untrained")
89
+
90
+ if not MODEL_STATE["ready"]:
91
+ if MODEL_STATE["error"]:
92
+ return JSONResponse({"error": MODEL_STATE["error"]}, status_code=500)
93
+ return JSONResponse({"error": "Model still loading, please wait..."}, status_code=503)
94
+
95
+ model = MODEL_STATE["model"]
96
+ tokenizer = MODEL_STATE["tokenizer"]
97
+ system = TRAINED_SYSTEM if mode == "trained" else UNTRAINED_SYSTEM
98
+
99
+ messages = [
100
+ {"role": "system", "content": system},
101
+ {"role": "user", "content": prompt_text},
102
+ ]
103
+
104
+ import torch
105
+
106
+ text_input = tokenizer.apply_chat_template(
107
+ messages, tokenize=False, add_generation_prompt=True
108
+ )
109
+ inputs = tokenizer(text_input, return_tensors="pt")
110
+
111
+ t0 = time.time()
112
+ with torch.no_grad():
113
+ outputs = model.generate(
114
+ **inputs,
115
+ max_new_tokens=max_tokens,
116
+ do_sample=True,
117
+ temperature=0.7,
118
+ top_p=0.9,
119
+ pad_token_id=tokenizer.eos_token_id,
120
+ )
121
+
122
+ new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
123
+ text = tokenizer.decode(new_tokens, skip_special_tokens=True)
124
+ gen_time = time.time() - t0
125
+ print(f"[Model] Generated {len(text)} chars in {gen_time:.1f}s (mode={mode})")
126
+ return JSONResponse({"text": text, "gen_time": gen_time})
127
+
128
+
129
+ @env_app.get("/model_status")
130
+ async def model_status():
131
+ return JSONResponse({
132
+ "ready": MODEL_STATE["ready"],
133
+ "error": MODEL_STATE["error"],
134
+ })
135
+
136
+
137
+ @env_app.get("/", include_in_schema=False)
138
+ async def root():
139
+ return FileResponse("/app/static/index.html")
140
+
141
+
142
+ if __name__ == "__main__":
143
+ uvicorn.run(env_app, host="0.0.0.0", port=7860)
server/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Stack Doctor environment server components."""
2
+
3
+ from .stack_doctor_environment import StackDoctorEnvironment
4
+
5
+ __all__ = ["StackDoctorEnvironment"]
6
+
7
+
8
+ def get_mcp_environment():
9
+ """Lazy import of MCP environment (requires fastapi/uvicorn)."""
10
+ from .stack_doctor_mcp import StackDoctorMCPEnvironment
11
+ return StackDoctorMCPEnvironment
server/app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application for the Stack Doctor Environment.
3
+
4
+ Exposes both:
5
+ - WebSocket API (reset/step/state) for RL training
6
+ - MCP API (tools/list, tools/call) for agent interaction
7
+
8
+ Usage:
9
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
10
+ """
11
+
12
+ try:
13
+ from openenv.core.env_server.http_server import create_app
14
+ except Exception as e:
15
+ raise ImportError(
16
+ "openenv is required. Install with: uv sync"
17
+ ) from e
18
+
19
+ from models import StackDoctorAction, StackDoctorObservation
20
+ from .stack_doctor_mcp import StackDoctorMCPEnvironment
21
+
22
+ app = create_app(
23
+ StackDoctorMCPEnvironment,
24
+ StackDoctorAction,
25
+ StackDoctorObservation,
26
+ env_name="stack_doctor",
27
+ max_concurrent_envs=4,
28
+ )
29
+
30
+
31
+ def main(host: str = "0.0.0.0", port: int = 8000):
32
+ import uvicorn
33
+ uvicorn.run(app, host=host, port=port)
34
+
35
+
36
+ if __name__ == "__main__":
37
+ import argparse
38
+ parser = argparse.ArgumentParser()
39
+ parser.add_argument("--port", type=int, default=8000)
40
+ args = parser.parse_args()
41
+ main(port=args.port)
server/baselines.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Oracle, heuristic, and random baselines for Stack Doctor.
3
+
4
+ Used to validate the reward function: random < heuristic < oracle must hold.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import random
11
+
12
+ from .scenarios import (
13
+ ROOT_CAUSE_TO_FIX,
14
+ ROOT_CAUSES,
15
+ FIXES,
16
+ SPECIALISTS,
17
+ Scenario,
18
+ SCENARIOS,
19
+ TRAIN_SCENARIOS,
20
+ EVAL_SCENARIOS,
21
+ )
22
+
23
+
24
+ def oracle_policy(scenario: Scenario) -> list[dict]:
25
+ """Perfect policy: submit correct answer in 1 step."""
26
+ return [
27
+ {
28
+ "type": "submit",
29
+ "root_cause": scenario.root_cause,
30
+ "fix": scenario.correct_fix,
31
+ "justification": f"Root cause is {scenario.root_cause}, applying the correct fix.",
32
+ }
33
+ ]
34
+
35
+
36
+ def heuristic_policy(scenario: Scenario) -> list[dict]:
37
+ """
38
+ Reasonable heuristic: inspect logs, ask the highest-confidence specialist,
39
+ then submit based on clues.
40
+
41
+ Uses keyword matching on specialist opinions and logs to guess root cause.
42
+ """
43
+ actions = []
44
+
45
+ # Step 1: inspect logs
46
+ actions.append({"type": "inspect", "target": "logs"})
47
+
48
+ # Step 2: ask the highest-confidence specialist
49
+ best_spec = max(
50
+ scenario.specialist_opinions.items(),
51
+ key=lambda kv: kv[1].confidence,
52
+ )
53
+ actions.append({"type": "ask_specialist", "specialist": best_spec[0]})
54
+
55
+ # Step 3: heuristic root-cause guess from keywords
56
+ combined_text = (
57
+ scenario.incident_ticket
58
+ + " " + scenario.initial_log
59
+ + " " + best_spec[1].opinion
60
+ ).lower()
61
+
62
+ guess = _keyword_guess(combined_text)
63
+
64
+ # Step 4: apply fix
65
+ actions.append({"type": "apply_fix", "fix": ROOT_CAUSE_TO_FIX[guess]})
66
+
67
+ # Step 5: submit
68
+ actions.append({
69
+ "type": "submit",
70
+ "root_cause": guess,
71
+ "fix": ROOT_CAUSE_TO_FIX[guess],
72
+ "justification": f"Keyword analysis of logs and specialist opinions points to {guess}.",
73
+ })
74
+
75
+ return actions
76
+
77
+
78
+ def random_policy(scenario: Scenario) -> list[dict]:
79
+ """Random policy: random actions, random submit."""
80
+ actions = []
81
+ n_steps = random.randint(1, 5)
82
+
83
+ for _ in range(n_steps - 1):
84
+ choice = random.choice(["inspect", "ask_specialist"])
85
+ if choice == "inspect":
86
+ actions.append({
87
+ "type": "inspect",
88
+ "target": random.choice(["logs", "config", "snippet", "metrics"]),
89
+ })
90
+ else:
91
+ actions.append({
92
+ "type": "ask_specialist",
93
+ "specialist": random.choice(SPECIALISTS),
94
+ })
95
+
96
+ # Final: random submit
97
+ rc = random.choice(ROOT_CAUSES)
98
+ actions.append({
99
+ "type": "submit",
100
+ "root_cause": rc,
101
+ "fix": ROOT_CAUSE_TO_FIX[rc],
102
+ })
103
+
104
+ return actions
105
+
106
+
107
+ def _keyword_guess(text: str) -> str:
108
+ """Guess root cause from keyword presence in text."""
109
+ scores = {rc: 0 for rc in ROOT_CAUSES}
110
+
111
+ # arch_guard keywords
112
+ for kw in ["arch", "architecture", "sm_12", "sm_120", "sm_121", "supported_arch", "capability", "is_supported"]:
113
+ if kw in text:
114
+ scores["arch_guard"] += 1
115
+
116
+ # backend_whitelist keywords
117
+ for kw in ["whitelist", "supported_gpu", "not in", "marlin", "awq", "gpu name"]:
118
+ if kw in text:
119
+ scores["backend_whitelist"] += 1
120
+
121
+ # runtime_loader keywords
122
+ for kw in ["runtime", "libcuda", "ld_library", "cuda_home", "symlink", "shared object", "rocm_path", "hipError"]:
123
+ if kw in text:
124
+ scores["runtime_loader"] += 1
125
+
126
+ # backend_selector keywords
127
+ for kw in ["backend", "selector", "xformers", "flash_attn", "latency", "slow", "e4m3fn", "fp8 format"]:
128
+ if kw in text:
129
+ scores["backend_selector"] += 1
130
+
131
+ # model_config keywords
132
+ for kw in ["config", "num_expert", "shape mismatch", "rope", "checkpoint", "config.json"]:
133
+ if kw in text:
134
+ scores["model_config"] += 1
135
+
136
+ # weight_layout keywords
137
+ for kw in ["weight", "mapping", "swap", "gate_proj", "up_proj", "convert", "layout", "qkv"]:
138
+ if kw in text:
139
+ scores["weight_layout"] += 1
140
+
141
+ # memory_oom keywords
142
+ for kw in ["out of memory", "oom", "kv_cache", "memory", "max_model_len", "batch size", "vram"]:
143
+ if kw in text:
144
+ scores["memory_oom"] += 1
145
+
146
+ # quantization_error keywords
147
+ for kw in ["quantiz", "fp8", "int4", "nf4", "calibrat", "precision", "scale factor", "gptq"]:
148
+ if kw in text:
149
+ scores["quantization_error"] += 1
150
+
151
+ # distributed_comm keywords
152
+ for kw in ["nccl", "tensor parallel", "all_reduce", "rdma", "pipeline parallel", "collective", "rank"]:
153
+ if kw in text:
154
+ scores["distributed_comm"] += 1
155
+
156
+ # driver_compat keywords
157
+ for kw in ["driver", "cudnn", "toolkit", "nvcc", "cuda version", "driver version", "libcudnn"]:
158
+ if kw in text:
159
+ scores["driver_compat"] += 1
160
+
161
+ return max(scores, key=scores.get)
162
+
163
+
164
+ def evaluate_policy(policy_fn, scenarios: list[Scenario], n_runs: int = 1) -> dict:
165
+ """
166
+ Run a policy across scenarios and compute metrics.
167
+
168
+ Returns dict with:
169
+ - rc_accuracy: fraction of correct root cause submissions
170
+ - fix_accuracy: fraction of correct fix submissions
171
+ - avg_steps: average steps to resolution
172
+ - avg_reward: average cumulative reward
173
+ """
174
+ from .stack_doctor_environment import StackDoctorEnvironment
175
+ from models import StackDoctorAction
176
+
177
+ total_rc_correct = 0
178
+ total_fix_correct = 0
179
+ total_steps = 0
180
+ total_reward = 0.0
181
+ total_episodes = 0
182
+
183
+ for _ in range(n_runs):
184
+ for scenario in scenarios:
185
+ env = StackDoctorEnvironment()
186
+ env.reset(scenario_id=scenario.id)
187
+
188
+ actions = policy_fn(scenario)
189
+ cumulative = 0.0
190
+ steps = 0
191
+
192
+ for action_dict in actions:
193
+ obs = env.step(StackDoctorAction(message=json.dumps(action_dict)))
194
+ cumulative += obs.reward
195
+ steps += 1
196
+ if obs.done:
197
+ break
198
+
199
+ # Check if submit happened
200
+ last_action = actions[-1] if actions else {}
201
+ if last_action.get("type") == "submit":
202
+ if last_action["root_cause"] == scenario.root_cause:
203
+ total_rc_correct += 1
204
+ if last_action["fix"] == scenario.correct_fix:
205
+ total_fix_correct += 1
206
+
207
+ total_steps += steps
208
+ total_reward += cumulative
209
+ total_episodes += 1
210
+
211
+ return {
212
+ "rc_accuracy": total_rc_correct / total_episodes if total_episodes else 0,
213
+ "fix_accuracy": total_fix_correct / total_episodes if total_episodes else 0,
214
+ "avg_steps": total_steps / total_episodes if total_episodes else 0,
215
+ "avg_reward": total_reward / total_episodes if total_episodes else 0,
216
+ "n_episodes": total_episodes,
217
+ }
server/scenarios.py ADDED
The diff for this file is too large to render. See raw diff
 
server/stack_doctor_environment.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stack Doctor Environment.
3
+
4
+ An overseer LLM diagnoses sick inference stacks by probing subsystems,
5
+ reconciling conflicting specialist-agent reports, and selecting the
6
+ minimal correct fix.
7
+
8
+ Inspired by real SM12x enablement bugs across vLLM, FlashInfer, SGLang,
9
+ CUTLASS, and Flash-Attention.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ from uuid import uuid4
16
+
17
+ from openenv.core.env_server.interfaces import Environment
18
+ from openenv.core.env_server.types import State
19
+
20
+ from models import StackDoctorAction, StackDoctorObservation
21
+ from .scenarios import (
22
+ ROOT_CAUSE_TO_FIX,
23
+ FIX_TO_ROOT_CAUSE,
24
+ ROOT_CAUSES,
25
+ FIXES,
26
+ SPECIALISTS,
27
+ Scenario,
28
+ SpecialistOpinion,
29
+ get_scenario,
30
+ randomize_specialist_opinions,
31
+ )
32
+
33
+ MAX_STEPS = 6
34
+
35
+ INSPECT_TARGETS = {"logs", "config", "snippet", "metrics"}
36
+ VALID_FIXES = set(FIXES)
37
+ VALID_ROOT_CAUSES = set(ROOT_CAUSES)
38
+
39
+
40
+ class EpisodeState:
41
+ """Internal mutable episode state (not exposed to agent)."""
42
+
43
+ def __init__(
44
+ self,
45
+ scenario: Scenario,
46
+ specialist_opinions: dict[str, SpecialistOpinion] | None = None,
47
+ ):
48
+ self.scenario = scenario
49
+ # Per-episode randomized specialist opinions (falls back to scenario defaults)
50
+ self.specialist_opinions = specialist_opinions or scenario.specialist_opinions
51
+ self.step_count = 0
52
+ self.fix_applied = False
53
+ self.fix_was_correct: bool | None = None
54
+ self.done = False
55
+ self.cumulative_reward = 0.0
56
+ self.actions_taken: list[dict] = []
57
+
58
+
59
+ class StackDoctorEnvironment(Environment):
60
+ """
61
+ Stack Doctor: incident-response RL environment for
62
+ inference-stack diagnosis.
63
+ """
64
+
65
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
66
+
67
+ def __init__(self):
68
+ self._state = State(episode_id=str(uuid4()), step_count=0)
69
+ self._episode: EpisodeState | None = None
70
+
71
+ def reset(self, seed=None, episode_id=None, **kwargs) -> StackDoctorObservation:
72
+ scenario_id = kwargs.get("scenario_id")
73
+ split = kwargs.get("split", "train")
74
+ scenario = get_scenario(scenario_id, split=split)
75
+
76
+ self._state = State(
77
+ episode_id=episode_id or str(uuid4()),
78
+ step_count=0,
79
+ )
80
+ randomized_opinions = randomize_specialist_opinions(scenario)
81
+ self._episode = EpisodeState(scenario, specialist_opinions=randomized_opinions)
82
+
83
+ specialist_obs = {}
84
+ for name, op in randomized_opinions.items():
85
+ specialist_obs[name] = {
86
+ "opinion": op.opinion,
87
+ "confidence": op.confidence,
88
+ }
89
+
90
+ return StackDoctorObservation(
91
+ output=(
92
+ "STACK DOCTOR — New incident assigned.\n"
93
+ "Diagnose the root cause, optionally apply a fix, then submit your diagnosis.\n"
94
+ "You have 6 steps. Use them wisely.\n\n"
95
+ "Available actions (send as JSON):\n"
96
+ ' {"type":"inspect","target":"logs|config|snippet|metrics"}\n'
97
+ ' {"type":"ask_specialist","specialist":"runtime|dispatch|kernel|loader"}\n'
98
+ ' {"type":"apply_fix","fix":"relax_arch_check|add_whitelist_entry|fix_runtime_path|switch_backend|update_model_config|fix_weight_mapping|tune_memory_config|fix_quantization|fix_comm_config|update_driver_config"}\n'
99
+ ' {"type":"submit","root_cause":"...","fix":"...","justification":"reason for diagnosis"}\n'
100
+ ),
101
+ incident_ticket=scenario.incident_ticket,
102
+ hardware=scenario.hardware,
103
+ model_name=scenario.model_name,
104
+ backend=scenario.backend,
105
+ log_excerpt=scenario.initial_log,
106
+ code_snippet=scenario.initial_snippet,
107
+ specialist_opinions=specialist_obs,
108
+ steps_remaining=MAX_STEPS,
109
+ fix_used=False,
110
+ done=False,
111
+ reward=0.0,
112
+ )
113
+
114
+ def step(self, action: StackDoctorAction, **kwargs) -> StackDoctorObservation:
115
+ ep = self._episode
116
+ if ep is None or ep.done:
117
+ return self._terminal_obs("Episode is over. Call reset() to start a new incident.", 0.0)
118
+
119
+ self._state.step_count += 1
120
+ ep.step_count += 1
121
+
122
+ try:
123
+ parsed = json.loads(action.message)
124
+ except (json.JSONDecodeError, TypeError):
125
+ return self._handle_invalid(ep, f"Invalid JSON: {action.message[:200]}")
126
+
127
+ action_type = parsed.get("type")
128
+
129
+ if action_type == "inspect":
130
+ return self._handle_inspect(ep, parsed)
131
+ elif action_type == "ask_specialist":
132
+ return self._handle_ask_specialist(ep, parsed)
133
+ elif action_type == "apply_fix":
134
+ return self._handle_apply_fix(ep, parsed)
135
+ elif action_type == "submit":
136
+ return self._handle_submit(ep, parsed)
137
+ else:
138
+ return self._handle_invalid(ep, f"Unknown action type: {action_type}")
139
+
140
+ @property
141
+ def state(self) -> State:
142
+ return self._state
143
+
144
+ def _handle_inspect(self, ep: EpisodeState, parsed: dict) -> StackDoctorObservation:
145
+ target = parsed.get("target")
146
+ if target not in INSPECT_TARGETS:
147
+ return self._handle_invalid(ep, f"Invalid inspect target: {target}. Use: {INSPECT_TARGETS}")
148
+
149
+ reward = -0.25
150
+ ep.cumulative_reward += reward
151
+ ep.actions_taken.append({"type": "inspect", "target": target})
152
+
153
+ ir = ep.scenario.inspect_results
154
+ result_map = {"logs": ir.logs, "config": ir.config, "snippet": ir.snippet, "metrics": ir.metrics}
155
+
156
+ return self._step_obs(ep, output=f"[INSPECT {target.upper()}]\n{result_map[target]}", reward=reward)
157
+
158
+ def _handle_ask_specialist(self, ep: EpisodeState, parsed: dict) -> StackDoctorObservation:
159
+ specialist = parsed.get("specialist")
160
+ if specialist not in SPECIALISTS:
161
+ return self._handle_invalid(ep, f"Invalid specialist: {specialist}. Use: {SPECIALISTS}")
162
+
163
+ reward = -0.25
164
+ ep.cumulative_reward += reward
165
+ ep.actions_taken.append({"type": "ask_specialist", "specialist": specialist})
166
+
167
+ followup = ep.scenario.specialist_followups.get(specialist, "No additional information.")
168
+ return self._step_obs(ep, output=f"[SPECIALIST: {specialist.upper()}]\n{followup}", reward=reward)
169
+
170
+ def _handle_apply_fix(self, ep: EpisodeState, parsed: dict) -> StackDoctorObservation:
171
+ if ep.fix_applied:
172
+ return self._handle_invalid(ep, "apply_fix already used this episode. You can only apply one fix.")
173
+
174
+ fix = parsed.get("fix")
175
+ if fix not in VALID_FIXES:
176
+ return self._handle_invalid(ep, f"Invalid fix: {fix}. Use one of: {sorted(VALID_FIXES)}")
177
+
178
+ ep.fix_applied = True
179
+ is_correct = fix == ep.scenario.correct_fix
180
+ ep.fix_was_correct = is_correct
181
+
182
+ reward = 3.0 if is_correct else -2.0
183
+ ep.cumulative_reward += reward
184
+ ep.actions_taken.append({"type": "apply_fix", "fix": fix, "correct": is_correct})
185
+
186
+ if is_correct:
187
+ output = f"[FIX APPLIED: {fix}] Fix applied successfully. Systems recovering. Now submit your diagnosis."
188
+ else:
189
+ output = f"[FIX APPLIED: {fix}] Fix applied but the issue persists. Consider your diagnosis carefully."
190
+
191
+ return self._step_obs(ep, output=output, reward=reward)
192
+
193
+ def _handle_submit(self, ep: EpisodeState, parsed: dict) -> StackDoctorObservation:
194
+ root_cause = parsed.get("root_cause")
195
+ fix = parsed.get("fix")
196
+ justification = parsed.get("justification", "")
197
+
198
+ if root_cause not in VALID_ROOT_CAUSES:
199
+ return self._handle_invalid(ep, f"Invalid root_cause: {root_cause}. Use one of: {sorted(VALID_ROOT_CAUSES)}")
200
+ if fix not in VALID_FIXES:
201
+ return self._handle_invalid(ep, f"Invalid fix: {fix}. Use one of: {sorted(VALID_FIXES)}")
202
+
203
+ ep.done = True
204
+ correct_rc = ep.scenario.root_cause
205
+ correct_fix = ep.scenario.correct_fix
206
+ rc_correct = root_cause == correct_rc
207
+ fix_correct = fix == correct_fix
208
+ has_justification = len(justification.strip()) >= 10
209
+
210
+ reward = 0.0
211
+ reward += 8.0 if rc_correct else -4.0
212
+ reward += 8.0 if fix_correct else -4.0
213
+ if (rc_correct and fix_correct) and ep.step_count <= 4:
214
+ reward += 2.0
215
+ if has_justification:
216
+ reward += 1.0
217
+
218
+ ep.cumulative_reward += reward
219
+ ep.actions_taken.append({
220
+ "type": "submit", "root_cause": root_cause, "fix": fix,
221
+ "justification": justification,
222
+ "rc_correct": rc_correct, "fix_correct": fix_correct,
223
+ "has_justification": has_justification,
224
+ })
225
+
226
+ output_lines = ["[DIAGNOSIS SUBMITTED]"]
227
+ output_lines.append(f" Root cause: {root_cause} — {'CORRECT' if rc_correct else 'WRONG (was: ' + correct_rc + ')'}")
228
+ output_lines.append(f" Fix: {fix} — {'CORRECT' if fix_correct else 'WRONG (was: ' + correct_fix + ')'}")
229
+ if has_justification:
230
+ output_lines.append(f" Justification: {justification.strip()}")
231
+ output_lines.append(" JUSTIFICATION BONUS: +1")
232
+ else:
233
+ output_lines.append(" No justification provided (missed +1 bonus)")
234
+ output_lines.append(f" Steps used: {ep.step_count}/{MAX_STEPS}")
235
+ if rc_correct and fix_correct and ep.step_count <= 4:
236
+ output_lines.append(" EFFICIENCY BONUS: +2 (solved in <= 4 steps)")
237
+ output_lines.append(f" Episode reward: {ep.cumulative_reward:.2f}")
238
+
239
+ return self._terminal_obs("\n".join(output_lines), reward)
240
+
241
+ def _handle_invalid(self, ep: EpisodeState, msg: str) -> StackDoctorObservation:
242
+ reward = -2.0
243
+ ep.cumulative_reward += reward
244
+ ep.actions_taken.append({"type": "invalid", "message": msg})
245
+
246
+ if ep.step_count >= MAX_STEPS:
247
+ ep.done = True
248
+ return self._terminal_obs(f"[INVALID ACTION] {msg}\n[EPISODE OVER] Max steps reached. Auto-fail.", reward)
249
+
250
+ return self._step_obs(ep, output=f"[INVALID ACTION] {msg}", reward=reward)
251
+
252
+ def _step_obs(self, ep: EpisodeState, output: str, reward: float) -> StackDoctorObservation:
253
+ remaining = MAX_STEPS - ep.step_count
254
+ if remaining <= 0 and not ep.done:
255
+ ep.done = True
256
+ timeout_penalty = -4.0
257
+ reward += timeout_penalty
258
+ ep.cumulative_reward += timeout_penalty
259
+ output += "\n\n[EPISODE OVER] Max steps reached without submission. Auto-fail. Reward: -4"
260
+
261
+ return StackDoctorObservation(
262
+ output=output, incident_ticket=ep.scenario.incident_ticket,
263
+ hardware=ep.scenario.hardware, model_name=ep.scenario.model_name,
264
+ backend=ep.scenario.backend, log_excerpt="", code_snippet="",
265
+ specialist_opinions={}, steps_remaining=remaining, fix_used=ep.fix_applied,
266
+ done=ep.done, reward=reward,
267
+ metadata={"cumulative_reward": ep.cumulative_reward, "step": ep.step_count, "scenario_id": ep.scenario.id},
268
+ )
269
+
270
+ def _terminal_obs(self, output: str, reward: float) -> StackDoctorObservation:
271
+ ep = self._episode
272
+ return StackDoctorObservation(
273
+ output=output, incident_ticket=ep.scenario.incident_ticket if ep else "",
274
+ hardware=ep.scenario.hardware if ep else "", model_name=ep.scenario.model_name if ep else "",
275
+ backend=ep.scenario.backend if ep else "", log_excerpt="", code_snippet="",
276
+ specialist_opinions={}, steps_remaining=0, fix_used=ep.fix_applied if ep else False,
277
+ done=True, reward=reward,
278
+ metadata={"cumulative_reward": ep.cumulative_reward if ep else 0.0, "step": ep.step_count if ep else 0, "scenario_id": ep.scenario.id if ep else ""},
279
+ )
server/stack_doctor_mcp.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stack Doctor MCP Environment.
3
+
4
+ Wraps the core Stack Doctor environment with MCP tools that agents
5
+ can discover and invoke. This is the agent-facing interface —
6
+ agents call tools like read_log(), query_specialist(), submit_diagnosis()
7
+ instead of constructing JSON action strings.
8
+
9
+ The training (WebSocket) API still works through _step_impl().
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ from typing import Any, Optional
16
+ from uuid import uuid4
17
+
18
+ from mcp.server.fastmcp import FastMCP
19
+ from openenv.core.env_server.mcp_environment import MCPEnvironment
20
+ from openenv.core.env_server.types import Action, Observation, State
21
+
22
+ from models import StackDoctorAction, StackDoctorObservation
23
+ from .scenarios import (
24
+ ROOT_CAUSE_TO_FIX,
25
+ FIX_TO_ROOT_CAUSE,
26
+ ROOT_CAUSES,
27
+ FIXES,
28
+ SPECIALISTS,
29
+ Scenario,
30
+ get_scenario,
31
+ )
32
+
33
+ MAX_STEPS = 6
34
+ VALID_FIXES = set(FIXES)
35
+ VALID_ROOT_CAUSES = set(ROOT_CAUSES)
36
+
37
+
38
+ class StackDoctorMCPEnvironment(MCPEnvironment):
39
+ """
40
+ Stack Doctor with MCP tool interface for agent interaction.
41
+
42
+ Agents discover available tools (read_log, check_config, view_code,
43
+ run_diagnostic, query_specialist, apply_fix, submit_diagnosis) and
44
+ call them to investigate incidents and submit diagnoses.
45
+ """
46
+
47
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
48
+
49
+ def __init__(self):
50
+ mcp = FastMCP("stack_doctor")
51
+ self._state_obj = State(episode_id=str(uuid4()), step_count=0)
52
+ self._scenario: Scenario | None = None
53
+ self._step_count = 0
54
+ self._fix_applied = False
55
+ self._fix_was_correct: bool | None = None
56
+ self._done = False
57
+ self._cumulative_reward = 0.0
58
+ self._actions_taken: list[dict] = []
59
+
60
+ env = self # capture for closures
61
+
62
+ @mcp.tool()
63
+ def read_log() -> str:
64
+ """Read system and application logs for the current incident.
65
+ Returns log output from the affected inference stack including
66
+ error messages, warnings, and system state information.
67
+ Costs 1 step (-0.25 reward)."""
68
+ return env._do_inspect("logs")
69
+
70
+ @mcp.tool()
71
+ def check_config() -> str:
72
+ """Check configuration files for the current incident.
73
+ Returns relevant configuration parameters including GPU settings,
74
+ backend configuration, model parameters, and environment variables.
75
+ Costs 1 step (-0.25 reward)."""
76
+ return env._do_inspect("config")
77
+
78
+ @mcp.tool()
79
+ def view_code() -> str:
80
+ """View relevant source code snippets for the current incident.
81
+ Returns code from the affected component showing the likely
82
+ location of the bug or misconfiguration.
83
+ Costs 1 step (-0.25 reward)."""
84
+ return env._do_inspect("snippet")
85
+
86
+ @mcp.tool()
87
+ def run_diagnostic() -> str:
88
+ """Run performance diagnostics and metrics collection.
89
+ Returns metrics like latency, throughput, GPU utilization,
90
+ error rates, and memory usage for the affected system.
91
+ Costs 1 step (-0.25 reward)."""
92
+ return env._do_inspect("metrics")
93
+
94
+ @mcp.tool()
95
+ def query_specialist(specialist: str) -> str:
96
+ """Ask a specialist for their analysis of the incident.
97
+ Specialists: 'runtime', 'dispatch', 'kernel', 'loader'.
98
+ WARNING: At least one specialist gives wrong advice per incident.
99
+ Cross-verify specialist opinions before trusting them.
100
+ Costs 1 step (-0.25 reward)."""
101
+ return env._do_ask_specialist(specialist)
102
+
103
+ @mcp.tool()
104
+ def apply_fix(fix: str) -> str:
105
+ """Apply a fix to the system. Can only be used ONCE per incident.
106
+ Available fixes: 'relax_arch_check', 'add_whitelist_entry',
107
+ 'fix_runtime_path', 'switch_backend', 'update_model_config',
108
+ 'fix_weight_mapping', 'tune_memory_config', 'fix_quantization',
109
+ 'fix_comm_config', 'update_driver_config'.
110
+ Correct fix: +3 reward. Wrong fix: -2 reward."""
111
+ return env._do_apply_fix(fix)
112
+
113
+ @mcp.tool()
114
+ def submit_diagnosis(root_cause: str, fix: str, justification: str = "") -> str:
115
+ """Submit your final diagnosis. This ends the episode.
116
+ Root causes: 'arch_guard', 'backend_whitelist', 'runtime_loader',
117
+ 'backend_selector', 'model_config', 'weight_layout',
118
+ 'memory_oom', 'quantization_error', 'distributed_comm', 'driver_compat'.
119
+ Fixes: 'relax_arch_check', 'add_whitelist_entry', 'fix_runtime_path',
120
+ 'switch_backend', 'update_model_config', 'fix_weight_mapping',
121
+ 'tune_memory_config', 'fix_quantization', 'fix_comm_config', 'update_driver_config'.
122
+ justification: A short sentence explaining WHY you chose this root cause
123
+ and fix based on the evidence you gathered. Bonus +1 if provided.
124
+ Correct root_cause: +8. Wrong: -4. Correct fix: +8. Wrong: -4.
125
+ Bonus +2 if solved in 4 or fewer steps. Bonus +1 for justification."""
126
+ return env._do_submit(root_cause, fix, justification)
127
+
128
+ super().__init__(mcp)
129
+
130
+ # ------------------------------------------------------------------
131
+ # MCP tool implementations
132
+ # ------------------------------------------------------------------
133
+
134
+ def _check_episode(self) -> str | None:
135
+ """Return error message if episode is not active."""
136
+ if self._scenario is None:
137
+ return "No active incident. Call reset() first."
138
+ if self._done:
139
+ return "Episode is over. Call reset() to start a new incident."
140
+ if self._step_count >= MAX_STEPS:
141
+ self._done = True
142
+ return "Max steps reached. Episode over."
143
+ return None
144
+
145
+ def _record_step(self, reward: float, action: dict) -> None:
146
+ self._step_count += 1
147
+ self._state_obj.step_count = self._step_count
148
+ self._cumulative_reward += reward
149
+ self._actions_taken.append(action)
150
+
151
+ def _do_inspect(self, target: str) -> str:
152
+ err = self._check_episode()
153
+ if err:
154
+ return err
155
+
156
+ ir = self._scenario.inspect_results
157
+ result_map = {
158
+ "logs": ir.logs,
159
+ "config": ir.config,
160
+ "snippet": ir.snippet,
161
+ "metrics": ir.metrics,
162
+ }
163
+
164
+ self._record_step(-0.25, {"type": "inspect", "target": target})
165
+
166
+ remaining = MAX_STEPS - self._step_count
167
+ return (
168
+ f"[INSPECT {target.upper()}]\n"
169
+ f"{result_map[target]}\n\n"
170
+ f"[Steps remaining: {remaining} | Reward: -0.25 | Cumulative: {self._cumulative_reward:.2f}]"
171
+ )
172
+
173
+ def _do_ask_specialist(self, specialist: str) -> str:
174
+ err = self._check_episode()
175
+ if err:
176
+ return err
177
+
178
+ if specialist not in SPECIALISTS:
179
+ self._record_step(-2.0, {"type": "invalid", "message": f"Unknown specialist: {specialist}"})
180
+ return f"Invalid specialist '{specialist}'. Available: {SPECIALISTS}. Penalty: -2.0"
181
+
182
+ followup = self._scenario.specialist_followups.get(specialist, "No additional information.")
183
+ self._record_step(-0.25, {"type": "ask_specialist", "specialist": specialist})
184
+
185
+ remaining = MAX_STEPS - self._step_count
186
+ return (
187
+ f"[SPECIALIST: {specialist.upper()}]\n"
188
+ f"{followup}\n\n"
189
+ f"[Steps remaining: {remaining} | Reward: -0.25 | Cumulative: {self._cumulative_reward:.2f}]"
190
+ )
191
+
192
+ def _do_apply_fix(self, fix: str) -> str:
193
+ err = self._check_episode()
194
+ if err:
195
+ return err
196
+
197
+ if self._fix_applied:
198
+ self._record_step(-2.0, {"type": "invalid", "message": "Fix already applied"})
199
+ return "You already applied a fix this episode. Only one fix allowed. Penalty: -2.0"
200
+
201
+ if fix not in VALID_FIXES:
202
+ self._record_step(-2.0, {"type": "invalid", "message": f"Invalid fix: {fix}"})
203
+ return f"Invalid fix '{fix}'. Available: {sorted(VALID_FIXES)}. Penalty: -2.0"
204
+
205
+ self._fix_applied = True
206
+ is_correct = fix == self._scenario.correct_fix
207
+ self._fix_was_correct = is_correct
208
+ reward = 3.0 if is_correct else -2.0
209
+ self._record_step(reward, {"type": "apply_fix", "fix": fix, "correct": is_correct})
210
+
211
+ remaining = MAX_STEPS - self._step_count
212
+ if is_correct:
213
+ return (
214
+ f"[FIX APPLIED: {fix}] Fix applied successfully. Systems recovering.\n"
215
+ f"Now submit your diagnosis with submit_diagnosis().\n\n"
216
+ f"[Steps remaining: {remaining} | Reward: +3.0 | Cumulative: {self._cumulative_reward:.2f}]"
217
+ )
218
+ else:
219
+ return (
220
+ f"[FIX APPLIED: {fix}] Fix applied but the issue persists.\n"
221
+ f"Consider your diagnosis carefully.\n\n"
222
+ f"[Steps remaining: {remaining} | Reward: -2.0 | Cumulative: {self._cumulative_reward:.2f}]"
223
+ )
224
+
225
+ def _do_submit(self, root_cause: str, fix: str, justification: str = "") -> str:
226
+ err = self._check_episode()
227
+ if err:
228
+ return err
229
+
230
+ if root_cause not in VALID_ROOT_CAUSES:
231
+ self._record_step(-2.0, {"type": "invalid", "message": f"Invalid root_cause: {root_cause}"})
232
+ return f"Invalid root_cause '{root_cause}'. Available: {sorted(VALID_ROOT_CAUSES)}. Penalty: -2.0"
233
+
234
+ if fix not in VALID_FIXES:
235
+ self._record_step(-2.0, {"type": "invalid", "message": f"Invalid fix: {fix}"})
236
+ return f"Invalid fix '{fix}'. Available: {sorted(VALID_FIXES)}. Penalty: -2.0"
237
+
238
+ self._done = True
239
+ rc_correct = root_cause == self._scenario.root_cause
240
+ fix_correct = fix == self._scenario.correct_fix
241
+ has_justification = len(justification.strip()) >= 10
242
+
243
+ reward = 0.0
244
+ reward += 8.0 if rc_correct else -4.0
245
+ reward += 8.0 if fix_correct else -4.0
246
+ if rc_correct and fix_correct and self._step_count + 1 <= 4:
247
+ reward += 2.0
248
+ if has_justification:
249
+ reward += 1.0
250
+
251
+ self._record_step(reward, {
252
+ "type": "submit", "root_cause": root_cause, "fix": fix,
253
+ "justification": justification,
254
+ "rc_correct": rc_correct, "fix_correct": fix_correct,
255
+ "has_justification": has_justification,
256
+ })
257
+
258
+ lines = ["[DIAGNOSIS SUBMITTED]"]
259
+ lines.append(f" Root cause: {root_cause} — {'CORRECT' if rc_correct else 'WRONG (was: ' + self._scenario.root_cause + ')'}")
260
+ lines.append(f" Fix: {fix} — {'CORRECT' if fix_correct else 'WRONG (was: ' + self._scenario.correct_fix + ')'}")
261
+ if has_justification:
262
+ lines.append(f" Justification: {justification.strip()}")
263
+ lines.append(" JUSTIFICATION BONUS: +1")
264
+ else:
265
+ lines.append(" No justification provided (missed +1 bonus)")
266
+ lines.append(f" Steps used: {self._step_count}/{MAX_STEPS}")
267
+ if rc_correct and fix_correct and self._step_count <= 4:
268
+ lines.append(" EFFICIENCY BONUS: +2 (solved in <= 4 steps)")
269
+ lines.append(f" Episode reward: {self._cumulative_reward:.2f}")
270
+
271
+ return "\n".join(lines)
272
+
273
+ # ------------------------------------------------------------------
274
+ # OpenEnv Environment interface (for training / WebSocket API)
275
+ # ------------------------------------------------------------------
276
+
277
+ def reset(self, seed=None, episode_id=None, **kwargs) -> StackDoctorObservation:
278
+ scenario_id = kwargs.get("scenario_id")
279
+ split = kwargs.get("split", "train")
280
+ self._scenario = get_scenario(scenario_id, split=split)
281
+
282
+ self._state_obj = State(
283
+ episode_id=episode_id or str(uuid4()),
284
+ step_count=0,
285
+ )
286
+ self._step_count = 0
287
+ self._fix_applied = False
288
+ self._fix_was_correct = None
289
+ self._done = False
290
+ self._cumulative_reward = 0.0
291
+ self._actions_taken = []
292
+
293
+ specialist_obs = {}
294
+ for name, op in self._scenario.specialist_opinions.items():
295
+ specialist_obs[name] = {
296
+ "opinion": op.opinion,
297
+ "confidence": op.confidence,
298
+ }
299
+
300
+ return StackDoctorObservation(
301
+ output=(
302
+ "STACK DOCTOR — New incident assigned.\n"
303
+ "Investigate using the available tools: read_log(), check_config(), "
304
+ "view_code(), run_diagnostic(), query_specialist(name).\n"
305
+ "When ready, apply_fix(fix) and/or submit_diagnosis(root_cause, fix).\n"
306
+ "You have 6 steps. At least one specialist is WRONG — cross-verify.\n"
307
+ ),
308
+ incident_ticket=self._scenario.incident_ticket,
309
+ hardware=self._scenario.hardware,
310
+ model_name=self._scenario.model_name,
311
+ backend=self._scenario.backend,
312
+ log_excerpt=self._scenario.initial_log,
313
+ code_snippet=self._scenario.initial_snippet,
314
+ specialist_opinions=specialist_obs,
315
+ steps_remaining=MAX_STEPS,
316
+ fix_used=False,
317
+ done=False,
318
+ reward=0.0,
319
+ )
320
+
321
+ def _step_impl(
322
+ self,
323
+ action: Action,
324
+ timeout_s: Optional[float] = None,
325
+ **kwargs: Any,
326
+ ) -> Observation:
327
+ """Handle non-MCP actions (JSON action strings for training)."""
328
+ if not isinstance(action, StackDoctorAction):
329
+ return self._make_obs("Invalid action type.", -2.0)
330
+
331
+ try:
332
+ parsed = json.loads(action.message)
333
+ except (json.JSONDecodeError, TypeError):
334
+ return self._make_obs(f"Invalid JSON: {action.message[:200]}", -2.0)
335
+
336
+ action_type = parsed.get("type")
337
+
338
+ if action_type == "inspect":
339
+ result = self._do_inspect(parsed.get("target", "logs"))
340
+ elif action_type == "ask_specialist":
341
+ result = self._do_ask_specialist(parsed.get("specialist", ""))
342
+ elif action_type == "apply_fix":
343
+ result = self._do_apply_fix(parsed.get("fix", ""))
344
+ elif action_type == "submit":
345
+ result = self._do_submit(parsed.get("root_cause", ""), parsed.get("fix", ""), parsed.get("justification", ""))
346
+ else:
347
+ self._record_step(-2.0, {"type": "invalid", "message": f"Unknown: {action_type}"})
348
+ result = f"Unknown action type: {action_type}. Penalty: -2.0"
349
+
350
+ # Extract last reward from actions
351
+ last_reward = 0.0
352
+ if self._actions_taken:
353
+ last = self._actions_taken[-1]
354
+ if last.get("type") == "submit":
355
+ # Calculate submit reward
356
+ rc_c = last.get("rc_correct", False)
357
+ fx_c = last.get("fix_correct", False)
358
+ last_reward = (8.0 if rc_c else -4.0) + (8.0 if fx_c else -4.0)
359
+ if rc_c and fx_c and self._step_count <= 4:
360
+ last_reward += 2.0
361
+ if last.get("has_justification", False):
362
+ last_reward += 1.0
363
+ elif last.get("type") == "apply_fix":
364
+ last_reward = 3.0 if last.get("correct") else -2.0
365
+ elif last.get("type") == "invalid":
366
+ last_reward = -2.0
367
+ else:
368
+ last_reward = -0.25
369
+
370
+ return self._make_obs(result, last_reward)
371
+
372
+ def _make_obs(self, output: str, reward: float) -> StackDoctorObservation:
373
+ remaining = MAX_STEPS - self._step_count
374
+ return StackDoctorObservation(
375
+ output=output,
376
+ incident_ticket=self._scenario.incident_ticket if self._scenario else "",
377
+ hardware=self._scenario.hardware if self._scenario else "",
378
+ model_name=self._scenario.model_name if self._scenario else "",
379
+ backend=self._scenario.backend if self._scenario else "",
380
+ log_excerpt="",
381
+ code_snippet="",
382
+ specialist_opinions={},
383
+ steps_remaining=remaining,
384
+ fix_used=self._fix_applied,
385
+ done=self._done,
386
+ reward=reward,
387
+ metadata={
388
+ "cumulative_reward": self._cumulative_reward,
389
+ "step": self._step_count,
390
+ "scenario_id": self._scenario.id if self._scenario else "",
391
+ },
392
+ )
393
+
394
+ @property
395
+ def state(self) -> State:
396
+ return self._state_obj
static/index.html ADDED
@@ -0,0 +1,1566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Stack Doctor — Incident War Room</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com">
8
+ <link href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@300;400;500;600&family=Outfit:wght@300;400;500;600;700&display=swap" rel="stylesheet">
9
+ <style>
10
+ *, *::before, *::after { margin: 0; padding: 0; box-sizing: border-box; }
11
+
12
+ :root {
13
+ --bg-abyss: #060a11;
14
+ --bg-deep: #0a0f1a;
15
+ --bg-mid: #0f1623;
16
+ --bg-surface: #151d2e;
17
+ --bg-elevated: #1a2438;
18
+ --border-subtle: rgba(100, 180, 255, 0.08);
19
+ --border-active: rgba(0, 196, 255, 0.25);
20
+ --cyan: #00c4ff;
21
+ --cyan-bright: #40d4ff;
22
+ --cyan-dim: rgba(0, 196, 255, 0.15);
23
+ --cyan-glow: rgba(0, 196, 255, 0.4);
24
+ --amber: #f0a030;
25
+ --amber-dim: rgba(240, 160, 48, 0.15);
26
+ --amber-glow: rgba(240, 160, 48, 0.4);
27
+ --emerald: #00e676;
28
+ --emerald-dim: rgba(0, 230, 118, 0.12);
29
+ --emerald-glow: rgba(0, 230, 118, 0.35);
30
+ --coral: #ff3d5a;
31
+ --coral-dim: rgba(255, 61, 90, 0.12);
32
+ --text-primary: #d8e0ec;
33
+ --text-secondary: rgba(216, 224, 236, 0.55);
34
+ --text-tertiary: rgba(216, 224, 236, 0.3);
35
+ --font-display: 'Outfit', system-ui, sans-serif;
36
+ --font-mono: 'IBM Plex Mono', 'SF Mono', monospace;
37
+ --ease-out-expo: cubic-bezier(0.16, 1, 0.3, 1);
38
+ --duration-slow: 800ms;
39
+ --duration-med: 400ms;
40
+ --duration-fast: 200ms;
41
+ }
42
+
43
+ html { height: 100%; }
44
+ body {
45
+ min-height: 100%;
46
+ background: var(--bg-abyss);
47
+ color: var(--text-primary);
48
+ font-family: var(--font-display);
49
+ -webkit-font-smoothing: antialiased;
50
+ overflow-y: auto;
51
+ overflow-x: hidden;
52
+ }
53
+
54
+ body::before {
55
+ content: '';
56
+ position: fixed; inset: 0;
57
+ background:
58
+ radial-gradient(ellipse 80% 60% at 20% 80%, rgba(0, 100, 180, 0.06) 0%, transparent 70%),
59
+ radial-gradient(ellipse 60% 50% at 80% 20%, rgba(0, 160, 255, 0.04) 0%, transparent 60%);
60
+ pointer-events: none; z-index: 0;
61
+ }
62
+
63
+ .grid-overlay {
64
+ position: fixed; inset: 0; z-index: 0; pointer-events: none;
65
+ background-image:
66
+ linear-gradient(rgba(100, 180, 255, 0.015) 1px, transparent 1px),
67
+ linear-gradient(90deg, rgba(100, 180, 255, 0.015) 1px, transparent 1px);
68
+ background-size: 60px 60px;
69
+ }
70
+
71
+ .app {
72
+ position: relative; z-index: 1;
73
+ max-width: 1400px;
74
+ margin: 0 auto;
75
+ padding: 20px 24px 40px;
76
+ display: flex; flex-direction: column; gap: 16px;
77
+ }
78
+
79
+ /* ══════════ HEADER ══════════ */
80
+ .header {
81
+ display: flex; align-items: center; justify-content: space-between;
82
+ padding: 14px 24px;
83
+ background: linear-gradient(135deg, var(--bg-mid), var(--bg-surface));
84
+ border: 1px solid var(--border-subtle); border-radius: 14px;
85
+ }
86
+ .header-left { display: flex; align-items: center; gap: 14px; }
87
+ .logo-mark {
88
+ width: 36px; height: 36px; border-radius: 9px;
89
+ background: linear-gradient(135deg, var(--cyan), rgba(0, 196, 255, 0.5));
90
+ display: flex; align-items: center; justify-content: center;
91
+ box-shadow: 0 0 24px var(--cyan-dim);
92
+ animation: logoPulse 4s ease-in-out infinite;
93
+ }
94
+ .logo-mark svg { width: 20px; height: 20px; }
95
+ @keyframes logoPulse { 0%,100% { box-shadow: 0 0 20px var(--cyan-dim); } 50% { box-shadow: 0 0 36px var(--cyan-glow); } }
96
+ .header-title { font-weight: 600; font-size: 16px; letter-spacing: 0.5px; }
97
+ .header-subtitle { font-family: var(--font-mono); font-size: 11px; font-weight: 300; color: var(--text-secondary); }
98
+ .header-right { display: flex; align-items: center; gap: 20px; }
99
+ .header-meta-label { font-family: var(--font-mono); font-size: 9px; font-weight: 500; letter-spacing: 1.5px; text-transform: uppercase; color: var(--text-tertiary); }
100
+ .header-meta-value { font-family: var(--font-mono); font-size: 12px; color: var(--cyan); }
101
+ .status-badge {
102
+ font-family: var(--font-mono); font-size: 10px; font-weight: 500;
103
+ letter-spacing: 1px; text-transform: uppercase;
104
+ padding: 5px 14px; border-radius: 20px;
105
+ background: var(--cyan-dim); color: var(--cyan);
106
+ border: 1px solid rgba(0, 196, 255, 0.2);
107
+ transition: all var(--duration-slow) var(--ease-out-expo);
108
+ }
109
+ .status-badge.warning { background: var(--amber-dim); color: var(--amber); border-color: rgba(240, 160, 48, 0.3); }
110
+ .status-badge.success { background: var(--emerald-dim); color: var(--emerald); border-color: rgba(0, 230, 118, 0.3); }
111
+ .status-badge.error { background: var(--coral-dim); color: var(--coral); border-color: rgba(255, 61, 90, 0.3); }
112
+
113
+ /* ══════════ SECTION TITLES ══════════ */
114
+ .section-title {
115
+ font-family: var(--font-mono); font-size: 10px; font-weight: 500;
116
+ letter-spacing: 2px; text-transform: uppercase; color: var(--text-tertiary);
117
+ padding: 8px 0 0;
118
+ display: flex; align-items: center; gap: 10px;
119
+ }
120
+ .section-title::after { content: ''; flex: 1; height: 1px; background: var(--border-subtle); }
121
+
122
+ /* ══════════ TRAINING CHART ══════════ */
123
+ .chart-section {
124
+ display: grid; grid-template-columns: 1fr 1fr; gap: 16px;
125
+ }
126
+ .chart-panel {
127
+ background: linear-gradient(180deg, rgba(15, 22, 35, 0.85), rgba(10, 15, 26, 0.95));
128
+ border: 1px solid var(--border-subtle); border-radius: 14px;
129
+ padding: 24px 28px; position: relative; overflow: hidden;
130
+ }
131
+ .chart-panel-title {
132
+ font-family: var(--font-mono); font-size: 10px; font-weight: 500;
133
+ letter-spacing: 1.5px; text-transform: uppercase; color: var(--text-secondary);
134
+ margin-bottom: 4px;
135
+ }
136
+ .chart-panel-subtitle {
137
+ font-size: 13px; font-weight: 300; color: var(--text-tertiary);
138
+ margin-bottom: 16px;
139
+ }
140
+ .chart-canvas-wrap {
141
+ position: relative; width: 100%; height: 280px;
142
+ }
143
+ canvas { width: 100% !important; height: 100% !important; }
144
+
145
+ .chart-stat-row {
146
+ display: flex; gap: 20px; margin-top: 16px; padding-top: 14px;
147
+ border-top: 1px solid var(--border-subtle);
148
+ }
149
+ .chart-stat { display: flex; flex-direction: column; gap: 2px; }
150
+ .chart-stat-label { font-family: var(--font-mono); font-size: 9px; font-weight: 500; letter-spacing: 1px; text-transform: uppercase; color: var(--text-tertiary); }
151
+ .chart-stat-value { font-family: var(--font-mono); font-size: 18px; font-weight: 300; }
152
+ .chart-stat-value.emerald { color: var(--emerald); }
153
+ .chart-stat-value.coral { color: var(--coral); }
154
+ .chart-stat-value.cyan { color: var(--cyan); }
155
+ .chart-stat-value.amber { color: var(--amber); }
156
+
157
+ /* ══════════ ANNOTATION BADGES ══════════ */
158
+ .annotation {
159
+ position: absolute; font-family: var(--font-mono); font-size: 9px;
160
+ font-weight: 500; letter-spacing: 0.5px; padding: 3px 8px;
161
+ border-radius: 4px; pointer-events: none; white-space: nowrap;
162
+ }
163
+
164
+ /* ══════════ WAR ROOM GRID ══════════ */
165
+ .warroom { display: grid; grid-template-columns: 220px 1fr 260px; gap: 14px; }
166
+
167
+ .panel {
168
+ background: linear-gradient(180deg, rgba(15, 22, 35, 0.85), rgba(10, 15, 26, 0.95));
169
+ border: 1px solid var(--border-subtle); border-radius: 14px; overflow: hidden;
170
+ }
171
+ .panel-header {
172
+ padding: 12px 16px 8px; display: flex; align-items: center; gap: 8px;
173
+ border-bottom: 1px solid var(--border-subtle);
174
+ }
175
+ .panel-header-dot {
176
+ width: 6px; height: 6px; border-radius: 50%;
177
+ background: var(--cyan); box-shadow: 0 0 8px var(--cyan-glow);
178
+ animation: dotPulse 3s ease-in-out infinite;
179
+ }
180
+ @keyframes dotPulse { 0%,100% { opacity: 0.6; } 50% { opacity: 1; } }
181
+ .panel-header-title { font-family: var(--font-mono); font-size: 10px; font-weight: 500; letter-spacing: 1.5px; text-transform: uppercase; color: var(--text-secondary); }
182
+
183
+ /* ══════════ ARCHITECTURE DIAGRAM ══════════ */
184
+ .arch-body {
185
+ display: flex; flex-direction: column; align-items: center;
186
+ padding: 16px 14px; gap: 0;
187
+ }
188
+ .arch-layer {
189
+ width: 100%; padding: 12px;
190
+ background: var(--bg-deep); border: 1px solid var(--border-subtle); border-radius: 8px;
191
+ transition: all var(--duration-slow) var(--ease-out-expo);
192
+ }
193
+ .arch-layer .layer-name { font-family: var(--font-mono); font-size: 10px; font-weight: 500; letter-spacing: 1.2px; text-transform: uppercase; color: var(--text-secondary); transition: color var(--duration-slow); }
194
+ .arch-layer .layer-detail { font-family: var(--font-mono); font-size: 9px; font-weight: 300; color: var(--text-tertiary); margin-top: 2px; }
195
+ .arch-layer.scanning { border-color: rgba(0, 196, 255, 0.3); background: linear-gradient(135deg, rgba(0, 196, 255, 0.06), var(--bg-deep)); box-shadow: 0 0 20px var(--cyan-dim); }
196
+ .arch-layer.scanning .layer-name { color: var(--cyan); }
197
+ .arch-layer.identified { border-color: rgba(240, 160, 48, 0.4); background: linear-gradient(135deg, rgba(240, 160, 48, 0.08), var(--bg-deep)); box-shadow: 0 0 25px var(--amber-dim); animation: identPulse 2s ease-in-out infinite; }
198
+ .arch-layer.identified .layer-name { color: var(--amber); }
199
+ @keyframes identPulse { 0%,100% { box-shadow: 0 0 20px var(--amber-dim); } 50% { box-shadow: 0 0 35px rgba(240, 160, 48, 0.25); } }
200
+ .arch-layer.resolved { border-color: rgba(0, 230, 118, 0.3); background: linear-gradient(135deg, rgba(0, 230, 118, 0.06), var(--bg-deep)); box-shadow: 0 0 20px var(--emerald-dim); }
201
+ .arch-layer.resolved .layer-name { color: var(--emerald); }
202
+
203
+ .arch-connector { width: 1px; height: 10px; background: linear-gradient(180deg, var(--border-active), transparent); position: relative; }
204
+ .arch-connector .data-dot { width: 3px; height: 3px; border-radius: 50%; background: var(--cyan); position: absolute; left: -1px; animation: flowDown 2s linear infinite; opacity: 0.6; }
205
+ @keyframes flowDown { 0% { top: 0; opacity: 0; } 20% { opacity: 0.8; } 80% { opacity: 0.8; } 100% { top: 100%; opacity: 0; } }
206
+
207
+ /* ══════════ INVESTIGATION LOG ══════════ */
208
+ .log-body {
209
+ height: 380px; overflow-y: auto; padding: 12px 16px;
210
+ display: flex; flex-direction: column; gap: 8px;
211
+ scrollbar-width: thin; scrollbar-color: rgba(100, 180, 255, 0.1) transparent;
212
+ }
213
+ .idle-prompt { display: flex; flex-direction: column; align-items: center; justify-content: center; height: 100%; gap: 12px; padding: 40px; }
214
+ .idle-prompt .idle-text { font-size: 13px; color: var(--text-secondary); text-align: center; line-height: 1.6; }
215
+
216
+ .incident-card {
217
+ background: linear-gradient(135deg, rgba(0, 196, 255, 0.04), var(--bg-surface));
218
+ border: 1px solid rgba(0, 196, 255, 0.12); border-radius: 10px;
219
+ padding: 14px; animation: cardIn 0.6s var(--ease-out-expo) both;
220
+ }
221
+ @keyframes cardIn { from { opacity: 0; transform: translateY(8px); } }
222
+ .incident-label { font-family: var(--font-mono); font-size: 9px; font-weight: 500; letter-spacing: 1.5px; text-transform: uppercase; color: var(--cyan); margin-bottom: 8px; }
223
+ .incident-text { font-size: 12px; line-height: 1.6; color: var(--text-primary); }
224
+ .incident-meta { display: flex; gap: 16px; margin-top: 10px; flex-wrap: wrap; }
225
+ .incident-meta-item .meta-label { font-family: var(--font-mono); font-size: 8px; font-weight: 500; letter-spacing: 1.2px; text-transform: uppercase; color: var(--text-tertiary); }
226
+ .incident-meta-item .meta-value { font-family: var(--font-mono); font-size: 11px; color: var(--text-secondary); }
227
+
228
+ .log-entry {
229
+ display: flex; gap: 10px; padding: 10px 12px;
230
+ background: var(--bg-deep); border: 1px solid var(--border-subtle); border-radius: 8px;
231
+ animation: entryIn 0.5s var(--ease-out-expo) both;
232
+ }
233
+ @keyframes entryIn { from { opacity: 0; transform: translateX(-12px); } }
234
+
235
+ .log-entry-icon {
236
+ width: 24px; height: 24px; border-radius: 6px;
237
+ display: flex; align-items: center; justify-content: center;
238
+ flex-shrink: 0; font-size: 11px;
239
+ }
240
+ .log-entry-icon.inspect { background: var(--cyan-dim); color: var(--cyan); }
241
+ .log-entry-icon.specialist { background: rgba(160, 120, 255, 0.12); color: #a078ff; }
242
+ .log-entry-icon.fix { background: var(--amber-dim); color: var(--amber); }
243
+ .log-entry-icon.submit { background: var(--emerald-dim); color: var(--emerald); }
244
+ .log-entry-content { flex: 1; min-width: 0; }
245
+ .log-entry-header { display: flex; align-items: center; justify-content: space-between; margin-bottom: 3px; }
246
+ .log-entry-type { font-family: var(--font-mono); font-size: 10px; font-weight: 500; letter-spacing: 0.8px; text-transform: uppercase; }
247
+ .log-entry-type.cyan { color: var(--cyan); }
248
+ .log-entry-type.purple { color: #a078ff; }
249
+ .log-entry-type.amber { color: var(--amber); }
250
+ .log-entry-type.emerald { color: var(--emerald); }
251
+ .log-entry-step { font-family: var(--font-mono); font-size: 9px; color: var(--text-tertiary); }
252
+ .log-entry-text { font-family: var(--font-mono); font-size: 10px; font-weight: 300; line-height: 1.55; color: var(--text-secondary); white-space: pre-wrap; word-break: break-word; }
253
+ .log-entry-reward { font-family: var(--font-mono); font-size: 10px; font-weight: 500; margin-top: 4px; }
254
+ .log-entry-reward.positive { color: var(--emerald); }
255
+ .log-entry-reward.negative { color: var(--coral); }
256
+
257
+ /* ══════════ SPECIALISTS ══════════ */
258
+ .specialists-body { padding: 10px 12px; display: flex; flex-direction: column; gap: 8px; overflow-y: auto; max-height: 420px; }
259
+ .specialist-card {
260
+ background: var(--bg-deep); border: 1px solid var(--border-subtle); border-radius: 10px;
261
+ padding: 10px 12px; transition: all var(--duration-slow) var(--ease-out-expo);
262
+ animation: cardIn 0.5s var(--ease-out-expo) both;
263
+ }
264
+ .specialist-card.highlighted { border-color: rgba(160, 120, 255, 0.3); background: linear-gradient(135deg, rgba(160, 120, 255, 0.05), var(--bg-deep)); }
265
+ .specialist-card.wrong { border-color: rgba(255, 61, 90, 0.15); opacity: 0.5; }
266
+ .specialist-card.correct { border-color: rgba(0, 230, 118, 0.2); }
267
+ .specialist-top { display: flex; align-items: center; justify-content: space-between; margin-bottom: 6px; }
268
+ .specialist-name { font-family: var(--font-mono); font-size: 10px; font-weight: 500; letter-spacing: 1px; text-transform: uppercase; color: var(--text-secondary); transition: color var(--duration-med); }
269
+ .specialist-card.highlighted .specialist-name { color: #a078ff; }
270
+ .specialist-card.correct .specialist-name { color: var(--emerald); }
271
+ .specialist-card.wrong .specialist-name { color: var(--coral); }
272
+ .confidence-bar { width: 50px; height: 3px; background: rgba(255,255,255,0.06); border-radius: 2px; overflow: hidden; }
273
+ .confidence-fill { height: 100%; border-radius: 2px; background: var(--cyan); transition: width 1s var(--ease-out-expo); }
274
+ .specialist-card.wrong .confidence-fill { background: var(--coral); }
275
+ .specialist-card.correct .confidence-fill { background: var(--emerald); }
276
+ .specialist-opinion { font-size: 11px; font-weight: 300; line-height: 1.4; color: var(--text-secondary); }
277
+ .specialist-card.wrong .specialist-opinion { opacity: 0.5; }
278
+ .specialist-verdict { font-family: var(--font-mono); font-size: 9px; font-weight: 500; letter-spacing: 1px; text-transform: uppercase; margin-top: 6px; opacity: 0; transition: opacity var(--duration-med); }
279
+ .specialist-card.wrong .specialist-verdict, .specialist-card.correct .specialist-verdict { opacity: 1; }
280
+ .specialist-card.wrong .specialist-verdict { color: var(--coral); }
281
+ .specialist-card.correct .specialist-verdict { color: var(--emerald); }
282
+
283
+ /* ══════════ VITALS BAR ══════════ */
284
+ .vitals-bar {
285
+ display: grid; grid-template-columns: repeat(5, 1fr); gap: 10px;
286
+ }
287
+ .vital {
288
+ background: linear-gradient(135deg, var(--bg-mid), var(--bg-surface));
289
+ border: 1px solid var(--border-subtle); border-radius: 10px;
290
+ padding: 10px 14px;
291
+ }
292
+ .vital-label { font-family: var(--font-mono); font-size: 9px; font-weight: 500; letter-spacing: 1.5px; text-transform: uppercase; color: var(--text-tertiary); margin-bottom: 4px; }
293
+ .vital-value { font-family: var(--font-mono); font-size: 20px; font-weight: 300; transition: color var(--duration-med); }
294
+ .vital-value.cyan { color: var(--cyan); }
295
+ .vital-value.amber { color: var(--amber); }
296
+ .vital-value.emerald { color: var(--emerald); }
297
+ .vital-value.coral { color: var(--coral); }
298
+ .steps-dots { display: flex; gap: 5px; margin-top: 4px; }
299
+ .step-dot { width: 8px; height: 8px; border-radius: 50%; background: rgba(255,255,255,0.08); border: 1px solid rgba(255,255,255,0.06); transition: all var(--duration-med); }
300
+ .step-dot.used { background: var(--cyan); border-color: var(--cyan); box-shadow: 0 0 8px var(--cyan-dim); }
301
+ .step-dot.current { background: var(--amber); border-color: var(--amber); box-shadow: 0 0 8px var(--amber-dim); animation: dotPulse 1.5s ease-in-out infinite; }
302
+
303
+ /* ══════════ CONTROLS ══════════ */
304
+ .controls {
305
+ display: flex; justify-content: center; gap: 12px; padding: 4px 0;
306
+ }
307
+ .ctrl-btn {
308
+ font-family: var(--font-mono); font-size: 12px; font-weight: 500;
309
+ letter-spacing: 1px; text-transform: uppercase;
310
+ padding: 12px 28px; border-radius: 10px;
311
+ cursor: pointer; transition: all var(--duration-fast) ease;
312
+ border: none;
313
+ }
314
+ .ctrl-btn.primary {
315
+ background: linear-gradient(135deg, var(--cyan), rgba(0, 160, 220, 0.9));
316
+ color: #fff; box-shadow: 0 0 24px var(--cyan-dim);
317
+ }
318
+ .ctrl-btn.primary:hover { box-shadow: 0 0 40px var(--cyan-glow); transform: translateY(-1px); }
319
+ .ctrl-btn.primary:disabled { opacity: 0.3; cursor: not-allowed; transform: none; box-shadow: none; }
320
+ .ctrl-btn.secondary {
321
+ background: var(--bg-surface); color: var(--text-secondary);
322
+ border: 1px solid var(--border-subtle);
323
+ }
324
+ .ctrl-btn.secondary:hover { border-color: var(--border-active); color: var(--text-primary); }
325
+ .server-input {
326
+ font-family: var(--font-mono); font-size: 11px; padding: 10px 14px;
327
+ background: var(--bg-deep); color: var(--cyan); border: 1px solid var(--border-subtle);
328
+ border-radius: 8px; width: 200px; outline: none;
329
+ }
330
+ .server-input:focus { border-color: var(--border-active); }
331
+ .conn-status {
332
+ font-family: var(--font-mono); font-size: 10px; text-transform: uppercase;
333
+ letter-spacing: 1px; color: var(--text-tertiary); padding: 0 8px;
334
+ }
335
+ .conn-status.connected { color: var(--emerald); }
336
+ .conn-status.error { color: var(--coral); }
337
+ .conn-status.running { color: var(--amber); }
338
+
339
+ /* ══════════ DIAGNOSIS OVERLAY ══════════ */
340
+ .diagnosis-overlay {
341
+ position: fixed; inset: 0;
342
+ background: rgba(6, 10, 17, 0.85); backdrop-filter: blur(20px);
343
+ display: flex; align-items: center; justify-content: center;
344
+ z-index: 100; opacity: 0; pointer-events: none;
345
+ transition: opacity 0.6s var(--ease-out-expo);
346
+ }
347
+ .diagnosis-overlay.visible { opacity: 1; pointer-events: auto; }
348
+ .diagnosis-card {
349
+ background: linear-gradient(180deg, var(--bg-surface), var(--bg-deep));
350
+ border: 1px solid var(--border-active); border-radius: 20px;
351
+ padding: 40px 48px; max-width: 520px; width: 100%; text-align: center;
352
+ transform: scale(0.92) translateY(20px);
353
+ transition: transform 0.8s var(--ease-out-expo);
354
+ box-shadow: 0 0 60px rgba(0, 196, 255, 0.08), 0 20px 60px rgba(0, 0, 0, 0.4);
355
+ }
356
+ .diagnosis-overlay.visible .diagnosis-card { transform: scale(1) translateY(0); }
357
+ .diagnosis-title { font-family: var(--font-mono); font-size: 11px; font-weight: 500; letter-spacing: 3px; text-transform: uppercase; color: var(--cyan); margin-bottom: 24px; }
358
+ .diagnosis-result { display: flex; flex-direction: column; gap: 12px; margin-bottom: 28px; }
359
+ .diagnosis-row { display: flex; align-items: center; justify-content: space-between; padding: 12px 16px; background: var(--bg-deep); border-radius: 10px; border: 1px solid var(--border-subtle); }
360
+ .diagnosis-row-label { font-family: var(--font-mono); font-size: 10px; font-weight: 500; letter-spacing: 1px; text-transform: uppercase; color: var(--text-tertiary); }
361
+ .diagnosis-row-value { font-family: var(--font-mono); font-size: 13px; }
362
+ .diagnosis-row-value.correct { color: var(--emerald); }
363
+ .diagnosis-row-value.wrong { color: var(--coral); }
364
+ .diagnosis-reward { font-size: 48px; font-weight: 700; letter-spacing: -2px; margin-bottom: 8px; }
365
+ .diagnosis-reward-label { font-family: var(--font-mono); font-size: 10px; letter-spacing: 1px; text-transform: uppercase; color: var(--text-tertiary); }
366
+
367
+ ::-webkit-scrollbar { width: 4px; }
368
+ ::-webkit-scrollbar-track { background: transparent; }
369
+ ::-webkit-scrollbar-thumb { background: rgba(100, 180, 255, 0.15); border-radius: 4px; }
370
+
371
+ @media (max-width: 1000px) {
372
+ .warroom { grid-template-columns: 1fr; }
373
+ .chart-section { grid-template-columns: 1fr; }
374
+ .vitals-bar { grid-template-columns: repeat(3, 1fr); }
375
+ }
376
+ </style>
377
+ </head>
378
+ <body>
379
+ <div class="grid-overlay"></div>
380
+
381
+ <div class="app">
382
+ <header class="header">
383
+ <div class="header-left">
384
+ <div class="logo-mark">
385
+ <svg viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round">
386
+ <path d="M12 3v18M3 12h18M7 7l10 10M17 7L7 17"/>
387
+ </svg>
388
+ </div>
389
+ <div>
390
+ <div class="header-title">Stack Doctor</div>
391
+ <div class="header-subtitle">Incident War Room</div>
392
+ </div>
393
+ </div>
394
+ <div class="header-right">
395
+ <div>
396
+ <div class="header-meta-label">Scenario</div>
397
+ <div class="header-meta-value" id="scenarioId">&mdash;</div>
398
+ </div>
399
+ <div>
400
+ <div class="header-meta-label">Episode</div>
401
+ <div class="header-meta-value" id="episodeTime">00:00</div>
402
+ </div>
403
+ <div class="status-badge" id="statusBadge">Standby</div>
404
+ </div>
405
+ </header>
406
+
407
+ <!-- ══════════ TRAINING DATA ══════════ -->
408
+ <div class="section-title">Training Analytics</div>
409
+ <div class="chart-section" style="grid-template-columns: 1fr; max-width: 900px; margin: 0 auto; width: 100%;">
410
+ <div class="chart-panel">
411
+ <div class="chart-panel-title">Qwen3.5-9B &mdash; Episode Reward</div>
412
+ <div class="chart-panel-subtitle">100 GRPO steps &mdash; base model already near-oracle</div>
413
+ <div class="chart-canvas-wrap"><canvas id="rewardChart"></canvas></div>
414
+ <div class="chart-stat-row">
415
+ <div class="chart-stat">
416
+ <div class="chart-stat-label">Peak</div>
417
+ <div class="chart-stat-value emerald">+26.00</div>
418
+ </div>
419
+ <div class="chart-stat">
420
+ <div class="chart-stat-label">Base Avg</div>
421
+ <div class="chart-stat-value cyan">+19.50</div>
422
+ </div>
423
+ <div class="chart-stat">
424
+ <div class="chart-stat-label">Zero-Std</div>
425
+ <div class="chart-stat-value coral">72%</div>
426
+ </div>
427
+ </div>
428
+ </div>
429
+ <div class="chart-panel">
430
+ <div class="chart-panel-title">Qwen3.5-9B &mdash; Completion Length</div>
431
+ <div class="chart-panel-subtitle">Thinking mode consumed token budget, hit 2048 cap</div>
432
+ <div class="chart-canvas-wrap"><canvas id="lengthChart"></canvas></div>
433
+ <div class="chart-stat-row">
434
+ <div class="chart-stat">
435
+ <div class="chart-stat-label">Collapse</div>
436
+ <div class="chart-stat-value coral">Step 36</div>
437
+ </div>
438
+ <div class="chart-stat">
439
+ <div class="chart-stat-label">Clipping</div>
440
+ <div class="chart-stat-value amber">Step 69</div>
441
+ </div>
442
+ </div>
443
+ </div>
444
+ <div class="chart-panel" style="border-color: rgba(0, 196, 255, 0.15);">
445
+ <div class="chart-panel-title">Qwen2.5-1.5B &mdash; Episode Reward</div>
446
+ <div class="chart-panel-subtitle">16 GRPO steps &mdash; weak model, real gradient signal</div>
447
+ <div class="chart-canvas-wrap"><canvas id="reward1bChart"></canvas></div>
448
+ <div class="chart-stat-row">
449
+ <div class="chart-stat">
450
+ <div class="chart-stat-label">Best Step</div>
451
+ <div class="chart-stat-value cyan">-1.75</div>
452
+ </div>
453
+ <div class="chart-stat">
454
+ <div class="chart-stat-label">Avg</div>
455
+ <div class="chart-stat-value amber">-4.90</div>
456
+ </div>
457
+ <div class="chart-stat">
458
+ <div class="chart-stat-label">Zero-Std</div>
459
+ <div class="chart-stat-value emerald">0%</div>
460
+ </div>
461
+ </div>
462
+ </div>
463
+ </div>
464
+
465
+ <!-- ══════════ DEMO CONTROLS ══════════ -->
466
+ <div class="section-title">Live Environment</div>
467
+ <div class="controls">
468
+ <input type="text" id="serverUrl" class="server-input" placeholder="Server URL (empty = same origin)">
469
+ <script>
470
+ /* Auto-detect: use localhost:8000 for local dev, empty for HF Spaces */
471
+ if (location.hostname === 'localhost' || location.hostname === '127.0.0.1') {
472
+ document.getElementById('serverUrl').value = 'http://localhost:8000';
473
+ }
474
+ </script>
475
+ <button class="ctrl-btn primary" id="demoBtn" onclick="runComparison()">&#9654; Run Comparison (Base → GRPO Trained)</button>
476
+ <button class="ctrl-btn secondary" id="resetBtn" onclick="resetState()">&#8634; Reset</button>
477
+ <span id="modelStatus" class="conn-status" style="margin-left:8px;">Model: checking...</span>
478
+ <span id="connStatus" class="conn-status">Disconnected</span>
479
+ </div>
480
+
481
+ <!-- ══════════ WAR ROOM ══════════ -->
482
+ <div class="warroom">
483
+ <div class="panel">
484
+ <div class="panel-header">
485
+ <div class="panel-header-dot"></div>
486
+ <div class="panel-header-title">Inference Stack</div>
487
+ </div>
488
+ <div class="arch-body" id="archBody">
489
+ <div class="arch-layer" id="layer-model"><div class="layer-name">Model</div><div class="layer-detail" id="detail-model">&mdash;</div></div>
490
+ <div class="arch-connector"><div class="data-dot"></div></div>
491
+ <div class="arch-layer" id="layer-kernel"><div class="layer-name">Kernel</div><div class="layer-detail">Attention / GEMM</div></div>
492
+ <div class="arch-connector"><div class="data-dot" style="animation-delay:-0.5s"></div></div>
493
+ <div class="arch-layer" id="layer-backend"><div class="layer-name">Backend</div><div class="layer-detail" id="detail-backend">&mdash;</div></div>
494
+ <div class="arch-connector"><div class="data-dot" style="animation-delay:-1s"></div></div>
495
+ <div class="arch-layer" id="layer-runtime"><div class="layer-name">Runtime</div><div class="layer-detail">CUDA / ROCm</div></div>
496
+ <div class="arch-connector"><div class="data-dot" style="animation-delay:-1.5s"></div></div>
497
+ <div class="arch-layer" id="layer-memory"><div class="layer-name">Memory</div><div class="layer-detail">HBM / KV Cache</div></div>
498
+ <div class="arch-connector"><div class="data-dot" style="animation-delay:-2s"></div></div>
499
+ <div class="arch-layer" id="layer-driver"><div class="layer-name">Driver</div><div class="layer-detail" id="detail-driver">&mdash;</div></div>
500
+ </div>
501
+ </div>
502
+
503
+ <div class="panel">
504
+ <div class="panel-header">
505
+ <div class="panel-header-dot"></div>
506
+ <div class="panel-header-title">Investigation Log</div>
507
+ </div>
508
+ <div class="log-body" id="logBody">
509
+ <div class="idle-prompt" id="idlePrompt">
510
+ <div class="idle-text">Awaiting incident assignment.<br>Click <strong>Run Demo</strong> above to start.</div>
511
+ </div>
512
+ </div>
513
+ </div>
514
+
515
+ <div class="panel">
516
+ <div class="panel-header">
517
+ <div class="panel-header-dot"></div>
518
+ <div class="panel-header-title">Specialist Agents</div>
519
+ </div>
520
+ <div class="specialists-body" id="specialistsBody"></div>
521
+ </div>
522
+ </div>
523
+
524
+ <!-- ══════════ VITALS ══════════ -->
525
+ <div class="vitals-bar">
526
+ <div class="vital">
527
+ <div class="vital-label">Steps</div>
528
+ <div class="steps-dots" id="stepsDots">
529
+ <div class="step-dot"></div><div class="step-dot"></div><div class="step-dot"></div>
530
+ <div class="step-dot"></div><div class="step-dot"></div><div class="step-dot"></div>
531
+ </div>
532
+ </div>
533
+ <div class="vital"><div class="vital-label">Reward</div><div class="vital-value" id="rewardValue">0.00</div></div>
534
+ <div class="vital"><div class="vital-label">Fix Status</div><div class="vital-value" id="fixStatus" style="font-size:13px">Not Applied</div></div>
535
+ <div class="vital"><div class="vital-label">Root Cause</div><div class="vital-value" id="rootCauseValue" style="font-size:13px">&mdash;</div></div>
536
+ <div class="vital"><div class="vital-label">Diagnosis</div><div class="vital-value" id="diagnosisValue" style="font-size:13px">Pending</div></div>
537
+ </div>
538
+ </div>
539
+
540
+ <div class="diagnosis-overlay" id="diagnosisOverlay">
541
+ <div class="diagnosis-card">
542
+ <div class="diagnosis-title">Diagnosis Submitted</div>
543
+ <div class="diagnosis-result" id="diagnosisResult"></div>
544
+ <div class="diagnosis-reward" id="diagnosisReward">+0.00</div>
545
+ <div class="diagnosis-reward-label">Episode Reward</div>
546
+ </div>
547
+ </div>
548
+
549
+ <script>
550
+ /* ═══════════════════════════════════════════════
551
+ TRAINING DATA — Qwen3.5-9B, 100 GRPO steps
552
+ ═══════════════════════════════════════════════ */
553
+ var TRAIN_DATA = {
554
+ steps: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100],
555
+ reward: [9.65,19.75,7.25,24.25,23.00,22.75,2.62,22.75,23.00,24.50,16.00,23.00,24.50,7.25,24.62,26.00,26.00,26.00,22.50,14.12,-5.45,7.12,-2.60,-8.50,18.50,0.88,26.25,7.88,9.62,7.88,6.88,24.25,-6.25,-5.50,-1.88,-1.75,-1.75,-5.12,-2.62,-1.75,-1.75,-1.75,-2.62,-1.75,-1.75,-1.75,-1.75,-1.75,-1.75,-1.75,-5.12,-1.75,-1.75,-1.75,-1.75,-1.75,-1.75,-1.75,-5.12,-5.12,-1.75,-1.75,-1.75,-1.75,-8.50,-5.12,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-8.50,-5.00,-8.50,-8.50,-8.50,-1.75,-5.12,-1.75,-1.75,-1.75,-5.12,-1.75,-1.75,-0.12,-1.75,-2.62,-1.75,-1.75],
556
+ completion_length: [68,98,91,96,85,92,96,74,99,71,86,116,112,124,126,175,105,120,152,112,148,190,193,182,164,135,140,218,130,152,164,182,134,93,24,15,18,87,16,16,15,15,15,15,13,15,15,16,15,15,125,62,180,876,376,280,883,484,734,470,432,488,354,177,471,607,248,210,1234,3,2048,2048,2048,2048,2048,1024,1026,1078,2048,1025,14,1030,2048,1025,2048,2048,2048,1364,1032,15,15,15,14,15,15,1032,2048,1065,2048,1058],
557
+ };
558
+
559
+ /* Qwen2.5-1.5B — 16 steps before crash (fixed in next run) */
560
+ var TRAIN_DATA_1B = {
561
+ steps: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16],
562
+ reward: [-5.625,-3.375,-8.375,-7.75,-2.75,-2.625,-3.0,-5.125,-8.875,-3.0,-1.75,-3.875,-9.5,-5.5,-3.125,-6.125],
563
+ completion_length: [65.5,57.5,53,36.5,81,98,89,41.5,46,62.5,75,72.5,48,52.5,44,60],
564
+ };
565
+
566
+ /* ═══════════════════════════════════════════════
567
+ CHARTS — lightweight canvas rendering
568
+ ═══════════════════════════════════════════════ */
569
+ function drawChart(canvasId, data, opts) {
570
+ var canvas = document.getElementById(canvasId);
571
+ var dpr = window.devicePixelRatio || 1;
572
+ var rect = canvas.parentElement.getBoundingClientRect();
573
+ canvas.width = rect.width * dpr;
574
+ canvas.height = rect.height * dpr;
575
+ canvas.style.width = rect.width + 'px';
576
+ canvas.style.height = rect.height + 'px';
577
+ var ctx = canvas.getContext('2d');
578
+ ctx.scale(dpr, dpr);
579
+ var W = rect.width, H = rect.height;
580
+ var pad = { top: 28, right: 24, bottom: 40, left: 64 };
581
+ var plotW = W - pad.left - pad.right;
582
+ var plotH = H - pad.top - pad.bottom;
583
+
584
+ var minY = opts.minY !== undefined ? opts.minY : Math.min.apply(null, data);
585
+ var maxY = opts.maxY !== undefined ? opts.maxY : Math.max.apply(null, data);
586
+ var rangeY = maxY - minY || 1;
587
+
588
+ function xPos(i) { return pad.left + (i / (data.length - 1)) * plotW; }
589
+ function yPos(v) { return pad.top + plotH - ((v - minY) / rangeY) * plotH; }
590
+
591
+ // Grid lines
592
+ ctx.strokeStyle = 'rgba(100,180,255,0.06)';
593
+ ctx.lineWidth = 1;
594
+ var gridCount = 5;
595
+ for (var g = 0; g <= gridCount; g++) {
596
+ var gy = pad.top + (g / gridCount) * plotH;
597
+ ctx.beginPath(); ctx.moveTo(pad.left, gy); ctx.lineTo(W - pad.right, gy); ctx.stroke();
598
+ var label = (maxY - (g / gridCount) * rangeY).toFixed(0);
599
+ ctx.fillStyle = 'rgba(216,224,236,0.25)';
600
+ ctx.font = '10px IBM Plex Mono';
601
+ ctx.textAlign = 'right';
602
+ ctx.fillText(label, pad.left - 12, gy + 4);
603
+ }
604
+
605
+ // X axis labels
606
+ var stepsArr = opts.stepsArray || TRAIN_DATA.steps;
607
+ var xInterval = data.length > 30 ? 20 : data.length > 10 ? 5 : 2;
608
+ ctx.fillStyle = 'rgba(216,224,236,0.25)';
609
+ ctx.textAlign = 'center';
610
+ for (var x = 0; x < data.length; x += xInterval) {
611
+ ctx.fillText(stepsArr[x], xPos(x), H - 8);
612
+ }
613
+ ctx.fillText(stepsArr[data.length - 1], xPos(data.length - 1), H - 8);
614
+
615
+ // Zero line for reward chart
616
+ if (opts.zeroLine) {
617
+ var zy = yPos(0);
618
+ if (zy >= pad.top && zy <= pad.top + plotH) {
619
+ ctx.strokeStyle = 'rgba(255,255,255,0.12)';
620
+ ctx.setLineDash([4, 4]);
621
+ ctx.beginPath(); ctx.moveTo(pad.left, zy); ctx.lineTo(W - pad.right, zy); ctx.stroke();
622
+ ctx.setLineDash([]);
623
+ }
624
+ }
625
+
626
+ // Reference lines (baselines) — drawn before data so they appear underneath
627
+ if (opts.refLines) {
628
+ opts.refLines.forEach(function(ref) {
629
+ var ry = yPos(ref.value);
630
+ if (ry >= pad.top - 5 && ry <= pad.top + plotH + 5) {
631
+ ctx.strokeStyle = ref.color || 'rgba(255,255,255,0.2)';
632
+ ctx.lineWidth = 1.5;
633
+ ctx.setLineDash(ref.dash || [6, 4]);
634
+ ctx.beginPath(); ctx.moveTo(pad.left, ry); ctx.lineTo(W - pad.right, ry); ctx.stroke();
635
+ ctx.setLineDash([]);
636
+ // Draw label with opaque background so it's never covered
637
+ var labelY = ref.labelBelow ? ry + 24 : ry - 16;
638
+ ctx.font = '600 9px IBM Plex Mono';
639
+ var textWidth = ctx.measureText(ref.label).width;
640
+ // Background pill
641
+ ctx.fillStyle = 'rgba(6, 10, 17, 0.85)';
642
+ ctx.beginPath();
643
+ ctx.roundRect(W - pad.right - textWidth - 16, labelY - 9, textWidth + 12, 14, 3);
644
+ ctx.fill();
645
+ // Label text
646
+ ctx.fillStyle = ref.color || 'rgba(255,255,255,0.3)';
647
+ ctx.textAlign = 'right';
648
+ ctx.fillText(ref.label, W - pad.right - 8, labelY);
649
+ }
650
+ });
651
+ }
652
+
653
+ // Regions
654
+ if (opts.regions) {
655
+ opts.regions.forEach(function(r) {
656
+ var x1 = xPos(r.from);
657
+ var x2 = xPos(r.to);
658
+ ctx.fillStyle = r.color;
659
+ ctx.fillRect(x1, pad.top, x2 - x1, plotH);
660
+ ctx.fillStyle = r.labelColor || 'rgba(216,224,236,0.35)';
661
+ ctx.font = '600 8px IBM Plex Mono';
662
+ ctx.textAlign = 'center';
663
+ ctx.fillText(r.label, (x1 + x2) / 2, pad.top - 6);
664
+ });
665
+ }
666
+
667
+ // Line gradient
668
+ var grad = ctx.createLinearGradient(pad.left, 0, W - pad.right, 0);
669
+ if (opts.gradientStops) {
670
+ opts.gradientStops.forEach(function(s) { grad.addColorStop(s[0], s[1]); });
671
+ } else {
672
+ grad.addColorStop(0, opts.color || '#00c4ff');
673
+ grad.addColorStop(1, opts.color || '#00c4ff');
674
+ }
675
+
676
+ // Area fill
677
+ ctx.beginPath();
678
+ ctx.moveTo(xPos(0), yPos(data[0]));
679
+ for (var i = 1; i < data.length; i++) ctx.lineTo(xPos(i), yPos(data[i]));
680
+ ctx.lineTo(xPos(data.length - 1), yPos(minY));
681
+ ctx.lineTo(xPos(0), yPos(minY));
682
+ ctx.closePath();
683
+ var areaGrad = ctx.createLinearGradient(0, pad.top, 0, pad.top + plotH);
684
+ areaGrad.addColorStop(0, (opts.areaColor || 'rgba(0,196,255,0.12)'));
685
+ areaGrad.addColorStop(1, 'rgba(0,196,255,0)');
686
+ ctx.fillStyle = areaGrad;
687
+ ctx.fill();
688
+
689
+ // Line
690
+ ctx.strokeStyle = grad;
691
+ ctx.lineWidth = 2;
692
+ ctx.lineJoin = 'round';
693
+ ctx.beginPath();
694
+ ctx.moveTo(xPos(0), yPos(data[0]));
695
+ for (var j = 1; j < data.length; j++) ctx.lineTo(xPos(j), yPos(data[j]));
696
+ ctx.stroke();
697
+
698
+ // Dot at end
699
+ var lastX = xPos(data.length - 1), lastY = yPos(data[data.length - 1]);
700
+ ctx.fillStyle = opts.color || '#00c4ff';
701
+ ctx.beginPath(); ctx.arc(lastX, lastY, 3, 0, Math.PI * 2); ctx.fill();
702
+ }
703
+
704
+ function renderCharts() {
705
+ drawChart('rewardChart', TRAIN_DATA.reward, {
706
+ minY: -12, maxY: 30, zeroLine: true,
707
+ gradientStops: [[0, '#00e676'], [0.18, '#00e676'], [0.22, '#f0a030'], [0.35, '#ff3d5a'], [1, '#ff3d5a']],
708
+ areaColor: 'rgba(0,196,255,0.08)',
709
+ regions: [
710
+ { from: 0, to: 19, label: 'BASE MODEL', color: 'rgba(0,230,118,0.04)', labelColor: 'rgba(0,230,118,0.5)' },
711
+ { from: 20, to: 35, label: 'DEGRADATION', color: 'rgba(240,160,48,0.04)', labelColor: 'rgba(240,160,48,0.5)' },
712
+ { from: 36, to: 69, label: 'COLLAPSE', color: 'rgba(255,61,90,0.04)', labelColor: 'rgba(255,61,90,0.5)' },
713
+ { from: 70, to: 99, label: 'CLIPPING', color: 'rgba(255,61,90,0.03)', labelColor: 'rgba(255,61,90,0.4)' },
714
+ ],
715
+ refLines: [
716
+ { value: 19.5, label: '9B BASELINE +19.5', color: 'rgba(0,230,118,0.5)', dash: [8, 4] },
717
+ ],
718
+ });
719
+
720
+ drawChart('lengthChart', TRAIN_DATA.completion_length, {
721
+ minY: 0, maxY: 2200,
722
+ color: '#f0a030',
723
+ areaColor: 'rgba(240,160,48,0.08)',
724
+ regions: [
725
+ { from: 0, to: 35, label: 'NORMAL', color: 'rgba(0,230,118,0.03)', labelColor: 'rgba(0,230,118,0.4)' },
726
+ { from: 36, to: 68, label: 'SHORT OUTPUT', color: 'rgba(240,160,48,0.04)', labelColor: 'rgba(240,160,48,0.5)' },
727
+ { from: 69, to: 99, label: 'HIT CAP (2048)', color: 'rgba(255,61,90,0.04)', labelColor: 'rgba(255,61,90,0.5)' },
728
+ ],
729
+ });
730
+
731
+ drawChart('reward1bChart', TRAIN_DATA_1B.reward, {
732
+ minY: -12, maxY: 22, zeroLine: true,
733
+ stepsArray: TRAIN_DATA_1B.steps,
734
+ color: '#00c4ff',
735
+ areaColor: 'rgba(0,196,255,0.1)',
736
+ gradientStops: [[0, '#ff3d5a'], [0.4, '#f0a030'], [0.7, '#00c4ff'], [1, '#00c4ff']],
737
+ refLines: [
738
+ { value: 19.5, label: '9B BASELINE +19.5', color: 'rgba(0,230,118,0.4)', dash: [8, 4] },
739
+ { value: -4.9, label: '1.5B BASELINE -4.9', color: 'rgba(240,160,48,0.5)', dash: [4, 4], labelBelow: true },
740
+ ],
741
+ });
742
+ }
743
+
744
+ window.addEventListener('resize', renderCharts);
745
+ setTimeout(renderCharts, 100);
746
+
747
+ /* ═══════════════════════════════════════════════
748
+ WAR ROOM ENGINE
749
+ ═══════════════════════════════════════════════ */
750
+ var ACTION_ICONS = { inspect: '\u2315', ask_specialist: '\u25C9', apply_fix: '\u26A1', submit: '\u2713', reward_breakdown: '\u2261' };
751
+ var state = { step: 0, reward: 0, fixApplied: false, done: false, startTime: null, timerInterval: null };
752
+
753
+ function startTimer() {
754
+ state.startTime = Date.now();
755
+ state.timerInterval = setInterval(function() {
756
+ var e = Math.floor((Date.now() - state.startTime) / 1000);
757
+ document.getElementById('episodeTime').textContent = String(Math.floor(e/60)).padStart(2,'0') + ':' + String(e%60).padStart(2,'0');
758
+ }, 1000);
759
+ }
760
+ function stopTimer() { if (state.timerInterval) clearInterval(state.timerInterval); }
761
+
762
+ function updateSteps(step) {
763
+ document.querySelectorAll('.step-dot').forEach(function(d, i) {
764
+ d.className = 'step-dot';
765
+ if (i < step) d.classList.add('used');
766
+ else if (i === step && !state.done) d.classList.add('current');
767
+ });
768
+ }
769
+
770
+ function updateReward(delta) {
771
+ state.reward += delta;
772
+ var el = document.getElementById('rewardValue');
773
+ el.textContent = (state.reward >= 0 ? '+' : '') + state.reward.toFixed(2);
774
+ el.className = 'vital-value ' + (state.reward >= 0 ? 'emerald' : 'coral');
775
+ }
776
+
777
+ function setLayerState(id, cls) {
778
+ var el = document.getElementById('layer-' + id);
779
+ if (el) el.className = 'arch-layer' + (cls ? ' ' + cls : '');
780
+ }
781
+ function clearAllLayers() { document.querySelectorAll('.arch-layer').forEach(function(e) { e.className = 'arch-layer'; }); }
782
+ function setStatus(text, cls) { var el = document.getElementById('statusBadge'); el.textContent = text; el.className = 'status-badge' + (cls ? ' ' + cls : ''); }
783
+
784
+ function addLogEntry(opts) {
785
+ var idle = document.getElementById('idlePrompt'); if (idle) idle.remove();
786
+ var body = document.getElementById('logBody');
787
+ var entry = document.createElement('div'); entry.className = 'log-entry';
788
+ var iconCls = opts.type === 'ask_specialist' ? 'specialist' : opts.type;
789
+ var typeCls = opts.type === 'inspect' ? 'cyan' : opts.type === 'ask_specialist' ? 'purple' : opts.type === 'apply_fix' ? 'amber' : 'emerald';
790
+ var iconEl = document.createElement('div'); iconEl.className = 'log-entry-icon ' + iconCls; iconEl.textContent = ACTION_ICONS[opts.type] || '\u2022';
791
+ var contentEl = document.createElement('div'); contentEl.className = 'log-entry-content';
792
+ var headerEl = document.createElement('div'); headerEl.className = 'log-entry-header';
793
+ var typeEl = document.createElement('div'); typeEl.className = 'log-entry-type ' + typeCls; typeEl.textContent = opts.label;
794
+ var stepEl = document.createElement('div'); stepEl.className = 'log-entry-step'; stepEl.textContent = 'Step ' + state.step + '/6';
795
+ headerEl.appendChild(typeEl); headerEl.appendChild(stepEl);
796
+ var textEl = document.createElement('div'); textEl.className = 'log-entry-text'; textEl.textContent = opts.text;
797
+ contentEl.appendChild(headerEl); contentEl.appendChild(textEl);
798
+ if (opts.reward !== undefined) {
799
+ var rEl = document.createElement('div'); rEl.className = 'log-entry-reward ' + (opts.reward >= 0 ? 'positive' : 'negative');
800
+ rEl.textContent = (opts.reward >= 0 ? '+' : '') + opts.reward.toFixed(2); contentEl.appendChild(rEl);
801
+ }
802
+ entry.appendChild(iconEl); entry.appendChild(contentEl);
803
+ body.appendChild(entry); body.scrollTop = body.scrollHeight;
804
+ }
805
+
806
+ function addIncidentCard(sc) {
807
+ var idle = document.getElementById('idlePrompt'); if (idle) idle.remove();
808
+ var body = document.getElementById('logBody');
809
+ var card = document.createElement('div'); card.className = 'incident-card';
810
+ var label = document.createElement('div'); label.className = 'incident-label'; label.textContent = 'Incident Ticket';
811
+ var text = document.createElement('div'); text.className = 'incident-text'; text.textContent = sc.incident_ticket;
812
+ var meta = document.createElement('div'); meta.className = 'incident-meta';
813
+ [['Hardware', sc.hardware], ['Model', sc.model_name], ['Backend', sc.backend]].forEach(function(f) {
814
+ var item = document.createElement('div'); item.className = 'incident-meta-item';
815
+ var ml = document.createElement('div'); ml.className = 'meta-label'; ml.textContent = f[0];
816
+ var mv = document.createElement('div'); mv.className = 'meta-value'; mv.textContent = f[1];
817
+ item.appendChild(ml); item.appendChild(mv); meta.appendChild(item);
818
+ });
819
+ card.appendChild(label); card.appendChild(text); card.appendChild(meta); body.appendChild(card);
820
+ }
821
+
822
+ function populateSpecialists(ops) {
823
+ var body = document.getElementById('specialistsBody'); body.textContent = '';
824
+ var i = 0;
825
+ Object.keys(ops).forEach(function(name) {
826
+ var d = ops[name];
827
+ var card = document.createElement('div'); card.className = 'specialist-card'; card.id = 'specialist-' + name;
828
+ card.style.animationDelay = (i * 0.1) + 's';
829
+ var top = document.createElement('div'); top.className = 'specialist-top';
830
+ var nameEl = document.createElement('div'); nameEl.className = 'specialist-name'; nameEl.textContent = name;
831
+ var barW = document.createElement('div'); barW.className = 'confidence-bar';
832
+ var barF = document.createElement('div'); barF.className = 'confidence-fill'; barF.style.width = (d.confidence * 100) + '%';
833
+ barW.appendChild(barF); top.appendChild(nameEl); top.appendChild(barW);
834
+ var opEl = document.createElement('div'); opEl.className = 'specialist-opinion'; opEl.textContent = d.opinion;
835
+ var vEl = document.createElement('div'); vEl.className = 'specialist-verdict';
836
+ card.appendChild(top); card.appendChild(opEl); card.appendChild(vEl); body.appendChild(card); i++;
837
+ });
838
+ }
839
+
840
+ function highlightSpecialist(name) {
841
+ document.querySelectorAll('.specialist-card').forEach(function(c) { c.classList.remove('highlighted'); });
842
+ var c = document.getElementById('specialist-' + name); if (c) c.classList.add('highlighted');
843
+ }
844
+ function markSpecialist(name, correct) {
845
+ var c = document.getElementById('specialist-' + name); if (!c) return;
846
+ c.classList.remove('highlighted'); c.classList.add(correct ? 'correct' : 'wrong');
847
+ var v = c.querySelector('.specialist-verdict'); if (v) v.textContent = correct ? '\u2713 Helpful' : '\u2014 Not Relevant';
848
+ }
849
+
850
+ function showDiagnosis(d) {
851
+ var overlay = document.getElementById('diagnosisOverlay');
852
+ var result = document.getElementById('diagnosisResult'); result.textContent = '';
853
+ [['Root Cause', d.rootCause, d.rcCorrect, d.correctRc], ['Fix', d.fix, d.fixCorrect, d.correctFix]].forEach(function(r) {
854
+ var row = document.createElement('div'); row.className = 'diagnosis-row';
855
+ var lbl = document.createElement('div'); lbl.className = 'diagnosis-row-label'; lbl.textContent = r[0];
856
+ var val = document.createElement('div'); val.className = 'diagnosis-row-value ' + (r[2] ? 'correct' : 'wrong');
857
+ val.textContent = r[1] + (r[2] ? ' \u2713' : ' \u2717 \u2192 ' + r[3]);
858
+ row.appendChild(lbl); row.appendChild(val); result.appendChild(row);
859
+ });
860
+ // Steps used + time
861
+ var stepsRow = document.createElement('div'); stepsRow.className = 'diagnosis-row';
862
+ var stepsLbl = document.createElement('div'); stepsLbl.className = 'diagnosis-row-label'; stepsLbl.textContent = 'Steps Used';
863
+ var stepsVal = document.createElement('div'); stepsVal.className = 'diagnosis-row-value'; stepsVal.textContent = state.step + ' / 6';
864
+ stepsRow.appendChild(stepsLbl); stepsRow.appendChild(stepsVal); result.appendChild(stepsRow);
865
+
866
+ var timeRow = document.createElement('div'); timeRow.className = 'diagnosis-row';
867
+ var timeLbl = document.createElement('div'); timeLbl.className = 'diagnosis-row-label'; timeLbl.textContent = 'Time';
868
+ var elapsed = state.startTime ? Math.round((Date.now() - state.startTime) / 1000) : 0;
869
+ var timeVal = document.createElement('div'); timeVal.className = 'diagnosis-row-value'; timeVal.textContent = elapsed + 's';
870
+ timeRow.appendChild(timeLbl); timeRow.appendChild(timeVal); result.appendChild(timeRow);
871
+
872
+ var rEl = document.getElementById('diagnosisReward');
873
+ rEl.textContent = (d.totalReward >= 0 ? '+' : '') + d.totalReward.toFixed(2);
874
+ rEl.style.color = d.totalReward >= 0 ? 'var(--emerald)' : 'var(--coral)';
875
+ overlay.classList.add('visible');
876
+ }
877
+
878
+ function resetState() {
879
+ state = { step: 0, reward: 0, fixApplied: false, done: false, startTime: null, timerInterval: null };
880
+ stopTimer();
881
+ document.getElementById('logBody').textContent = '';
882
+ var logBody = document.getElementById('logBody');
883
+ var idle = document.createElement('div'); idle.className = 'idle-prompt'; idle.id = 'idlePrompt';
884
+ var txt = document.createElement('div'); txt.className = 'idle-text'; txt.textContent = 'Awaiting incident assignment.\nConnect to a live Stack Doctor environment above.';
885
+ idle.appendChild(txt); logBody.appendChild(idle);
886
+ document.getElementById('specialistsBody').textContent = '';
887
+ document.getElementById('rewardValue').textContent = '0.00'; document.getElementById('rewardValue').className = 'vital-value';
888
+ document.getElementById('fixStatus').textContent = 'Not Applied'; document.getElementById('fixStatus').className = 'vital-value';
889
+ document.getElementById('rootCauseValue').textContent = '\u2014'; document.getElementById('rootCauseValue').className = 'vital-value';
890
+ document.getElementById('diagnosisValue').textContent = 'Pending'; document.getElementById('diagnosisValue').className = 'vital-value';
891
+ document.getElementById('scenarioId').textContent = '\u2014';
892
+ document.getElementById('episodeTime').textContent = '00:00';
893
+ document.getElementById('diagnosisOverlay').classList.remove('visible');
894
+ setStatus('Standby', ''); clearAllLayers(); updateSteps(0);
895
+ setConnStatus('Disconnected', '');
896
+ }
897
+
898
+ /* ═══════════════════════════════════════════════
899
+ DEMO — arch_guard_01
900
+ ═══════════════════════════════════════════════ */
901
+ var DEMO = {
902
+ id: 'arch_guard_01', root_cause: 'arch_guard', correct_fix: 'relax_arch_check',
903
+ incident_ticket: "FlashInfer attention kernel fails to launch on newly provisioned DGX Spark nodes. Error: 'Unsupported GPU architecture sm_121'. Identical model config works on H100 nodes.",
904
+ hardware: 'NVIDIA SM121 (DGX Spark)', model_name: 'DeepSeek-V3-671B', backend: 'FlashInfer 0.4',
905
+ specialist_opinions: {
906
+ runtime: { opinion: "CUDA runtime loaded successfully. No runtime issues detected.", confidence: 0.85 },
907
+ dispatch: { opinion: "Architecture check is blocking kernel dispatch. SM121 is not in the supported set despite being SM90-compatible.", confidence: 0.92 },
908
+ kernel: { opinion: "HMMA m16n8k16 instructions available on SM121. Capability check issue.", confidence: 0.88 },
909
+ loader: { opinion: "Model weights loaded correctly. Weight layout is standard.", confidence: 0.80 },
910
+ },
911
+ inspect_logs: "[FlashInfer] GPU: NVIDIA GH200 (sm_121)\n[FlashInfer] is_supported_arch(121) = False\n[FlashInfer] Architecture check FAILED\n[CUDA] All CUDA operations nominal\n[System] GPU memory: 96GB available",
912
+ inspect_config: "gpu_architecture: sm_121\ncuda_version: 13.0\nflashinfer_version: 0.4.1\nsupported_archs: [70, 75, 80, 86, 89, 90]",
913
+ followup_dispatch: "The dispatch table maps arch -> kernel. SM121 has no entry. Adding sm_12x family to the arch check should resolve this.",
914
+ };
915
+
916
+ function sleep(ms) { return new Promise(function(r) { setTimeout(r, ms); }); }
917
+
918
+ /* ══════════════════════════════════════════════
919
+ LIVE ENVIRONMENT CONNECTION
920
+ ══════════════════════════════════════════════ */
921
+ var SERVER = { url: '', ws: null };
922
+
923
+ function getServerUrl() { return document.getElementById('serverUrl').value.replace(/\/$/, ''); }
924
+ function getWsUrl() {
925
+ var base = getServerUrl();
926
+ if (!base) {
927
+ // Same-origin: derive WS URL from current page
928
+ var proto = location.protocol === 'https:' ? 'wss:' : 'ws:';
929
+ return proto + '//' + location.host + '/ws';
930
+ }
931
+ return base.replace(/^http/, 'ws') + '/ws';
932
+ }
933
+ function setConnStatus(text, cls) {
934
+ var el = document.getElementById('connStatus');
935
+ el.textContent = text; el.className = 'conn-status' + (cls ? ' ' + cls : '');
936
+ }
937
+
938
+ /* WebSocket-based communication — maintains session state across reset/step */
939
+ function wsConnect() {
940
+ return new Promise(function(resolve, reject) {
941
+ var url = getWsUrl();
942
+ var ws = new WebSocket(url);
943
+ ws.onopen = function() { SERVER.ws = ws; resolve(ws); };
944
+ ws.onerror = function(e) { reject(new Error('WebSocket connection failed')); };
945
+ ws.onclose = function() { SERVER.ws = null; };
946
+ });
947
+ }
948
+
949
+ function wsSend(type, data) {
950
+ return new Promise(function(resolve, reject) {
951
+ if (!SERVER.ws || SERVER.ws.readyState !== WebSocket.OPEN) {
952
+ reject(new Error('WebSocket not connected')); return;
953
+ }
954
+ SERVER.ws.onmessage = function(evt) {
955
+ try {
956
+ var msg = JSON.parse(evt.data);
957
+ resolve(msg); /* Always resolve — caller handles errors */
958
+ }
959
+ catch (e) { reject(new Error('Bad JSON from server')); }
960
+ };
961
+ SERVER.ws.send(JSON.stringify({ type: type, data: data || {} }));
962
+ });
963
+ }
964
+
965
+ function wsClose() {
966
+ if (SERVER.ws) { SERVER.ws.close(); SERVER.ws = null; }
967
+ }
968
+
969
+ /* Convenience wrappers — returns {observation, reward, done} from data envelope */
970
+ async function serverReset(body) {
971
+ if (!SERVER.ws) await wsConnect();
972
+ var msg = await wsSend('reset', body || {});
973
+ if (msg.type === 'error') throw new Error((msg.data && msg.data.message) || 'Reset failed');
974
+ return msg.data; /* {observation, reward, done} */
975
+ }
976
+
977
+ async function serverStep(actionMessage) {
978
+ /* WS step format: {type: "step", data: {message: "..."}} — NOT wrapped in action */
979
+ var msg = await wsSend('step', { message: actionMessage });
980
+ if (msg.type === 'error') {
981
+ /* Environment validation error (invalid target, specialist, etc.) — return penalty */
982
+ var errMsg = (msg.data && msg.data.message) || 'Unknown error';
983
+ return { observation: { output: 'Error: ' + errMsg }, reward: -2.0, done: false };
984
+ }
985
+ return msg.data; /* {observation, reward, done} */
986
+ }
987
+
988
+ function disableButtons() {
989
+ var btn = document.getElementById('demoBtn');
990
+ btn.disabled = true; btn.textContent = '\u25CF Running...';
991
+ }
992
+ function enableButtons() {
993
+ var btn = document.getElementById('demoBtn');
994
+ btn.disabled = false; btn.textContent = '\u25B6 Run Comparison (Untrained \u2192 Trained)';
995
+ }
996
+
997
+ /* Map root_cause to a layer name for the architecture diagram */
998
+ var CAUSE_TO_LAYER = {
999
+ arch_guard: 'backend', backend_whitelist: 'backend', backend_selector: 'backend',
1000
+ runtime_loader: 'runtime', driver_compat: 'driver',
1001
+ model_config: 'model', weight_layout: 'model',
1002
+ memory_oom: 'memory', quantization_error: 'kernel',
1003
+ distributed_comm: 'runtime'
1004
+ };
1005
+
1006
+ /* Parse the environment observation to extract structured info */
1007
+ function parseObs(obs) {
1008
+ return {
1009
+ output: obs.output || '',
1010
+ ticket: obs.incident_ticket || '',
1011
+ hardware: obs.hardware || '',
1012
+ model: obs.model_name || '',
1013
+ backend: obs.backend || '',
1014
+ log: obs.log_excerpt || '',
1015
+ snippet: obs.code_snippet || '',
1016
+ specialists: obs.specialist_opinions || {},
1017
+ remaining: obs.steps_remaining,
1018
+ fixUsed: obs.fix_used,
1019
+ done: obs.done,
1020
+ reward: obs.reward || 0,
1021
+ meta: obs.metadata || {}
1022
+ };
1023
+ }
1024
+
1025
+ async function runLive(mode) {
1026
+ disableButtons(); resetState(); setConnStatus('Connecting...', 'running');
1027
+
1028
+ try {
1029
+ // Step 0: Reset — get a real scenario via WebSocket (stateful session)
1030
+ var resetResp = await serverReset({});
1031
+ var obs = parseObs(resetResp.observation || resetResp);
1032
+ setConnStatus('Connected', 'connected');
1033
+ startTimer();
1034
+ setStatus('Incident Received', 'warning');
1035
+
1036
+ // Extract scenario ID from metadata
1037
+ var scenarioId = (obs.meta && obs.meta.scenario_id) || 'unknown';
1038
+ document.getElementById('scenarioId').textContent = scenarioId;
1039
+ document.getElementById('detail-model').textContent = obs.model;
1040
+ document.getElementById('detail-backend').textContent = obs.backend;
1041
+ document.getElementById('detail-driver').textContent = obs.hardware;
1042
+
1043
+ // Populate incident card from real data
1044
+ addIncidentCard({
1045
+ incident_ticket: obs.ticket,
1046
+ hardware: obs.hardware, model_name: obs.model, backend: obs.backend
1047
+ });
1048
+ populateSpecialists(obs.specialists);
1049
+ await sleep(1800);
1050
+
1051
+ if (mode === 'untrained') {
1052
+ await runLiveUntrained(obs);
1053
+ } else {
1054
+ await runLiveTrained(obs);
1055
+ }
1056
+
1057
+ await sleep(6000);
1058
+ document.getElementById('diagnosisOverlay').classList.remove('visible');
1059
+ } catch (e) {
1060
+ console.error('Live connection failed:', e);
1061
+ addLogEntry({ type: 'submit', label: 'CONNECTION ERROR', reward: 0, text: e.message + '\n' + (e.stack || '') });
1062
+ wsClose();
1063
+ setConnStatus('Offline mode', 'running');
1064
+ await runOffline(mode);
1065
+ await sleep(6000);
1066
+ document.getElementById('diagnosisOverlay').classList.remove('visible');
1067
+ }
1068
+ wsClose();
1069
+ enableButtons();
1070
+ }
1071
+
1072
+ /* ══════════════════════════════════════════════
1073
+ OFFLINE FALLBACK (no server needed)
1074
+ ══════════════════════════════════════════════ */
1075
+ async function runOffline(mode) {
1076
+ var sc = DEMO;
1077
+ startTimer();
1078
+ setStatus('Incident Received', 'warning');
1079
+ document.getElementById('scenarioId').textContent = sc.id;
1080
+ document.getElementById('detail-model').textContent = sc.model_name;
1081
+ document.getElementById('detail-backend').textContent = sc.backend;
1082
+ document.getElementById('detail-driver').textContent = sc.hardware;
1083
+ addIncidentCard(sc); populateSpecialists(sc.specialist_opinions);
1084
+ await sleep(1800);
1085
+ if (mode === 'untrained') { await runOfflineUntrained(sc); }
1086
+ else { await runOfflineTrained(sc); }
1087
+ }
1088
+
1089
+ async function runOfflineUntrained(sc) {
1090
+ state.step = 1; updateSteps(1); setStatus('Model Acting', 'warning');
1091
+ addLogEntry({ type: 'submit', label: 'Blind Submit (no investigation)', reward: 0,
1092
+ text: 'Model skips investigation.\nOutput: [{"type":"submit","root_cause":"runtime_loader","fix":"fix_runtime_path","justification":"maybe"}]' });
1093
+ await sleep(1800);
1094
+ document.getElementById('rootCauseValue').textContent = 'runtime_loader';
1095
+ document.getElementById('rootCauseValue').className = 'vital-value coral';
1096
+ setLayerState('runtime', 'identified');
1097
+ var totalReward = -11.5;
1098
+ addLogEntry({ type: 'reward_breakdown', label: 'Reward Breakdown', reward: totalReward,
1099
+ text: 'Root cause: runtime_loader \u2717 (expected arch_guard) \u2192 -8.0\nFix: fix_runtime_path \u2717 (expected relax_arch_check) \u2192 -2.0\nNo investigation \u2192 -1.0\nJustification too short \u2192 -0.5' });
1100
+ updateReward(totalReward);
1101
+ state.done = true; setStatus('Diagnosis Submitted', 'error');
1102
+ document.getElementById('diagnosisValue').textContent = '\u2717 Incorrect';
1103
+ document.getElementById('diagnosisValue').className = 'vital-value coral';
1104
+ markSpecialist('runtime', false); markSpecialist('dispatch', true);
1105
+ markSpecialist('kernel', true); markSpecialist('loader', false);
1106
+ await sleep(1500); stopTimer();
1107
+ showDiagnosis({ rcCorrect: false, fixCorrect: false, rootCause: 'runtime_loader', fix: 'fix_runtime_path',
1108
+ correctRc: 'arch_guard', correctFix: 'relax_arch_check', totalReward: state.reward });
1109
+ }
1110
+
1111
+ async function runOfflineTrained(sc) {
1112
+ state.step = 1; updateSteps(1); setStatus('Investigating', 'warning');
1113
+ setLayerState('runtime', 'scanning');
1114
+ addLogEntry({ type: 'inspect', label: 'Inspect Logs', reward: -0.25, text: sc.inspect_logs });
1115
+ updateReward(-0.25); await sleep(2200);
1116
+
1117
+ state.step = 2; updateSteps(2);
1118
+ setLayerState('runtime', ''); setLayerState('backend', 'scanning');
1119
+ addLogEntry({ type: 'inspect', label: 'Inspect Config', reward: -0.25, text: sc.inspect_config });
1120
+ updateReward(-0.25); await sleep(2000);
1121
+
1122
+ state.step = 3; updateSteps(3);
1123
+ setLayerState('backend', 'identified'); highlightSpecialist('dispatch');
1124
+ addLogEntry({ type: 'ask_specialist', label: 'Query: Dispatch', reward: -0.25, text: sc.followup_dispatch });
1125
+ updateReward(-0.25);
1126
+ document.getElementById('rootCauseValue').textContent = 'arch_guard';
1127
+ document.getElementById('rootCauseValue').className = 'vital-value amber';
1128
+ await sleep(2200);
1129
+
1130
+ state.step = 4; updateSteps(4); setStatus('Applying Fix', 'warning');
1131
+ state.fixApplied = true; setLayerState('backend', 'resolved');
1132
+ addLogEntry({ type: 'apply_fix', label: 'Apply Fix: relax_arch_check', reward: 3.0, text: 'Fix applied successfully. Systems recovering.' });
1133
+ updateReward(3.0);
1134
+ document.getElementById('fixStatus').textContent = '\u2713 Applied';
1135
+ document.getElementById('fixStatus').className = 'vital-value emerald';
1136
+ ['backend', 'runtime', 'model', 'memory', 'driver'].forEach(function(l, i) {
1137
+ setTimeout(function() { setLayerState(l, 'resolved'); }, i * 300);
1138
+ }); await sleep(2000);
1139
+
1140
+ state.step = 5; updateSteps(5); state.done = true;
1141
+ setStatus('Diagnosis Submitted', 'success');
1142
+ addLogEntry({ type: 'submit', label: 'Submit Diagnosis', reward: 19.0,
1143
+ text: 'Root cause: arch_guard \u2713\nFix: relax_arch_check \u2713\nJustification: Logs show sm_121 rejected by arch check. Dispatch confirmed SM121 supports HMMA. Config missing sm_12x in supported_archs.' });
1144
+ updateReward(19.0);
1145
+ document.getElementById('diagnosisValue').textContent = '\u2713 Correct';
1146
+ document.getElementById('diagnosisValue').className = 'vital-value emerald';
1147
+ markSpecialist('runtime', false); markSpecialist('dispatch', true);
1148
+ markSpecialist('kernel', true); markSpecialist('loader', false);
1149
+ await sleep(1500); stopTimer();
1150
+ showDiagnosis({ rcCorrect: true, fixCorrect: true, rootCause: 'arch_guard', fix: 'relax_arch_check',
1151
+ correctRc: 'arch_guard', correctFix: 'relax_arch_check', totalReward: state.reward });
1152
+ }
1153
+
1154
+ /* ── UNTRAINED: blind submit, no investigation ── */
1155
+ async function runLiveUntrained(initObs) {
1156
+ setStatus('Model Acting', 'warning');
1157
+ state.step = 1; updateSteps(1);
1158
+
1159
+ // Untrained model skips all investigation — just submits a random wrong guess
1160
+ addLogEntry({ type: 'inspect', label: 'Untrained Model Behavior', reward: 0,
1161
+ text: 'Model receives incident but skips investigation.\nNo logs read. No config checked. No specialists queried.\nImmediately submits a blind guess...' });
1162
+ await sleep(2000);
1163
+
1164
+ // Send a deliberately wrong submit to the real environment
1165
+ state.step = 2; updateSteps(2);
1166
+ var stepResp = await serverStep('{"type":"submit","root_cause":"runtime_loader","fix":"fix_runtime_path","justification":"idk"}');
1167
+ var obs = parseObs(stepResp.observation || stepResp);
1168
+ var stepReward = stepResp.reward !== undefined ? stepResp.reward : obs.reward;
1169
+ setConnStatus('Connected', 'connected');
1170
+
1171
+ // Parse the real environment response
1172
+ var outputText = obs.output;
1173
+ var rcCorrect = outputText.indexOf('CORRECT') !== -1 && outputText.indexOf('Root cause') !== -1
1174
+ && outputText.split('Root cause')[1].split('\n')[0].indexOf('CORRECT') !== -1;
1175
+ var fixCorrect = outputText.indexOf('CORRECT') !== -1 && outputText.indexOf('Fix:') !== -1
1176
+ && outputText.split('Fix:')[1].split('\n')[0].indexOf('CORRECT') !== -1;
1177
+
1178
+ // Extract actual correct answers from output
1179
+ var correctRc = ''; var correctFix = '';
1180
+ var rcMatch = outputText.match(/WRONG \(was: (\w+)\)/);
1181
+ if (rcMatch) correctRc = rcMatch[1];
1182
+ var fixMatch = outputText.match(/Fix:.*WRONG \(was: (\w+)\)/);
1183
+ if (fixMatch) correctFix = fixMatch[1];
1184
+
1185
+ document.getElementById('rootCauseValue').textContent = 'runtime_loader';
1186
+ document.getElementById('rootCauseValue').className = 'vital-value coral';
1187
+
1188
+ addLogEntry({ type: 'submit', label: 'Blind Submit', reward: stepReward,
1189
+ text: outputText });
1190
+ updateReward(stepReward);
1191
+
1192
+ state.done = true;
1193
+ setStatus('Diagnosis Submitted', obs.reward >= 0 ? 'success' : 'error');
1194
+ document.getElementById('diagnosisValue').textContent = rcCorrect ? '\u2713 Correct' : '\u2717 Incorrect';
1195
+ document.getElementById('diagnosisValue').className = 'vital-value ' + (rcCorrect ? 'emerald' : 'coral');
1196
+
1197
+ await sleep(1500); stopTimer();
1198
+ showDiagnosis({
1199
+ rcCorrect: rcCorrect, fixCorrect: fixCorrect,
1200
+ rootCause: 'runtime_loader', fix: 'fix_runtime_path',
1201
+ correctRc: correctRc || 'unknown', correctFix: correctFix || 'unknown',
1202
+ totalReward: state.reward
1203
+ });
1204
+ }
1205
+
1206
+ /* ── TRAINED: investigate, then diagnose ── */
1207
+ async function runLiveTrained(initObs) {
1208
+ setStatus('Investigating', 'warning');
1209
+
1210
+ // Step 1: Inspect logs
1211
+ state.step = 1; updateSteps(1);
1212
+ setLayerState('runtime', 'scanning');
1213
+ var step1 = await serverStep('{"type":"inspect","target":"logs"}');
1214
+ var obs1 = parseObs(step1.observation || step1);
1215
+ var rew1 = step1.reward !== undefined ? step1.reward : obs1.reward;
1216
+ addLogEntry({ type: 'inspect', label: 'Inspect Logs', reward: rew1, text: obs1.output });
1217
+ updateReward(rew1);
1218
+ await sleep(2200);
1219
+
1220
+ // Step 2: Inspect config
1221
+ state.step = 2; updateSteps(2);
1222
+ setLayerState('runtime', ''); setLayerState('backend', 'scanning');
1223
+ var step2 = await serverStep('{"type":"inspect","target":"config"}');
1224
+ var obs2 = parseObs(step2.observation || step2);
1225
+ var rew2 = step2.reward !== undefined ? step2.reward : obs2.reward;
1226
+ addLogEntry({ type: 'inspect', label: 'Inspect Config', reward: rew2, text: obs2.output });
1227
+ updateReward(rew2);
1228
+ await sleep(2000);
1229
+
1230
+ // Step 3: Query a specialist — pick dispatch as a reasonable investigation choice
1231
+ state.step = 3; updateSteps(3);
1232
+ setLayerState('backend', 'identified');
1233
+ highlightSpecialist('dispatch');
1234
+ var step3 = await serverStep('{"type":"ask_specialist","specialist":"dispatch"}');
1235
+ var obs3 = parseObs(step3.observation || step3);
1236
+ var rew3 = step3.reward !== undefined ? step3.reward : obs3.reward;
1237
+ addLogEntry({ type: 'ask_specialist', label: 'Query: Dispatch', reward: rew3, text: obs3.output });
1238
+ updateReward(rew3);
1239
+ await sleep(2200);
1240
+
1241
+ // Step 4: Smart submit — analyze the logs/config to guess the right answer
1242
+ // For the demo, we use the scenario hints from the environment output to make an informed guess
1243
+ // A real trained model would parse the observations and infer the root cause
1244
+ state.step = 4; updateSteps(4); setStatus('Diagnosing', 'warning');
1245
+
1246
+ // Attempt to extract the root cause from clues in the observations
1247
+ var allText = obs1.output + ' ' + obs2.output + ' ' + obs3.output;
1248
+ var guessRc = inferRootCause(allText);
1249
+ var guessFix = RC_TO_FIX[guessRc] || 'switch_backend';
1250
+ var justification = 'Logs and config analysis indicates ' + guessRc + '. Dispatch specialist confirmed. Applying ' + guessFix + '.';
1251
+
1252
+ document.getElementById('rootCauseValue').textContent = guessRc;
1253
+ document.getElementById('rootCauseValue').className = 'vital-value amber';
1254
+
1255
+ // Apply fix first
1256
+ var step4 = await serverStep(JSON.stringify({ type: 'apply_fix', fix: guessFix }));
1257
+ var obs4 = parseObs(step4.observation || step4);
1258
+ var rew4 = step4.reward !== undefined ? step4.reward : obs4.reward;
1259
+ var fixWorked = rew4 > 0;
1260
+ addLogEntry({ type: 'apply_fix', label: 'Apply Fix: ' + guessFix, reward: rew4, text: obs4.output });
1261
+ updateReward(rew4);
1262
+ document.getElementById('fixStatus').textContent = fixWorked ? '\u2713 Applied' : '\u2717 Failed';
1263
+ document.getElementById('fixStatus').className = 'vital-value ' + (fixWorked ? 'emerald' : 'coral');
1264
+
1265
+ if (fixWorked) {
1266
+ var layers = ['backend', 'runtime', 'model', 'memory', 'driver'];
1267
+ layers.forEach(function(l, i) { setTimeout(function() { setLayerState(l, 'resolved'); }, i * 300); });
1268
+ }
1269
+ await sleep(2000);
1270
+
1271
+ // Step 5: Submit diagnosis
1272
+ var isDone4 = step4.done !== undefined ? step4.done : obs4.done;
1273
+ if (!isDone4) {
1274
+ state.step = 5; updateSteps(5);
1275
+ var step5 = await serverStep(JSON.stringify({ type: 'submit', root_cause: guessRc, fix: guessFix, justification: justification }));
1276
+ var obs5 = parseObs(step5.observation || step5);
1277
+ var rew5 = step5.reward !== undefined ? step5.reward : obs5.reward;
1278
+ addLogEntry({ type: 'submit', label: 'Submit Diagnosis', reward: rew5, text: obs5.output });
1279
+ updateReward(rew5);
1280
+
1281
+ var outputText = obs5.output;
1282
+ var rcCorrect = outputText.indexOf('Root cause') !== -1 && outputText.split('Root cause')[1].split('\n')[0].indexOf('CORRECT') !== -1;
1283
+ var fixCorrect2 = outputText.indexOf('Fix:') !== -1 && outputText.split('Fix:')[1].split('\n')[0].indexOf('CORRECT') !== -1;
1284
+
1285
+ state.done = true;
1286
+ setStatus('Diagnosis Submitted', rcCorrect ? 'success' : 'error');
1287
+ document.getElementById('diagnosisValue').textContent = rcCorrect ? '\u2713 Correct' : '\u2717 Incorrect';
1288
+ document.getElementById('diagnosisValue').className = 'vital-value ' + (rcCorrect ? 'emerald' : 'coral');
1289
+
1290
+ var correctRc = guessRc; var correctFix = guessFix;
1291
+ var rcWrong = outputText.match(/WRONG \(was: (\w+)\)/);
1292
+ if (rcWrong) correctRc = rcWrong[1];
1293
+ var fixWrong = outputText.match(/Fix:.*WRONG \(was: (\w+)\)/);
1294
+ if (fixWrong) correctFix = fixWrong[1];
1295
+
1296
+ await sleep(1500); stopTimer();
1297
+ showDiagnosis({
1298
+ rcCorrect: rcCorrect && !rcWrong, fixCorrect: fixCorrect2 && !fixWrong,
1299
+ rootCause: guessRc, fix: guessFix,
1300
+ correctRc: correctRc, correctFix: correctFix,
1301
+ totalReward: state.reward
1302
+ });
1303
+ } else {
1304
+ state.done = true; stopTimer();
1305
+ }
1306
+ }
1307
+
1308
+ /* ══════════════════════════════════════════════
1309
+ LIVE MODEL INFERENCE — real Qwen 1.5B via MLX
1310
+ ══════════════════════════════════════════════ */
1311
+ /* Inference URL: same origin on HF Spaces, localhost:8001 locally */
1312
+ var INFERENCE_URL = (location.hostname === 'localhost' || location.hostname === '127.0.0.1') ? 'http://localhost:8001' : '';
1313
+
1314
+ function extractActionsJS(text) {
1315
+ text = text.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
1316
+ var start = text.indexOf('['), end = text.lastIndexOf(']');
1317
+ if (start !== -1 && end > start) {
1318
+ try { var a = JSON.parse(text.slice(start, end+1)); if (Array.isArray(a)) return a.filter(function(x){return typeof x==='object';}); } catch(e) {}
1319
+ }
1320
+ try { var a = JSON.parse(text); if (Array.isArray(a)) return a.filter(function(x){return typeof x==='object';}); if (typeof a === 'object') return [a]; } catch(e) {}
1321
+ return null;
1322
+ }
1323
+
1324
+ var UNTRAINED_SYSTEM = 'You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\nYou receive an incident ticket with hardware/model/backend context, log excerpts, and specialist opinions.\nSome specialists may be wrong. Output a JSON array of actions:\n {"type":"inspect","target":"logs|config|snippet|metrics"}\n {"type":"ask_specialist","specialist":"runtime|dispatch|kernel|loader"}\n {"type":"apply_fix","fix":"<fix_name>"}\n {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<why>"}';
1325
+
1326
+ var TRAINED_SYSTEM = 'You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\nYou are methodical: first inspect logs and config, then query specialists to cross-verify (some lie), then apply a fix and submit.\n\nAvailable actions (output as a JSON array):\n {"type":"inspect","target":"logs"} or "config" or "snippet" or "metrics"\n {"type":"ask_specialist","specialist":"runtime"} or "dispatch" or "kernel" or "loader"\n {"type":"apply_fix","fix":"<name>"} — available fixes: add_whitelist_entry, fix_comm_config, fix_quantization, fix_runtime_path, fix_weight_mapping, relax_arch_check, switch_backend, tune_memory_config, update_driver_config, update_model_config\n {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<detailed reasoning>"}\n\nAvailable root causes: arch_guard, backend_selector, backend_whitelist, distributed_comm, driver_compat, memory_oom, model_config, quantization_error, runtime_loader, weight_layout\n\nIMPORTANT: Pick ONE target per inspect, ONE specialist per query. Investigate before submitting. Give a detailed justification.\n\nExample output:\n[{"type":"inspect","target":"logs"},{"type":"inspect","target":"config"},{"type":"ask_specialist","specialist":"kernel"},{"type":"apply_fix","fix":"relax_arch_check"},{"type":"submit","root_cause":"arch_guard","fix":"relax_arch_check","justification":"Logs show architecture check failure for SM90 on the backend. Config confirms the guard is enabled. Kernel specialist confirmed this is not a kernel issue. Relaxing the arch check resolves the incompatibility."}]';
1327
+
1328
+ async function runComparison() {
1329
+ disableButtons();
1330
+
1331
+ // 1. Pick a random scenario ID to pin both runs to the same problem
1332
+ var scenarioIds = ['arch_guard_01','arch_guard_02','backend_whitelist_01','backend_whitelist_02','runtime_loader_01','runtime_loader_02','backend_selector_01'];
1333
+ var scenarioId = scenarioIds[Math.floor(Math.random() * scenarioIds.length)];
1334
+
1335
+ // 2. Run untrained
1336
+ await runLiveModel('untrained', scenarioId);
1337
+
1338
+ // 3. Pause between runs
1339
+ await sleep(3000);
1340
+ addLogEntry({ type: 'inspect', label: '--- Now running TRAINED model on same scenario ---', reward: 0, text: 'Same incident: ' + scenarioId });
1341
+ await sleep(2000);
1342
+ resetState();
1343
+
1344
+ // 4. Run trained on same scenario
1345
+ await runLiveModel('trained', scenarioId);
1346
+
1347
+ enableButtons();
1348
+ }
1349
+
1350
+ async function runLiveModel(mode, scenarioId) {
1351
+ resetState(); setConnStatus('Connecting...', 'running');
1352
+ var isTrained = mode === 'trained';
1353
+ var ws = null;
1354
+
1355
+ try {
1356
+ // 1. Raw WebSocket connect (derive WS URL from server URL or same-origin)
1357
+ var base = getServerUrl();
1358
+ var wsUrl;
1359
+ if (!base) {
1360
+ var proto = location.protocol === 'https:' ? 'wss:' : 'ws:';
1361
+ wsUrl = proto + '//' + location.host + '/ws';
1362
+ } else {
1363
+ wsUrl = base.replace(/^http/, 'ws') + '/ws';
1364
+ }
1365
+ ws = new WebSocket(wsUrl);
1366
+ await new Promise(function(res, rej) {
1367
+ ws.onopen = res;
1368
+ ws.onerror = function() { rej(new Error('WebSocket connect failed')); };
1369
+ setTimeout(function() { rej(new Error('WebSocket timeout')); }, 5000);
1370
+ });
1371
+
1372
+ function wsSendRaw(type, data) {
1373
+ return new Promise(function(res, rej) {
1374
+ ws.onmessage = function(e) { res(JSON.parse(e.data)); };
1375
+ ws.send(JSON.stringify({type: type, data: data || {}}));
1376
+ setTimeout(function() { rej(new Error('WS ' + type + ' timeout')); }, 15000);
1377
+ });
1378
+ }
1379
+
1380
+ // 2. Reset with pinned scenario ID (same scenario for both runs)
1381
+ var resetData = scenarioId ? { scenario_id: scenarioId } : {};
1382
+ var resetMsg = await wsSendRaw('reset', resetData);
1383
+ if (resetMsg.type === 'error') throw new Error('Reset: ' + JSON.stringify(resetMsg.data));
1384
+ var obs = resetMsg.data.observation;
1385
+ setConnStatus('Connected', 'connected');
1386
+ startTimer();
1387
+ setStatus('Incident Received', 'warning');
1388
+
1389
+ document.getElementById('scenarioId').textContent = 'live';
1390
+ document.getElementById('detail-model').textContent = obs.model_name || '';
1391
+ document.getElementById('detail-backend').textContent = obs.backend || '';
1392
+ document.getElementById('detail-driver').textContent = obs.hardware || '';
1393
+
1394
+ addIncidentCard({ incident_ticket: obs.incident_ticket, hardware: obs.hardware, model_name: obs.model_name, backend: obs.backend });
1395
+ populateSpecialists(obs.specialist_opinions || {});
1396
+ await sleep(1000);
1397
+
1398
+ // 3. Build prompt from raw observation
1399
+ var opsStr = '';
1400
+ var specs = obs.specialist_opinions || {};
1401
+ for (var name in specs) {
1402
+ var o = specs[name];
1403
+ opsStr += ' ' + name + ': ' + o.opinion + ' (confidence: ' + o.confidence + ')\n';
1404
+ }
1405
+ var userPrompt = 'INCIDENT: ' + obs.incident_ticket + '\nHardware: ' + obs.hardware + ' | Model: ' + obs.model_name + ' | Backend: ' + obs.backend + '\nLOG:\n' + obs.log_excerpt + '\nSPECIALISTS:\n' + opsStr + '\nInvestigate and submit your diagnosis as a JSON action array.';
1406
+
1407
+ // 4. Call inference server
1408
+ setStatus(isTrained ? 'Trained Model Thinking...' : 'Untrained Model Thinking...', 'warning');
1409
+ addLogEntry({ type: 'inspect', label: 'Qwen 1.5B ' + (isTrained ? '(GRPO)' : '(Base)') + ' Generating...', reward: 0, text: 'Sending scenario to local MLX model for inference...' });
1410
+
1411
+ var genResp = await fetch(INFERENCE_URL + '/generate', {
1412
+ method: 'POST', headers: { 'Content-Type': 'application/json' },
1413
+ body: JSON.stringify({ prompt: userPrompt, max_tokens: 512, mode: isTrained ? 'trained' : 'untrained', system: isTrained ? TRAINED_SYSTEM : UNTRAINED_SYSTEM })
1414
+ }).then(function(r) { return r.json(); });
1415
+
1416
+ addLogEntry({ type: 'inspect', label: 'Model Output (' + genResp.gen_time.toFixed(1) + 's)', reward: 0, text: genResp.text.slice(0, 500) });
1417
+ await sleep(500);
1418
+
1419
+ // 5. Parse actions
1420
+ var actions = extractActionsJS(genResp.text);
1421
+ if (!actions || actions.length === 0) {
1422
+ addLogEntry({ type: 'submit', label: 'Parse Failed', reward: -5, text: 'Could not parse model output as JSON actions.\nRaw: ' + genResp.text.slice(0, 200) });
1423
+ updateReward(-5);
1424
+ state.done = true; setStatus('Parse Error', 'error'); stopTimer();
1425
+ ws.close(); enableButtons(); return;
1426
+ }
1427
+
1428
+ setStatus('Executing Actions', 'warning');
1429
+
1430
+ // 6. Execute each action via raw WebSocket
1431
+ var done = false;
1432
+ var lastOutput = '';
1433
+ var totalReward = 0;
1434
+ for (var i = 0; i < actions.length && !done; i++) {
1435
+ var action = actions[i];
1436
+ var aType = action.type || '?';
1437
+ state.step = i + 1; updateSteps(i + 1);
1438
+
1439
+ // Visual feedback on architecture diagram
1440
+ if (aType === 'inspect') setLayerState('runtime', 'scanning');
1441
+ if (aType === 'ask_specialist') highlightSpecialist(action.specialist || 'dispatch');
1442
+ if (aType === 'apply_fix') setLayerState('backend', 'identified');
1443
+ if (aType === 'submit') { setStatus('Diagnosing', 'warning'); document.getElementById('rootCauseValue').textContent = action.root_cause || '?'; }
1444
+
1445
+ var stepMsg = await wsSendRaw('step', { message: JSON.stringify(action) });
1446
+
1447
+ var rew = 0;
1448
+ var stepOutput = '';
1449
+ if (stepMsg.type === 'error') {
1450
+ rew = -2;
1451
+ stepOutput = 'Error: ' + ((stepMsg.data && stepMsg.data.message) || 'unknown');
1452
+ } else {
1453
+ rew = stepMsg.data.reward || 0;
1454
+ done = stepMsg.data.done || false;
1455
+ stepOutput = (stepMsg.data.observation && stepMsg.data.observation.output) || '';
1456
+ }
1457
+ totalReward += rew;
1458
+
1459
+ var label = aType;
1460
+ if (aType === 'inspect') label = 'Inspect: ' + (action.target || '?');
1461
+ if (aType === 'ask_specialist') label = 'Query: ' + (action.specialist || '?');
1462
+ if (aType === 'apply_fix') label = 'Fix: ' + (action.fix || '?');
1463
+ if (aType === 'submit') label = 'Submit Diagnosis';
1464
+
1465
+ addLogEntry({ type: aType, label: label, reward: rew, text: stepOutput });
1466
+ updateReward(rew);
1467
+ lastOutput = stepOutput;
1468
+
1469
+ // Green up layers on successful fix
1470
+ if (aType === 'apply_fix' && rew > 0) {
1471
+ ['backend', 'runtime', 'model', 'memory', 'driver'].forEach(function(l, idx) {
1472
+ setTimeout(function() { setLayerState(l, 'resolved'); }, idx * 300);
1473
+ });
1474
+ }
1475
+ await sleep(1200);
1476
+ }
1477
+
1478
+ // 7. Final diagnosis
1479
+ state.done = true; stopTimer();
1480
+ if (lastOutput.indexOf('DIAGNOSIS SUBMITTED') !== -1 || lastOutput.indexOf('Root cause') !== -1 || done) {
1481
+ var rcCorrect = lastOutput.indexOf('CORRECT') !== -1 && lastOutput.indexOf('Root cause') !== -1 && lastOutput.split('Root cause')[1].split('\n')[0].indexOf('CORRECT') !== -1;
1482
+ setStatus('Diagnosis Submitted', totalReward > 0 ? 'success' : 'error');
1483
+ document.getElementById('diagnosisValue').textContent = totalReward > 0 ? '\u2713 Correct' : '\u2717 Incorrect';
1484
+ document.getElementById('diagnosisValue').className = 'vital-value ' + (totalReward > 0 ? 'emerald' : 'coral');
1485
+
1486
+ var submitAction = actions.find(function(a) { return a.type === 'submit'; }) || {};
1487
+ var correctRc = submitAction.root_cause || '?';
1488
+ var correctFix = submitAction.fix || '?';
1489
+ var rcWrong = lastOutput.match(/WRONG \(was: (\w+)\)/);
1490
+ if (rcWrong) correctRc = rcWrong[1];
1491
+ var fixWrong = lastOutput.match(/Fix:.*WRONG \(was: (\w+)\)/);
1492
+ if (fixWrong) correctFix = fixWrong[1];
1493
+
1494
+ showDiagnosis({
1495
+ rcCorrect: rcCorrect && !rcWrong, fixCorrect: !fixWrong,
1496
+ rootCause: submitAction.root_cause || '?', fix: submitAction.fix || '?',
1497
+ correctRc: correctRc, correctFix: correctFix,
1498
+ totalReward: state.reward
1499
+ });
1500
+
1501
+ await sleep(8000);
1502
+ document.getElementById('diagnosisOverlay').classList.remove('visible');
1503
+ } else {
1504
+ setStatus('Episode Ended', totalReward > 0 ? 'success' : 'error');
1505
+ }
1506
+
1507
+ } catch (e) {
1508
+ console.error('Live model error:', e);
1509
+ setConnStatus('Error: ' + e.message, 'error');
1510
+ setStatus('Error', 'error');
1511
+ alert('ERROR: ' + e.message);
1512
+ addLogEntry({ type: 'submit', label: 'ERROR', reward: 0, text: e.message + '\n' + (e.stack || '') });
1513
+ }
1514
+ if (ws) ws.close();
1515
+ enableButtons();
1516
+ }
1517
+
1518
+ /* Root cause inference from observation text — pattern matching on known signatures */
1519
+ var RC_TO_FIX = {
1520
+ arch_guard: 'relax_arch_check', backend_whitelist: 'add_whitelist_entry',
1521
+ runtime_loader: 'fix_runtime_path', backend_selector: 'switch_backend',
1522
+ model_config: 'update_model_config', weight_layout: 'fix_weight_mapping',
1523
+ memory_oom: 'tune_memory_config', quantization_error: 'fix_quantization',
1524
+ distributed_comm: 'fix_comm_config', driver_compat: 'update_driver_config'
1525
+ };
1526
+
1527
+ function inferRootCause(text) {
1528
+ var t = text.toLowerCase();
1529
+ if (t.indexOf('arch') !== -1 && (t.indexOf('guard') !== -1 || t.indexOf('unsupported') !== -1 || t.indexOf('architecture check') !== -1)) return 'arch_guard';
1530
+ if (t.indexOf('whitelist') !== -1 || t.indexOf('not in supported') !== -1) return 'backend_whitelist';
1531
+ if (t.indexOf('runtime') !== -1 && (t.indexOf('loader') !== -1 || t.indexOf('path') !== -1 || t.indexOf('dlopen') !== -1)) return 'runtime_loader';
1532
+ if (t.indexOf('backend') !== -1 && (t.indexOf('selector') !== -1 || t.indexOf('fallback') !== -1)) return 'backend_selector';
1533
+ if (t.indexOf('model') !== -1 && t.indexOf('config') !== -1 && (t.indexOf('mismatch') !== -1 || t.indexOf('invalid') !== -1)) return 'model_config';
1534
+ if (t.indexOf('weight') !== -1 && (t.indexOf('layout') !== -1 || t.indexOf('shape') !== -1)) return 'weight_layout';
1535
+ if (t.indexOf('oom') !== -1 || t.indexOf('out of memory') !== -1 || t.indexOf('memory') !== -1 && t.indexOf('exceed') !== -1) return 'memory_oom';
1536
+ if (t.indexOf('quantiz') !== -1 || t.indexOf('quant') !== -1 && t.indexOf('error') !== -1) return 'quantization_error';
1537
+ if (t.indexOf('nccl') !== -1 || t.indexOf('distributed') !== -1 || t.indexOf('comm') !== -1) return 'distributed_comm';
1538
+ if (t.indexOf('driver') !== -1 && (t.indexOf('compat') !== -1 || t.indexOf('version') !== -1)) return 'driver_compat';
1539
+ return 'arch_guard'; // fallback
1540
+ }
1541
+
1542
+ /* Poll model status on page load */
1543
+ (function checkModelStatus() {
1544
+ var statusUrl = INFERENCE_URL || '';
1545
+ var el = document.getElementById('modelStatus');
1546
+ fetch(statusUrl + '/model_status').then(function(r) { return r.json(); }).then(function(d) {
1547
+ if (d.ready) {
1548
+ el.textContent = 'Model: ready';
1549
+ el.className = 'conn-status connected';
1550
+ } else if (d.error) {
1551
+ el.textContent = 'Model: error';
1552
+ el.className = 'conn-status error';
1553
+ } else {
1554
+ el.textContent = 'Model: loading...';
1555
+ el.className = 'conn-status running';
1556
+ setTimeout(checkModelStatus, 3000);
1557
+ }
1558
+ }).catch(function() {
1559
+ /* Local dev without /model_status endpoint — assume ready */
1560
+ el.textContent = 'Model: ready';
1561
+ el.className = 'conn-status connected';
1562
+ });
1563
+ })();
1564
+ </script>
1565
+ </body>
1566
+ </html>