OGrohit commited on
Commit
fbb0927
Β·
1 Parent(s): cb16948

Day 4: grader system complete

Browse files
scripts/run_grader.py CHANGED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Manual grader testing CLI.
3
+ Run a simulated episode and score it with the official grader.
4
+
5
+ Usage:
6
+ python scripts/run_grader.py --task single_crash --agent correct
7
+ python scripts/run_grader.py --task cascading_failure --agent wrong
8
+ python scripts/run_grader.py --task silent_degradation --agent correct
9
+ python scripts/run_grader.py --all
10
+ """
11
+ import argparse
12
+ import sys
13
+ import os
14
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15
+
16
+ from server.environment import LogTriageEnvironment
17
+ from server.models import TriageAction
18
+ from server.graders import score_episode
19
+
20
+ # ─── CORRECT AGENT SCRIPTS ────────────────────────────────────────────────────
21
+
22
+ CORRECT_ACTIONS = {
23
+ "single_crash": [
24
+ TriageAction(action_type="classify_severity", value="P1", confidence=0.95),
25
+ TriageAction(action_type="identify_root_cause", value="payment-service", confidence=0.90),
26
+ TriageAction(action_type="remediate", value="restart:payment-service", confidence=0.85),
27
+ TriageAction(action_type="resolve", value="resolved", confidence=1.00),
28
+ ],
29
+ "cascading_failure": [
30
+ TriageAction(action_type="classify_severity", value="P1", confidence=0.90),
31
+ TriageAction(action_type="identify_root_cause", value="user-db", confidence=0.85),
32
+ TriageAction(action_type="remediate", value="kill-query:user-db", confidence=0.90),
33
+ TriageAction(action_type="resolve", value="resolved", confidence=1.00),
34
+ ],
35
+ "silent_degradation": [
36
+ TriageAction(action_type="request_more_logs", value="payment-db", confidence=0.70),
37
+ TriageAction(action_type="classify_severity", value="P2", confidence=0.80),
38
+ TriageAction(action_type="identify_root_cause", value="payment-db", confidence=0.85),
39
+ TriageAction(action_type="remediate", value="flush-cache:payment-db", confidence=0.80),
40
+ TriageAction(action_type="resolve", value="resolved", confidence=1.00),
41
+ ],
42
+ }
43
+
44
+ # ─── WRONG AGENT SCRIPTS ──────────────────────────────────────────────────────
45
+
46
+ WRONG_ACTIONS = {
47
+ "single_crash": [
48
+ TriageAction(action_type="classify_severity", value="P3", confidence=0.50),
49
+ TriageAction(action_type="identify_root_cause", value="api-gateway", confidence=0.50),
50
+ TriageAction(action_type="remediate", value="restart:api-gateway", confidence=0.50),
51
+ TriageAction(action_type="resolve", value="resolved", confidence=1.00),
52
+ ],
53
+ "cascading_failure": [
54
+ TriageAction(action_type="classify_severity", value="P2", confidence=0.60),
55
+ TriageAction(action_type="identify_root_cause", value="api-gateway", confidence=0.60),
56
+ TriageAction(action_type="remediate", value="restart:api-gateway", confidence=0.60),
57
+ TriageAction(action_type="resolve", value="resolved", confidence=1.00),
58
+ ],
59
+ "silent_degradation": [
60
+ TriageAction(action_type="classify_severity", value="P1", confidence=0.90),
61
+ TriageAction(action_type="identify_root_cause", value="api-gateway", confidence=0.70),
62
+ TriageAction(action_type="remediate", value="restart:api-gateway", confidence=0.70),
63
+ TriageAction(action_type="resolve", value="resolved", confidence=1.00),
64
+ ],
65
+ }
66
+
67
+
68
+ def run_test(task_id: str, agent_type: str, seed: int = 42) -> dict:
69
+ """Run a full episode with given actions and return grader result."""
70
+ env = LogTriageEnvironment()
71
+ env.reset(task_id=task_id, seed=seed)
72
+
73
+ actions = CORRECT_ACTIONS[task_id] if agent_type == "correct" else WRONG_ACTIONS[task_id]
74
+
75
+ for action in actions:
76
+ obs = env.step(action)
77
+ if obs.done:
78
+ break
79
+
80
+ result = score_episode(task_id, env.state)
81
+ return result
82
+
83
+
84
+ def print_result(task_id: str, agent_type: str, result: dict):
85
+ score = result["score"]
86
+ print(f"\n{'='*60}")
87
+ print(f"Task: {task_id}")
88
+ print(f"Agent: {agent_type}")
89
+ print(f"Score: {score:.4f}")
90
+ print(f"Steps: {result['steps_taken']}/{result['max_steps']}")
91
+ print(f"Resolved: {result['resolved']}")
92
+ print(f"\nBreakdown:")
93
+ for key, val in result.get("breakdown", {}).items():
94
+ print(f" {key:<20} {val}")
95
+ print(f"{'='*60}")
96
+
97
+
98
+ def main():
99
+ parser = argparse.ArgumentParser(description="Test LogTriageEnv graders")
100
+ parser.add_argument("--task", choices=["single_crash", "cascading_failure", "silent_degradation"],
101
+ help="Task to test")
102
+ parser.add_argument("--agent", choices=["correct", "wrong"], default="correct",
103
+ help="Agent type to simulate")
104
+ parser.add_argument("--all", action="store_true",
105
+ help="Run all tasks with both correct and wrong agents")
106
+ args = parser.parse_args()
107
+
108
+ if args.all:
109
+ tasks = ["single_crash", "cascading_failure", "silent_degradation"]
110
+ print("\n[TEST] Running all tasks with correct and wrong agents...\n")
111
+ print(f"{'Task':<25} {'Agent':<10} {'Score':<8} {'Variance'}")
112
+ print("-" * 60)
113
+ for task in tasks:
114
+ correct_result = run_test(task, "correct")
115
+ wrong_result = run_test(task, "wrong")
116
+ correct_score = correct_result["score"]
117
+ wrong_score = wrong_result["score"]
118
+ variance = correct_score - wrong_score
119
+ status = "[OK]" if variance > 0.10 else "[LOW]"
120
+ print(f"{task:<25} correct {correct_score:.4f}")
121
+ print(f"{task:<25} wrong {wrong_score:.4f} delta={variance:.4f} {status}")
122
+ print()
123
+ elif args.task:
124
+ result = run_test(args.task, args.agent)
125
+ print_result(args.task, args.agent, result)
126
+ else:
127
+ parser.print_help()
128
+
129
+
130
+ if __name__ == "__main__":
131
+ main()
server/app.py CHANGED
@@ -101,13 +101,15 @@ def get_tasks():
101
 
102
  @app.post("/grader")
103
  def grader():
104
- score = env.get_grader_score()
105
- return {
106
- "score": score,
107
- "episode_id": env.state.episode_id if env._state else None,
108
- "task_id": env._task_id,
109
- "steps_taken": env.state.step_count if env._state else 0,
110
- }
 
 
111
 
112
 
113
  @app.post("/baseline")
 
101
 
102
  @app.post("/grader")
103
  def grader():
104
+ try:
105
+ from server.graders import score_episode
106
+ state = env.state
107
+ result = score_episode(state.task_id, state)
108
+ return result
109
+ except RuntimeError as e:
110
+ return JSONResponse(status_code=400, content={"error": str(e)})
111
+ except ValueError as e:
112
+ return JSONResponse(status_code=400, content={"error": str(e)})
113
 
114
 
115
  @app.post("/baseline")
server/environment.py CHANGED
@@ -15,6 +15,8 @@ from server.models import (
15
  ServiceStatus,
16
  )
17
  from server.scenarios import single_crash
 
 
18
  from server.log_generator import generate_healthy_system_state, _make_timestamp
19
 
20
  # ─── TASK REGISTRY ─────────────────────────────────────────────────────────────
@@ -77,9 +79,10 @@ class LogTriageEnvironment:
77
  # Load ground truth for this task
78
  if task_id == "single_crash":
79
  self._ground_truth = single_crash.GROUND_TRUTH
80
- else:
81
- # Tasks 2 & 3 will be wired in Day 3
82
- self._ground_truth = {}
 
83
 
84
  # Initialize episode state
85
  self._state = EpisodeState(
@@ -141,6 +144,7 @@ class LogTriageEnvironment:
141
  self._state.cumulative_score + reward, 4
142
  )
143
  self._state.actions_taken.append(action.action_type)
 
144
  self._state.step_count += 1
145
 
146
  # Check if episode should end
@@ -293,13 +297,20 @@ class LogTriageEnvironment:
293
  """Get logs and system state for the current step."""
294
  if self._task_id == "single_crash":
295
  return single_crash.get_step_data(step, self._base_time, self._rng)
296
- # Tasks 2 & 3 wired in Day 3
 
 
 
297
  return [], generate_healthy_system_state(self._base_time)
298
 
299
  def _get_alerts(self, step: int) -> list[str]:
300
  """Get active alerts for the current step."""
301
  if self._task_id == "single_crash":
302
  return single_crash.get_active_alerts(step)
 
 
 
 
303
  return []
304
 
305
  def _make_obs(
 
15
  ServiceStatus,
16
  )
17
  from server.scenarios import single_crash
18
+ from server.scenarios import cascading
19
+ from server.scenarios import silent_degrade
20
  from server.log_generator import generate_healthy_system_state, _make_timestamp
21
 
22
  # ─── TASK REGISTRY ─────────────────────────────────────────────────────────────
 
79
  # Load ground truth for this task
80
  if task_id == "single_crash":
81
  self._ground_truth = single_crash.GROUND_TRUTH
82
+ elif task_id == "cascading_failure":
83
+ self._ground_truth = cascading.GROUND_TRUTH
84
+ elif task_id == "silent_degradation":
85
+ self._ground_truth = silent_degrade.GROUND_TRUTH
86
 
87
  # Initialize episode state
88
  self._state = EpisodeState(
 
144
  self._state.cumulative_score + reward, 4
145
  )
146
  self._state.actions_taken.append(action.action_type)
147
+ self._state.action_history.append(action.model_dump())
148
  self._state.step_count += 1
149
 
150
  # Check if episode should end
 
297
  """Get logs and system state for the current step."""
298
  if self._task_id == "single_crash":
299
  return single_crash.get_step_data(step, self._base_time, self._rng)
300
+ elif self._task_id == "cascading_failure":
301
+ return cascading.get_step_data(step, self._base_time, self._rng)
302
+ elif self._task_id == "silent_degradation":
303
+ return silent_degrade.get_step_data(step, self._base_time, self._rng)
304
  return [], generate_healthy_system_state(self._base_time)
305
 
306
  def _get_alerts(self, step: int) -> list[str]:
307
  """Get active alerts for the current step."""
308
  if self._task_id == "single_crash":
309
  return single_crash.get_active_alerts(step)
310
+ elif self._task_id == "cascading_failure":
311
+ return cascading.get_active_alerts(step)
312
+ elif self._task_id == "silent_degradation":
313
+ return silent_degrade.get_active_alerts(step)
314
  return []
315
 
316
  def _make_obs(
server/graders/__init__.py CHANGED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Grader registry for LogTriageEnv.
3
+ Maps task_id strings to grader class instances.
4
+ """
5
+ from server.graders.crash_grader import CrashGrader
6
+ from server.graders.cascade_grader import CascadeGrader
7
+ from server.graders.noise_grader import NoiseGrader
8
+
9
+ # Registry: task_id β†’ grader instance
10
+ GRADER_REGISTRY = {
11
+ "single_crash": CrashGrader(),
12
+ "cascading_failure": CascadeGrader(),
13
+ "silent_degradation": NoiseGrader(),
14
+ }
15
+
16
+
17
+ def get_grader(task_id: str):
18
+ """
19
+ Get the grader for a given task.
20
+ Raises ValueError if task_id is unknown.
21
+ """
22
+ if task_id not in GRADER_REGISTRY:
23
+ raise ValueError(
24
+ f"No grader registered for task '{task_id}'. "
25
+ f"Valid tasks: {list(GRADER_REGISTRY.keys())}"
26
+ )
27
+ return GRADER_REGISTRY[task_id]
28
+
29
+
30
+ def score_episode(task_id: str, state) -> dict:
31
+ """
32
+ Score a completed episode and return full result dict.
33
+ This is what the /grader endpoint calls.
34
+ """
35
+ grader = get_grader(task_id)
36
+ score = grader.score(state)
37
+ breakdown = grader.get_breakdown() if hasattr(grader, "get_breakdown") else {}
38
+
39
+ return {
40
+ "score": score,
41
+ "task_id": task_id,
42
+ "episode_id": state.episode_id,
43
+ "steps_taken": state.step_count,
44
+ "max_steps": state.max_steps,
45
+ "breakdown": breakdown,
46
+ "resolved": any(
47
+ a.get("action_type") == "resolve"
48
+ for a in state.action_history
49
+ ),
50
+ }
server/graders/base_grader.py CHANGED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Abstract base grader interface.
3
+ All task graders must inherit from this and implement score().
4
+ """
5
+ from __future__ import annotations
6
+ from abc import ABC, abstractmethod
7
+ from server.models import EpisodeState
8
+
9
+
10
+ class BaseGrader(ABC):
11
+ """
12
+ Abstract grader base class.
13
+
14
+ A grader evaluates the complete episode history and produces
15
+ a final score in [0.0, 1.0].
16
+
17
+ Unlike the reward function (which fires after every step),
18
+ the grader fires once at episode end and produces the
19
+ official score used by judges.
20
+ """
21
+
22
+ @abstractmethod
23
+ def score(self, state: EpisodeState) -> float:
24
+ """
25
+ Score the completed episode.
26
+
27
+ Args:
28
+ state: Final EpisodeState including full action_history
29
+
30
+ Returns:
31
+ float in [0.0, 1.0] β€” the official episode score
32
+ """
33
+ raise NotImplementedError
34
+
35
+ def _clamp(self, value: float) -> float:
36
+ """Clamp score to valid range [0.0, 1.0]."""
37
+ return round(max(0.0, min(1.0, value)), 4)
38
+
39
+ def _get_actions_of_type(
40
+ self, state: EpisodeState, action_type: str
41
+ ) -> list[dict]:
42
+ """Return all actions of a given type from episode history."""
43
+ return [
44
+ a for a in state.action_history
45
+ if a.get("action_type") == action_type
46
+ ]
47
+
48
+ def _was_action_taken(self, state: EpisodeState, action_type: str) -> bool:
49
+ """Check if an action type was taken at any point in the episode."""
50
+ return any(
51
+ a.get("action_type") == action_type
52
+ for a in state.action_history
53
+ )
54
+
55
+ def _get_first_value(
56
+ self, state: EpisodeState, action_type: str
57
+ ) -> str | None:
58
+ """Get the value of the first action of a given type."""
59
+ actions = self._get_actions_of_type(state, action_type)
60
+ return actions[0].get("value") if actions else None
61
+
62
+ def _episode_resolved(self, state: EpisodeState) -> bool:
63
+ """Check if agent explicitly resolved the episode."""
64
+ return self._was_action_taken(state, "resolve")
65
+
66
+ def _steps_used(self, state: EpisodeState) -> int:
67
+ """Return number of steps taken."""
68
+ return state.step_count
server/graders/cascade_grader.py CHANGED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Grader for Task 2 β€” Cascading Failure (Medium)
3
+
4
+ Scoring breakdown:
5
+ Correct severity (P1) β†’ +0.20
6
+ Correct root cause (user-db) β†’ +0.35
7
+ Correct remediation (kill-query/restart) β†’ +0.25
8
+ Ordering bonus (no symptom fix first) β†’ +0.10
9
+ Speed bonus (resolved ≀ 8 steps) β†’ +0.10
10
+ ─────────────────────────────────────────────────
11
+ Maximum possible score β†’ 1.00
12
+
13
+ Penalties:
14
+ Identified symptom as root cause β†’ 0.00 (no credit)
15
+ Remediated symptom service first β†’ -0.10 (ordering penalty)
16
+ Never resolved β†’ -0.10
17
+ """
18
+ from __future__ import annotations
19
+ from server.models import EpisodeState
20
+ from server.graders.base_grader import BaseGrader
21
+
22
+
23
+ class CascadeGrader(BaseGrader):
24
+ """Official grader for Task 2 β€” Cascading Failure."""
25
+
26
+ CORRECT_SEVERITY = "P1"
27
+ CORRECT_ROOT_CAUSE = "user-db"
28
+ CORRECT_REMEDIATION_PREFIXES = {"kill-query", "restart"}
29
+ CORRECT_REMEDIATION_SERVICE = "user-db"
30
+ SYMPTOM_SERVICES = {"api-gateway", "auth-service"} # wrong answers
31
+ MAX_STEPS = 12
32
+ SPEED_THRESHOLD = 8
33
+
34
+ def score(self, state: EpisodeState) -> float:
35
+ """
36
+ Score the completed Task 2 episode.
37
+ Penalizes agents that treat symptoms instead of root cause.
38
+ """
39
+ total = 0.0
40
+ breakdown = {}
41
+
42
+ # ── 1. Severity classification ─────────────────────────────────────────
43
+ severity_value = self._get_first_value(state, "classify_severity")
44
+ if severity_value == self.CORRECT_SEVERITY:
45
+ total += 0.20
46
+ breakdown["severity"] = "+0.20 (correct: P1)"
47
+ elif severity_value == "P2":
48
+ total += 0.08
49
+ breakdown["severity"] = "+0.08 (partial: P2 given, P1 expected)"
50
+ elif severity_value is None:
51
+ breakdown["severity"] = "+0.00 (never classified)"
52
+ else:
53
+ breakdown["severity"] = f"+0.00 (wrong: {severity_value})"
54
+
55
+ # ── 2. Root cause identification ───────────────────────────────────────
56
+ root_cause_value = self._get_first_value(state, "identify_root_cause")
57
+ if root_cause_value == self.CORRECT_ROOT_CAUSE:
58
+ total += 0.35
59
+ breakdown["root_cause"] = "+0.35 (correct: user-db)"
60
+ elif root_cause_value in self.SYMPTOM_SERVICES:
61
+ # Identified a symptom, not root cause β€” no credit
62
+ breakdown["root_cause"] = f"+0.00 (wrong: {root_cause_value} is a symptom, not root cause)"
63
+ elif root_cause_value and "db" in root_cause_value:
64
+ total += 0.10 # right tier (database), wrong specific service
65
+ breakdown["root_cause"] = f"+0.10 (partial: {root_cause_value}, right tier)"
66
+ elif root_cause_value is None:
67
+ breakdown["root_cause"] = "+0.00 (never identified)"
68
+ else:
69
+ breakdown["root_cause"] = f"+0.00 (wrong: {root_cause_value})"
70
+
71
+ # ── 3. Remediation + Ordering ──────────────────────────────────────────
72
+ remediation_actions = self._get_actions_of_type(state, "remediate")
73
+ remediation_scored = False
74
+ symptom_remediated_first = False
75
+
76
+ for i, action in enumerate(remediation_actions):
77
+ value = action.get("value", "")
78
+ parts = value.split(":")
79
+ if len(parts) != 2:
80
+ continue
81
+ prefix, service = parts
82
+
83
+ # Check if agent remediated a symptom service before root cause
84
+ if service in self.SYMPTOM_SERVICES and not remediation_scored:
85
+ symptom_remediated_first = True
86
+
87
+ # Check for correct remediation
88
+ if (
89
+ prefix in self.CORRECT_REMEDIATION_PREFIXES
90
+ and service == self.CORRECT_REMEDIATION_SERVICE
91
+ and not remediation_scored
92
+ ):
93
+ total += 0.25
94
+ breakdown["remediation"] = f"+0.25 (correct: {value})"
95
+ remediation_scored = True
96
+
97
+ if not remediation_scored:
98
+ breakdown["remediation"] = "+0.00 (no correct remediation)"
99
+
100
+ # ── 4. Ordering bonus ──────────────────────────────────────────────────
101
+ if not symptom_remediated_first and remediation_scored:
102
+ total += 0.10
103
+ breakdown["ordering"] = "+0.10 (correctly targeted root cause, not symptoms)"
104
+ elif symptom_remediated_first:
105
+ total -= 0.10
106
+ breakdown["ordering"] = "-0.10 (remediated symptom service before root cause)"
107
+
108
+ # ── 5. Speed bonus ─────────────────────────────────────────────────────
109
+ if self._episode_resolved(state):
110
+ if self._steps_used(state) <= self.SPEED_THRESHOLD:
111
+ total += 0.10
112
+ breakdown["speed"] = f"+0.10 (resolved in {self._steps_used(state)} steps)"
113
+ else:
114
+ breakdown["speed"] = f"+0.00 (resolved but used {self._steps_used(state)} steps)"
115
+ else:
116
+ total -= 0.10
117
+ breakdown["resolution"] = "-0.10 (never resolved)"
118
+
119
+ self._breakdown = breakdown
120
+ return self._clamp(total)
121
+
122
+ def get_breakdown(self) -> dict:
123
+ return getattr(self, "_breakdown", {})
server/graders/crash_grader.py CHANGED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Grader for Task 1 β€” Single Service Crash (Easy)
3
+
4
+ Scoring breakdown:
5
+ Correct severity (P1) β†’ +0.30
6
+ Correct root cause (payment-service) β†’ +0.35
7
+ Correct remediation (restart:payment-*) β†’ +0.25
8
+ Speed bonus (resolved ≀ 5 steps) β†’ +0.10
9
+ ─────────────────────────────────────────────────
10
+ Maximum possible score β†’ 1.00
11
+
12
+ Penalties:
13
+ Ignored P1 incident β†’ -0.30 (from base)
14
+ Wrong root cause identified β†’ 0.00 (no credit)
15
+ Never resolved β†’ -0.10
16
+ """
17
+ from __future__ import annotations
18
+ from server.models import EpisodeState
19
+ from server.graders.base_grader import BaseGrader
20
+
21
+
22
+ class CrashGrader(BaseGrader):
23
+ """Official grader for Task 1 β€” Single Service Crash."""
24
+
25
+ # Ground truth constants
26
+ CORRECT_SEVERITY = "P1"
27
+ CORRECT_ROOT_CAUSE = "payment-service"
28
+ CORRECT_REMEDIATION_PREFIX = "restart"
29
+ CORRECT_REMEDIATION_SERVICE = "payment-service"
30
+ MAX_STEPS = 8
31
+ SPEED_THRESHOLD = 5 # must resolve within this many steps for speed bonus
32
+
33
+ def score(self, state: EpisodeState) -> float:
34
+ """
35
+ Score the completed Task 1 episode.
36
+ Deterministic β€” same action history always produces same score.
37
+ """
38
+ total = 0.0
39
+ breakdown = {}
40
+
41
+ # ── 1. Severity classification ─────────────────────────────────────────
42
+ severity_value = self._get_first_value(state, "classify_severity")
43
+ if severity_value == self.CORRECT_SEVERITY:
44
+ total += 0.30
45
+ breakdown["severity"] = "+0.30 (correct: P1)"
46
+ elif severity_value == "P2":
47
+ total += 0.10 # partial credit β€” close but not right
48
+ breakdown["severity"] = "+0.10 (partial: P2 given, P1 expected)"
49
+ elif severity_value is None:
50
+ breakdown["severity"] = "+0.00 (never classified)"
51
+ else:
52
+ breakdown["severity"] = f"+0.00 (wrong: {severity_value})"
53
+
54
+ # ── 2. Root cause identification ───────────────────────────────────────
55
+ root_cause_value = self._get_first_value(state, "identify_root_cause")
56
+ if root_cause_value == self.CORRECT_ROOT_CAUSE:
57
+ total += 0.35
58
+ breakdown["root_cause"] = "+0.35 (correct: payment-service)"
59
+ elif root_cause_value and root_cause_value.startswith("payment"):
60
+ total += 0.10 # partial β€” right service family
61
+ breakdown["root_cause"] = f"+0.10 (partial: {root_cause_value}, right family)"
62
+ elif root_cause_value is None:
63
+ breakdown["root_cause"] = "+0.00 (never identified)"
64
+ else:
65
+ breakdown["root_cause"] = f"+0.00 (wrong: {root_cause_value})"
66
+
67
+ # ── 3. Remediation ─────────────────────────────────────────────────────
68
+ remediation_actions = self._get_actions_of_type(state, "remediate")
69
+ remediation_scored = False
70
+ for action in remediation_actions:
71
+ value = action.get("value", "")
72
+ parts = value.split(":")
73
+ if len(parts) == 2:
74
+ prefix, service = parts
75
+ if prefix == self.CORRECT_REMEDIATION_PREFIX and service == self.CORRECT_REMEDIATION_SERVICE:
76
+ total += 0.25
77
+ breakdown["remediation"] = f"+0.25 (correct: {value})"
78
+ remediation_scored = True
79
+ break
80
+ elif service == self.CORRECT_REMEDIATION_SERVICE:
81
+ total += 0.08 # right service, wrong action type
82
+ breakdown["remediation"] = f"+0.08 (partial: right service, wrong action)"
83
+ remediation_scored = True
84
+ break
85
+
86
+ if not remediation_scored:
87
+ breakdown["remediation"] = "+0.00 (no correct remediation)"
88
+
89
+ # ── 4. Speed bonus ─────────────────────────────────────────────────────
90
+ if self._episode_resolved(state):
91
+ if self._steps_used(state) <= self.SPEED_THRESHOLD:
92
+ total += 0.10
93
+ breakdown["speed"] = f"+0.10 (resolved in {self._steps_used(state)} steps)"
94
+ else:
95
+ breakdown["speed"] = f"+0.00 (resolved but slow: {self._steps_used(state)} steps)"
96
+ else:
97
+ total -= 0.10 # penalty for not resolving
98
+ breakdown["resolution"] = "-0.10 (never resolved)"
99
+
100
+ # ── 5. Ignore penalty ─────────────────────────────��────────────────────
101
+ if self._was_action_taken(state, "ignore"):
102
+ total -= 0.30
103
+ breakdown["ignore_penalty"] = "-0.30 (ignored P1 incident)"
104
+
105
+ self._breakdown = breakdown
106
+ return self._clamp(total)
107
+
108
+ def get_breakdown(self) -> dict:
109
+ """Return scoring breakdown from last score() call."""
110
+ return getattr(self, "_breakdown", {})
server/graders/noise_grader.py CHANGED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Grader for Task 3 β€” Silent Degradation with Noise (Hard)
3
+
4
+ Scoring breakdown:
5
+ Correct severity (P2) β†’ +0.30
6
+ Correct root cause (payment-db) β†’ +0.30
7
+ Correct remediation (flush-cache/kill-q) β†’ +0.20
8
+ No spurious actions on noise services β†’ +0.10
9
+ Speed bonus (resolved ≀ 10 steps) β†’ +0.10
10
+ ──────────────────────────────────────────────────
11
+ Maximum possible score β†’ 1.00
12
+
13
+ Penalties:
14
+ Over-escalation (classified P1) β†’ -0.15
15
+ Under-escalation (classified P3) β†’ -0.15
16
+ Remediated wrong service β†’ 0.00 (no credit)
17
+ Never resolved/escalated β†’ -0.10
18
+ """
19
+ from __future__ import annotations
20
+ from server.models import EpisodeState
21
+ from server.graders.base_grader import BaseGrader
22
+
23
+
24
+ class NoiseGrader(BaseGrader):
25
+ """Official grader for Task 3 β€” Silent Degradation with Noise."""
26
+
27
+ CORRECT_SEVERITY = "P2"
28
+ CORRECT_ROOT_CAUSE = "payment-db"
29
+ CORRECT_REMEDIATION_PREFIXES = {"flush-cache", "kill-query"}
30
+ CORRECT_REMEDIATION_SERVICE = "payment-db"
31
+ CORRECT_ESCALATION_TEAMS = {"dba-team"}
32
+ NOISE_SERVICES = {
33
+ "api-gateway", "auth-service", "user-db",
34
+ "notification-service", "email-queue"
35
+ } # irrelevant to this scenario
36
+ MAX_STEPS = 15
37
+ SPEED_THRESHOLD = 10
38
+
39
+ def score(self, state: EpisodeState) -> float:
40
+ """
41
+ Score the completed Task 3 episode.
42
+ Penalizes both over-escalation (P1) and under-escalation (P3).
43
+ Rewards noise filtering β€” penalizes actions on irrelevant services.
44
+ """
45
+ total = 0.0
46
+ breakdown = {}
47
+
48
+ # ── 1. Severity classification ─────────────────────────────────────────
49
+ severity_value = self._get_first_value(state, "classify_severity")
50
+ if severity_value == self.CORRECT_SEVERITY:
51
+ total += 0.30
52
+ breakdown["severity"] = "+0.30 (correct: P2)"
53
+ elif severity_value == "P1":
54
+ total -= 0.15
55
+ breakdown["severity"] = "-0.15 (over-escalation: P1 given, P2 expected β€” no outage yet)"
56
+ elif severity_value == "P3":
57
+ total -= 0.15
58
+ breakdown["severity"] = "-0.15 (under-escalation: P3 given, P2 expected β€” trend is serious)"
59
+ elif severity_value is None:
60
+ breakdown["severity"] = "+0.00 (never classified)"
61
+ else:
62
+ breakdown["severity"] = f"+0.00 (wrong: {severity_value})"
63
+
64
+ # ── 2. Root cause identification ───────────────────────────────────────
65
+ root_cause_value = self._get_first_value(state, "identify_root_cause")
66
+ if root_cause_value == self.CORRECT_ROOT_CAUSE:
67
+ total += 0.30
68
+ breakdown["root_cause"] = "+0.30 (correct: payment-db)"
69
+ elif root_cause_value == "payment-service":
70
+ total += 0.10 # close β€” right payment tier, wrong component
71
+ breakdown["root_cause"] = "+0.10 (partial: payment-service, but root is payment-db)"
72
+ elif root_cause_value in self.NOISE_SERVICES:
73
+ breakdown["root_cause"] = f"+0.00 (wrong: {root_cause_value} is a noise service)"
74
+ elif root_cause_value is None:
75
+ breakdown["root_cause"] = "+0.00 (never identified)"
76
+ else:
77
+ breakdown["root_cause"] = f"+0.00 (wrong: {root_cause_value})"
78
+
79
+ # ── 3. Remediation ─────────────────────────────────────────────────────
80
+ remediation_actions = self._get_actions_of_type(state, "remediate")
81
+ remediation_scored = False
82
+ for action in remediation_actions:
83
+ value = action.get("value", "")
84
+ parts = value.split(":")
85
+ if len(parts) == 2:
86
+ prefix, service = parts
87
+ if (
88
+ prefix in self.CORRECT_REMEDIATION_PREFIXES
89
+ and service == self.CORRECT_REMEDIATION_SERVICE
90
+ ):
91
+ total += 0.20
92
+ breakdown["remediation"] = f"+0.20 (correct: {value})"
93
+ remediation_scored = True
94
+ break
95
+ elif service == self.CORRECT_REMEDIATION_SERVICE:
96
+ total += 0.05 # right service, suboptimal action
97
+ breakdown["remediation"] = f"+0.05 (partial: right service, suboptimal action)"
98
+ remediation_scored = True
99
+ break
100
+
101
+ # Also accept correct escalation to dba-team as valid resolution
102
+ if not remediation_scored:
103
+ escalation_actions = self._get_actions_of_type(state, "escalate")
104
+ for action in escalation_actions:
105
+ if action.get("value") in self.CORRECT_ESCALATION_TEAMS:
106
+ total += 0.15 # escalation is slightly less credit than direct fix
107
+ breakdown["remediation"] = "+0.15 (escalated to dba-team β€” acceptable)"
108
+ remediation_scored = True
109
+ break
110
+
111
+ if not remediation_scored:
112
+ breakdown["remediation"] = "+0.00 (no correct remediation or escalation)"
113
+
114
+ # ── 4. Noise filtering bonus ───────────────────────────────────────────
115
+ # Check if agent took any unnecessary actions on noise services
116
+ spurious_actions = 0
117
+ all_actions = state.action_history
118
+ for action in all_actions:
119
+ action_type = action.get("action_type")
120
+ value = action.get("value", "")
121
+ # Check remediate/escalate/identify actions on noise services
122
+ if action_type == "identify_root_cause" and value in self.NOISE_SERVICES:
123
+ spurious_actions += 1
124
+ elif action_type == "remediate":
125
+ service = value.split(":")[-1] if ":" in value else ""
126
+ if service in self.NOISE_SERVICES:
127
+ spurious_actions += 1
128
+ elif action_type == "escalate" and value not in self.CORRECT_ESCALATION_TEAMS and value != "sre-team":
129
+ spurious_actions += 1
130
+
131
+ if spurious_actions == 0:
132
+ total += 0.10
133
+ breakdown["noise_filtering"] = "+0.10 (no spurious actions on noise services)"
134
+ elif spurious_actions == 1:
135
+ breakdown["noise_filtering"] = f"+0.00 ({spurious_actions} spurious action)"
136
+ else:
137
+ total -= 0.05
138
+ breakdown["noise_filtering"] = f"-0.05 ({spurious_actions} spurious actions β€” poor noise filtering)"
139
+
140
+ # ── 5. Speed bonus ─────────────────────────────────────────────────────
141
+ if self._episode_resolved(state) or remediation_scored:
142
+ if self._steps_used(state) <= self.SPEED_THRESHOLD:
143
+ total += 0.10
144
+ breakdown["speed"] = f"+0.10 (acted within {self._steps_used(state)} steps)"
145
+ else:
146
+ breakdown["speed"] = f"+0.00 (acted but used {self._steps_used(state)} steps)"
147
+ else:
148
+ total -= 0.10
149
+ breakdown["resolution"] = "-0.10 (never acted on the degradation)"
150
+
151
+ self._breakdown = breakdown
152
+ return self._clamp(total)
153
+
154
+ def get_breakdown(self) -> dict:
155
+ return getattr(self, "_breakdown", {})
server/models.py CHANGED
@@ -206,6 +206,10 @@ class EpisodeState(BaseModel):
206
  default_factory=list,
207
  description="List of action_type values taken so far this episode"
208
  )
 
 
 
 
209
  correct_severity: Optional[str] = Field(
210
  None,
211
  description="Whether agent has correctly classified severity yet"
 
206
  default_factory=list,
207
  description="List of action_type values taken so far this episode"
208
  )
209
+ action_history: list[dict] = Field(
210
+ default_factory=list,
211
+ description="Full action objects taken this episode (for grader evaluation)"
212
+ )
213
  correct_severity: Optional[str] = Field(
214
  None,
215
  description="Whether agent has correctly classified severity yet"
server/scenarios/cascading.py CHANGED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task 2 β€” Cascading Failure (Medium)
3
+
4
+ Scenario: user-db develops a slow query that exhausts the auth-service connection pool,
5
+ which then causes the api-gateway to return timeouts to all users.
6
+
7
+ Surface logs show gateway errors most loudly (symptom), but root cause is hidden (user-db).
8
+ Agent must trace backward through the cascade chain β€” NOT treat symptoms as root cause.
9
+
10
+ Ground truth:
11
+ - severity: P1
12
+ - root_cause: user-db
13
+ - remediation: kill-query:user-db OR restart:user-db
14
+ - correct_teams: dba-team, sre-team
15
+ - noise_ratio: 30%
16
+ """
17
+ from __future__ import annotations
18
+ import random
19
+ from datetime import datetime
20
+ from server.models import LogLine, ServiceStatus
21
+ from server.log_generator import (
22
+ generate_log_batch,
23
+ generate_healthy_system_state,
24
+ _make_timestamp,
25
+ )
26
+
27
+ # ─── GROUND TRUTH ─────────────────────────────────────────────────────────────
28
+
29
+ GROUND_TRUTH = {
30
+ "severity": "P1",
31
+ "root_cause": "user-db",
32
+ "remediation_prefixes": {"kill-query", "restart"},
33
+ "remediation_service": "user-db",
34
+ "correct_teams": {"dba-team", "sre-team"},
35
+ "max_steps": 12,
36
+ "noise_ratio": 0.30,
37
+ }
38
+
39
+ # ─── STEP-BY-STEP SIGNAL PLAN ─────────────────────────────────────────────────
40
+ # Cascade chain: user-db slow query β†’ auth-service pool exhausted β†’ api-gateway timeouts
41
+ # Steps 0-1: Gateway errors surface (symptoms only β€” most visible)
42
+ # Steps 2-3: Auth-service DB pressure becomes visible
43
+ # Steps 4-5: user-db slow queries exposed; circuit breaker opens
44
+ # Steps 6-7: Full cascade β€” all 3 services degraded/down
45
+ # Steps 8-11: Escalating alerts; root cause becomes unmistakable
46
+
47
+ STEP_SIGNALS = [
48
+ # Step 0: Gateway errors first to appear (surface symptom)
49
+ [
50
+ ("api-gateway", "ERROR", "upstream timeout from auth-service: 5002ms"),
51
+ ("api-gateway", "WARN", "error rate: 8.3% on /auth/* routes"),
52
+ ],
53
+ # Step 1: More gateway errors; first hints of auth-service pressure
54
+ [
55
+ ("api-gateway", "ERROR", "upstream timeout from auth-service: 30007ms"),
56
+ ("api-gateway", "WARN", "error rate: 15.7% β€” auth-service latency climbing"),
57
+ ],
58
+ # Step 2: Auth-service connection pool pressure visible
59
+ [
60
+ ("auth-service", "WARN", "db connection pool at 42/50 β€” pressure building"),
61
+ ("api-gateway", "ERROR", "upstream timeout from auth-service: 30005ms"),
62
+ ("auth-service", "ERROR", "db query timeout: SELECT session WHERE user_id=? [5001ms]"),
63
+ ],
64
+ # Step 3: Auth-service pool nearly exhausted
65
+ [
66
+ ("auth-service", "ERROR", "db connection pool EXHAUSTED (50/50) β€” blocking new requests"),
67
+ ("api-gateway", "ERROR", "auth-service unavailable: connection pool full"),
68
+ ("auth-service", "WARN", "request queue depth: 127 β€” approaching overflow"),
69
+ ],
70
+ # Step 4: user-db slow query finally exposed
71
+ [
72
+ ("user-db", "WARN", "slow query detected: SELECT * FROM sessions WHERE user_id=? [2847ms]"),
73
+ ("auth-service", "ERROR", "db connection timeout after 5000ms β€” query hanging"),
74
+ ("user-db", "ERROR", "lock wait timeout: session table β€” blocking reads"),
75
+ ],
76
+ # Step 5: user-db circuit breaker opens; auth-service starts failing fast
77
+ [
78
+ ("user-db", "WARN", "slow query: 4500ms β€” circuit breaker approaching threshold"),
79
+ ("auth-service", "ERROR", "circuit breaker OPEN for user-db: latency exceeded 5000ms"),
80
+ ("api-gateway", "ERROR", "all /auth/* requests failing β€” upstream unavailable"),
81
+ ],
82
+ # Step 6: Full cascade β€” all 3 services degraded
83
+ [
84
+ ("api-gateway", "ERROR", "error rate: 67.4% β€” multiple upstreams timing out"),
85
+ ("auth-service", "ERROR", "health check FAILED: cannot reach user-db"),
86
+ ("user-db", "ERROR", "connection pool saturated: 95/100 connections in use"),
87
+ ],
88
+ # Step 7: api-gateway now fully symptomatic
89
+ [
90
+ ("api-gateway", "FATAL", "SLA breach: /auth endpoint availability < 95%"),
91
+ ("auth-service", "ERROR", "auth-service DOWN: 3/3 health checks failed"),
92
+ ("user-db", "WARN", "slow query count: 847 in last 60s β€” severe degradation"),
93
+ ],
94
+ # Step 8: Database fully exposed as root cause
95
+ [
96
+ ("user-db", "ERROR", "CRITICAL: user-db query latency 8000ms+ β€” active sessions timing out"),
97
+ ("auth-service", "ERROR", "rejected: user-db connection pool exhausted"),
98
+ ("api-gateway", "ERROR", "user-auth endpoint returning 503 β€” cascade failure"),
99
+ ],
100
+ # Step 9: Escalating
101
+ [
102
+ ("user-db", "FATAL", "user-db DOWN: connection pool 100/100 β€” no connections available"),
103
+ ("api-gateway", "ERROR", "error rate: 89.2% β€” auth-service and user-db both unreachable"),
104
+ ],
105
+ # Step 10: Critical
106
+ [
107
+ ("api-gateway", "FATAL", "CRITICAL: auth-service DOWN for 90s β€” 100% of login attempts failing"),
108
+ ("user-db", "ERROR", "lock contention: session table fully locked β€” queries timing out"),
109
+ ],
110
+ # Step 11: Maximum severity
111
+ [
112
+ ("user-db", "FATAL", "user-db unresponsive for 180s β€” database crisis"),
113
+ ("api-gateway", "FATAL", "SLA_BREACH: auth availability 0% β€” complete user-auth outage"),
114
+ ],
115
+ ]
116
+
117
+
118
+ def get_system_state(step: int, base_time: datetime) -> dict[str, ServiceStatus]:
119
+ """Return system state for this step. Cascade: user-db β†’ auth-service β†’ api-gateway."""
120
+ now = _make_timestamp(base_time, step * 30)
121
+ state = generate_healthy_system_state(base_time)
122
+
123
+ # Escalating degradation based on step
124
+ if step <= 1:
125
+ # Gateway just starting to see issues
126
+ state["api-gateway"] = ServiceStatus(
127
+ name="api-gateway", status="degraded", error_rate=0.083, latency_p99_ms=2500, last_updated=now
128
+ )
129
+ elif step <= 3:
130
+ # Auth-service pool pressure
131
+ state["api-gateway"] = ServiceStatus(
132
+ name="api-gateway", status="degraded", error_rate=0.157, latency_p99_ms=5000, last_updated=now
133
+ )
134
+ state["auth-service"] = ServiceStatus(
135
+ name="auth-service", status="degraded", error_rate=0.15, latency_p99_ms=5000, last_updated=now
136
+ )
137
+ elif step <= 5:
138
+ # user-db slow queries exposed
139
+ state["api-gateway"] = ServiceStatus(
140
+ name="api-gateway", status="degraded", error_rate=0.45, latency_p99_ms=8000, last_updated=now
141
+ )
142
+ state["auth-service"] = ServiceStatus(
143
+ name="auth-service", status="down", error_rate=0.85, latency_p99_ms=10000, last_updated=now
144
+ )
145
+ state["user-db"] = ServiceStatus(
146
+ name="user-db", status="degraded", error_rate=0.30, latency_p99_ms=4500, last_updated=now
147
+ )
148
+ elif step <= 7:
149
+ # Full cascade
150
+ state["api-gateway"] = ServiceStatus(
151
+ name="api-gateway", status="down", error_rate=0.89, latency_p99_ms=10000, last_updated=now
152
+ )
153
+ state["auth-service"] = ServiceStatus(
154
+ name="auth-service", status="down", error_rate=0.95, latency_p99_ms=10000, last_updated=now
155
+ )
156
+ state["user-db"] = ServiceStatus(
157
+ name="user-db", status="down", error_rate=0.50, latency_p99_ms=8000, last_updated=now
158
+ )
159
+ else:
160
+ # Maximum severity
161
+ state["api-gateway"] = ServiceStatus(
162
+ name="api-gateway", status="down", error_rate=0.99, latency_p99_ms=10000, last_updated=now
163
+ )
164
+ state["auth-service"] = ServiceStatus(
165
+ name="auth-service", status="down", error_rate=1.0, latency_p99_ms=10000, last_updated=now
166
+ )
167
+ state["user-db"] = ServiceStatus(
168
+ name="user-db", status="down", error_rate=0.75, latency_p99_ms=10000, last_updated=now
169
+ )
170
+
171
+ return state
172
+
173
+
174
+ def get_step_data(step: int, base_time: datetime, rng: random.Random) -> tuple[list[LogLine], dict[str, ServiceStatus]]:
175
+ """
176
+ Returns (logs, system_state) for the given step.
177
+ Signal gets louder over time if agent hasn't acted.
178
+ """
179
+ signal_idx = min(step, len(STEP_SIGNALS) - 1)
180
+ signals = STEP_SIGNALS[signal_idx]
181
+
182
+ logs = generate_log_batch(
183
+ scenario_signals=signals,
184
+ step=step,
185
+ base_time=base_time,
186
+ noise_ratio=GROUND_TRUTH["noise_ratio"],
187
+ batch_size=10,
188
+ rng=rng,
189
+ )
190
+ system_state = get_system_state(step, base_time)
191
+ return logs, system_state
192
+
193
+
194
+ def get_active_alerts(step: int) -> list[str]:
195
+ """Return active alerts for this step."""
196
+ alerts = []
197
+ if step >= 0:
198
+ alerts.append("api-gateway: elevated error rate on /auth/* routes")
199
+ if step >= 2:
200
+ alerts.append("auth-service: db connection pool pressure")
201
+ if step >= 4:
202
+ alerts.append("user-db: slow queries detected β€” latency 2000ms+")
203
+ if step >= 5:
204
+ alerts.append("auth-service: circuit breaker OPEN for user-db")
205
+ if step >= 6:
206
+ alerts.append("SLA_BREACH: /auth availability < 90%")
207
+ if step >= 8:
208
+ alerts.append("CRITICAL: user-db connection pool saturated")
209
+ if step >= 10:
210
+ alerts.append("CRITICAL: full auth cascade failure β€” P1 incident")
211
+ return alerts
server/scenarios/silent_degrade.py CHANGED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task 3 β€” Silent Degradation with Noise (Hard)
3
+
4
+ Scenario: payment-db query latency slowly increases over time.
5
+ No service crashes. Error rate stays below P1 threshold (5%).
6
+ 60% of logs are irrelevant noise from unrelated services.
7
+ Agent must filter noise, identify subtle signal, classify as P2 (NOT P1, NOT P3).
8
+
9
+ Ground truth:
10
+ - severity: P2 (nuanced β€” trending toward breach but no hard outage yet)
11
+ - root_cause: payment-db
12
+ - remediation: flush-cache:payment-db OR kill-query:payment-db
13
+ - correct_teams: dba-team
14
+ - noise_ratio: 60% (hardest noise ratio of all tasks)
15
+ """
16
+ from __future__ import annotations
17
+ import random
18
+ from datetime import datetime
19
+ from server.models import LogLine, ServiceStatus
20
+ from server.log_generator import (
21
+ generate_log_batch,
22
+ generate_healthy_system_state,
23
+ _make_timestamp,
24
+ )
25
+
26
+ # Ground Truth
27
+
28
+ GROUND_TRUTH = {
29
+ "severity": "P2",
30
+ "root_cause": "payment-db",
31
+ "remediation_prefixes": {"flush-cache", "kill-query"},
32
+ "remediation_service": "payment-db",
33
+ "correct_teams": {"dba-team"},
34
+ "max_steps": 15,
35
+ "noise_ratio": 0.60,
36
+ }
37
+
38
+ # Step signals: (service, level, message)
39
+ STEP_SIGNALS = [
40
+ # Step 0: Very subtle
41
+ [("payment-db", "WARN", "payment-db: query latency elevated 450ms (baseline: 12ms)")],
42
+ # Step 1
43
+ [("payment-db", "WARN", "payment-db: query latency 620ms")],
44
+ # Step 2
45
+ [("payment-db", "WARN", "payment-db: slow query: SELECT * FROM transactions WHERE user_id=? [890ms]")],
46
+ # Step 3
47
+ [("payment-db", "WARN", "payment-db: buffer cache hit ratio dropping: 89% to 71%")],
48
+ # Step 4
49
+ [("payment-db", "WARN", "payment-db: query latency 1200ms"), ("payment-service", "WARN", "payment-service: error rate 2.1%")],
50
+ # Step 5
51
+ [("payment-db", "WARN", "payment-db: buffer cache hit ratio 54% β€” cache thrashing")],
52
+ # Step 6
53
+ [("payment-db", "WARN", "payment-db: slow query: SELECT * FROM transactions [2200ms]")],
54
+ # Step 7
55
+ [("payment-db", "WARN", "payment-db: query latency 2800ms"), ("payment-service", "ERROR", "payment-service: 3.4% error rate")],
56
+ # Step 8
57
+ [("payment-db", "ERROR", "payment-db: slow query timeout: [3100ms] β€” query cancelled")],
58
+ # Step 9
59
+ [("payment-db", "WARN", "payment-db: query latency 4200ms β€” cache fully cold")],
60
+ # Step 10
61
+ [("payment-db", "ERROR", "payment-db: query latency 4500ms")],
62
+ # Step 11
63
+ [("payment-db", "WARN", "payment-db: buffer pool pages: 94% dirty")],
64
+ # Step 12
65
+ [("payment-db", "ERROR", "payment-db: query latency 4600ms β€” timeouts beginning"), ("payment-service", "ERROR", "payment-service: error rate 4.9%")],
66
+ # Step 13: P1 breached
67
+ [("payment-db", "ERROR", "payment-db: CRITICAL query latency 4950ms β€” P1 breached"), ("payment-service", "ERROR", "payment-service: error rate 5.1% β€” P1 exceeded")],
68
+ # Step 14: Worst case
69
+ [("payment-db", "FATAL", "payment-db: query latency 5000ms+ β€” connection pool exhausted"), ("payment-service", "FATAL", "payment-service: P1 CRITICAL β€” 6.2% error rate")],
70
+ ]
71
+
72
+
73
+ def get_system_state(step: int, base_time: datetime) -> dict[str, ServiceStatus]:
74
+ now = _make_timestamp(base_time, step * 30)
75
+ state = generate_healthy_system_state(base_time)
76
+
77
+ latencies = [450, 620, 890, 1200, 1400, 1800, 2200, 2800, 3100, 4200, 4500, 4600, 4600, 4950, 5000]
78
+ error_rates = [0.0, 0.005, 0.01, 0.021, 0.021, 0.025, 0.028, 0.034, 0.038, 0.042, 0.047, 0.049, 0.049, 0.051, 0.062]
79
+
80
+ step_idx = min(step, len(latencies) - 1)
81
+ db_latency = latencies[step_idx]
82
+ db_error = error_rates[step_idx]
83
+
84
+ psvc_latency = min(5000, 340 + db_latency // 2)
85
+ psvc_error = min(0.10, db_error * 0.8)
86
+
87
+ state["payment-db"] = ServiceStatus(
88
+ name="payment-db",
89
+ status="up" if step < 3 else "degraded",
90
+ error_rate=db_error,
91
+ latency_p99_ms=db_latency,
92
+ last_updated=now,
93
+ )
94
+ state["payment-service"] = ServiceStatus(
95
+ name="payment-service",
96
+ status="degraded" if step >= 4 else "up",
97
+ error_rate=psvc_error,
98
+ latency_p99_ms=psvc_latency,
99
+ last_updated=now,
100
+ )
101
+ return state
102
+
103
+
104
+ def get_step_data(step: int, base_time: datetime, rng: random.Random) -> tuple[list[LogLine], dict[str, ServiceStatus]]:
105
+ signal_idx = min(step, len(STEP_SIGNALS) - 1)
106
+ signals = STEP_SIGNALS[signal_idx]
107
+
108
+ logs = generate_log_batch(
109
+ scenario_signals=signals,
110
+ step=step,
111
+ base_time=base_time,
112
+ noise_ratio=GROUND_TRUTH["noise_ratio"],
113
+ batch_size=12,
114
+ rng=rng,
115
+ )
116
+ system_state = get_system_state(step, base_time)
117
+ return logs, system_state
118
+
119
+
120
+ def get_active_alerts(step: int) -> list[str]:
121
+ alerts = []
122
+ if step >= 4:
123
+ alerts.append("payment-service: error rate 2%+ β€” watching")
124
+ if step >= 6:
125
+ alerts.append("payment-service: p99 latency above threshold")
126
+ if step >= 9:
127
+ alerts.append("payment-db: query latency 4000ms+ β€” approaching P1 threshold")
128
+ if step >= 12:
129
+ alerts.append("WARNING: payment error rate approaching 5% P1 threshold")
130
+ if step >= 13:
131
+ alerts.append("ALERT: P1 threshold BREACHED for payment-service")
132
+ return alerts