Spaces:
Running
Running
Day 4: grader system complete
Browse files- scripts/run_grader.py +131 -0
- server/app.py +9 -7
- server/environment.py +15 -4
- server/graders/__init__.py +50 -0
- server/graders/base_grader.py +68 -0
- server/graders/cascade_grader.py +123 -0
- server/graders/crash_grader.py +110 -0
- server/graders/noise_grader.py +155 -0
- server/models.py +4 -0
- server/scenarios/cascading.py +211 -0
- server/scenarios/silent_degrade.py +132 -0
scripts/run_grader.py
CHANGED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Manual grader testing CLI.
|
| 3 |
+
Run a simulated episode and score it with the official grader.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python scripts/run_grader.py --task single_crash --agent correct
|
| 7 |
+
python scripts/run_grader.py --task cascading_failure --agent wrong
|
| 8 |
+
python scripts/run_grader.py --task silent_degradation --agent correct
|
| 9 |
+
python scripts/run_grader.py --all
|
| 10 |
+
"""
|
| 11 |
+
import argparse
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 15 |
+
|
| 16 |
+
from server.environment import LogTriageEnvironment
|
| 17 |
+
from server.models import TriageAction
|
| 18 |
+
from server.graders import score_episode
|
| 19 |
+
|
| 20 |
+
# βββ CORRECT AGENT SCRIPTS ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
|
| 22 |
+
CORRECT_ACTIONS = {
|
| 23 |
+
"single_crash": [
|
| 24 |
+
TriageAction(action_type="classify_severity", value="P1", confidence=0.95),
|
| 25 |
+
TriageAction(action_type="identify_root_cause", value="payment-service", confidence=0.90),
|
| 26 |
+
TriageAction(action_type="remediate", value="restart:payment-service", confidence=0.85),
|
| 27 |
+
TriageAction(action_type="resolve", value="resolved", confidence=1.00),
|
| 28 |
+
],
|
| 29 |
+
"cascading_failure": [
|
| 30 |
+
TriageAction(action_type="classify_severity", value="P1", confidence=0.90),
|
| 31 |
+
TriageAction(action_type="identify_root_cause", value="user-db", confidence=0.85),
|
| 32 |
+
TriageAction(action_type="remediate", value="kill-query:user-db", confidence=0.90),
|
| 33 |
+
TriageAction(action_type="resolve", value="resolved", confidence=1.00),
|
| 34 |
+
],
|
| 35 |
+
"silent_degradation": [
|
| 36 |
+
TriageAction(action_type="request_more_logs", value="payment-db", confidence=0.70),
|
| 37 |
+
TriageAction(action_type="classify_severity", value="P2", confidence=0.80),
|
| 38 |
+
TriageAction(action_type="identify_root_cause", value="payment-db", confidence=0.85),
|
| 39 |
+
TriageAction(action_type="remediate", value="flush-cache:payment-db", confidence=0.80),
|
| 40 |
+
TriageAction(action_type="resolve", value="resolved", confidence=1.00),
|
| 41 |
+
],
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# βββ WRONG AGENT SCRIPTS ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
|
| 46 |
+
WRONG_ACTIONS = {
|
| 47 |
+
"single_crash": [
|
| 48 |
+
TriageAction(action_type="classify_severity", value="P3", confidence=0.50),
|
| 49 |
+
TriageAction(action_type="identify_root_cause", value="api-gateway", confidence=0.50),
|
| 50 |
+
TriageAction(action_type="remediate", value="restart:api-gateway", confidence=0.50),
|
| 51 |
+
TriageAction(action_type="resolve", value="resolved", confidence=1.00),
|
| 52 |
+
],
|
| 53 |
+
"cascading_failure": [
|
| 54 |
+
TriageAction(action_type="classify_severity", value="P2", confidence=0.60),
|
| 55 |
+
TriageAction(action_type="identify_root_cause", value="api-gateway", confidence=0.60),
|
| 56 |
+
TriageAction(action_type="remediate", value="restart:api-gateway", confidence=0.60),
|
| 57 |
+
TriageAction(action_type="resolve", value="resolved", confidence=1.00),
|
| 58 |
+
],
|
| 59 |
+
"silent_degradation": [
|
| 60 |
+
TriageAction(action_type="classify_severity", value="P1", confidence=0.90),
|
| 61 |
+
TriageAction(action_type="identify_root_cause", value="api-gateway", confidence=0.70),
|
| 62 |
+
TriageAction(action_type="remediate", value="restart:api-gateway", confidence=0.70),
|
| 63 |
+
TriageAction(action_type="resolve", value="resolved", confidence=1.00),
|
| 64 |
+
],
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def run_test(task_id: str, agent_type: str, seed: int = 42) -> dict:
|
| 69 |
+
"""Run a full episode with given actions and return grader result."""
|
| 70 |
+
env = LogTriageEnvironment()
|
| 71 |
+
env.reset(task_id=task_id, seed=seed)
|
| 72 |
+
|
| 73 |
+
actions = CORRECT_ACTIONS[task_id] if agent_type == "correct" else WRONG_ACTIONS[task_id]
|
| 74 |
+
|
| 75 |
+
for action in actions:
|
| 76 |
+
obs = env.step(action)
|
| 77 |
+
if obs.done:
|
| 78 |
+
break
|
| 79 |
+
|
| 80 |
+
result = score_episode(task_id, env.state)
|
| 81 |
+
return result
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def print_result(task_id: str, agent_type: str, result: dict):
|
| 85 |
+
score = result["score"]
|
| 86 |
+
print(f"\n{'='*60}")
|
| 87 |
+
print(f"Task: {task_id}")
|
| 88 |
+
print(f"Agent: {agent_type}")
|
| 89 |
+
print(f"Score: {score:.4f}")
|
| 90 |
+
print(f"Steps: {result['steps_taken']}/{result['max_steps']}")
|
| 91 |
+
print(f"Resolved: {result['resolved']}")
|
| 92 |
+
print(f"\nBreakdown:")
|
| 93 |
+
for key, val in result.get("breakdown", {}).items():
|
| 94 |
+
print(f" {key:<20} {val}")
|
| 95 |
+
print(f"{'='*60}")
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def main():
|
| 99 |
+
parser = argparse.ArgumentParser(description="Test LogTriageEnv graders")
|
| 100 |
+
parser.add_argument("--task", choices=["single_crash", "cascading_failure", "silent_degradation"],
|
| 101 |
+
help="Task to test")
|
| 102 |
+
parser.add_argument("--agent", choices=["correct", "wrong"], default="correct",
|
| 103 |
+
help="Agent type to simulate")
|
| 104 |
+
parser.add_argument("--all", action="store_true",
|
| 105 |
+
help="Run all tasks with both correct and wrong agents")
|
| 106 |
+
args = parser.parse_args()
|
| 107 |
+
|
| 108 |
+
if args.all:
|
| 109 |
+
tasks = ["single_crash", "cascading_failure", "silent_degradation"]
|
| 110 |
+
print("\n[TEST] Running all tasks with correct and wrong agents...\n")
|
| 111 |
+
print(f"{'Task':<25} {'Agent':<10} {'Score':<8} {'Variance'}")
|
| 112 |
+
print("-" * 60)
|
| 113 |
+
for task in tasks:
|
| 114 |
+
correct_result = run_test(task, "correct")
|
| 115 |
+
wrong_result = run_test(task, "wrong")
|
| 116 |
+
correct_score = correct_result["score"]
|
| 117 |
+
wrong_score = wrong_result["score"]
|
| 118 |
+
variance = correct_score - wrong_score
|
| 119 |
+
status = "[OK]" if variance > 0.10 else "[LOW]"
|
| 120 |
+
print(f"{task:<25} correct {correct_score:.4f}")
|
| 121 |
+
print(f"{task:<25} wrong {wrong_score:.4f} delta={variance:.4f} {status}")
|
| 122 |
+
print()
|
| 123 |
+
elif args.task:
|
| 124 |
+
result = run_test(args.task, args.agent)
|
| 125 |
+
print_result(args.task, args.agent, result)
|
| 126 |
+
else:
|
| 127 |
+
parser.print_help()
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
main()
|
server/app.py
CHANGED
|
@@ -101,13 +101,15 @@ def get_tasks():
|
|
| 101 |
|
| 102 |
@app.post("/grader")
|
| 103 |
def grader():
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
| 111 |
|
| 112 |
|
| 113 |
@app.post("/baseline")
|
|
|
|
| 101 |
|
| 102 |
@app.post("/grader")
|
| 103 |
def grader():
|
| 104 |
+
try:
|
| 105 |
+
from server.graders import score_episode
|
| 106 |
+
state = env.state
|
| 107 |
+
result = score_episode(state.task_id, state)
|
| 108 |
+
return result
|
| 109 |
+
except RuntimeError as e:
|
| 110 |
+
return JSONResponse(status_code=400, content={"error": str(e)})
|
| 111 |
+
except ValueError as e:
|
| 112 |
+
return JSONResponse(status_code=400, content={"error": str(e)})
|
| 113 |
|
| 114 |
|
| 115 |
@app.post("/baseline")
|
server/environment.py
CHANGED
|
@@ -15,6 +15,8 @@ from server.models import (
|
|
| 15 |
ServiceStatus,
|
| 16 |
)
|
| 17 |
from server.scenarios import single_crash
|
|
|
|
|
|
|
| 18 |
from server.log_generator import generate_healthy_system_state, _make_timestamp
|
| 19 |
|
| 20 |
# βββ TASK REGISTRY βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -77,9 +79,10 @@ class LogTriageEnvironment:
|
|
| 77 |
# Load ground truth for this task
|
| 78 |
if task_id == "single_crash":
|
| 79 |
self._ground_truth = single_crash.GROUND_TRUTH
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
| 83 |
|
| 84 |
# Initialize episode state
|
| 85 |
self._state = EpisodeState(
|
|
@@ -141,6 +144,7 @@ class LogTriageEnvironment:
|
|
| 141 |
self._state.cumulative_score + reward, 4
|
| 142 |
)
|
| 143 |
self._state.actions_taken.append(action.action_type)
|
|
|
|
| 144 |
self._state.step_count += 1
|
| 145 |
|
| 146 |
# Check if episode should end
|
|
@@ -293,13 +297,20 @@ class LogTriageEnvironment:
|
|
| 293 |
"""Get logs and system state for the current step."""
|
| 294 |
if self._task_id == "single_crash":
|
| 295 |
return single_crash.get_step_data(step, self._base_time, self._rng)
|
| 296 |
-
|
|
|
|
|
|
|
|
|
|
| 297 |
return [], generate_healthy_system_state(self._base_time)
|
| 298 |
|
| 299 |
def _get_alerts(self, step: int) -> list[str]:
|
| 300 |
"""Get active alerts for the current step."""
|
| 301 |
if self._task_id == "single_crash":
|
| 302 |
return single_crash.get_active_alerts(step)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
return []
|
| 304 |
|
| 305 |
def _make_obs(
|
|
|
|
| 15 |
ServiceStatus,
|
| 16 |
)
|
| 17 |
from server.scenarios import single_crash
|
| 18 |
+
from server.scenarios import cascading
|
| 19 |
+
from server.scenarios import silent_degrade
|
| 20 |
from server.log_generator import generate_healthy_system_state, _make_timestamp
|
| 21 |
|
| 22 |
# βββ TASK REGISTRY βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 79 |
# Load ground truth for this task
|
| 80 |
if task_id == "single_crash":
|
| 81 |
self._ground_truth = single_crash.GROUND_TRUTH
|
| 82 |
+
elif task_id == "cascading_failure":
|
| 83 |
+
self._ground_truth = cascading.GROUND_TRUTH
|
| 84 |
+
elif task_id == "silent_degradation":
|
| 85 |
+
self._ground_truth = silent_degrade.GROUND_TRUTH
|
| 86 |
|
| 87 |
# Initialize episode state
|
| 88 |
self._state = EpisodeState(
|
|
|
|
| 144 |
self._state.cumulative_score + reward, 4
|
| 145 |
)
|
| 146 |
self._state.actions_taken.append(action.action_type)
|
| 147 |
+
self._state.action_history.append(action.model_dump())
|
| 148 |
self._state.step_count += 1
|
| 149 |
|
| 150 |
# Check if episode should end
|
|
|
|
| 297 |
"""Get logs and system state for the current step."""
|
| 298 |
if self._task_id == "single_crash":
|
| 299 |
return single_crash.get_step_data(step, self._base_time, self._rng)
|
| 300 |
+
elif self._task_id == "cascading_failure":
|
| 301 |
+
return cascading.get_step_data(step, self._base_time, self._rng)
|
| 302 |
+
elif self._task_id == "silent_degradation":
|
| 303 |
+
return silent_degrade.get_step_data(step, self._base_time, self._rng)
|
| 304 |
return [], generate_healthy_system_state(self._base_time)
|
| 305 |
|
| 306 |
def _get_alerts(self, step: int) -> list[str]:
|
| 307 |
"""Get active alerts for the current step."""
|
| 308 |
if self._task_id == "single_crash":
|
| 309 |
return single_crash.get_active_alerts(step)
|
| 310 |
+
elif self._task_id == "cascading_failure":
|
| 311 |
+
return cascading.get_active_alerts(step)
|
| 312 |
+
elif self._task_id == "silent_degradation":
|
| 313 |
+
return silent_degrade.get_active_alerts(step)
|
| 314 |
return []
|
| 315 |
|
| 316 |
def _make_obs(
|
server/graders/__init__.py
CHANGED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grader registry for LogTriageEnv.
|
| 3 |
+
Maps task_id strings to grader class instances.
|
| 4 |
+
"""
|
| 5 |
+
from server.graders.crash_grader import CrashGrader
|
| 6 |
+
from server.graders.cascade_grader import CascadeGrader
|
| 7 |
+
from server.graders.noise_grader import NoiseGrader
|
| 8 |
+
|
| 9 |
+
# Registry: task_id β grader instance
|
| 10 |
+
GRADER_REGISTRY = {
|
| 11 |
+
"single_crash": CrashGrader(),
|
| 12 |
+
"cascading_failure": CascadeGrader(),
|
| 13 |
+
"silent_degradation": NoiseGrader(),
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def get_grader(task_id: str):
|
| 18 |
+
"""
|
| 19 |
+
Get the grader for a given task.
|
| 20 |
+
Raises ValueError if task_id is unknown.
|
| 21 |
+
"""
|
| 22 |
+
if task_id not in GRADER_REGISTRY:
|
| 23 |
+
raise ValueError(
|
| 24 |
+
f"No grader registered for task '{task_id}'. "
|
| 25 |
+
f"Valid tasks: {list(GRADER_REGISTRY.keys())}"
|
| 26 |
+
)
|
| 27 |
+
return GRADER_REGISTRY[task_id]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def score_episode(task_id: str, state) -> dict:
|
| 31 |
+
"""
|
| 32 |
+
Score a completed episode and return full result dict.
|
| 33 |
+
This is what the /grader endpoint calls.
|
| 34 |
+
"""
|
| 35 |
+
grader = get_grader(task_id)
|
| 36 |
+
score = grader.score(state)
|
| 37 |
+
breakdown = grader.get_breakdown() if hasattr(grader, "get_breakdown") else {}
|
| 38 |
+
|
| 39 |
+
return {
|
| 40 |
+
"score": score,
|
| 41 |
+
"task_id": task_id,
|
| 42 |
+
"episode_id": state.episode_id,
|
| 43 |
+
"steps_taken": state.step_count,
|
| 44 |
+
"max_steps": state.max_steps,
|
| 45 |
+
"breakdown": breakdown,
|
| 46 |
+
"resolved": any(
|
| 47 |
+
a.get("action_type") == "resolve"
|
| 48 |
+
for a in state.action_history
|
| 49 |
+
),
|
| 50 |
+
}
|
server/graders/base_grader.py
CHANGED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Abstract base grader interface.
|
| 3 |
+
All task graders must inherit from this and implement score().
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
from server.models import EpisodeState
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class BaseGrader(ABC):
|
| 11 |
+
"""
|
| 12 |
+
Abstract grader base class.
|
| 13 |
+
|
| 14 |
+
A grader evaluates the complete episode history and produces
|
| 15 |
+
a final score in [0.0, 1.0].
|
| 16 |
+
|
| 17 |
+
Unlike the reward function (which fires after every step),
|
| 18 |
+
the grader fires once at episode end and produces the
|
| 19 |
+
official score used by judges.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
@abstractmethod
|
| 23 |
+
def score(self, state: EpisodeState) -> float:
|
| 24 |
+
"""
|
| 25 |
+
Score the completed episode.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
state: Final EpisodeState including full action_history
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
float in [0.0, 1.0] β the official episode score
|
| 32 |
+
"""
|
| 33 |
+
raise NotImplementedError
|
| 34 |
+
|
| 35 |
+
def _clamp(self, value: float) -> float:
|
| 36 |
+
"""Clamp score to valid range [0.0, 1.0]."""
|
| 37 |
+
return round(max(0.0, min(1.0, value)), 4)
|
| 38 |
+
|
| 39 |
+
def _get_actions_of_type(
|
| 40 |
+
self, state: EpisodeState, action_type: str
|
| 41 |
+
) -> list[dict]:
|
| 42 |
+
"""Return all actions of a given type from episode history."""
|
| 43 |
+
return [
|
| 44 |
+
a for a in state.action_history
|
| 45 |
+
if a.get("action_type") == action_type
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
def _was_action_taken(self, state: EpisodeState, action_type: str) -> bool:
|
| 49 |
+
"""Check if an action type was taken at any point in the episode."""
|
| 50 |
+
return any(
|
| 51 |
+
a.get("action_type") == action_type
|
| 52 |
+
for a in state.action_history
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
def _get_first_value(
|
| 56 |
+
self, state: EpisodeState, action_type: str
|
| 57 |
+
) -> str | None:
|
| 58 |
+
"""Get the value of the first action of a given type."""
|
| 59 |
+
actions = self._get_actions_of_type(state, action_type)
|
| 60 |
+
return actions[0].get("value") if actions else None
|
| 61 |
+
|
| 62 |
+
def _episode_resolved(self, state: EpisodeState) -> bool:
|
| 63 |
+
"""Check if agent explicitly resolved the episode."""
|
| 64 |
+
return self._was_action_taken(state, "resolve")
|
| 65 |
+
|
| 66 |
+
def _steps_used(self, state: EpisodeState) -> int:
|
| 67 |
+
"""Return number of steps taken."""
|
| 68 |
+
return state.step_count
|
server/graders/cascade_grader.py
CHANGED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grader for Task 2 β Cascading Failure (Medium)
|
| 3 |
+
|
| 4 |
+
Scoring breakdown:
|
| 5 |
+
Correct severity (P1) β +0.20
|
| 6 |
+
Correct root cause (user-db) β +0.35
|
| 7 |
+
Correct remediation (kill-query/restart) β +0.25
|
| 8 |
+
Ordering bonus (no symptom fix first) β +0.10
|
| 9 |
+
Speed bonus (resolved β€ 8 steps) β +0.10
|
| 10 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
+
Maximum possible score β 1.00
|
| 12 |
+
|
| 13 |
+
Penalties:
|
| 14 |
+
Identified symptom as root cause β 0.00 (no credit)
|
| 15 |
+
Remediated symptom service first β -0.10 (ordering penalty)
|
| 16 |
+
Never resolved β -0.10
|
| 17 |
+
"""
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
from server.models import EpisodeState
|
| 20 |
+
from server.graders.base_grader import BaseGrader
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class CascadeGrader(BaseGrader):
|
| 24 |
+
"""Official grader for Task 2 β Cascading Failure."""
|
| 25 |
+
|
| 26 |
+
CORRECT_SEVERITY = "P1"
|
| 27 |
+
CORRECT_ROOT_CAUSE = "user-db"
|
| 28 |
+
CORRECT_REMEDIATION_PREFIXES = {"kill-query", "restart"}
|
| 29 |
+
CORRECT_REMEDIATION_SERVICE = "user-db"
|
| 30 |
+
SYMPTOM_SERVICES = {"api-gateway", "auth-service"} # wrong answers
|
| 31 |
+
MAX_STEPS = 12
|
| 32 |
+
SPEED_THRESHOLD = 8
|
| 33 |
+
|
| 34 |
+
def score(self, state: EpisodeState) -> float:
|
| 35 |
+
"""
|
| 36 |
+
Score the completed Task 2 episode.
|
| 37 |
+
Penalizes agents that treat symptoms instead of root cause.
|
| 38 |
+
"""
|
| 39 |
+
total = 0.0
|
| 40 |
+
breakdown = {}
|
| 41 |
+
|
| 42 |
+
# ββ 1. Severity classification βββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
severity_value = self._get_first_value(state, "classify_severity")
|
| 44 |
+
if severity_value == self.CORRECT_SEVERITY:
|
| 45 |
+
total += 0.20
|
| 46 |
+
breakdown["severity"] = "+0.20 (correct: P1)"
|
| 47 |
+
elif severity_value == "P2":
|
| 48 |
+
total += 0.08
|
| 49 |
+
breakdown["severity"] = "+0.08 (partial: P2 given, P1 expected)"
|
| 50 |
+
elif severity_value is None:
|
| 51 |
+
breakdown["severity"] = "+0.00 (never classified)"
|
| 52 |
+
else:
|
| 53 |
+
breakdown["severity"] = f"+0.00 (wrong: {severity_value})"
|
| 54 |
+
|
| 55 |
+
# ββ 2. Root cause identification βββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
root_cause_value = self._get_first_value(state, "identify_root_cause")
|
| 57 |
+
if root_cause_value == self.CORRECT_ROOT_CAUSE:
|
| 58 |
+
total += 0.35
|
| 59 |
+
breakdown["root_cause"] = "+0.35 (correct: user-db)"
|
| 60 |
+
elif root_cause_value in self.SYMPTOM_SERVICES:
|
| 61 |
+
# Identified a symptom, not root cause β no credit
|
| 62 |
+
breakdown["root_cause"] = f"+0.00 (wrong: {root_cause_value} is a symptom, not root cause)"
|
| 63 |
+
elif root_cause_value and "db" in root_cause_value:
|
| 64 |
+
total += 0.10 # right tier (database), wrong specific service
|
| 65 |
+
breakdown["root_cause"] = f"+0.10 (partial: {root_cause_value}, right tier)"
|
| 66 |
+
elif root_cause_value is None:
|
| 67 |
+
breakdown["root_cause"] = "+0.00 (never identified)"
|
| 68 |
+
else:
|
| 69 |
+
breakdown["root_cause"] = f"+0.00 (wrong: {root_cause_value})"
|
| 70 |
+
|
| 71 |
+
# ββ 3. Remediation + Ordering ββββββββββββββββββββββββββββββββββββββββββ
|
| 72 |
+
remediation_actions = self._get_actions_of_type(state, "remediate")
|
| 73 |
+
remediation_scored = False
|
| 74 |
+
symptom_remediated_first = False
|
| 75 |
+
|
| 76 |
+
for i, action in enumerate(remediation_actions):
|
| 77 |
+
value = action.get("value", "")
|
| 78 |
+
parts = value.split(":")
|
| 79 |
+
if len(parts) != 2:
|
| 80 |
+
continue
|
| 81 |
+
prefix, service = parts
|
| 82 |
+
|
| 83 |
+
# Check if agent remediated a symptom service before root cause
|
| 84 |
+
if service in self.SYMPTOM_SERVICES and not remediation_scored:
|
| 85 |
+
symptom_remediated_first = True
|
| 86 |
+
|
| 87 |
+
# Check for correct remediation
|
| 88 |
+
if (
|
| 89 |
+
prefix in self.CORRECT_REMEDIATION_PREFIXES
|
| 90 |
+
and service == self.CORRECT_REMEDIATION_SERVICE
|
| 91 |
+
and not remediation_scored
|
| 92 |
+
):
|
| 93 |
+
total += 0.25
|
| 94 |
+
breakdown["remediation"] = f"+0.25 (correct: {value})"
|
| 95 |
+
remediation_scored = True
|
| 96 |
+
|
| 97 |
+
if not remediation_scored:
|
| 98 |
+
breakdown["remediation"] = "+0.00 (no correct remediation)"
|
| 99 |
+
|
| 100 |
+
# ββ 4. Ordering bonus ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 101 |
+
if not symptom_remediated_first and remediation_scored:
|
| 102 |
+
total += 0.10
|
| 103 |
+
breakdown["ordering"] = "+0.10 (correctly targeted root cause, not symptoms)"
|
| 104 |
+
elif symptom_remediated_first:
|
| 105 |
+
total -= 0.10
|
| 106 |
+
breakdown["ordering"] = "-0.10 (remediated symptom service before root cause)"
|
| 107 |
+
|
| 108 |
+
# ββ 5. Speed bonus βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 109 |
+
if self._episode_resolved(state):
|
| 110 |
+
if self._steps_used(state) <= self.SPEED_THRESHOLD:
|
| 111 |
+
total += 0.10
|
| 112 |
+
breakdown["speed"] = f"+0.10 (resolved in {self._steps_used(state)} steps)"
|
| 113 |
+
else:
|
| 114 |
+
breakdown["speed"] = f"+0.00 (resolved but used {self._steps_used(state)} steps)"
|
| 115 |
+
else:
|
| 116 |
+
total -= 0.10
|
| 117 |
+
breakdown["resolution"] = "-0.10 (never resolved)"
|
| 118 |
+
|
| 119 |
+
self._breakdown = breakdown
|
| 120 |
+
return self._clamp(total)
|
| 121 |
+
|
| 122 |
+
def get_breakdown(self) -> dict:
|
| 123 |
+
return getattr(self, "_breakdown", {})
|
server/graders/crash_grader.py
CHANGED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grader for Task 1 β Single Service Crash (Easy)
|
| 3 |
+
|
| 4 |
+
Scoring breakdown:
|
| 5 |
+
Correct severity (P1) β +0.30
|
| 6 |
+
Correct root cause (payment-service) β +0.35
|
| 7 |
+
Correct remediation (restart:payment-*) β +0.25
|
| 8 |
+
Speed bonus (resolved β€ 5 steps) β +0.10
|
| 9 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 10 |
+
Maximum possible score β 1.00
|
| 11 |
+
|
| 12 |
+
Penalties:
|
| 13 |
+
Ignored P1 incident β -0.30 (from base)
|
| 14 |
+
Wrong root cause identified β 0.00 (no credit)
|
| 15 |
+
Never resolved β -0.10
|
| 16 |
+
"""
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
from server.models import EpisodeState
|
| 19 |
+
from server.graders.base_grader import BaseGrader
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class CrashGrader(BaseGrader):
|
| 23 |
+
"""Official grader for Task 1 β Single Service Crash."""
|
| 24 |
+
|
| 25 |
+
# Ground truth constants
|
| 26 |
+
CORRECT_SEVERITY = "P1"
|
| 27 |
+
CORRECT_ROOT_CAUSE = "payment-service"
|
| 28 |
+
CORRECT_REMEDIATION_PREFIX = "restart"
|
| 29 |
+
CORRECT_REMEDIATION_SERVICE = "payment-service"
|
| 30 |
+
MAX_STEPS = 8
|
| 31 |
+
SPEED_THRESHOLD = 5 # must resolve within this many steps for speed bonus
|
| 32 |
+
|
| 33 |
+
def score(self, state: EpisodeState) -> float:
|
| 34 |
+
"""
|
| 35 |
+
Score the completed Task 1 episode.
|
| 36 |
+
Deterministic β same action history always produces same score.
|
| 37 |
+
"""
|
| 38 |
+
total = 0.0
|
| 39 |
+
breakdown = {}
|
| 40 |
+
|
| 41 |
+
# ββ 1. Severity classification βββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
+
severity_value = self._get_first_value(state, "classify_severity")
|
| 43 |
+
if severity_value == self.CORRECT_SEVERITY:
|
| 44 |
+
total += 0.30
|
| 45 |
+
breakdown["severity"] = "+0.30 (correct: P1)"
|
| 46 |
+
elif severity_value == "P2":
|
| 47 |
+
total += 0.10 # partial credit β close but not right
|
| 48 |
+
breakdown["severity"] = "+0.10 (partial: P2 given, P1 expected)"
|
| 49 |
+
elif severity_value is None:
|
| 50 |
+
breakdown["severity"] = "+0.00 (never classified)"
|
| 51 |
+
else:
|
| 52 |
+
breakdown["severity"] = f"+0.00 (wrong: {severity_value})"
|
| 53 |
+
|
| 54 |
+
# ββ 2. Root cause identification βββββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
root_cause_value = self._get_first_value(state, "identify_root_cause")
|
| 56 |
+
if root_cause_value == self.CORRECT_ROOT_CAUSE:
|
| 57 |
+
total += 0.35
|
| 58 |
+
breakdown["root_cause"] = "+0.35 (correct: payment-service)"
|
| 59 |
+
elif root_cause_value and root_cause_value.startswith("payment"):
|
| 60 |
+
total += 0.10 # partial β right service family
|
| 61 |
+
breakdown["root_cause"] = f"+0.10 (partial: {root_cause_value}, right family)"
|
| 62 |
+
elif root_cause_value is None:
|
| 63 |
+
breakdown["root_cause"] = "+0.00 (never identified)"
|
| 64 |
+
else:
|
| 65 |
+
breakdown["root_cause"] = f"+0.00 (wrong: {root_cause_value})"
|
| 66 |
+
|
| 67 |
+
# ββ 3. Remediation βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
+
remediation_actions = self._get_actions_of_type(state, "remediate")
|
| 69 |
+
remediation_scored = False
|
| 70 |
+
for action in remediation_actions:
|
| 71 |
+
value = action.get("value", "")
|
| 72 |
+
parts = value.split(":")
|
| 73 |
+
if len(parts) == 2:
|
| 74 |
+
prefix, service = parts
|
| 75 |
+
if prefix == self.CORRECT_REMEDIATION_PREFIX and service == self.CORRECT_REMEDIATION_SERVICE:
|
| 76 |
+
total += 0.25
|
| 77 |
+
breakdown["remediation"] = f"+0.25 (correct: {value})"
|
| 78 |
+
remediation_scored = True
|
| 79 |
+
break
|
| 80 |
+
elif service == self.CORRECT_REMEDIATION_SERVICE:
|
| 81 |
+
total += 0.08 # right service, wrong action type
|
| 82 |
+
breakdown["remediation"] = f"+0.08 (partial: right service, wrong action)"
|
| 83 |
+
remediation_scored = True
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
if not remediation_scored:
|
| 87 |
+
breakdown["remediation"] = "+0.00 (no correct remediation)"
|
| 88 |
+
|
| 89 |
+
# ββ 4. Speed bonus βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
if self._episode_resolved(state):
|
| 91 |
+
if self._steps_used(state) <= self.SPEED_THRESHOLD:
|
| 92 |
+
total += 0.10
|
| 93 |
+
breakdown["speed"] = f"+0.10 (resolved in {self._steps_used(state)} steps)"
|
| 94 |
+
else:
|
| 95 |
+
breakdown["speed"] = f"+0.00 (resolved but slow: {self._steps_used(state)} steps)"
|
| 96 |
+
else:
|
| 97 |
+
total -= 0.10 # penalty for not resolving
|
| 98 |
+
breakdown["resolution"] = "-0.10 (never resolved)"
|
| 99 |
+
|
| 100 |
+
# ββ 5. Ignore penalty βββββββββββββββββββββββββββββοΏ½οΏ½ββββββββββββββββββββ
|
| 101 |
+
if self._was_action_taken(state, "ignore"):
|
| 102 |
+
total -= 0.30
|
| 103 |
+
breakdown["ignore_penalty"] = "-0.30 (ignored P1 incident)"
|
| 104 |
+
|
| 105 |
+
self._breakdown = breakdown
|
| 106 |
+
return self._clamp(total)
|
| 107 |
+
|
| 108 |
+
def get_breakdown(self) -> dict:
|
| 109 |
+
"""Return scoring breakdown from last score() call."""
|
| 110 |
+
return getattr(self, "_breakdown", {})
|
server/graders/noise_grader.py
CHANGED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grader for Task 3 β Silent Degradation with Noise (Hard)
|
| 3 |
+
|
| 4 |
+
Scoring breakdown:
|
| 5 |
+
Correct severity (P2) β +0.30
|
| 6 |
+
Correct root cause (payment-db) β +0.30
|
| 7 |
+
Correct remediation (flush-cache/kill-q) β +0.20
|
| 8 |
+
No spurious actions on noise services β +0.10
|
| 9 |
+
Speed bonus (resolved β€ 10 steps) β +0.10
|
| 10 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
+
Maximum possible score β 1.00
|
| 12 |
+
|
| 13 |
+
Penalties:
|
| 14 |
+
Over-escalation (classified P1) β -0.15
|
| 15 |
+
Under-escalation (classified P3) β -0.15
|
| 16 |
+
Remediated wrong service β 0.00 (no credit)
|
| 17 |
+
Never resolved/escalated β -0.10
|
| 18 |
+
"""
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
from server.models import EpisodeState
|
| 21 |
+
from server.graders.base_grader import BaseGrader
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class NoiseGrader(BaseGrader):
|
| 25 |
+
"""Official grader for Task 3 β Silent Degradation with Noise."""
|
| 26 |
+
|
| 27 |
+
CORRECT_SEVERITY = "P2"
|
| 28 |
+
CORRECT_ROOT_CAUSE = "payment-db"
|
| 29 |
+
CORRECT_REMEDIATION_PREFIXES = {"flush-cache", "kill-query"}
|
| 30 |
+
CORRECT_REMEDIATION_SERVICE = "payment-db"
|
| 31 |
+
CORRECT_ESCALATION_TEAMS = {"dba-team"}
|
| 32 |
+
NOISE_SERVICES = {
|
| 33 |
+
"api-gateway", "auth-service", "user-db",
|
| 34 |
+
"notification-service", "email-queue"
|
| 35 |
+
} # irrelevant to this scenario
|
| 36 |
+
MAX_STEPS = 15
|
| 37 |
+
SPEED_THRESHOLD = 10
|
| 38 |
+
|
| 39 |
+
def score(self, state: EpisodeState) -> float:
|
| 40 |
+
"""
|
| 41 |
+
Score the completed Task 3 episode.
|
| 42 |
+
Penalizes both over-escalation (P1) and under-escalation (P3).
|
| 43 |
+
Rewards noise filtering β penalizes actions on irrelevant services.
|
| 44 |
+
"""
|
| 45 |
+
total = 0.0
|
| 46 |
+
breakdown = {}
|
| 47 |
+
|
| 48 |
+
# ββ 1. Severity classification βββββββββββββββββββββββββββββββββββββββββ
|
| 49 |
+
severity_value = self._get_first_value(state, "classify_severity")
|
| 50 |
+
if severity_value == self.CORRECT_SEVERITY:
|
| 51 |
+
total += 0.30
|
| 52 |
+
breakdown["severity"] = "+0.30 (correct: P2)"
|
| 53 |
+
elif severity_value == "P1":
|
| 54 |
+
total -= 0.15
|
| 55 |
+
breakdown["severity"] = "-0.15 (over-escalation: P1 given, P2 expected β no outage yet)"
|
| 56 |
+
elif severity_value == "P3":
|
| 57 |
+
total -= 0.15
|
| 58 |
+
breakdown["severity"] = "-0.15 (under-escalation: P3 given, P2 expected β trend is serious)"
|
| 59 |
+
elif severity_value is None:
|
| 60 |
+
breakdown["severity"] = "+0.00 (never classified)"
|
| 61 |
+
else:
|
| 62 |
+
breakdown["severity"] = f"+0.00 (wrong: {severity_value})"
|
| 63 |
+
|
| 64 |
+
# ββ 2. Root cause identification βββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
root_cause_value = self._get_first_value(state, "identify_root_cause")
|
| 66 |
+
if root_cause_value == self.CORRECT_ROOT_CAUSE:
|
| 67 |
+
total += 0.30
|
| 68 |
+
breakdown["root_cause"] = "+0.30 (correct: payment-db)"
|
| 69 |
+
elif root_cause_value == "payment-service":
|
| 70 |
+
total += 0.10 # close β right payment tier, wrong component
|
| 71 |
+
breakdown["root_cause"] = "+0.10 (partial: payment-service, but root is payment-db)"
|
| 72 |
+
elif root_cause_value in self.NOISE_SERVICES:
|
| 73 |
+
breakdown["root_cause"] = f"+0.00 (wrong: {root_cause_value} is a noise service)"
|
| 74 |
+
elif root_cause_value is None:
|
| 75 |
+
breakdown["root_cause"] = "+0.00 (never identified)"
|
| 76 |
+
else:
|
| 77 |
+
breakdown["root_cause"] = f"+0.00 (wrong: {root_cause_value})"
|
| 78 |
+
|
| 79 |
+
# ββ 3. Remediation βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 80 |
+
remediation_actions = self._get_actions_of_type(state, "remediate")
|
| 81 |
+
remediation_scored = False
|
| 82 |
+
for action in remediation_actions:
|
| 83 |
+
value = action.get("value", "")
|
| 84 |
+
parts = value.split(":")
|
| 85 |
+
if len(parts) == 2:
|
| 86 |
+
prefix, service = parts
|
| 87 |
+
if (
|
| 88 |
+
prefix in self.CORRECT_REMEDIATION_PREFIXES
|
| 89 |
+
and service == self.CORRECT_REMEDIATION_SERVICE
|
| 90 |
+
):
|
| 91 |
+
total += 0.20
|
| 92 |
+
breakdown["remediation"] = f"+0.20 (correct: {value})"
|
| 93 |
+
remediation_scored = True
|
| 94 |
+
break
|
| 95 |
+
elif service == self.CORRECT_REMEDIATION_SERVICE:
|
| 96 |
+
total += 0.05 # right service, suboptimal action
|
| 97 |
+
breakdown["remediation"] = f"+0.05 (partial: right service, suboptimal action)"
|
| 98 |
+
remediation_scored = True
|
| 99 |
+
break
|
| 100 |
+
|
| 101 |
+
# Also accept correct escalation to dba-team as valid resolution
|
| 102 |
+
if not remediation_scored:
|
| 103 |
+
escalation_actions = self._get_actions_of_type(state, "escalate")
|
| 104 |
+
for action in escalation_actions:
|
| 105 |
+
if action.get("value") in self.CORRECT_ESCALATION_TEAMS:
|
| 106 |
+
total += 0.15 # escalation is slightly less credit than direct fix
|
| 107 |
+
breakdown["remediation"] = "+0.15 (escalated to dba-team β acceptable)"
|
| 108 |
+
remediation_scored = True
|
| 109 |
+
break
|
| 110 |
+
|
| 111 |
+
if not remediation_scored:
|
| 112 |
+
breakdown["remediation"] = "+0.00 (no correct remediation or escalation)"
|
| 113 |
+
|
| 114 |
+
# ββ 4. Noise filtering bonus βββββββββββββββββββββββββββββββββββββββββββ
|
| 115 |
+
# Check if agent took any unnecessary actions on noise services
|
| 116 |
+
spurious_actions = 0
|
| 117 |
+
all_actions = state.action_history
|
| 118 |
+
for action in all_actions:
|
| 119 |
+
action_type = action.get("action_type")
|
| 120 |
+
value = action.get("value", "")
|
| 121 |
+
# Check remediate/escalate/identify actions on noise services
|
| 122 |
+
if action_type == "identify_root_cause" and value in self.NOISE_SERVICES:
|
| 123 |
+
spurious_actions += 1
|
| 124 |
+
elif action_type == "remediate":
|
| 125 |
+
service = value.split(":")[-1] if ":" in value else ""
|
| 126 |
+
if service in self.NOISE_SERVICES:
|
| 127 |
+
spurious_actions += 1
|
| 128 |
+
elif action_type == "escalate" and value not in self.CORRECT_ESCALATION_TEAMS and value != "sre-team":
|
| 129 |
+
spurious_actions += 1
|
| 130 |
+
|
| 131 |
+
if spurious_actions == 0:
|
| 132 |
+
total += 0.10
|
| 133 |
+
breakdown["noise_filtering"] = "+0.10 (no spurious actions on noise services)"
|
| 134 |
+
elif spurious_actions == 1:
|
| 135 |
+
breakdown["noise_filtering"] = f"+0.00 ({spurious_actions} spurious action)"
|
| 136 |
+
else:
|
| 137 |
+
total -= 0.05
|
| 138 |
+
breakdown["noise_filtering"] = f"-0.05 ({spurious_actions} spurious actions β poor noise filtering)"
|
| 139 |
+
|
| 140 |
+
# ββ 5. Speed bonus βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 141 |
+
if self._episode_resolved(state) or remediation_scored:
|
| 142 |
+
if self._steps_used(state) <= self.SPEED_THRESHOLD:
|
| 143 |
+
total += 0.10
|
| 144 |
+
breakdown["speed"] = f"+0.10 (acted within {self._steps_used(state)} steps)"
|
| 145 |
+
else:
|
| 146 |
+
breakdown["speed"] = f"+0.00 (acted but used {self._steps_used(state)} steps)"
|
| 147 |
+
else:
|
| 148 |
+
total -= 0.10
|
| 149 |
+
breakdown["resolution"] = "-0.10 (never acted on the degradation)"
|
| 150 |
+
|
| 151 |
+
self._breakdown = breakdown
|
| 152 |
+
return self._clamp(total)
|
| 153 |
+
|
| 154 |
+
def get_breakdown(self) -> dict:
|
| 155 |
+
return getattr(self, "_breakdown", {})
|
server/models.py
CHANGED
|
@@ -206,6 +206,10 @@ class EpisodeState(BaseModel):
|
|
| 206 |
default_factory=list,
|
| 207 |
description="List of action_type values taken so far this episode"
|
| 208 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
correct_severity: Optional[str] = Field(
|
| 210 |
None,
|
| 211 |
description="Whether agent has correctly classified severity yet"
|
|
|
|
| 206 |
default_factory=list,
|
| 207 |
description="List of action_type values taken so far this episode"
|
| 208 |
)
|
| 209 |
+
action_history: list[dict] = Field(
|
| 210 |
+
default_factory=list,
|
| 211 |
+
description="Full action objects taken this episode (for grader evaluation)"
|
| 212 |
+
)
|
| 213 |
correct_severity: Optional[str] = Field(
|
| 214 |
None,
|
| 215 |
description="Whether agent has correctly classified severity yet"
|
server/scenarios/cascading.py
CHANGED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 2 β Cascading Failure (Medium)
|
| 3 |
+
|
| 4 |
+
Scenario: user-db develops a slow query that exhausts the auth-service connection pool,
|
| 5 |
+
which then causes the api-gateway to return timeouts to all users.
|
| 6 |
+
|
| 7 |
+
Surface logs show gateway errors most loudly (symptom), but root cause is hidden (user-db).
|
| 8 |
+
Agent must trace backward through the cascade chain β NOT treat symptoms as root cause.
|
| 9 |
+
|
| 10 |
+
Ground truth:
|
| 11 |
+
- severity: P1
|
| 12 |
+
- root_cause: user-db
|
| 13 |
+
- remediation: kill-query:user-db OR restart:user-db
|
| 14 |
+
- correct_teams: dba-team, sre-team
|
| 15 |
+
- noise_ratio: 30%
|
| 16 |
+
"""
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
import random
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
from server.models import LogLine, ServiceStatus
|
| 21 |
+
from server.log_generator import (
|
| 22 |
+
generate_log_batch,
|
| 23 |
+
generate_healthy_system_state,
|
| 24 |
+
_make_timestamp,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# βββ GROUND TRUTH βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
|
| 29 |
+
GROUND_TRUTH = {
|
| 30 |
+
"severity": "P1",
|
| 31 |
+
"root_cause": "user-db",
|
| 32 |
+
"remediation_prefixes": {"kill-query", "restart"},
|
| 33 |
+
"remediation_service": "user-db",
|
| 34 |
+
"correct_teams": {"dba-team", "sre-team"},
|
| 35 |
+
"max_steps": 12,
|
| 36 |
+
"noise_ratio": 0.30,
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
# βββ STEP-BY-STEP SIGNAL PLAN βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
# Cascade chain: user-db slow query β auth-service pool exhausted β api-gateway timeouts
|
| 41 |
+
# Steps 0-1: Gateway errors surface (symptoms only β most visible)
|
| 42 |
+
# Steps 2-3: Auth-service DB pressure becomes visible
|
| 43 |
+
# Steps 4-5: user-db slow queries exposed; circuit breaker opens
|
| 44 |
+
# Steps 6-7: Full cascade β all 3 services degraded/down
|
| 45 |
+
# Steps 8-11: Escalating alerts; root cause becomes unmistakable
|
| 46 |
+
|
| 47 |
+
STEP_SIGNALS = [
|
| 48 |
+
# Step 0: Gateway errors first to appear (surface symptom)
|
| 49 |
+
[
|
| 50 |
+
("api-gateway", "ERROR", "upstream timeout from auth-service: 5002ms"),
|
| 51 |
+
("api-gateway", "WARN", "error rate: 8.3% on /auth/* routes"),
|
| 52 |
+
],
|
| 53 |
+
# Step 1: More gateway errors; first hints of auth-service pressure
|
| 54 |
+
[
|
| 55 |
+
("api-gateway", "ERROR", "upstream timeout from auth-service: 30007ms"),
|
| 56 |
+
("api-gateway", "WARN", "error rate: 15.7% β auth-service latency climbing"),
|
| 57 |
+
],
|
| 58 |
+
# Step 2: Auth-service connection pool pressure visible
|
| 59 |
+
[
|
| 60 |
+
("auth-service", "WARN", "db connection pool at 42/50 β pressure building"),
|
| 61 |
+
("api-gateway", "ERROR", "upstream timeout from auth-service: 30005ms"),
|
| 62 |
+
("auth-service", "ERROR", "db query timeout: SELECT session WHERE user_id=? [5001ms]"),
|
| 63 |
+
],
|
| 64 |
+
# Step 3: Auth-service pool nearly exhausted
|
| 65 |
+
[
|
| 66 |
+
("auth-service", "ERROR", "db connection pool EXHAUSTED (50/50) β blocking new requests"),
|
| 67 |
+
("api-gateway", "ERROR", "auth-service unavailable: connection pool full"),
|
| 68 |
+
("auth-service", "WARN", "request queue depth: 127 β approaching overflow"),
|
| 69 |
+
],
|
| 70 |
+
# Step 4: user-db slow query finally exposed
|
| 71 |
+
[
|
| 72 |
+
("user-db", "WARN", "slow query detected: SELECT * FROM sessions WHERE user_id=? [2847ms]"),
|
| 73 |
+
("auth-service", "ERROR", "db connection timeout after 5000ms β query hanging"),
|
| 74 |
+
("user-db", "ERROR", "lock wait timeout: session table β blocking reads"),
|
| 75 |
+
],
|
| 76 |
+
# Step 5: user-db circuit breaker opens; auth-service starts failing fast
|
| 77 |
+
[
|
| 78 |
+
("user-db", "WARN", "slow query: 4500ms β circuit breaker approaching threshold"),
|
| 79 |
+
("auth-service", "ERROR", "circuit breaker OPEN for user-db: latency exceeded 5000ms"),
|
| 80 |
+
("api-gateway", "ERROR", "all /auth/* requests failing β upstream unavailable"),
|
| 81 |
+
],
|
| 82 |
+
# Step 6: Full cascade β all 3 services degraded
|
| 83 |
+
[
|
| 84 |
+
("api-gateway", "ERROR", "error rate: 67.4% β multiple upstreams timing out"),
|
| 85 |
+
("auth-service", "ERROR", "health check FAILED: cannot reach user-db"),
|
| 86 |
+
("user-db", "ERROR", "connection pool saturated: 95/100 connections in use"),
|
| 87 |
+
],
|
| 88 |
+
# Step 7: api-gateway now fully symptomatic
|
| 89 |
+
[
|
| 90 |
+
("api-gateway", "FATAL", "SLA breach: /auth endpoint availability < 95%"),
|
| 91 |
+
("auth-service", "ERROR", "auth-service DOWN: 3/3 health checks failed"),
|
| 92 |
+
("user-db", "WARN", "slow query count: 847 in last 60s β severe degradation"),
|
| 93 |
+
],
|
| 94 |
+
# Step 8: Database fully exposed as root cause
|
| 95 |
+
[
|
| 96 |
+
("user-db", "ERROR", "CRITICAL: user-db query latency 8000ms+ β active sessions timing out"),
|
| 97 |
+
("auth-service", "ERROR", "rejected: user-db connection pool exhausted"),
|
| 98 |
+
("api-gateway", "ERROR", "user-auth endpoint returning 503 β cascade failure"),
|
| 99 |
+
],
|
| 100 |
+
# Step 9: Escalating
|
| 101 |
+
[
|
| 102 |
+
("user-db", "FATAL", "user-db DOWN: connection pool 100/100 β no connections available"),
|
| 103 |
+
("api-gateway", "ERROR", "error rate: 89.2% β auth-service and user-db both unreachable"),
|
| 104 |
+
],
|
| 105 |
+
# Step 10: Critical
|
| 106 |
+
[
|
| 107 |
+
("api-gateway", "FATAL", "CRITICAL: auth-service DOWN for 90s β 100% of login attempts failing"),
|
| 108 |
+
("user-db", "ERROR", "lock contention: session table fully locked β queries timing out"),
|
| 109 |
+
],
|
| 110 |
+
# Step 11: Maximum severity
|
| 111 |
+
[
|
| 112 |
+
("user-db", "FATAL", "user-db unresponsive for 180s β database crisis"),
|
| 113 |
+
("api-gateway", "FATAL", "SLA_BREACH: auth availability 0% β complete user-auth outage"),
|
| 114 |
+
],
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def get_system_state(step: int, base_time: datetime) -> dict[str, ServiceStatus]:
|
| 119 |
+
"""Return system state for this step. Cascade: user-db β auth-service β api-gateway."""
|
| 120 |
+
now = _make_timestamp(base_time, step * 30)
|
| 121 |
+
state = generate_healthy_system_state(base_time)
|
| 122 |
+
|
| 123 |
+
# Escalating degradation based on step
|
| 124 |
+
if step <= 1:
|
| 125 |
+
# Gateway just starting to see issues
|
| 126 |
+
state["api-gateway"] = ServiceStatus(
|
| 127 |
+
name="api-gateway", status="degraded", error_rate=0.083, latency_p99_ms=2500, last_updated=now
|
| 128 |
+
)
|
| 129 |
+
elif step <= 3:
|
| 130 |
+
# Auth-service pool pressure
|
| 131 |
+
state["api-gateway"] = ServiceStatus(
|
| 132 |
+
name="api-gateway", status="degraded", error_rate=0.157, latency_p99_ms=5000, last_updated=now
|
| 133 |
+
)
|
| 134 |
+
state["auth-service"] = ServiceStatus(
|
| 135 |
+
name="auth-service", status="degraded", error_rate=0.15, latency_p99_ms=5000, last_updated=now
|
| 136 |
+
)
|
| 137 |
+
elif step <= 5:
|
| 138 |
+
# user-db slow queries exposed
|
| 139 |
+
state["api-gateway"] = ServiceStatus(
|
| 140 |
+
name="api-gateway", status="degraded", error_rate=0.45, latency_p99_ms=8000, last_updated=now
|
| 141 |
+
)
|
| 142 |
+
state["auth-service"] = ServiceStatus(
|
| 143 |
+
name="auth-service", status="down", error_rate=0.85, latency_p99_ms=10000, last_updated=now
|
| 144 |
+
)
|
| 145 |
+
state["user-db"] = ServiceStatus(
|
| 146 |
+
name="user-db", status="degraded", error_rate=0.30, latency_p99_ms=4500, last_updated=now
|
| 147 |
+
)
|
| 148 |
+
elif step <= 7:
|
| 149 |
+
# Full cascade
|
| 150 |
+
state["api-gateway"] = ServiceStatus(
|
| 151 |
+
name="api-gateway", status="down", error_rate=0.89, latency_p99_ms=10000, last_updated=now
|
| 152 |
+
)
|
| 153 |
+
state["auth-service"] = ServiceStatus(
|
| 154 |
+
name="auth-service", status="down", error_rate=0.95, latency_p99_ms=10000, last_updated=now
|
| 155 |
+
)
|
| 156 |
+
state["user-db"] = ServiceStatus(
|
| 157 |
+
name="user-db", status="down", error_rate=0.50, latency_p99_ms=8000, last_updated=now
|
| 158 |
+
)
|
| 159 |
+
else:
|
| 160 |
+
# Maximum severity
|
| 161 |
+
state["api-gateway"] = ServiceStatus(
|
| 162 |
+
name="api-gateway", status="down", error_rate=0.99, latency_p99_ms=10000, last_updated=now
|
| 163 |
+
)
|
| 164 |
+
state["auth-service"] = ServiceStatus(
|
| 165 |
+
name="auth-service", status="down", error_rate=1.0, latency_p99_ms=10000, last_updated=now
|
| 166 |
+
)
|
| 167 |
+
state["user-db"] = ServiceStatus(
|
| 168 |
+
name="user-db", status="down", error_rate=0.75, latency_p99_ms=10000, last_updated=now
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
return state
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def get_step_data(step: int, base_time: datetime, rng: random.Random) -> tuple[list[LogLine], dict[str, ServiceStatus]]:
|
| 175 |
+
"""
|
| 176 |
+
Returns (logs, system_state) for the given step.
|
| 177 |
+
Signal gets louder over time if agent hasn't acted.
|
| 178 |
+
"""
|
| 179 |
+
signal_idx = min(step, len(STEP_SIGNALS) - 1)
|
| 180 |
+
signals = STEP_SIGNALS[signal_idx]
|
| 181 |
+
|
| 182 |
+
logs = generate_log_batch(
|
| 183 |
+
scenario_signals=signals,
|
| 184 |
+
step=step,
|
| 185 |
+
base_time=base_time,
|
| 186 |
+
noise_ratio=GROUND_TRUTH["noise_ratio"],
|
| 187 |
+
batch_size=10,
|
| 188 |
+
rng=rng,
|
| 189 |
+
)
|
| 190 |
+
system_state = get_system_state(step, base_time)
|
| 191 |
+
return logs, system_state
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def get_active_alerts(step: int) -> list[str]:
|
| 195 |
+
"""Return active alerts for this step."""
|
| 196 |
+
alerts = []
|
| 197 |
+
if step >= 0:
|
| 198 |
+
alerts.append("api-gateway: elevated error rate on /auth/* routes")
|
| 199 |
+
if step >= 2:
|
| 200 |
+
alerts.append("auth-service: db connection pool pressure")
|
| 201 |
+
if step >= 4:
|
| 202 |
+
alerts.append("user-db: slow queries detected β latency 2000ms+")
|
| 203 |
+
if step >= 5:
|
| 204 |
+
alerts.append("auth-service: circuit breaker OPEN for user-db")
|
| 205 |
+
if step >= 6:
|
| 206 |
+
alerts.append("SLA_BREACH: /auth availability < 90%")
|
| 207 |
+
if step >= 8:
|
| 208 |
+
alerts.append("CRITICAL: user-db connection pool saturated")
|
| 209 |
+
if step >= 10:
|
| 210 |
+
alerts.append("CRITICAL: full auth cascade failure β P1 incident")
|
| 211 |
+
return alerts
|
server/scenarios/silent_degrade.py
CHANGED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 3 β Silent Degradation with Noise (Hard)
|
| 3 |
+
|
| 4 |
+
Scenario: payment-db query latency slowly increases over time.
|
| 5 |
+
No service crashes. Error rate stays below P1 threshold (5%).
|
| 6 |
+
60% of logs are irrelevant noise from unrelated services.
|
| 7 |
+
Agent must filter noise, identify subtle signal, classify as P2 (NOT P1, NOT P3).
|
| 8 |
+
|
| 9 |
+
Ground truth:
|
| 10 |
+
- severity: P2 (nuanced β trending toward breach but no hard outage yet)
|
| 11 |
+
- root_cause: payment-db
|
| 12 |
+
- remediation: flush-cache:payment-db OR kill-query:payment-db
|
| 13 |
+
- correct_teams: dba-team
|
| 14 |
+
- noise_ratio: 60% (hardest noise ratio of all tasks)
|
| 15 |
+
"""
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
import random
|
| 18 |
+
from datetime import datetime
|
| 19 |
+
from server.models import LogLine, ServiceStatus
|
| 20 |
+
from server.log_generator import (
|
| 21 |
+
generate_log_batch,
|
| 22 |
+
generate_healthy_system_state,
|
| 23 |
+
_make_timestamp,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Ground Truth
|
| 27 |
+
|
| 28 |
+
GROUND_TRUTH = {
|
| 29 |
+
"severity": "P2",
|
| 30 |
+
"root_cause": "payment-db",
|
| 31 |
+
"remediation_prefixes": {"flush-cache", "kill-query"},
|
| 32 |
+
"remediation_service": "payment-db",
|
| 33 |
+
"correct_teams": {"dba-team"},
|
| 34 |
+
"max_steps": 15,
|
| 35 |
+
"noise_ratio": 0.60,
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Step signals: (service, level, message)
|
| 39 |
+
STEP_SIGNALS = [
|
| 40 |
+
# Step 0: Very subtle
|
| 41 |
+
[("payment-db", "WARN", "payment-db: query latency elevated 450ms (baseline: 12ms)")],
|
| 42 |
+
# Step 1
|
| 43 |
+
[("payment-db", "WARN", "payment-db: query latency 620ms")],
|
| 44 |
+
# Step 2
|
| 45 |
+
[("payment-db", "WARN", "payment-db: slow query: SELECT * FROM transactions WHERE user_id=? [890ms]")],
|
| 46 |
+
# Step 3
|
| 47 |
+
[("payment-db", "WARN", "payment-db: buffer cache hit ratio dropping: 89% to 71%")],
|
| 48 |
+
# Step 4
|
| 49 |
+
[("payment-db", "WARN", "payment-db: query latency 1200ms"), ("payment-service", "WARN", "payment-service: error rate 2.1%")],
|
| 50 |
+
# Step 5
|
| 51 |
+
[("payment-db", "WARN", "payment-db: buffer cache hit ratio 54% β cache thrashing")],
|
| 52 |
+
# Step 6
|
| 53 |
+
[("payment-db", "WARN", "payment-db: slow query: SELECT * FROM transactions [2200ms]")],
|
| 54 |
+
# Step 7
|
| 55 |
+
[("payment-db", "WARN", "payment-db: query latency 2800ms"), ("payment-service", "ERROR", "payment-service: 3.4% error rate")],
|
| 56 |
+
# Step 8
|
| 57 |
+
[("payment-db", "ERROR", "payment-db: slow query timeout: [3100ms] β query cancelled")],
|
| 58 |
+
# Step 9
|
| 59 |
+
[("payment-db", "WARN", "payment-db: query latency 4200ms β cache fully cold")],
|
| 60 |
+
# Step 10
|
| 61 |
+
[("payment-db", "ERROR", "payment-db: query latency 4500ms")],
|
| 62 |
+
# Step 11
|
| 63 |
+
[("payment-db", "WARN", "payment-db: buffer pool pages: 94% dirty")],
|
| 64 |
+
# Step 12
|
| 65 |
+
[("payment-db", "ERROR", "payment-db: query latency 4600ms β timeouts beginning"), ("payment-service", "ERROR", "payment-service: error rate 4.9%")],
|
| 66 |
+
# Step 13: P1 breached
|
| 67 |
+
[("payment-db", "ERROR", "payment-db: CRITICAL query latency 4950ms β P1 breached"), ("payment-service", "ERROR", "payment-service: error rate 5.1% β P1 exceeded")],
|
| 68 |
+
# Step 14: Worst case
|
| 69 |
+
[("payment-db", "FATAL", "payment-db: query latency 5000ms+ β connection pool exhausted"), ("payment-service", "FATAL", "payment-service: P1 CRITICAL β 6.2% error rate")],
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def get_system_state(step: int, base_time: datetime) -> dict[str, ServiceStatus]:
|
| 74 |
+
now = _make_timestamp(base_time, step * 30)
|
| 75 |
+
state = generate_healthy_system_state(base_time)
|
| 76 |
+
|
| 77 |
+
latencies = [450, 620, 890, 1200, 1400, 1800, 2200, 2800, 3100, 4200, 4500, 4600, 4600, 4950, 5000]
|
| 78 |
+
error_rates = [0.0, 0.005, 0.01, 0.021, 0.021, 0.025, 0.028, 0.034, 0.038, 0.042, 0.047, 0.049, 0.049, 0.051, 0.062]
|
| 79 |
+
|
| 80 |
+
step_idx = min(step, len(latencies) - 1)
|
| 81 |
+
db_latency = latencies[step_idx]
|
| 82 |
+
db_error = error_rates[step_idx]
|
| 83 |
+
|
| 84 |
+
psvc_latency = min(5000, 340 + db_latency // 2)
|
| 85 |
+
psvc_error = min(0.10, db_error * 0.8)
|
| 86 |
+
|
| 87 |
+
state["payment-db"] = ServiceStatus(
|
| 88 |
+
name="payment-db",
|
| 89 |
+
status="up" if step < 3 else "degraded",
|
| 90 |
+
error_rate=db_error,
|
| 91 |
+
latency_p99_ms=db_latency,
|
| 92 |
+
last_updated=now,
|
| 93 |
+
)
|
| 94 |
+
state["payment-service"] = ServiceStatus(
|
| 95 |
+
name="payment-service",
|
| 96 |
+
status="degraded" if step >= 4 else "up",
|
| 97 |
+
error_rate=psvc_error,
|
| 98 |
+
latency_p99_ms=psvc_latency,
|
| 99 |
+
last_updated=now,
|
| 100 |
+
)
|
| 101 |
+
return state
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def get_step_data(step: int, base_time: datetime, rng: random.Random) -> tuple[list[LogLine], dict[str, ServiceStatus]]:
|
| 105 |
+
signal_idx = min(step, len(STEP_SIGNALS) - 1)
|
| 106 |
+
signals = STEP_SIGNALS[signal_idx]
|
| 107 |
+
|
| 108 |
+
logs = generate_log_batch(
|
| 109 |
+
scenario_signals=signals,
|
| 110 |
+
step=step,
|
| 111 |
+
base_time=base_time,
|
| 112 |
+
noise_ratio=GROUND_TRUTH["noise_ratio"],
|
| 113 |
+
batch_size=12,
|
| 114 |
+
rng=rng,
|
| 115 |
+
)
|
| 116 |
+
system_state = get_system_state(step, base_time)
|
| 117 |
+
return logs, system_state
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def get_active_alerts(step: int) -> list[str]:
|
| 121 |
+
alerts = []
|
| 122 |
+
if step >= 4:
|
| 123 |
+
alerts.append("payment-service: error rate 2%+ β watching")
|
| 124 |
+
if step >= 6:
|
| 125 |
+
alerts.append("payment-service: p99 latency above threshold")
|
| 126 |
+
if step >= 9:
|
| 127 |
+
alerts.append("payment-db: query latency 4000ms+ β approaching P1 threshold")
|
| 128 |
+
if step >= 12:
|
| 129 |
+
alerts.append("WARNING: payment error rate approaching 5% P1 threshold")
|
| 130 |
+
if step >= 13:
|
| 131 |
+
alerts.append("ALERT: P1 threshold BREACHED for payment-service")
|
| 132 |
+
return alerts
|