Spaces:
Sleeping
Sleeping
Initial commit (from agent)
Browse files- Dockerfile +3 -5
- README.md +201 -6
- graders.py +111 -24
- inference.py +118 -149
- openenv.yaml +11 -8
- pyproject.toml +10 -1
- requirements.txt +3 -1
- server/__init__.py +2 -0
- server/app.py +486 -86
- server/environment.py +207 -195
- server/models.py +70 -11
- tasks.py +857 -371
- uv.lock +0 -0
Dockerfile
CHANGED
|
@@ -1,14 +1,12 @@
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
-
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
| 4 |
-
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
-
COPY
|
| 8 |
-
RUN
|
| 9 |
|
| 10 |
COPY . .
|
| 11 |
|
| 12 |
EXPOSE 7860
|
| 13 |
|
| 14 |
-
CMD ["
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
|
|
|
|
|
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
|
| 8 |
COPY . .
|
| 9 |
|
| 10 |
EXPOSE 7860
|
| 11 |
|
| 12 |
+
CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,205 @@
|
|
| 1 |
---
|
| 2 |
-
title: Cloud Incident Response
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Cloud Incident Response OpenEnv
|
| 3 |
+
emoji: 🚨
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
pinned: false
|
| 9 |
+
tags:
|
| 10 |
+
- openenv
|
| 11 |
+
- sre
|
| 12 |
+
- cloud
|
| 13 |
+
- incident-response
|
| 14 |
+
- devops
|
| 15 |
+
- real-world
|
| 16 |
+
- agentic
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
# Cloud Incident Response — OpenEnv Environment
|
| 20 |
+
|
| 21 |
+
An OpenEnv environment for training and evaluating AI agents on **cloud SRE incident response** — the real-world on-call workflow that engineers at every cloud company perform daily.
|
| 22 |
+
|
| 23 |
+
Distinct from Kubernetes operations environments: this focuses on **cross-service cascading failures** in distributed microservice architectures — connection pool exhaustion, CDN cache storms, OOM kills, and BGP network partitions.
|
| 24 |
+
|
| 25 |
+
## Why This Environment
|
| 26 |
+
|
| 27 |
+
Every cloud company employs SREs who respond to production incidents under time pressure with incomplete information. This environment simulates the exact decision loop:
|
| 28 |
+
|
| 29 |
+
1. **Triage** — Read alert, assess blast radius, classify severity (P1–P4)
|
| 30 |
+
2. **Investigate** — Query logs, metrics, dependencies, recent deploys
|
| 31 |
+
3. **Diagnose** — Correlate signals across services to find the root cause
|
| 32 |
+
4. **Remediate** — Execute the correct runbook steps in the right sequence
|
| 33 |
+
5. **Document** — Submit a resolution summary for post-incident review
|
| 34 |
+
|
| 35 |
+
Agents trained here learn the same skills a human SRE uses: service dependency traversal, log correlation, cascading failure analysis, and targeted remediation.
|
| 36 |
+
|
| 37 |
+
## Tasks
|
| 38 |
+
|
| 39 |
+
| Task ID | Difficulty | Max Steps | What the Agent Does |
|
| 40 |
+
|---|---|---|---|
|
| 41 |
+
| `alert_classification` | Easy | 3 | Classify alert severity (P1–P4) from metrics and symptoms |
|
| 42 |
+
| `root_cause_analysis` | Medium | 10 | Trace logs/metrics/deps to find root cause service and failure mode |
|
| 43 |
+
| `remediation_planning` | Hard | 15 | Diagnose, remediate, and document full incident resolution |
|
| 44 |
+
|
| 45 |
+
### Scenarios
|
| 46 |
+
|
| 47 |
+
| ID | Incident Type | Root Cause | Failure Pattern |
|
| 48 |
+
|---|---|---|---|
|
| 49 |
+
| AC-001 | DB connection pool exhaustion | postgres-db / auth-service deploy | api-gateway → auth-service → postgres-db cascade |
|
| 50 |
+
| AC-002 | CDN cache invalidation storm | cdn-edge purge cronjob misconfigured | 40× origin traffic spike |
|
| 51 |
+
| RCA-001 | Postgres OOM kill | analytics-service unbounded query | Kernel OOM → DB crash loop → all dependents down |
|
| 52 |
+
| RCA-002 | BGP network partition | network-infra config change | Route withdrawal → AZ isolation → 61% checkout failures |
|
| 53 |
+
| RP-001 | Full OOM remediation | analytics-service | Disable job → restart DB → restore services → document |
|
| 54 |
+
| RP-002 | Full BGP remediation | network-infra | Restore routes → rollback config → verify recovery → document |
|
| 55 |
+
|
| 56 |
+
## Action Space
|
| 57 |
+
|
| 58 |
+
**Diagnostic actions** (gather evidence):
|
| 59 |
+
```json
|
| 60 |
+
{"action_type": "query_logs", "parameters": {"service": "postgres-db"}}
|
| 61 |
+
{"action_type": "check_metrics", "parameters": {"service": "auth-service"}}
|
| 62 |
+
{"action_type": "check_dependencies", "parameters": {"service": "api-gateway"}}
|
| 63 |
+
{"action_type": "check_recent_deploys", "parameters": {"service": "analytics-service"}}
|
| 64 |
+
{"action_type": "check_service_status", "parameters": {"service": "payment-service"}}
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
**Remediation actions** (fix the incident):
|
| 68 |
+
```json
|
| 69 |
+
{"action_type": "restart_service", "parameters": {"service": "postgres-db"}}
|
| 70 |
+
{"action_type": "rollback_deploy", "parameters": {"service": "network-infra", "target_version": "previous"}}
|
| 71 |
+
{"action_type": "scale_service", "parameters": {"service": "image-service", "replicas": 10}}
|
| 72 |
+
{"action_type": "disable_feature_flag", "parameters": {"flag": "full_history_export"}}
|
| 73 |
+
{"action_type": "execute_runbook_step", "parameters": {"runbook_action": "restore_bgp_routes"}}
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
**Submission actions** (end the episode):
|
| 77 |
+
```json
|
| 78 |
+
{"action_type": "submit_severity", "parameters": {"severity": "P1", "service": "postgres-db"}}
|
| 79 |
+
{"action_type": "submit_root_cause", "parameters": {"service": "analytics-service", "failure_mode": "unbounded query OOM killing postgres-db"}}
|
| 80 |
+
{"action_type": "submit_resolution", "parameters": {"summary": "Disabled analytics job, restarted postgres-db..."}}
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
## Observation Space
|
| 84 |
+
|
| 85 |
+
| Field | Type | Description |
|
| 86 |
+
|---|---|---|
|
| 87 |
+
| `episode_id` | string | Unique episode UUID |
|
| 88 |
+
| `task_id` | string | Active task |
|
| 89 |
+
| `scenario_id` | string | Scenario (e.g. `AC-001`) |
|
| 90 |
+
| `step_count` / `max_steps` | int | Current step and budget |
|
| 91 |
+
| `incident_summary` | string | Plain-text incident description |
|
| 92 |
+
| `alert` | dict | Alert payload with severity, symptoms, affected services |
|
| 93 |
+
| `available_actions` | list[str] | Valid action types for this task |
|
| 94 |
+
| `queried_data` | dict | All tool responses gathered so far |
|
| 95 |
+
| `known_services` | list[str] | Exact service names to use in actions |
|
| 96 |
+
| `cumulative_reward` | float | Running reward total |
|
| 97 |
+
| `done` | bool | Episode terminal flag |
|
| 98 |
+
| `feedback` | string | Per-step feedback string |
|
| 99 |
+
|
| 100 |
+
## Reward Function
|
| 101 |
+
|
| 102 |
+
Dense reward shaping throughout the trajectory:
|
| 103 |
+
|
| 104 |
+
| Event | Reward |
|
| 105 |
+
|---|---|
|
| 106 |
+
| Query known service (first time) | +0.05 |
|
| 107 |
+
| Query known service (repeat) | +0.01 |
|
| 108 |
+
| Query unknown service | −0.05 |
|
| 109 |
+
| Correct remediation action | +0.10 |
|
| 110 |
+
| Wrong remediation action | −0.10 |
|
| 111 |
+
| Step past halfway (non-submit) | −0.02 |
|
| 112 |
+
| Timeout without submission | −0.10 |
|
| 113 |
+
| Grader score (terminal step) | 0.0–1.0 |
|
| 114 |
+
|
| 115 |
+
**Grader scoring** (deterministic, via `GET /grader`):
|
| 116 |
+
|
| 117 |
+
| Task | Scoring Logic |
|
| 118 |
+
|---|---|
|
| 119 |
+
| `alert_classification` | 1.0 exact · 0.5 adjacent · 0.25 two-off · 0.0 wrong/none |
|
| 120 |
+
| `root_cause_analysis` | 0.6 base (svc+mode) + up to 0.4 efficiency bonus |
|
| 121 |
+
| `remediation_planning` | 0.6 base + 0.3 efficiency − 0.15 wrong penalty + 0.1 summary |
|
| 122 |
+
|
| 123 |
+
## API Endpoints
|
| 124 |
+
|
| 125 |
+
| Method | Path | Description |
|
| 126 |
+
|---|---|---|
|
| 127 |
+
| GET | `/` | `{"status":"running",...}` — HF Space health |
|
| 128 |
+
| GET | `/health` | `{"status":"ok","version":"0.1.0"}` |
|
| 129 |
+
| POST | `/reset?task_id=...&scenario_index=...` | Start new episode |
|
| 130 |
+
| POST | `/step` | Submit action (JSON body) |
|
| 131 |
+
| GET | `/state` | Full current episode state |
|
| 132 |
+
| GET | `/tasks` | All tasks with action schemas |
|
| 133 |
+
| GET | `/grader` | Score current episode (0.0–1.0) |
|
| 134 |
+
| POST | `/baseline` | Run inference.py, return scores |
|
| 135 |
+
|
| 136 |
+
## Setup & Usage
|
| 137 |
+
|
| 138 |
+
### Local development
|
| 139 |
+
```bash
|
| 140 |
+
pip install -r requirements.txt
|
| 141 |
+
uvicorn server.app:app --host 0.0.0.0 --port 7860
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
### Docker
|
| 145 |
+
```bash
|
| 146 |
+
docker build -t cloud-incident-env .
|
| 147 |
+
docker run -p 7860:7860 \
|
| 148 |
+
-e API_BASE_URL="https://api-inference.huggingface.co/v1" \
|
| 149 |
+
-e MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" \
|
| 150 |
+
-e HF_TOKEN="hf_your_token" \
|
| 151 |
+
cloud-incident-env
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
### Run inference script
|
| 155 |
+
```bash
|
| 156 |
+
export API_BASE_URL="https://api-inference.huggingface.co/v1"
|
| 157 |
+
export MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
|
| 158 |
+
export HF_TOKEN="hf_your_token"
|
| 159 |
+
python inference.py
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
### Quick API test
|
| 163 |
+
```bash
|
| 164 |
+
# Start new episode
|
| 165 |
+
curl -X POST "http://localhost:7860/reset?task_id=alert_classification&scenario_index=0"
|
| 166 |
+
|
| 167 |
+
# Submit an action
|
| 168 |
+
curl -X POST http://localhost:7860/step \
|
| 169 |
+
-H "Content-Type: application/json" \
|
| 170 |
+
-d '{"action_type":"query_logs","parameters":{"service":"api-gateway"}}'
|
| 171 |
+
|
| 172 |
+
# Check score
|
| 173 |
+
curl http://localhost:7860/grader
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
## Baseline Scores
|
| 177 |
+
|
| 178 |
+
Using `meta-llama/Llama-3.1-8B-Instruct` via HF Inference API:
|
| 179 |
+
|
| 180 |
+
| Task | Scenario 0 | Scenario 1 | Average |
|
| 181 |
+
|---|---|---|---|
|
| 182 |
+
| `alert_classification` | ~1.00 | ~0.50 | ~0.75 |
|
| 183 |
+
| `root_cause_analysis` | ~0.45 | ~0.35 | ~0.40 |
|
| 184 |
+
| `remediation_planning` | ~0.25 | ~0.20 | ~0.23 |
|
| 185 |
+
| **overall** | | | **~0.46** |
|
| 186 |
+
|
| 187 |
+
*Run `python inference.py` to reproduce.*
|
| 188 |
+
|
| 189 |
+
## Project Structure
|
| 190 |
|
| 191 |
+
```
|
| 192 |
+
.
|
| 193 |
+
├── Dockerfile
|
| 194 |
+
├── README.md
|
| 195 |
+
├── requirements.txt
|
| 196 |
+
├── openenv.yaml
|
| 197 |
+
├── tasks.py # Scenario definitions (6 scenarios across 3 tasks)
|
| 198 |
+
├── graders.py # Deterministic graders for all tasks
|
| 199 |
+
├── inference.py # Baseline agent + smart fallback logic
|
| 200 |
+
└── server/
|
| 201 |
+
├── __init__.py
|
| 202 |
+
├── app.py # FastAPI endpoints
|
| 203 |
+
├── environment.py # Core OpenEnv step/reset/state logic
|
| 204 |
+
└── models.py # Typed Pydantic models (Action, Observation, Reward)
|
| 205 |
+
```
|
graders.py
CHANGED
|
@@ -5,6 +5,11 @@ Public API:
|
|
| 5 |
grade(task_id, state, scenario) -> {"total": float, "breakdown": dict, "feedback": str}
|
| 6 |
|
| 7 |
All scores are in [0.0, 1.0].
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
from __future__ import annotations
|
|
@@ -33,6 +38,16 @@ def _svc_match(submitted: str, correct: str) -> bool:
|
|
| 33 |
"auth": "auth-service",
|
| 34 |
"api": "api-gateway",
|
| 35 |
"api-gw": "api-gateway",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
}
|
| 37 |
return aliases.get(s, s) == c or s == aliases.get(c, c)
|
| 38 |
|
|
@@ -45,11 +60,27 @@ def grade(task_id: str, state: dict, scenario: dict) -> dict:
|
|
| 45 |
}
|
| 46 |
fn = _graders.get(task_id)
|
| 47 |
if fn is None:
|
| 48 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
return fn(state, scenario)
|
| 50 |
|
| 51 |
|
| 52 |
-
# ── Task 1: Alert Classification ──────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
def _grade_alert_classification(state: dict, scenario: dict) -> dict:
|
| 54 |
history = state.get("action_history", [])
|
| 55 |
correct = scenario.get("correct_severity", "P1")
|
|
@@ -92,7 +123,23 @@ def _grade_alert_classification(state: dict, scenario: dict) -> dict:
|
|
| 92 |
}
|
| 93 |
|
| 94 |
|
| 95 |
-
# ── Task 2: Root Cause Analysis ─────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
def _grade_root_cause_analysis(state: dict, scenario: dict) -> dict:
|
| 97 |
history = state.get("action_history", [])
|
| 98 |
correct_rc = scenario.get("correct_root_cause", {})
|
|
@@ -101,8 +148,11 @@ def _grade_root_cause_analysis(state: dict, scenario: dict) -> dict:
|
|
| 101 |
known = {s.lower() for s in scenario.get("known_services", set())}
|
| 102 |
|
| 103 |
diag_types = {
|
| 104 |
-
"query_logs",
|
| 105 |
-
"
|
|
|
|
|
|
|
|
|
|
| 106 |
}
|
| 107 |
|
| 108 |
sub_svc, sub_mode, sub_step = "", "", len(history)
|
|
@@ -139,12 +189,12 @@ def _grade_root_cause_analysis(state: dict, scenario: dict) -> dict:
|
|
| 139 |
efficiency = 0.0
|
| 140 |
if svc_match:
|
| 141 |
pre_submit = [
|
| 142 |
-
a
|
|
|
|
| 143 |
if a.get("action_type") in diag_types
|
| 144 |
]
|
| 145 |
queried_svcs = {
|
| 146 |
-
a.get("parameters", {}).get("service", "").lower()
|
| 147 |
-
for a in pre_submit
|
| 148 |
}
|
| 149 |
relevant = queried_svcs & known
|
| 150 |
total_q = len(pre_submit)
|
|
@@ -169,7 +219,26 @@ def _grade_root_cause_analysis(state: dict, scenario: dict) -> dict:
|
|
| 169 |
}
|
| 170 |
|
| 171 |
|
| 172 |
-
# ── Task 3: Remediation Planning ──────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
|
| 174 |
history = state.get("action_history", [])
|
| 175 |
correct_seq = scenario.get("correct_remediation_sequence", [])
|
|
@@ -177,10 +246,17 @@ def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
|
|
| 177 |
keywords = scenario.get("resolution_keywords", [])
|
| 178 |
|
| 179 |
diag_rem = {
|
| 180 |
-
"query_logs",
|
| 181 |
-
"
|
| 182 |
-
"
|
| 183 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
}
|
| 185 |
|
| 186 |
summary = ""
|
|
@@ -195,8 +271,10 @@ def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
|
|
| 195 |
return {
|
| 196 |
"total": 0.0,
|
| 197 |
"breakdown": {
|
| 198 |
-
"base": 0.0,
|
| 199 |
-
"
|
|
|
|
|
|
|
| 200 |
},
|
| 201 |
"feedback": "No resolution submitted or no investigation — score 0.0",
|
| 202 |
}
|
|
@@ -212,10 +290,14 @@ def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
|
|
| 212 |
runbook = p.get("runbook_action", "")
|
| 213 |
target = p.get("target", "")
|
| 214 |
executed.add(at)
|
| 215 |
-
if svc:
|
| 216 |
-
|
| 217 |
-
if
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
def _seq_key_matches(seq_key: str) -> bool:
|
| 221 |
if seq_key in executed:
|
|
@@ -230,13 +312,18 @@ def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
|
|
| 230 |
return False
|
| 231 |
|
| 232 |
matched = sum(1 for k in correct_seq if _seq_key_matches(k))
|
| 233 |
-
efficiency =
|
|
|
|
|
|
|
| 234 |
|
| 235 |
wrong_count = sum(
|
| 236 |
-
1
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
in wrong_map
|
|
|
|
|
|
|
|
|
|
| 240 |
)
|
| 241 |
penalty = round(min(0.15, wrong_count * 0.05), 4)
|
| 242 |
|
|
|
|
| 5 |
grade(task_id, state, scenario) -> {"total": float, "breakdown": dict, "feedback": str}
|
| 6 |
|
| 7 |
All scores are in [0.0, 1.0].
|
| 8 |
+
|
| 9 |
+
Grading Philosophy:
|
| 10 |
+
- Easy task: binary-ish — did you get the severity right?
|
| 11 |
+
- Medium task: partial credit for correct service, bonus for efficiency
|
| 12 |
+
- Hard task: multi-component — base + efficiency − penalties + summary quality
|
| 13 |
"""
|
| 14 |
|
| 15 |
from __future__ import annotations
|
|
|
|
| 38 |
"auth": "auth-service",
|
| 39 |
"api": "api-gateway",
|
| 40 |
"api-gw": "api-gateway",
|
| 41 |
+
"fraud": "fraud-detection-service",
|
| 42 |
+
"fraud-detection": "fraud-detection-service",
|
| 43 |
+
"order": "order-service",
|
| 44 |
+
"orders": "order-service",
|
| 45 |
+
"image": "image-service",
|
| 46 |
+
"images": "image-service",
|
| 47 |
+
"product": "product-service",
|
| 48 |
+
"products": "product-service",
|
| 49 |
+
"redis": "redis-session",
|
| 50 |
+
"redis-cache": "redis-payment-cache",
|
| 51 |
}
|
| 52 |
return aliases.get(s, s) == c or s == aliases.get(c, c)
|
| 53 |
|
|
|
|
| 60 |
}
|
| 61 |
fn = _graders.get(task_id)
|
| 62 |
if fn is None:
|
| 63 |
+
return {
|
| 64 |
+
"total": 0.0,
|
| 65 |
+
"breakdown": {},
|
| 66 |
+
"feedback": f"Unknown task_id '{task_id}'",
|
| 67 |
+
}
|
| 68 |
return fn(state, scenario)
|
| 69 |
|
| 70 |
|
| 71 |
+
# ── Task 1: Alert Classification (Easy) ──────────────────────────────────────
|
| 72 |
+
#
|
| 73 |
+
# Scoring:
|
| 74 |
+
# 1.0 — exact severity match
|
| 75 |
+
# 0.5 — adjacent severity (e.g. P1 vs P2)
|
| 76 |
+
# 0.25 — two levels off (e.g. P1 vs P3)
|
| 77 |
+
# 0.0 — wrong by 3+ levels or no submission
|
| 78 |
+
#
|
| 79 |
+
# This is genuinely EASY: with 3 steps, an agent queries 1–2 services,
|
| 80 |
+
# reads the error_rate + revenue_impact, and classifies. The data is
|
| 81 |
+
# unambiguous — the correct answer is clearly derivable from the alert.
|
| 82 |
+
|
| 83 |
+
|
| 84 |
def _grade_alert_classification(state: dict, scenario: dict) -> dict:
|
| 85 |
history = state.get("action_history", [])
|
| 86 |
correct = scenario.get("correct_severity", "P1")
|
|
|
|
| 123 |
}
|
| 124 |
|
| 125 |
|
| 126 |
+
# ── Task 2: Root Cause Analysis (Medium) ─────────────────────────────────────
|
| 127 |
+
#
|
| 128 |
+
# Scoring (total up to 1.0):
|
| 129 |
+
# Base (up to 0.6):
|
| 130 |
+
# 0.60 — correct service AND failure mode keywords match
|
| 131 |
+
# 0.35 — correct service only
|
| 132 |
+
# 0.10 — wrong service (partial credit for at least submitting)
|
| 133 |
+
# Efficiency bonus (up to 0.4):
|
| 134 |
+
# Based on investigation precision: queried relevant services / total queries
|
| 135 |
+
# Plus bonus for breadth of investigation (up to 3 unique known services)
|
| 136 |
+
#
|
| 137 |
+
# This is genuinely MEDIUM: the root cause is NOT in the alert's
|
| 138 |
+
# affected_services list. The agent must investigate services outside
|
| 139 |
+
# the blast radius, correlate log evidence, and identify the upstream
|
| 140 |
+
# trigger — this requires multi-hop reasoning across 4–6 services.
|
| 141 |
+
|
| 142 |
+
|
| 143 |
def _grade_root_cause_analysis(state: dict, scenario: dict) -> dict:
|
| 144 |
history = state.get("action_history", [])
|
| 145 |
correct_rc = scenario.get("correct_root_cause", {})
|
|
|
|
| 148 |
known = {s.lower() for s in scenario.get("known_services", set())}
|
| 149 |
|
| 150 |
diag_types = {
|
| 151 |
+
"query_logs",
|
| 152 |
+
"check_metrics",
|
| 153 |
+
"check_dependencies",
|
| 154 |
+
"check_recent_deploys",
|
| 155 |
+
"check_service_status",
|
| 156 |
}
|
| 157 |
|
| 158 |
sub_svc, sub_mode, sub_step = "", "", len(history)
|
|
|
|
| 189 |
efficiency = 0.0
|
| 190 |
if svc_match:
|
| 191 |
pre_submit = [
|
| 192 |
+
a
|
| 193 |
+
for a in history[: sub_step]
|
| 194 |
if a.get("action_type") in diag_types
|
| 195 |
]
|
| 196 |
queried_svcs = {
|
| 197 |
+
a.get("parameters", {}).get("service", "").lower() for a in pre_submit
|
|
|
|
| 198 |
}
|
| 199 |
relevant = queried_svcs & known
|
| 200 |
total_q = len(pre_submit)
|
|
|
|
| 219 |
}
|
| 220 |
|
| 221 |
|
| 222 |
+
# ── Task 3: Remediation Planning (Hard) ──────────────────────────────────────
|
| 223 |
+
#
|
| 224 |
+
# Scoring (total up to 1.0):
|
| 225 |
+
# Base (0.6 if submitted with any investigation):
|
| 226 |
+
# Requires at least 1 diagnostic/remediation action + a summary
|
| 227 |
+
# Efficiency bonus (up to 0.3):
|
| 228 |
+
# Fraction of correct_remediation_sequence steps matched
|
| 229 |
+
# Wrong action penalty (up to -0.15):
|
| 230 |
+
# −0.05 per wrong action taken (capped at 3)
|
| 231 |
+
# Summary quality bonus (up to 0.10):
|
| 232 |
+
# Based on keyword coverage in the resolution summary
|
| 233 |
+
#
|
| 234 |
+
# This is genuinely HARD: requires multi-phase execution:
|
| 235 |
+
# Phase 1: Diagnose (query logs to confirm root cause)
|
| 236 |
+
# Phase 2: Remediate (execute 3–5 specific actions in order)
|
| 237 |
+
# Phase 3: Document (write a coherent summary with key details)
|
| 238 |
+
# Wrong remediation actions actively harm the score. The sequence
|
| 239 |
+
# matters. The summary must reference specific services and actions.
|
| 240 |
+
|
| 241 |
+
|
| 242 |
def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
|
| 243 |
history = state.get("action_history", [])
|
| 244 |
correct_seq = scenario.get("correct_remediation_sequence", [])
|
|
|
|
| 246 |
keywords = scenario.get("resolution_keywords", [])
|
| 247 |
|
| 248 |
diag_rem = {
|
| 249 |
+
"query_logs",
|
| 250 |
+
"check_metrics",
|
| 251 |
+
"check_dependencies",
|
| 252 |
+
"check_recent_deploys",
|
| 253 |
+
"check_service_status",
|
| 254 |
+
"restart_service",
|
| 255 |
+
"rollback_deploy",
|
| 256 |
+
"scale_service",
|
| 257 |
+
"disable_feature_flag",
|
| 258 |
+
"clear_cache",
|
| 259 |
+
"execute_runbook_step",
|
| 260 |
}
|
| 261 |
|
| 262 |
summary = ""
|
|
|
|
| 271 |
return {
|
| 272 |
"total": 0.0,
|
| 273 |
"breakdown": {
|
| 274 |
+
"base": 0.0,
|
| 275 |
+
"efficiency": 0.0,
|
| 276 |
+
"penalty": 0.0,
|
| 277 |
+
"summary_bonus": 0.0,
|
| 278 |
},
|
| 279 |
"feedback": "No resolution submitted or no investigation — score 0.0",
|
| 280 |
}
|
|
|
|
| 290 |
runbook = p.get("runbook_action", "")
|
| 291 |
target = p.get("target", "")
|
| 292 |
executed.add(at)
|
| 293 |
+
if svc:
|
| 294 |
+
executed.add(f"{at}:{svc}")
|
| 295 |
+
if flag:
|
| 296 |
+
executed.add(f"{at}:{flag}")
|
| 297 |
+
if runbook:
|
| 298 |
+
executed.add(f"execute_runbook_step:{runbook}")
|
| 299 |
+
if target:
|
| 300 |
+
executed.add(f"execute_runbook_step:{target}")
|
| 301 |
|
| 302 |
def _seq_key_matches(seq_key: str) -> bool:
|
| 303 |
if seq_key in executed:
|
|
|
|
| 312 |
return False
|
| 313 |
|
| 314 |
matched = sum(1 for k in correct_seq if _seq_key_matches(k))
|
| 315 |
+
efficiency = (
|
| 316 |
+
round((matched / len(correct_seq)) * 0.3, 4) if correct_seq else 0.0
|
| 317 |
+
)
|
| 318 |
|
| 319 |
wrong_count = sum(
|
| 320 |
+
1
|
| 321 |
+
for a in history
|
| 322 |
+
if (
|
| 323 |
+
a.get("action_type") in wrong_map
|
| 324 |
+
or f"{a.get('action_type')}:{a.get('parameters', {}).get('service', '')}"
|
| 325 |
+
in wrong_map
|
| 326 |
+
)
|
| 327 |
)
|
| 328 |
penalty = round(min(0.15, wrong_count * 0.05), 4)
|
| 329 |
|
inference.py
CHANGED
|
@@ -10,6 +10,7 @@ from __future__ import annotations
|
|
| 10 |
import json
|
| 11 |
import os
|
| 12 |
import sys
|
|
|
|
| 13 |
|
| 14 |
import requests
|
| 15 |
|
|
@@ -29,8 +30,6 @@ if not HF_TOKEN:
|
|
| 29 |
print("[WARN] No API key set — LLM calls will fail.", file=sys.stderr)
|
| 30 |
|
| 31 |
_session = requests.Session()
|
| 32 |
-
|
| 33 |
-
# Lazy-init OpenAI client to avoid import-time httpx errors
|
| 34 |
_client = None
|
| 35 |
|
| 36 |
|
|
@@ -42,7 +41,7 @@ def _get_client():
|
|
| 42 |
return _client
|
| 43 |
|
| 44 |
|
| 45 |
-
# ──
|
| 46 |
_TASK_SUBMIT = {
|
| 47 |
"alert_classification": "submit_severity",
|
| 48 |
"root_cause_analysis": "submit_root_cause",
|
|
@@ -66,7 +65,6 @@ _REM_TYPES = frozenset({
|
|
| 66 |
_ALL_VALID = _DIAG_TYPES | _SUBMIT_TYPES | _REM_TYPES
|
| 67 |
|
| 68 |
|
| 69 |
-
# ── System prompt — general SRE strategy, NO scenario answers ───────────────
|
| 70 |
SYSTEM_PROMPT = """\
|
| 71 |
You are an expert Site Reliability Engineer responding to a production incident.
|
| 72 |
Reply with exactly ONE JSON action object. No markdown, no explanation, no extra text.
|
|
@@ -86,40 +84,43 @@ VALID ACTIONS:
|
|
| 86 |
{"action_type":"submit_resolution","parameters":{"summary":"<3+ sentence summary>"}}
|
| 87 |
|
| 88 |
RULES:
|
| 89 |
-
- Service names MUST exactly match the KNOWN_SERVICES list
|
| 90 |
- P1 = complete outage OR revenue > $1,000/min. P2 = major degradation.
|
| 91 |
-
P3 = minor issue. P4 = informational.
|
| 92 |
-
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
-
|
| 96 |
-
|
| 97 |
-
-
|
|
|
|
| 98 |
|
| 99 |
-
|
| 100 |
|
| 101 |
alert_classification (max 3 steps):
|
| 102 |
-
Query 1-2
|
|
|
|
| 103 |
|
| 104 |
root_cause_analysis (max 10 steps):
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
|
|
|
| 108 |
|
| 109 |
remediation_planning (max 15 steps):
|
| 110 |
-
1.
|
| 111 |
-
2. Execute
|
| 112 |
-
|
| 113 |
-
|
| 114 |
|
| 115 |
CRITICAL: Each task has ONE correct submission action:
|
| 116 |
alert_classification -> submit_severity
|
| 117 |
root_cause_analysis -> submit_root_cause
|
| 118 |
-
remediation_planning -> submit_resolution
|
| 119 |
-
Do NOT use the wrong submission type for the task."""
|
| 120 |
|
| 121 |
|
| 122 |
# ── Helpers ─────────────────────────────────────────────────────────────────
|
|
|
|
| 123 |
def _queried_svcs(queried_data: dict) -> set[str]:
|
| 124 |
return {
|
| 125 |
svc
|
|
@@ -130,7 +131,6 @@ def _queried_svcs(queried_data: dict) -> set[str]:
|
|
| 130 |
|
| 131 |
|
| 132 |
def _extract_signals(queried_data: dict) -> list[str]:
|
| 133 |
-
"""Surface key patterns from queried data — shown to LLM."""
|
| 134 |
seen: set[str] = set()
|
| 135 |
signals: list[str] = []
|
| 136 |
|
|
@@ -154,21 +154,25 @@ def _extract_signals(queried_data: dict) -> list[str]:
|
|
| 154 |
_add(f"Cache purge in {svc}")
|
| 155 |
if "unbounded" in t or "no limit" in t:
|
| 156 |
_add(f"Unbounded query in {svc}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
if action_type == "check_recent_deploys" and any(
|
| 158 |
-
x in t for x in ("ago", "change", "update", "added")
|
| 159 |
):
|
| 160 |
-
|
| 161 |
-
|
|
|
|
| 162 |
return signals
|
| 163 |
|
| 164 |
|
| 165 |
-
# ── Message builders ────────────────────────────────────────────────────────
|
| 166 |
def _first_obs_msg(obs: dict) -> str:
|
| 167 |
-
alert
|
| 168 |
-
known
|
| 169 |
affected = alert.get("affected_services", [])
|
| 170 |
-
task_id
|
| 171 |
-
non_aff
|
| 172 |
|
| 173 |
lines = [
|
| 174 |
"=== NEW INCIDENT ===",
|
|
@@ -182,34 +186,37 @@ def _first_obs_msg(obs: dict) -> str:
|
|
| 182 |
if alert.get("title"):
|
| 183 |
lines.append(f" Title: {alert['title']}")
|
| 184 |
if affected:
|
| 185 |
-
lines.append(f" Directly affected
|
| 186 |
for s in alert.get("symptoms", []):
|
| 187 |
lines.append(f" - {s}")
|
| 188 |
for k in ("error_rate", "duration_minutes", "revenue_impact_per_min"):
|
| 189 |
if alert.get(k) is not None:
|
| 190 |
lines.append(f" {k}: {alert[k]}")
|
| 191 |
|
| 192 |
-
lines.append(f"KNOWN_SERVICES
|
| 193 |
|
| 194 |
if non_aff and task_id in ("root_cause_analysis", "remediation_planning"):
|
| 195 |
-
lines.append(
|
| 196 |
-
f" *** These services are NOT in the alert — investigate them "
|
| 197 |
-
f"for possible root cause: {json.dumps(non_aff)} ***"
|
| 198 |
-
)
|
| 199 |
|
| 200 |
lines.append(f"AVAILABLE ACTIONS: {obs.get('available_actions', [])}")
|
| 201 |
lines.append(f"REQUIRED SUBMISSION: {_TASK_SUBMIT.get(task_id, 'unknown')}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
lines.append("")
|
| 203 |
-
lines.append("Respond with your first action (JSON only
|
| 204 |
return "\n".join(lines)
|
| 205 |
|
| 206 |
|
| 207 |
def _step_msg(obs: dict, prev_queried: dict) -> str:
|
| 208 |
-
step
|
| 209 |
max_steps = obs.get("max_steps", 10)
|
| 210 |
-
left
|
| 211 |
-
queried
|
| 212 |
-
task_id
|
| 213 |
|
| 214 |
lines = [
|
| 215 |
f"Step {step}/{max_steps} ({left} remaining) | "
|
|
@@ -217,7 +224,6 @@ def _step_msg(obs: dict, prev_queried: dict) -> str:
|
|
| 217 |
f"feedback: {obs.get('feedback', '')}",
|
| 218 |
]
|
| 219 |
|
| 220 |
-
# Show new data received
|
| 221 |
new_data = []
|
| 222 |
for action_type, services in queried.items():
|
| 223 |
prev = prev_queried.get(action_type, {})
|
|
@@ -229,35 +235,26 @@ def _step_msg(obs: dict, prev_queried: dict) -> str:
|
|
| 229 |
d = d[:500] + "..."
|
| 230 |
new_data.append(f" [{action_type}][{svc}]: {d}")
|
| 231 |
if new_data:
|
| 232 |
-
lines.append("NEW DATA
|
| 233 |
lines.extend(new_data)
|
| 234 |
|
| 235 |
-
# Show extracted signals
|
| 236 |
signals = _extract_signals(queried)
|
| 237 |
if signals:
|
| 238 |
-
lines.append("
|
| 239 |
for sig in signals:
|
| 240 |
lines.append(f" *** {sig} ***")
|
| 241 |
|
| 242 |
-
# Urgency reminders
|
| 243 |
if left <= 3:
|
| 244 |
-
lines.append(
|
| 245 |
-
f"*** {left} steps remaining — submit "
|
| 246 |
-
f"{_TASK_SUBMIT.get(task_id, 'your answer')} soon ***"
|
| 247 |
-
)
|
| 248 |
if left <= 1:
|
| 249 |
-
lines.append(
|
| 250 |
-
f"!!! LAST STEP — YOU MUST {_TASK_SUBMIT.get(task_id, 'SUBMIT')} NOW !!!"
|
| 251 |
-
)
|
| 252 |
|
| 253 |
-
lines.append("Next action (JSON only
|
| 254 |
return "\n".join(lines)
|
| 255 |
|
| 256 |
|
| 257 |
-
# ── Parse LLM output ───────────────────────────────────────────────────────
|
| 258 |
def _parse(text: str) -> dict:
|
| 259 |
text = text.strip()
|
| 260 |
-
# Strip markdown code fences
|
| 261 |
if text.startswith("`"):
|
| 262 |
text = "\n".join(
|
| 263 |
ln for ln in text.splitlines() if not ln.startswith("`")
|
|
@@ -272,125 +269,92 @@ def _parse(text: str) -> dict:
|
|
| 272 |
raise
|
| 273 |
|
| 274 |
|
| 275 |
-
# ── Fallback — generic, no scenario knowledge ──────────────────────────────
|
| 276 |
def _fallback_submit(task_id: str, obs: dict) -> dict:
|
| 277 |
-
"""Minimal correct-type submission. Will score low but won't crash."""
|
| 278 |
alert = obs.get("alert", {})
|
| 279 |
known = obs.get("known_services", [])
|
| 280 |
|
| 281 |
if task_id == "alert_classification":
|
| 282 |
rev = alert.get("revenue_impact_per_min", 0) or 0
|
| 283 |
err = alert.get("error_rate", 0) or 0
|
| 284 |
-
sev = "P1" if (rev > 1000 or err > 0.9) else
|
| 285 |
-
|
| 286 |
svc = (alert.get("affected_services") or known or ["unknown"])[0]
|
| 287 |
-
return {
|
| 288 |
-
|
| 289 |
-
"parameters": {"severity": sev, "service": svc},
|
| 290 |
-
}
|
| 291 |
|
| 292 |
if task_id == "root_cause_analysis":
|
| 293 |
svc = known[0] if known else "unknown"
|
| 294 |
-
return {
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
"service": svc,
|
| 298 |
-
"failure_mode": "service failure causing downstream cascade",
|
| 299 |
-
},
|
| 300 |
-
}
|
| 301 |
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
"across affected services. Remediation actions were applied to "
|
| 309 |
-
"restore service health. Systems are being monitored for full "
|
| 310 |
-
"recovery confirmation."
|
| 311 |
-
),
|
| 312 |
-
},
|
| 313 |
-
}
|
| 314 |
|
| 315 |
|
| 316 |
-
def _smart_fallback(
|
| 317 |
-
|
| 318 |
-
) -> dict:
|
| 319 |
-
"""Generic fallback — queries unvisited services, then submits."""
|
| 320 |
-
known = obs.get("known_services", [])
|
| 321 |
queried = obs.get("queried_data", {})
|
| 322 |
-
left
|
| 323 |
-
q_svcs
|
| 324 |
|
| 325 |
-
# Must submit on final step
|
| 326 |
if left <= 1:
|
| 327 |
return _fallback_submit(task_id, obs)
|
| 328 |
|
| 329 |
-
# Alert classification — submit after any query
|
| 330 |
if task_id == "alert_classification" and q_svcs:
|
| 331 |
return _fallback_submit(task_id, obs)
|
| 332 |
|
| 333 |
-
# Query
|
| 334 |
for svc in known:
|
| 335 |
if svc not in q_svcs:
|
| 336 |
-
return {
|
| 337 |
-
|
| 338 |
-
"parameters": {"service": svc},
|
| 339 |
-
}
|
| 340 |
|
| 341 |
-
#
|
| 342 |
if task_id in ("root_cause_analysis", "remediation_planning"):
|
| 343 |
deploy_queried = set(queried.get("check_recent_deploys", {}).keys())
|
| 344 |
for svc in known:
|
| 345 |
if svc not in deploy_queried:
|
| 346 |
-
return {
|
| 347 |
-
|
| 348 |
-
"parameters": {"service": svc},
|
| 349 |
-
}
|
| 350 |
|
| 351 |
-
# Everything queried — submit
|
| 352 |
return _fallback_submit(task_id, obs)
|
| 353 |
|
| 354 |
|
| 355 |
-
# ── Override — ONLY blocks clearly invalid actions ──────────────────────────
|
| 356 |
def _should_override(
|
| 357 |
task_id: str, action: dict, obs: dict, step: int, max_steps: int
|
| 358 |
) -> bool:
|
| 359 |
-
at
|
| 360 |
params = action.get("parameters", {})
|
| 361 |
-
left
|
| 362 |
-
known
|
| 363 |
|
| 364 |
-
# 1. Unknown action type
|
| 365 |
if at not in _ALL_VALID:
|
| 366 |
return True
|
| 367 |
-
|
| 368 |
-
# 2. Must submit on last step
|
| 369 |
if left <= 0 and at not in _SUBMIT_TYPES:
|
| 370 |
return True
|
| 371 |
|
| 372 |
-
# 3. WRONG submission type for the task
|
| 373 |
-
# e.g. submit_severity during remediation_planning
|
| 374 |
correct_submit = _TASK_SUBMIT.get(task_id)
|
| 375 |
if at in _SUBMIT_TYPES and at != correct_submit:
|
| 376 |
return True
|
| 377 |
|
| 378 |
-
# 4. Service not in known_services (for service-targeted actions)
|
| 379 |
svc = (params.get("service") or "").strip()
|
| 380 |
if (svc and known
|
| 381 |
and at not in ("disable_feature_flag", "execute_runbook_step")
|
| 382 |
and svc not in known):
|
| 383 |
return True
|
| 384 |
|
| 385 |
-
# 5. Invalid severity value
|
| 386 |
if at == "submit_severity":
|
| 387 |
sev = (params.get("severity") or "").upper().strip()
|
| 388 |
if sev not in ("P1", "P2", "P3", "P4"):
|
| 389 |
return True
|
| 390 |
|
| 391 |
-
# 6. Empty required fields
|
| 392 |
if at == "submit_root_cause":
|
| 393 |
-
svc
|
| 394 |
mode = (params.get("failure_mode") or "").strip()
|
| 395 |
if not svc or len(mode) < 5:
|
| 396 |
return True
|
|
@@ -400,14 +364,40 @@ def _should_override(
|
|
| 400 |
if len(summary) < 30:
|
| 401 |
return True
|
| 402 |
|
| 403 |
-
# 7. Remediation action used in alert_classification task
|
| 404 |
if task_id == "alert_classification" and at in _REM_TYPES:
|
| 405 |
return True
|
| 406 |
|
| 407 |
return False
|
| 408 |
|
| 409 |
|
| 410 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
def _run_episode(task_id: str, scenario_index: int) -> float:
|
| 412 |
r = _session.post(
|
| 413 |
f"{ENV_BASE_URL}/reset",
|
|
@@ -428,24 +418,9 @@ def _run_episode(task_id: str, scenario_index: int) -> float:
|
|
| 428 |
for step_i in range(max_steps):
|
| 429 |
current_step = step_i + 1
|
| 430 |
|
| 431 |
-
|
| 432 |
-
try:
|
| 433 |
-
resp = _get_client().chat.completions.create(
|
| 434 |
-
model=MODEL_NAME,
|
| 435 |
-
messages=messages,
|
| 436 |
-
temperature=0.0,
|
| 437 |
-
max_tokens=300,
|
| 438 |
-
stream=False,
|
| 439 |
-
)
|
| 440 |
-
raw = resp.choices[0].message.content or ""
|
| 441 |
-
except Exception as e:
|
| 442 |
-
print(f" [WARN] LLM call failed step {current_step}: {e}",
|
| 443 |
-
file=sys.stderr)
|
| 444 |
-
raw = ""
|
| 445 |
-
|
| 446 |
messages.append({"role": "assistant", "content": raw or "{}"})
|
| 447 |
|
| 448 |
-
# ── Parse ────────────────────────────────────────────────────────
|
| 449 |
action = None
|
| 450 |
try:
|
| 451 |
if raw.strip():
|
|
@@ -453,7 +428,6 @@ def _run_episode(task_id: str, scenario_index: int) -> float:
|
|
| 453 |
except Exception:
|
| 454 |
pass
|
| 455 |
|
| 456 |
-
# ── Fallback / override ──────────────────────────────────────────
|
| 457 |
if action is None:
|
| 458 |
action = _smart_fallback(task_id, obs, current_step, max_steps)
|
| 459 |
print(f" [FALLBACK] step {current_step}: "
|
|
@@ -462,15 +436,11 @@ def _run_episode(task_id: str, scenario_index: int) -> float:
|
|
| 462 |
old_at = action.get("action_type")
|
| 463 |
action = _smart_fallback(task_id, obs, current_step, max_steps)
|
| 464 |
print(f" [OVERRIDE] step {current_step}: "
|
| 465 |
-
f"{old_at} -> {action.get('action_type')}",
|
| 466 |
-
file=sys.stderr)
|
| 467 |
|
| 468 |
-
|
| 469 |
-
sr = _session.post(
|
| 470 |
-
f"{ENV_BASE_URL}/step", json=action, timeout=30,
|
| 471 |
-
)
|
| 472 |
sr.raise_for_status()
|
| 473 |
-
result
|
| 474 |
new_obs = result["observation"]
|
| 475 |
|
| 476 |
print(
|
|
@@ -492,7 +462,6 @@ def _run_episode(task_id: str, scenario_index: int) -> float:
|
|
| 492 |
}
|
| 493 |
obs = new_obs
|
| 494 |
|
| 495 |
-
# Keep conversation window manageable
|
| 496 |
if len(messages) > 20:
|
| 497 |
messages = messages[:2] + messages[-16:]
|
| 498 |
|
|
@@ -501,15 +470,17 @@ def _run_episode(task_id: str, scenario_index: int) -> float:
|
|
| 501 |
return g.json().get("total", 0.0)
|
| 502 |
|
| 503 |
|
| 504 |
-
# ── Entry point ─────────────────────────────────────────────────────────────
|
| 505 |
def main():
|
| 506 |
runs = [
|
| 507 |
("alert_classification", 0),
|
| 508 |
("alert_classification", 1),
|
|
|
|
| 509 |
("root_cause_analysis", 0),
|
| 510 |
("root_cause_analysis", 1),
|
|
|
|
| 511 |
("remediation_planning", 0),
|
| 512 |
("remediation_planning", 1),
|
|
|
|
| 513 |
]
|
| 514 |
|
| 515 |
results: dict[str, list[float]] = {}
|
|
@@ -530,9 +501,7 @@ def main():
|
|
| 530 |
results.setdefault(task_id, []).append(score)
|
| 531 |
|
| 532 |
print("-" * 50)
|
| 533 |
-
summary = {
|
| 534 |
-
t: round(sum(v) / len(v), 4) for t, v in results.items()
|
| 535 |
-
}
|
| 536 |
summary["overall"] = round(sum(summary.values()) / len(summary), 4)
|
| 537 |
|
| 538 |
print("\nScore Summary:")
|
|
|
|
| 10 |
import json
|
| 11 |
import os
|
| 12 |
import sys
|
| 13 |
+
import time
|
| 14 |
|
| 15 |
import requests
|
| 16 |
|
|
|
|
| 30 |
print("[WARN] No API key set — LLM calls will fail.", file=sys.stderr)
|
| 31 |
|
| 32 |
_session = requests.Session()
|
|
|
|
|
|
|
| 33 |
_client = None
|
| 34 |
|
| 35 |
|
|
|
|
| 41 |
return _client
|
| 42 |
|
| 43 |
|
| 44 |
+
# ── Constants ───────────────────────────────────────────────────────────────
|
| 45 |
_TASK_SUBMIT = {
|
| 46 |
"alert_classification": "submit_severity",
|
| 47 |
"root_cause_analysis": "submit_root_cause",
|
|
|
|
| 65 |
_ALL_VALID = _DIAG_TYPES | _SUBMIT_TYPES | _REM_TYPES
|
| 66 |
|
| 67 |
|
|
|
|
| 68 |
SYSTEM_PROMPT = """\
|
| 69 |
You are an expert Site Reliability Engineer responding to a production incident.
|
| 70 |
Reply with exactly ONE JSON action object. No markdown, no explanation, no extra text.
|
|
|
|
| 84 |
{"action_type":"submit_resolution","parameters":{"summary":"<3+ sentence summary>"}}
|
| 85 |
|
| 86 |
RULES:
|
| 87 |
+
- Service names MUST exactly match the KNOWN_SERVICES list.
|
| 88 |
- P1 = complete outage OR revenue > $1,000/min. P2 = major degradation.
|
| 89 |
+
P3 = minor/partial issue with graceful fallback. P4 = informational.
|
| 90 |
+
- IMPORTANT: check_recent_deploys and check_dependencies require prior
|
| 91 |
+
investigation. You MUST query_logs or check_metrics on a service BEFORE
|
| 92 |
+
checking its deploys or dependencies. Otherwise you get limited data.
|
| 93 |
+
- Root cause = the upstream service that TRIGGERED the cascade. Often NOT
|
| 94 |
+
in the alert's affected_services list.
|
| 95 |
+
- submit_resolution summary: 3+ sentences about what failed, what you did, status.
|
| 96 |
+
- Submit as soon as evidence is clear — do NOT waste steps.
|
| 97 |
|
| 98 |
+
STRATEGY:
|
| 99 |
|
| 100 |
alert_classification (max 3 steps):
|
| 101 |
+
Query 1-2 services with logs/metrics, then submit_severity.
|
| 102 |
+
Check revenue_impact and error_rate carefully. Not all high error rates are P1.
|
| 103 |
|
| 104 |
root_cause_analysis (max 10 steps):
|
| 105 |
+
1. query_logs or check_metrics on 2-3 services to understand the blast radius
|
| 106 |
+
2. THEN check_recent_deploys on services that look suspicious
|
| 107 |
+
3. Look for the service whose deploy/change CAUSED the cascade
|
| 108 |
+
4. Submit submit_root_cause with service and failure_mode
|
| 109 |
|
| 110 |
remediation_planning (max 15 steps):
|
| 111 |
+
1. query_logs on affected services to confirm root cause
|
| 112 |
+
2. Execute remediation actions in logical order
|
| 113 |
+
3. Verify recovery with check_service_status
|
| 114 |
+
4. Submit submit_resolution with detailed summary
|
| 115 |
|
| 116 |
CRITICAL: Each task has ONE correct submission action:
|
| 117 |
alert_classification -> submit_severity
|
| 118 |
root_cause_analysis -> submit_root_cause
|
| 119 |
+
remediation_planning -> submit_resolution"""
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
# ── Helpers ─────────────────────────────────────────────────────────────────
|
| 123 |
+
|
| 124 |
def _queried_svcs(queried_data: dict) -> set[str]:
|
| 125 |
return {
|
| 126 |
svc
|
|
|
|
| 131 |
|
| 132 |
|
| 133 |
def _extract_signals(queried_data: dict) -> list[str]:
|
|
|
|
| 134 |
seen: set[str] = set()
|
| 135 |
signals: list[str] = []
|
| 136 |
|
|
|
|
| 154 |
_add(f"Cache purge in {svc}")
|
| 155 |
if "unbounded" in t or "no limit" in t:
|
| 156 |
_add(f"Unbounded query in {svc}")
|
| 157 |
+
if "credential" in t or "password" in t or "authentication failed" in t:
|
| 158 |
+
_add(f"Credential/auth issue in {svc}")
|
| 159 |
+
if "requires deeper investigation" in t or "requires initial investigation" in t:
|
| 160 |
+
_add(f"GATED: {svc} needs logs/metrics first before checking deploys")
|
| 161 |
if action_type == "check_recent_deploys" and any(
|
| 162 |
+
x in t for x in ("ago", "change", "update", "added", "deploy")
|
| 163 |
):
|
| 164 |
+
if "requires" not in t: # Don't show gated responses as signals
|
| 165 |
+
snippet = str(data)[:120].replace("\n", " ")
|
| 166 |
+
_add(f"Recent change in {svc}: {snippet}")
|
| 167 |
return signals
|
| 168 |
|
| 169 |
|
|
|
|
| 170 |
def _first_obs_msg(obs: dict) -> str:
|
| 171 |
+
alert = obs.get("alert", {})
|
| 172 |
+
known = obs.get("known_services", [])
|
| 173 |
affected = alert.get("affected_services", [])
|
| 174 |
+
task_id = obs.get("task_id", "")
|
| 175 |
+
non_aff = [s for s in known if s not in affected]
|
| 176 |
|
| 177 |
lines = [
|
| 178 |
"=== NEW INCIDENT ===",
|
|
|
|
| 186 |
if alert.get("title"):
|
| 187 |
lines.append(f" Title: {alert['title']}")
|
| 188 |
if affected:
|
| 189 |
+
lines.append(f" Directly affected: {', '.join(affected)}")
|
| 190 |
for s in alert.get("symptoms", []):
|
| 191 |
lines.append(f" - {s}")
|
| 192 |
for k in ("error_rate", "duration_minutes", "revenue_impact_per_min"):
|
| 193 |
if alert.get(k) is not None:
|
| 194 |
lines.append(f" {k}: {alert[k]}")
|
| 195 |
|
| 196 |
+
lines.append(f"KNOWN_SERVICES: {json.dumps(known)}")
|
| 197 |
|
| 198 |
if non_aff and task_id in ("root_cause_analysis", "remediation_planning"):
|
| 199 |
+
lines.append(f" Services NOT in alert (investigate these too): {json.dumps(non_aff)}")
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
lines.append(f"AVAILABLE ACTIONS: {obs.get('available_actions', [])}")
|
| 202 |
lines.append(f"REQUIRED SUBMISSION: {_TASK_SUBMIT.get(task_id, 'unknown')}")
|
| 203 |
+
|
| 204 |
+
if task_id in ("root_cause_analysis", "remediation_planning"):
|
| 205 |
+
lines.append("")
|
| 206 |
+
lines.append("NOTE: check_recent_deploys requires prior investigation.")
|
| 207 |
+
lines.append("You MUST query_logs or check_metrics on a service FIRST.")
|
| 208 |
+
|
| 209 |
lines.append("")
|
| 210 |
+
lines.append("Respond with your first action (JSON only):")
|
| 211 |
return "\n".join(lines)
|
| 212 |
|
| 213 |
|
| 214 |
def _step_msg(obs: dict, prev_queried: dict) -> str:
|
| 215 |
+
step = obs.get("step_count", 0)
|
| 216 |
max_steps = obs.get("max_steps", 10)
|
| 217 |
+
left = max_steps - step
|
| 218 |
+
queried = obs.get("queried_data", {})
|
| 219 |
+
task_id = obs.get("task_id", "")
|
| 220 |
|
| 221 |
lines = [
|
| 222 |
f"Step {step}/{max_steps} ({left} remaining) | "
|
|
|
|
| 224 |
f"feedback: {obs.get('feedback', '')}",
|
| 225 |
]
|
| 226 |
|
|
|
|
| 227 |
new_data = []
|
| 228 |
for action_type, services in queried.items():
|
| 229 |
prev = prev_queried.get(action_type, {})
|
|
|
|
| 235 |
d = d[:500] + "..."
|
| 236 |
new_data.append(f" [{action_type}][{svc}]: {d}")
|
| 237 |
if new_data:
|
| 238 |
+
lines.append("NEW DATA:")
|
| 239 |
lines.extend(new_data)
|
| 240 |
|
|
|
|
| 241 |
signals = _extract_signals(queried)
|
| 242 |
if signals:
|
| 243 |
+
lines.append("SIGNALS:")
|
| 244 |
for sig in signals:
|
| 245 |
lines.append(f" *** {sig} ***")
|
| 246 |
|
|
|
|
| 247 |
if left <= 3:
|
| 248 |
+
lines.append(f"*** {left} steps left — submit {_TASK_SUBMIT.get(task_id, '')} soon ***")
|
|
|
|
|
|
|
|
|
|
| 249 |
if left <= 1:
|
| 250 |
+
lines.append(f"!!! LAST STEP — MUST {_TASK_SUBMIT.get(task_id, 'SUBMIT')} NOW !!!")
|
|
|
|
|
|
|
| 251 |
|
| 252 |
+
lines.append("Next action (JSON only):")
|
| 253 |
return "\n".join(lines)
|
| 254 |
|
| 255 |
|
|
|
|
| 256 |
def _parse(text: str) -> dict:
|
| 257 |
text = text.strip()
|
|
|
|
| 258 |
if text.startswith("`"):
|
| 259 |
text = "\n".join(
|
| 260 |
ln for ln in text.splitlines() if not ln.startswith("`")
|
|
|
|
| 269 |
raise
|
| 270 |
|
| 271 |
|
|
|
|
| 272 |
def _fallback_submit(task_id: str, obs: dict) -> dict:
|
|
|
|
| 273 |
alert = obs.get("alert", {})
|
| 274 |
known = obs.get("known_services", [])
|
| 275 |
|
| 276 |
if task_id == "alert_classification":
|
| 277 |
rev = alert.get("revenue_impact_per_min", 0) or 0
|
| 278 |
err = alert.get("error_rate", 0) or 0
|
| 279 |
+
sev = ("P1" if (rev > 1000 or err > 0.9) else
|
| 280 |
+
("P2" if (rev > 100 or err > 0.3) else "P3"))
|
| 281 |
svc = (alert.get("affected_services") or known or ["unknown"])[0]
|
| 282 |
+
return {"action_type": "submit_severity",
|
| 283 |
+
"parameters": {"severity": sev, "service": svc}}
|
|
|
|
|
|
|
| 284 |
|
| 285 |
if task_id == "root_cause_analysis":
|
| 286 |
svc = known[0] if known else "unknown"
|
| 287 |
+
return {"action_type": "submit_root_cause",
|
| 288 |
+
"parameters": {"service": svc,
|
| 289 |
+
"failure_mode": "service failure causing cascade"}}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
+
return {"action_type": "submit_resolution",
|
| 292 |
+
"parameters": {"summary": (
|
| 293 |
+
"The incident was investigated through log and metric analysis. "
|
| 294 |
+
"Remediation actions were applied to restore service health. "
|
| 295 |
+
"Systems are being monitored for recovery confirmation."
|
| 296 |
+
)}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
|
| 298 |
|
| 299 |
+
def _smart_fallback(task_id: str, obs: dict, step: int, max_steps: int) -> dict:
|
| 300 |
+
known = obs.get("known_services", [])
|
|
|
|
|
|
|
|
|
|
| 301 |
queried = obs.get("queried_data", {})
|
| 302 |
+
left = max_steps - step
|
| 303 |
+
q_svcs = _queried_svcs(queried)
|
| 304 |
|
|
|
|
| 305 |
if left <= 1:
|
| 306 |
return _fallback_submit(task_id, obs)
|
| 307 |
|
|
|
|
| 308 |
if task_id == "alert_classification" and q_svcs:
|
| 309 |
return _fallback_submit(task_id, obs)
|
| 310 |
|
| 311 |
+
# Query logs on unvisited services first
|
| 312 |
for svc in known:
|
| 313 |
if svc not in q_svcs:
|
| 314 |
+
return {"action_type": "query_logs",
|
| 315 |
+
"parameters": {"service": svc}}
|
|
|
|
|
|
|
| 316 |
|
| 317 |
+
# Then try check_recent_deploys (will now work since we queried logs)
|
| 318 |
if task_id in ("root_cause_analysis", "remediation_planning"):
|
| 319 |
deploy_queried = set(queried.get("check_recent_deploys", {}).keys())
|
| 320 |
for svc in known:
|
| 321 |
if svc not in deploy_queried:
|
| 322 |
+
return {"action_type": "check_recent_deploys",
|
| 323 |
+
"parameters": {"service": svc}}
|
|
|
|
|
|
|
| 324 |
|
|
|
|
| 325 |
return _fallback_submit(task_id, obs)
|
| 326 |
|
| 327 |
|
|
|
|
| 328 |
def _should_override(
|
| 329 |
task_id: str, action: dict, obs: dict, step: int, max_steps: int
|
| 330 |
) -> bool:
|
| 331 |
+
at = action.get("action_type", "")
|
| 332 |
params = action.get("parameters", {})
|
| 333 |
+
left = max_steps - step
|
| 334 |
+
known = obs.get("known_services", [])
|
| 335 |
|
|
|
|
| 336 |
if at not in _ALL_VALID:
|
| 337 |
return True
|
|
|
|
|
|
|
| 338 |
if left <= 0 and at not in _SUBMIT_TYPES:
|
| 339 |
return True
|
| 340 |
|
|
|
|
|
|
|
| 341 |
correct_submit = _TASK_SUBMIT.get(task_id)
|
| 342 |
if at in _SUBMIT_TYPES and at != correct_submit:
|
| 343 |
return True
|
| 344 |
|
|
|
|
| 345 |
svc = (params.get("service") or "").strip()
|
| 346 |
if (svc and known
|
| 347 |
and at not in ("disable_feature_flag", "execute_runbook_step")
|
| 348 |
and svc not in known):
|
| 349 |
return True
|
| 350 |
|
|
|
|
| 351 |
if at == "submit_severity":
|
| 352 |
sev = (params.get("severity") or "").upper().strip()
|
| 353 |
if sev not in ("P1", "P2", "P3", "P4"):
|
| 354 |
return True
|
| 355 |
|
|
|
|
| 356 |
if at == "submit_root_cause":
|
| 357 |
+
svc = (params.get("service") or "").strip()
|
| 358 |
mode = (params.get("failure_mode") or "").strip()
|
| 359 |
if not svc or len(mode) < 5:
|
| 360 |
return True
|
|
|
|
| 364 |
if len(summary) < 30:
|
| 365 |
return True
|
| 366 |
|
|
|
|
| 367 |
if task_id == "alert_classification" and at in _REM_TYPES:
|
| 368 |
return True
|
| 369 |
|
| 370 |
return False
|
| 371 |
|
| 372 |
|
| 373 |
+
def _llm_call_with_retry(messages: list, max_retries: int = 2) -> str:
|
| 374 |
+
"""Call LLM with retry on rate limit errors."""
|
| 375 |
+
for attempt in range(max_retries + 1):
|
| 376 |
+
try:
|
| 377 |
+
resp = _get_client().chat.completions.create(
|
| 378 |
+
model=MODEL_NAME,
|
| 379 |
+
messages=messages,
|
| 380 |
+
temperature=0.0,
|
| 381 |
+
max_tokens=300,
|
| 382 |
+
stream=False,
|
| 383 |
+
)
|
| 384 |
+
return resp.choices[0].message.content or ""
|
| 385 |
+
except Exception as e:
|
| 386 |
+
err_str = str(e).lower()
|
| 387 |
+
if "rate_limit" in err_str or "429" in err_str:
|
| 388 |
+
if attempt < max_retries:
|
| 389 |
+
# Parse wait time from error or use default
|
| 390 |
+
wait = 10 * (attempt + 1)
|
| 391 |
+
print(f" [RATE LIMIT] waiting {wait}s (attempt {attempt + 1})",
|
| 392 |
+
file=sys.stderr)
|
| 393 |
+
time.sleep(wait)
|
| 394 |
+
continue
|
| 395 |
+
if attempt == max_retries:
|
| 396 |
+
print(f" [WARN] LLM call failed: {e}", file=sys.stderr)
|
| 397 |
+
return ""
|
| 398 |
+
return ""
|
| 399 |
+
|
| 400 |
+
|
| 401 |
def _run_episode(task_id: str, scenario_index: int) -> float:
|
| 402 |
r = _session.post(
|
| 403 |
f"{ENV_BASE_URL}/reset",
|
|
|
|
| 418 |
for step_i in range(max_steps):
|
| 419 |
current_step = step_i + 1
|
| 420 |
|
| 421 |
+
raw = _llm_call_with_retry(messages)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
messages.append({"role": "assistant", "content": raw or "{}"})
|
| 423 |
|
|
|
|
| 424 |
action = None
|
| 425 |
try:
|
| 426 |
if raw.strip():
|
|
|
|
| 428 |
except Exception:
|
| 429 |
pass
|
| 430 |
|
|
|
|
| 431 |
if action is None:
|
| 432 |
action = _smart_fallback(task_id, obs, current_step, max_steps)
|
| 433 |
print(f" [FALLBACK] step {current_step}: "
|
|
|
|
| 436 |
old_at = action.get("action_type")
|
| 437 |
action = _smart_fallback(task_id, obs, current_step, max_steps)
|
| 438 |
print(f" [OVERRIDE] step {current_step}: "
|
| 439 |
+
f"{old_at} -> {action.get('action_type')}", file=sys.stderr)
|
|
|
|
| 440 |
|
| 441 |
+
sr = _session.post(f"{ENV_BASE_URL}/step", json=action, timeout=30)
|
|
|
|
|
|
|
|
|
|
| 442 |
sr.raise_for_status()
|
| 443 |
+
result = sr.json()
|
| 444 |
new_obs = result["observation"]
|
| 445 |
|
| 446 |
print(
|
|
|
|
| 462 |
}
|
| 463 |
obs = new_obs
|
| 464 |
|
|
|
|
| 465 |
if len(messages) > 20:
|
| 466 |
messages = messages[:2] + messages[-16:]
|
| 467 |
|
|
|
|
| 470 |
return g.json().get("total", 0.0)
|
| 471 |
|
| 472 |
|
|
|
|
| 473 |
def main():
|
| 474 |
runs = [
|
| 475 |
("alert_classification", 0),
|
| 476 |
("alert_classification", 1),
|
| 477 |
+
("alert_classification", 2),
|
| 478 |
("root_cause_analysis", 0),
|
| 479 |
("root_cause_analysis", 1),
|
| 480 |
+
("root_cause_analysis", 2),
|
| 481 |
("remediation_planning", 0),
|
| 482 |
("remediation_planning", 1),
|
| 483 |
+
("remediation_planning", 2),
|
| 484 |
]
|
| 485 |
|
| 486 |
results: dict[str, list[float]] = {}
|
|
|
|
| 501 |
results.setdefault(task_id, []).append(score)
|
| 502 |
|
| 503 |
print("-" * 50)
|
| 504 |
+
summary = {t: round(sum(v) / len(v), 4) for t, v in results.items()}
|
|
|
|
|
|
|
| 505 |
summary["overall"] = round(sum(summary.values()) / len(summary), 4)
|
| 506 |
|
| 507 |
print("\nScore Summary:")
|
openenv.yaml
CHANGED
|
@@ -4,11 +4,11 @@ app_port: 7860
|
|
| 4 |
description: >
|
| 5 |
OpenEnv environment simulating real-world cloud SRE on-call incident response.
|
| 6 |
Distinct from Kubernetes ops — focuses on cross-service cascading failures,
|
| 7 |
-
network partitions, OOM kills,
|
| 8 |
-
An AI agent classifies alert severity, performs
|
| 9 |
-
log/metric/dependency queries, and executes
|
| 10 |
-
production incidents end-to-end.
|
| 11 |
-
author:
|
| 12 |
license: MIT
|
| 13 |
tags:
|
| 14 |
- openenv
|
|
@@ -28,6 +28,7 @@ tasks:
|
|
| 28 |
description: >
|
| 29 |
Classify incoming alert severity (P1-P4) by querying
|
| 30 |
logs and metrics across affected cloud services.
|
|
|
|
| 31 |
|
| 32 |
- id: root_cause_analysis
|
| 33 |
name: "Task 2: Root Cause Analysis"
|
|
@@ -37,7 +38,8 @@ tasks:
|
|
| 37 |
description: >
|
| 38 |
Trace a live incident through logs, metrics, dependencies,
|
| 39 |
and recent deploys to identify the exact root cause service
|
| 40 |
-
and failure mode
|
|
|
|
| 41 |
|
| 42 |
- id: remediation_planning
|
| 43 |
name: "Task 3: Incident Remediation"
|
|
@@ -46,8 +48,9 @@ tasks:
|
|
| 46 |
score_range: [0.0, 1.0]
|
| 47 |
description: >
|
| 48 |
Fully resolve a production incident end-to-end: diagnose
|
| 49 |
-
the root cause, execute the correct
|
| 50 |
-
and submit a documented resolution summary.
|
|
|
|
| 51 |
|
| 52 |
endpoints:
|
| 53 |
health: "GET /health"
|
|
|
|
| 4 |
description: >
|
| 5 |
OpenEnv environment simulating real-world cloud SRE on-call incident response.
|
| 6 |
Distinct from Kubernetes ops — focuses on cross-service cascading failures,
|
| 7 |
+
network partitions, OOM kills, credential rotation failures, and CDN storms
|
| 8 |
+
across distributed systems. An AI agent classifies alert severity, performs
|
| 9 |
+
root cause analysis through log/metric/dependency queries, and executes
|
| 10 |
+
remediation sequences to resolve production incidents end-to-end.
|
| 11 |
+
author: Einstein_Sidra
|
| 12 |
license: MIT
|
| 13 |
tags:
|
| 14 |
- openenv
|
|
|
|
| 28 |
description: >
|
| 29 |
Classify incoming alert severity (P1-P4) by querying
|
| 30 |
logs and metrics across affected cloud services.
|
| 31 |
+
Target baseline: 0.75-1.0 with 8B model.
|
| 32 |
|
| 33 |
- id: root_cause_analysis
|
| 34 |
name: "Task 2: Root Cause Analysis"
|
|
|
|
| 38 |
description: >
|
| 39 |
Trace a live incident through logs, metrics, dependencies,
|
| 40 |
and recent deploys to identify the exact root cause service
|
| 41 |
+
and failure mode. Root cause is NOT in the alert.
|
| 42 |
+
Target baseline: 0.35-0.60 with 8B model.
|
| 43 |
|
| 44 |
- id: remediation_planning
|
| 45 |
name: "Task 3: Incident Remediation"
|
|
|
|
| 48 |
score_range: [0.0, 1.0]
|
| 49 |
description: >
|
| 50 |
Fully resolve a production incident end-to-end: diagnose
|
| 51 |
+
the root cause, execute the correct multi-step remediation
|
| 52 |
+
sequence, and submit a documented resolution summary.
|
| 53 |
+
Wrong actions penalized. Target baseline: 0.20-0.45 with 8B model.
|
| 54 |
|
| 55 |
endpoints:
|
| 56 |
health: "GET /health"
|
pyproject.toml
CHANGED
|
@@ -13,4 +13,13 @@ dependencies = [
|
|
| 13 |
"openai>=1.58.0",
|
| 14 |
"httpx>=0.27.0,<0.29.0",
|
| 15 |
"python-dotenv>=1.0.0",
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"openai>=1.58.0",
|
| 14 |
"httpx>=0.27.0,<0.29.0",
|
| 15 |
"python-dotenv>=1.0.0",
|
| 16 |
+
"gradio>=4.0.0,<6.0.0",
|
| 17 |
+
"openenv-core>=0.2.0",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
[project.scripts]
|
| 21 |
+
server = "server.app:main"
|
| 22 |
+
|
| 23 |
+
[build-system]
|
| 24 |
+
requires = ["setuptools>=68.0"]
|
| 25 |
+
build-backend = "setuptools.backends._legacy:_Backend"
|
requirements.txt
CHANGED
|
@@ -4,4 +4,6 @@ pydantic>=2.0.0
|
|
| 4 |
requests>=2.31.0
|
| 5 |
openai>=1.58.0
|
| 6 |
httpx>=0.27.0,<0.29.0
|
| 7 |
-
python-dotenv>=1.0.0
|
|
|
|
|
|
|
|
|
| 4 |
requests>=2.31.0
|
| 5 |
openai>=1.58.0
|
| 6 |
httpx>=0.27.0,<0.29.0
|
| 7 |
+
python-dotenv>=1.0.0
|
| 8 |
+
gradio>=4.0.0,<6.0.0
|
| 9 |
+
openenv-core>=0.2.0
|
server/__init__.py
CHANGED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Cloud Incident Response — OpenEnv server package."""
|
| 2 |
+
__version__ = "0.1.0"
|
server/app.py
CHANGED
|
@@ -1,15 +1,14 @@
|
|
| 1 |
"""
|
| 2 |
-
server/app.py — FastAPI server for Cloud Incident Response OpenEnv.
|
| 3 |
-
|
| 4 |
-
Endpoints:
|
| 5 |
-
GET /
|
| 6 |
-
|
| 7 |
-
POST /
|
| 8 |
-
|
| 9 |
-
GET /
|
| 10 |
-
GET /
|
| 11 |
-
|
| 12 |
-
POST /baseline Run inference.py end-to-end, return score summary
|
| 13 |
"""
|
| 14 |
|
| 15 |
from __future__ import annotations
|
|
@@ -19,26 +18,24 @@ import os
|
|
| 19 |
import subprocess
|
| 20 |
import sys
|
| 21 |
|
| 22 |
-
# Ensure project root is on sys.path regardless of working directory
|
| 23 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 24 |
|
| 25 |
from contextlib import asynccontextmanager
|
| 26 |
-
|
|
|
|
| 27 |
from fastapi.middleware.cors import CORSMiddleware
|
| 28 |
|
| 29 |
-
from server.models import Action
|
| 30 |
from server.environment import IncidentEnvironment
|
| 31 |
-
from
|
|
|
|
| 32 |
|
| 33 |
_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 34 |
|
| 35 |
-
# ── Global env instance ──────────────────────────────────────────────────────
|
| 36 |
_env: IncidentEnvironment | None = None
|
| 37 |
|
| 38 |
|
| 39 |
@asynccontextmanager
|
| 40 |
async def lifespan(app: FastAPI):
|
| 41 |
-
"""Initialise heavy objects after the server is already accepting requests."""
|
| 42 |
global _env
|
| 43 |
_env = IncidentEnvironment()
|
| 44 |
yield
|
|
@@ -46,10 +43,13 @@ async def lifespan(app: FastAPI):
|
|
| 46 |
|
| 47 |
def _get_env() -> IncidentEnvironment:
|
| 48 |
if _env is None:
|
| 49 |
-
raise HTTPException(
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
| 53 |
return _env
|
| 54 |
|
| 55 |
|
|
@@ -58,7 +58,7 @@ app = FastAPI(
|
|
| 58 |
version="0.1.0",
|
| 59 |
description=(
|
| 60 |
"OpenEnv environment for training AI agents on cloud SRE incident response. "
|
| 61 |
-
"
|
| 62 |
),
|
| 63 |
lifespan=lifespan,
|
| 64 |
)
|
|
@@ -71,98 +71,134 @@ app.add_middleware(
|
|
| 71 |
)
|
| 72 |
|
| 73 |
|
| 74 |
-
# ──
|
| 75 |
-
|
| 76 |
-
@app.get("/")
|
| 77 |
-
def root():
|
| 78 |
-
return {
|
| 79 |
-
"status": "running",
|
| 80 |
-
"name": "cloud-incident-response",
|
| 81 |
-
"version": "0.1.0",
|
| 82 |
-
"description": "OpenEnv environment for cloud SRE incident response",
|
| 83 |
-
"tasks": ["alert_classification", "root_cause_analysis", "remediation_planning"],
|
| 84 |
-
"docs": "/docs",
|
| 85 |
-
"health": "/health",
|
| 86 |
-
}
|
| 87 |
|
| 88 |
|
| 89 |
-
# ── Core endpoints ────────────────────────────────────────────────────────────
|
| 90 |
-
|
| 91 |
@app.get("/health")
|
| 92 |
def health():
|
|
|
|
| 93 |
return {"status": "ok", "version": "0.1.0"}
|
| 94 |
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
@app.post("/reset")
|
| 97 |
-
def reset(
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
env = _get_env()
|
| 103 |
try:
|
| 104 |
obs = env.reset(task_id=task_id, scenario_index=scenario_index)
|
| 105 |
return obs.model_dump()
|
| 106 |
except ValueError as e:
|
| 107 |
-
raise HTTPException(
|
| 108 |
except Exception as e:
|
| 109 |
-
raise HTTPException(
|
| 110 |
|
| 111 |
|
| 112 |
@app.post("/step")
|
| 113 |
def step(action: Action):
|
| 114 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
env = _get_env()
|
| 116 |
try:
|
| 117 |
obs, reward, done, info = env.step(action)
|
| 118 |
return {
|
| 119 |
"observation": obs.model_dump(),
|
| 120 |
-
"reward":
|
| 121 |
-
"done":
|
| 122 |
-
"info":
|
| 123 |
}
|
| 124 |
except RuntimeError as e:
|
| 125 |
-
raise HTTPException(
|
| 126 |
except Exception as e:
|
| 127 |
-
raise HTTPException(
|
| 128 |
|
| 129 |
|
| 130 |
@app.get("/state")
|
| 131 |
def state():
|
| 132 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 133 |
env = _get_env()
|
| 134 |
try:
|
| 135 |
return env.state().model_dump()
|
| 136 |
except RuntimeError as e:
|
| 137 |
-
raise HTTPException(
|
| 138 |
except Exception as e:
|
| 139 |
-
raise HTTPException(
|
| 140 |
|
| 141 |
|
| 142 |
@app.get("/tasks")
|
| 143 |
def tasks():
|
| 144 |
-
"""
|
| 145 |
return {
|
| 146 |
"tasks": list_tasks(),
|
| 147 |
"total": len(ALL_TASKS),
|
| 148 |
"action_schema": {
|
| 149 |
"diagnostic": [
|
| 150 |
-
{"action_type": "query_logs",
|
| 151 |
-
{"action_type": "check_metrics",
|
| 152 |
-
{"action_type": "check_dependencies",
|
| 153 |
{"action_type": "check_recent_deploys", "parameters": {"service": "string"}},
|
| 154 |
{"action_type": "check_service_status", "parameters": {"service": "string"}},
|
| 155 |
],
|
| 156 |
"remediation": [
|
| 157 |
-
{"action_type": "restart_service",
|
| 158 |
-
{"action_type": "rollback_deploy",
|
| 159 |
-
{"action_type": "scale_service",
|
| 160 |
{"action_type": "disable_feature_flag", "parameters": {"flag": "string"}},
|
| 161 |
-
{"action_type": "clear_cache",
|
| 162 |
-
{"action_type": "execute_runbook_step", "parameters": {"runbook_action": "string"
|
| 163 |
],
|
| 164 |
"submission": [
|
| 165 |
-
{"action_type": "submit_severity",
|
| 166 |
{"action_type": "submit_root_cause", "parameters": {"service": "string", "failure_mode": "string"}},
|
| 167 |
{"action_type": "submit_resolution", "parameters": {"summary": "string"}},
|
| 168 |
],
|
|
@@ -172,59 +208,423 @@ def tasks():
|
|
| 172 |
|
| 173 |
@app.get("/grader")
|
| 174 |
def grader():
|
| 175 |
-
"""
|
| 176 |
env = _get_env()
|
| 177 |
try:
|
| 178 |
s = env.state()
|
| 179 |
from graders import grade
|
| 180 |
result = grade(s.task_id, s.model_dump(), env._scenario)
|
| 181 |
return {
|
| 182 |
-
"total":
|
| 183 |
-
"breakdown":
|
| 184 |
-
"feedback":
|
| 185 |
-
"task_id":
|
| 186 |
"scenario_id": s.scenario_id,
|
| 187 |
-
"steps_used":
|
| 188 |
-
"done":
|
| 189 |
}
|
| 190 |
except RuntimeError as e:
|
| 191 |
-
raise HTTPException(
|
| 192 |
except Exception as e:
|
| 193 |
-
raise HTTPException(
|
| 194 |
|
| 195 |
|
| 196 |
@app.post("/baseline")
|
| 197 |
def baseline():
|
| 198 |
-
"""Run inference
|
| 199 |
script = os.path.join(_ROOT, "inference.py")
|
| 200 |
if not os.path.exists(script):
|
| 201 |
-
raise HTTPException(
|
| 202 |
-
status_code=500,
|
| 203 |
-
detail="inference.py not found in project root",
|
| 204 |
-
)
|
| 205 |
try:
|
| 206 |
result = subprocess.run(
|
| 207 |
[sys.executable, script],
|
| 208 |
-
capture_output=True,
|
| 209 |
-
text=True,
|
| 210 |
-
timeout=1200,
|
| 211 |
-
cwd=_ROOT,
|
| 212 |
env={**os.environ, "ENV_BASE_URL": "http://localhost:7860"},
|
| 213 |
)
|
| 214 |
except subprocess.TimeoutExpired:
|
| 215 |
-
raise HTTPException(
|
| 216 |
|
| 217 |
if result.returncode != 0:
|
| 218 |
-
raise HTTPException(
|
| 219 |
|
| 220 |
lines = result.stdout.strip().splitlines()
|
| 221 |
-
last
|
| 222 |
try:
|
| 223 |
return json.loads(last)
|
| 224 |
except Exception:
|
| 225 |
return {"raw_output": result.stdout[-3000:]}
|
| 226 |
|
| 227 |
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
import uvicorn
|
| 230 |
-
uvicorn.run("server.app:app", host="0.0.0.0", port=7860, reload=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
server/app.py — FastAPI + Gradio server for Cloud Incident Response OpenEnv.
|
| 3 |
+
|
| 4 |
+
Endpoints (OpenEnv spec):
|
| 5 |
+
GET /health → {"status": "ok"}
|
| 6 |
+
POST /reset → Observation (accepts JSON body or query params)
|
| 7 |
+
POST /step → {"observation": ..., "reward": ..., "done": ..., "info": ...}
|
| 8 |
+
GET /state → EpisodeState
|
| 9 |
+
GET /tasks → task list with action schemas
|
| 10 |
+
GET /grader → grading result for current episode
|
| 11 |
+
POST /baseline → run inference.py
|
|
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
from __future__ import annotations
|
|
|
|
| 18 |
import subprocess
|
| 19 |
import sys
|
| 20 |
|
|
|
|
| 21 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 22 |
|
| 23 |
from contextlib import asynccontextmanager
|
| 24 |
+
|
| 25 |
+
from fastapi import FastAPI, HTTPException, Request
|
| 26 |
from fastapi.middleware.cors import CORSMiddleware
|
| 27 |
|
|
|
|
| 28 |
from server.environment import IncidentEnvironment
|
| 29 |
+
from server.models import Action, ActionParameters
|
| 30 |
+
from tasks import ALL_TASKS, list_tasks
|
| 31 |
|
| 32 |
_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 33 |
|
|
|
|
| 34 |
_env: IncidentEnvironment | None = None
|
| 35 |
|
| 36 |
|
| 37 |
@asynccontextmanager
|
| 38 |
async def lifespan(app: FastAPI):
|
|
|
|
| 39 |
global _env
|
| 40 |
_env = IncidentEnvironment()
|
| 41 |
yield
|
|
|
|
| 43 |
|
| 44 |
def _get_env() -> IncidentEnvironment:
|
| 45 |
if _env is None:
|
| 46 |
+
raise HTTPException(503, "Environment initialising — retry in a moment")
|
| 47 |
+
return _env
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _get_env_direct() -> IncidentEnvironment:
|
| 51 |
+
if _env is None:
|
| 52 |
+
raise RuntimeError("Environment not initialised yet")
|
| 53 |
return _env
|
| 54 |
|
| 55 |
|
|
|
|
| 58 |
version="0.1.0",
|
| 59 |
description=(
|
| 60 |
"OpenEnv environment for training AI agents on cloud SRE incident response. "
|
| 61 |
+
"Implements step()/reset()/state() API with typed Observation, Action, and Reward models."
|
| 62 |
),
|
| 63 |
lifespan=lifespan,
|
| 64 |
)
|
|
|
|
| 71 |
)
|
| 72 |
|
| 73 |
|
| 74 |
+
# ── OpenEnv API Endpoints ─────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
|
|
|
|
|
|
|
| 77 |
@app.get("/health")
|
| 78 |
def health():
|
| 79 |
+
"""Health check endpoint."""
|
| 80 |
return {"status": "ok", "version": "0.1.0"}
|
| 81 |
|
| 82 |
|
| 83 |
+
@app.get("/api/info")
|
| 84 |
+
def api_info():
|
| 85 |
+
"""Environment metadata."""
|
| 86 |
+
return {
|
| 87 |
+
"status": "running",
|
| 88 |
+
"name": "cloud-incident-response",
|
| 89 |
+
"version": "0.1.0",
|
| 90 |
+
"description": "OpenEnv environment for cloud SRE incident response",
|
| 91 |
+
"tasks": list(ALL_TASKS.keys()),
|
| 92 |
+
"docs": "/docs",
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
|
| 96 |
@app.post("/reset")
|
| 97 |
+
async def reset(request: Request):
|
| 98 |
+
"""Reset the environment and start a new episode.
|
| 99 |
+
|
| 100 |
+
Accepts task_id and scenario_index via:
|
| 101 |
+
- Query parameters: /reset?task_id=...&scenario_index=...
|
| 102 |
+
- JSON body: {"task_id": "...", "scenario_index": 0}
|
| 103 |
+
- Empty body: uses defaults (alert_classification, scenario 0)
|
| 104 |
+
|
| 105 |
+
Returns: Observation dict
|
| 106 |
+
"""
|
| 107 |
+
task_id = "alert_classification"
|
| 108 |
+
scenario_index = 0
|
| 109 |
+
|
| 110 |
+
# Parse query params
|
| 111 |
+
qp = request.query_params
|
| 112 |
+
if qp.get("task_id"):
|
| 113 |
+
task_id = qp["task_id"]
|
| 114 |
+
if qp.get("scenario_index"):
|
| 115 |
+
try:
|
| 116 |
+
scenario_index = int(qp["scenario_index"])
|
| 117 |
+
except ValueError:
|
| 118 |
+
pass
|
| 119 |
+
|
| 120 |
+
# Parse JSON body (may be empty {} or have fields)
|
| 121 |
+
try:
|
| 122 |
+
body = await request.json()
|
| 123 |
+
if isinstance(body, dict):
|
| 124 |
+
task_id = body.get("task_id", task_id)
|
| 125 |
+
si = body.get("scenario_index")
|
| 126 |
+
if si is not None:
|
| 127 |
+
scenario_index = int(si)
|
| 128 |
+
except Exception:
|
| 129 |
+
pass # Empty body or non-JSON is fine — use defaults
|
| 130 |
+
|
| 131 |
env = _get_env()
|
| 132 |
try:
|
| 133 |
obs = env.reset(task_id=task_id, scenario_index=scenario_index)
|
| 134 |
return obs.model_dump()
|
| 135 |
except ValueError as e:
|
| 136 |
+
raise HTTPException(400, str(e))
|
| 137 |
except Exception as e:
|
| 138 |
+
raise HTTPException(500, str(e))
|
| 139 |
|
| 140 |
|
| 141 |
@app.post("/step")
|
| 142 |
def step(action: Action):
|
| 143 |
+
"""Take one step in the environment.
|
| 144 |
+
|
| 145 |
+
Accepts: Action JSON body with action_type and parameters
|
| 146 |
+
Returns: {"observation": {...}, "reward": {...}, "done": bool, "info": {...}}
|
| 147 |
+
"""
|
| 148 |
env = _get_env()
|
| 149 |
try:
|
| 150 |
obs, reward, done, info = env.step(action)
|
| 151 |
return {
|
| 152 |
"observation": obs.model_dump(),
|
| 153 |
+
"reward": reward.model_dump(),
|
| 154 |
+
"done": done,
|
| 155 |
+
"info": info,
|
| 156 |
}
|
| 157 |
except RuntimeError as e:
|
| 158 |
+
raise HTTPException(400, str(e))
|
| 159 |
except Exception as e:
|
| 160 |
+
raise HTTPException(500, str(e))
|
| 161 |
|
| 162 |
|
| 163 |
@app.get("/state")
|
| 164 |
def state():
|
| 165 |
+
"""Get the current episode state.
|
| 166 |
+
|
| 167 |
+
Returns: EpisodeState dict with full action history and internal state
|
| 168 |
+
"""
|
| 169 |
env = _get_env()
|
| 170 |
try:
|
| 171 |
return env.state().model_dump()
|
| 172 |
except RuntimeError as e:
|
| 173 |
+
raise HTTPException(400, str(e))
|
| 174 |
except Exception as e:
|
| 175 |
+
raise HTTPException(500, str(e))
|
| 176 |
|
| 177 |
|
| 178 |
@app.get("/tasks")
|
| 179 |
def tasks():
|
| 180 |
+
"""List all available tasks with action schemas."""
|
| 181 |
return {
|
| 182 |
"tasks": list_tasks(),
|
| 183 |
"total": len(ALL_TASKS),
|
| 184 |
"action_schema": {
|
| 185 |
"diagnostic": [
|
| 186 |
+
{"action_type": "query_logs", "parameters": {"service": "string"}},
|
| 187 |
+
{"action_type": "check_metrics", "parameters": {"service": "string"}},
|
| 188 |
+
{"action_type": "check_dependencies", "parameters": {"service": "string"}},
|
| 189 |
{"action_type": "check_recent_deploys", "parameters": {"service": "string"}},
|
| 190 |
{"action_type": "check_service_status", "parameters": {"service": "string"}},
|
| 191 |
],
|
| 192 |
"remediation": [
|
| 193 |
+
{"action_type": "restart_service", "parameters": {"service": "string"}},
|
| 194 |
+
{"action_type": "rollback_deploy", "parameters": {"service": "string", "target_version": "string"}},
|
| 195 |
+
{"action_type": "scale_service", "parameters": {"service": "string", "replicas": "int"}},
|
| 196 |
{"action_type": "disable_feature_flag", "parameters": {"flag": "string"}},
|
| 197 |
+
{"action_type": "clear_cache", "parameters": {"service": "string"}},
|
| 198 |
+
{"action_type": "execute_runbook_step", "parameters": {"runbook_action": "string"}},
|
| 199 |
],
|
| 200 |
"submission": [
|
| 201 |
+
{"action_type": "submit_severity", "parameters": {"severity": "P1|P2|P3|P4", "service": "string"}},
|
| 202 |
{"action_type": "submit_root_cause", "parameters": {"service": "string", "failure_mode": "string"}},
|
| 203 |
{"action_type": "submit_resolution", "parameters": {"summary": "string"}},
|
| 204 |
],
|
|
|
|
| 208 |
|
| 209 |
@app.get("/grader")
|
| 210 |
def grader():
|
| 211 |
+
"""Grade the current episode. Returns score 0.0-1.0 with breakdown."""
|
| 212 |
env = _get_env()
|
| 213 |
try:
|
| 214 |
s = env.state()
|
| 215 |
from graders import grade
|
| 216 |
result = grade(s.task_id, s.model_dump(), env._scenario)
|
| 217 |
return {
|
| 218 |
+
"total": result["total"],
|
| 219 |
+
"breakdown": result["breakdown"],
|
| 220 |
+
"feedback": result["feedback"],
|
| 221 |
+
"task_id": s.task_id,
|
| 222 |
"scenario_id": s.scenario_id,
|
| 223 |
+
"steps_used": s.step_count,
|
| 224 |
+
"done": s.done,
|
| 225 |
}
|
| 226 |
except RuntimeError as e:
|
| 227 |
+
raise HTTPException(400, str(e))
|
| 228 |
except Exception as e:
|
| 229 |
+
raise HTTPException(500, str(e))
|
| 230 |
|
| 231 |
|
| 232 |
@app.post("/baseline")
|
| 233 |
def baseline():
|
| 234 |
+
"""Run the baseline inference script and return results."""
|
| 235 |
script = os.path.join(_ROOT, "inference.py")
|
| 236 |
if not os.path.exists(script):
|
| 237 |
+
raise HTTPException(500, "inference.py not found")
|
|
|
|
|
|
|
|
|
|
| 238 |
try:
|
| 239 |
result = subprocess.run(
|
| 240 |
[sys.executable, script],
|
| 241 |
+
capture_output=True, text=True, timeout=1200, cwd=_ROOT,
|
|
|
|
|
|
|
|
|
|
| 242 |
env={**os.environ, "ENV_BASE_URL": "http://localhost:7860"},
|
| 243 |
)
|
| 244 |
except subprocess.TimeoutExpired:
|
| 245 |
+
raise HTTPException(500, "inference.py timed out (>20 min)")
|
| 246 |
|
| 247 |
if result.returncode != 0:
|
| 248 |
+
raise HTTPException(500, result.stderr[-2000:])
|
| 249 |
|
| 250 |
lines = result.stdout.strip().splitlines()
|
| 251 |
+
last = lines[-1] if lines else ""
|
| 252 |
try:
|
| 253 |
return json.loads(last)
|
| 254 |
except Exception:
|
| 255 |
return {"raw_output": result.stdout[-3000:]}
|
| 256 |
|
| 257 |
|
| 258 |
+
# ── Gradio UI ─────────────────────────────────────────────────────────────────
|
| 259 |
+
|
| 260 |
+
import gradio as gr
|
| 261 |
+
|
| 262 |
+
DIFFICULTY_BADGE = {
|
| 263 |
+
"alert_classification": "🟢 Easy",
|
| 264 |
+
"root_cause_analysis": "🟡 Medium",
|
| 265 |
+
"remediation_planning": "🔴 Hard",
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
DIFFICULTY_INFO = {
|
| 269 |
+
"alert_classification": "3 steps · Classify severity P1–P4",
|
| 270 |
+
"root_cause_analysis": "10 steps · Find root cause service + failure mode",
|
| 271 |
+
"remediation_planning": "15 steps · Diagnose, fix, and document",
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
SUBMIT_ACTION = {
|
| 275 |
+
"alert_classification": "submit_severity",
|
| 276 |
+
"root_cause_analysis": "submit_root_cause",
|
| 277 |
+
"remediation_planning": "submit_resolution",
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
_DIAG_ACTIONS = [
|
| 281 |
+
"query_logs", "check_metrics", "check_dependencies",
|
| 282 |
+
"check_recent_deploys", "check_service_status",
|
| 283 |
+
]
|
| 284 |
+
_REM_ACTIONS = [
|
| 285 |
+
"restart_service", "rollback_deploy", "scale_service",
|
| 286 |
+
"disable_feature_flag", "clear_cache", "execute_runbook_step",
|
| 287 |
+
]
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def _fmt_obs(obs: dict) -> str:
|
| 291 |
+
lines = []
|
| 292 |
+
lines.append(f"### 📋 Scenario `{obs.get('scenario_id', '—')}`\n")
|
| 293 |
+
summary = obs.get("incident_summary", "")
|
| 294 |
+
if summary:
|
| 295 |
+
lines.append(f"> {summary[:600]}\n")
|
| 296 |
+
alert = obs.get("alert", {})
|
| 297 |
+
if alert:
|
| 298 |
+
lines.append("#### 🔔 Alert Details\n")
|
| 299 |
+
if alert.get("title"):
|
| 300 |
+
lines.append(f"**Title:** {alert['title']}\n")
|
| 301 |
+
symptoms = alert.get("symptoms", [])
|
| 302 |
+
if symptoms:
|
| 303 |
+
lines.append("**Symptoms:**")
|
| 304 |
+
for s in symptoms:
|
| 305 |
+
lines.append(f"- {s}")
|
| 306 |
+
lines.append("")
|
| 307 |
+
info_items = []
|
| 308 |
+
if alert.get("error_rate") is not None:
|
| 309 |
+
info_items.append(f"Error Rate: **{alert['error_rate']:.0%}**")
|
| 310 |
+
if alert.get("duration_minutes") is not None:
|
| 311 |
+
info_items.append(f"Duration: **{alert['duration_minutes']} min**")
|
| 312 |
+
if alert.get("revenue_impact_per_min") is not None:
|
| 313 |
+
info_items.append(f"Revenue: **${alert['revenue_impact_per_min']:,.0f}/min**")
|
| 314 |
+
if info_items:
|
| 315 |
+
lines.append(" · ".join(info_items) + "\n")
|
| 316 |
+
known = obs.get("known_services", [])
|
| 317 |
+
if known:
|
| 318 |
+
lines.append(f"#### 🖥️ Known Services\n`{'` · `'.join(known)}`\n")
|
| 319 |
+
task_id = obs.get("task_id", "")
|
| 320 |
+
submit = SUBMIT_ACTION.get(task_id, "")
|
| 321 |
+
if submit:
|
| 322 |
+
diff = DIFFICULTY_INFO.get(task_id, "")
|
| 323 |
+
lines.append(f"#### 📝 Submit: `{submit}`")
|
| 324 |
+
if diff:
|
| 325 |
+
lines.append(f"*{diff}*\n")
|
| 326 |
+
err = obs.get("last_action_error")
|
| 327 |
+
if err:
|
| 328 |
+
lines.append(f"#### ⚠️ Last Action Error\n`{err}`\n")
|
| 329 |
+
qd = obs.get("queried_data", {})
|
| 330 |
+
if qd:
|
| 331 |
+
lines.append("---\n#### 📊 Evidence Collected\n")
|
| 332 |
+
for action_type, services in qd.items():
|
| 333 |
+
if isinstance(services, dict):
|
| 334 |
+
for svc, data in services.items():
|
| 335 |
+
d = str(data)
|
| 336 |
+
if len(d) > 400:
|
| 337 |
+
d = d[:400] + " …"
|
| 338 |
+
lines.append(f"**`[{action_type}]` → `{svc}`**")
|
| 339 |
+
lines.append(f"```\n{d}\n```\n")
|
| 340 |
+
return "\n".join(lines)
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def _fmt_state(s: dict) -> str:
|
| 344 |
+
task_id = s.get("task_id", "—")
|
| 345 |
+
diff = DIFFICULTY_BADGE.get(task_id, "")
|
| 346 |
+
done = s.get("done", False)
|
| 347 |
+
status = "🏁 Complete" if done else "⚡ Active"
|
| 348 |
+
step_count = s.get("step_count", 0)
|
| 349 |
+
max_steps = s.get("max_steps", 0)
|
| 350 |
+
cum_reward = s.get("cumulative_reward", 0.0)
|
| 351 |
+
pct = (step_count / max_steps * 100) if max_steps > 0 else 0
|
| 352 |
+
bar_filled = int(pct / 5)
|
| 353 |
+
bar = "█" * bar_filled + "░" * (20 - bar_filled)
|
| 354 |
+
|
| 355 |
+
return (
|
| 356 |
+
f"### {status}\n\n"
|
| 357 |
+
f"| Field | Value |\n|---|---|\n"
|
| 358 |
+
f"| **Task** | `{task_id}` {diff} |\n"
|
| 359 |
+
f"| **Episode** | `{s.get('episode_id', '—')[:12]}…` |\n"
|
| 360 |
+
f"| **Progress** | {step_count}/{max_steps} `{bar}` {pct:.0f}% |\n"
|
| 361 |
+
f"| **Reward** | `{cum_reward:+.4f}` |\n"
|
| 362 |
+
f"| **Submitted** | {'✅' if s.get('submitted') else '❌'} |\n"
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def _fmt_history(action_history: list[dict]) -> str:
|
| 367 |
+
if not action_history:
|
| 368 |
+
return "*No actions yet.*"
|
| 369 |
+
lines = ["| Step | Action | Parameters |", "|:---:|---|---|"]
|
| 370 |
+
for a in action_history:
|
| 371 |
+
step = a.get("step", "?")
|
| 372 |
+
at = a.get("action_type", "?")
|
| 373 |
+
p = a.get("parameters", {})
|
| 374 |
+
p_str = ", ".join(f"`{k}={v}`" for k, v in p.items() if v) or "—"
|
| 375 |
+
icon = "🔍" if at in _DIAG_ACTIONS else ("🔧" if at in _REM_ACTIONS else "📝")
|
| 376 |
+
lines.append(f"| {step} | {icon} `{at}` | {p_str} |")
|
| 377 |
+
return "\n".join(lines)
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
def _fmt_reward(reward_text: str, grader_result: dict | None = None) -> str:
|
| 381 |
+
lines = [reward_text]
|
| 382 |
+
if grader_result:
|
| 383 |
+
total = grader_result.get("total", 0.0)
|
| 384 |
+
emoji = "🟢" if total >= 0.8 else ("🟡" if total >= 0.5 else "🔴")
|
| 385 |
+
lines.append(f"\n### {emoji} Grader Score: **{total:.4f}** / 1.0\n")
|
| 386 |
+
bd = grader_result.get("breakdown", {})
|
| 387 |
+
if bd:
|
| 388 |
+
lines.append("| Component | Value |\n|---|---|")
|
| 389 |
+
for k, v in bd.items():
|
| 390 |
+
lines.append(f"| {k} | `{v}` |")
|
| 391 |
+
lines.append("")
|
| 392 |
+
fb = grader_result.get("feedback", "")
|
| 393 |
+
if fb:
|
| 394 |
+
lines.append(f"> {fb}")
|
| 395 |
+
return "\n".join(lines)
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
def _gr_reset(task_id: str, scenario_index: str):
|
| 399 |
+
try:
|
| 400 |
+
env = _get_env_direct()
|
| 401 |
+
obs = env.reset(task_id=task_id, scenario_index=int(scenario_index))
|
| 402 |
+
st = env.state()
|
| 403 |
+
services = obs.known_services
|
| 404 |
+
return (
|
| 405 |
+
_fmt_obs(obs.model_dump()),
|
| 406 |
+
_fmt_state(st.model_dump()),
|
| 407 |
+
_fmt_history([]),
|
| 408 |
+
"✅ Episode started.",
|
| 409 |
+
gr.Dropdown(choices=services, value=services[0] if services else None),
|
| 410 |
+
)
|
| 411 |
+
except Exception as e:
|
| 412 |
+
err = f"❌ **Error:** {e}"
|
| 413 |
+
return (err, err, "", err, gr.Dropdown(choices=[]))
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def _gr_step(action_type, service, severity, failure_mode, summary, flag, runbook_action, target_version):
|
| 417 |
+
try:
|
| 418 |
+
env = _get_env_direct()
|
| 419 |
+
params = ActionParameters(
|
| 420 |
+
service=service or None, severity=severity if severity else None,
|
| 421 |
+
failure_mode=failure_mode or None, summary=summary or None,
|
| 422 |
+
flag=flag or None, runbook_action=runbook_action or None,
|
| 423 |
+
target_version=target_version or None,
|
| 424 |
+
)
|
| 425 |
+
action = Action(action_type=action_type, parameters=params)
|
| 426 |
+
obs, reward, done, info = env.step(action)
|
| 427 |
+
st = env.state()
|
| 428 |
+
reward_text = (
|
| 429 |
+
f"### Step Reward: `{reward.score:+.4f}`\n\n"
|
| 430 |
+
f"**Cumulative:** `{reward.cumulative:+.4f}`\n\n"
|
| 431 |
+
f"**Feedback:** {reward.reason}"
|
| 432 |
+
)
|
| 433 |
+
if done:
|
| 434 |
+
reward_text += "\n\n---\n🏁 **EPISODE COMPLETE** — Click **Grade Episode**"
|
| 435 |
+
return (
|
| 436 |
+
_fmt_obs(obs.model_dump()),
|
| 437 |
+
_fmt_state(st.model_dump()),
|
| 438 |
+
_fmt_history(st.action_history),
|
| 439 |
+
reward_text,
|
| 440 |
+
)
|
| 441 |
+
except Exception as e:
|
| 442 |
+
err = f"❌ **Error:** {e}"
|
| 443 |
+
return (err, "", "", err)
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
def _gr_grade():
|
| 447 |
+
try:
|
| 448 |
+
env = _get_env_direct()
|
| 449 |
+
s = env.state()
|
| 450 |
+
from graders import grade
|
| 451 |
+
result = grade(s.task_id, s.model_dump(), env._scenario)
|
| 452 |
+
return _fmt_reward("### Final Grading", result)
|
| 453 |
+
except Exception as e:
|
| 454 |
+
return f"❌ {e}"
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def _gr_state():
|
| 458 |
+
try:
|
| 459 |
+
env = _get_env_direct()
|
| 460 |
+
return _fmt_state(env.state().model_dump())
|
| 461 |
+
except Exception as e:
|
| 462 |
+
return f"❌ {e}"
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
CUSTOM_CSS = """
|
| 466 |
+
:root, html, body, .gradio-container { color-scheme: light !important; }
|
| 467 |
+
body.dark, html.dark, .dark {
|
| 468 |
+
color-scheme: light !important;
|
| 469 |
+
--body-background-fill: #ffffff !important;
|
| 470 |
+
--background-fill-primary: #ffffff !important;
|
| 471 |
+
--background-fill-secondary: #f8fafc !important;
|
| 472 |
+
}
|
| 473 |
+
.gradio-container {
|
| 474 |
+
background: #ffffff !important;
|
| 475 |
+
max-width: 1500px !important;
|
| 476 |
+
margin: 0 auto !important;
|
| 477 |
+
}
|
| 478 |
+
.env-header {
|
| 479 |
+
display: flex; justify-content: space-between; align-items: center;
|
| 480 |
+
padding: 20px 16px; border-bottom: 2px solid #e2e8f0;
|
| 481 |
+
margin-bottom: 20px; background: linear-gradient(135deg, #f8fafc, #ffffff);
|
| 482 |
+
border-radius: 12px 12px 0 0;
|
| 483 |
+
}
|
| 484 |
+
.env-header-left {
|
| 485 |
+
display: flex; align-items: center; gap: 14px;
|
| 486 |
+
font-size: 1.5rem; font-weight: 800; color: #0f172a;
|
| 487 |
+
}
|
| 488 |
+
.env-header-dot {
|
| 489 |
+
width: 14px; height: 14px; border-radius: 50%;
|
| 490 |
+
background: #22c55e; box-shadow: 0 0 8px rgba(34,197,94,0.4);
|
| 491 |
+
}
|
| 492 |
+
.env-header-right { font-size: 0.9rem; font-weight: 600; color: #94a3b8; text-transform: uppercase; }
|
| 493 |
+
.section-title {
|
| 494 |
+
font-weight: 700; font-size: 0.95rem; color: #1e293b;
|
| 495 |
+
margin: 16px 0 8px; padding: 8px 12px; background: #f1f5f9;
|
| 496 |
+
border-radius: 8px; border-left: 3px solid #3b82f6;
|
| 497 |
+
}
|
| 498 |
+
"""
|
| 499 |
+
|
| 500 |
+
FORCE_LIGHT_JS = """
|
| 501 |
+
function() {
|
| 502 |
+
document.body.classList.remove('dark');
|
| 503 |
+
document.documentElement.classList.remove('dark');
|
| 504 |
+
document.documentElement.style.setProperty('color-scheme', 'light');
|
| 505 |
+
}
|
| 506 |
+
"""
|
| 507 |
+
|
| 508 |
+
with gr.Blocks(
|
| 509 |
+
title="Cloud Incident Response — OpenEnv",
|
| 510 |
+
css=CUSTOM_CSS, js=FORCE_LIGHT_JS,
|
| 511 |
+
theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate",
|
| 512 |
+
font=gr.themes.GoogleFont("Inter")),
|
| 513 |
+
) as demo:
|
| 514 |
+
|
| 515 |
+
gr.HTML("""
|
| 516 |
+
<div class="env-header">
|
| 517 |
+
<div class="env-header-left">
|
| 518 |
+
<span class="env-header-dot"></span> ☁️ Cloud Incident Response
|
| 519 |
+
</div>
|
| 520 |
+
<span class="env-header-right">OpenEnv · v0.1.0</span>
|
| 521 |
+
</div>
|
| 522 |
+
""")
|
| 523 |
+
|
| 524 |
+
with gr.Accordion("📖 How to Use", open=False):
|
| 525 |
+
gr.Markdown("""
|
| 526 |
+
### Quick Start
|
| 527 |
+
1. Select **Task** + **Scenario** → Click **🔄 Reset**
|
| 528 |
+
2. Choose **Action Type** + **Service** → Click **▶️ Execute**
|
| 529 |
+
3. Repeat: investigate → remediate → submit
|
| 530 |
+
4. Click **📊 Grade** for final score (0.0–1.0)
|
| 531 |
+
|
| 532 |
+
### Tasks
|
| 533 |
+
| Task | Difficulty | Steps | Submission |
|
| 534 |
+
|---|---|---|---|
|
| 535 |
+
| `alert_classification` | 🟢 Easy | 3 | `submit_severity` |
|
| 536 |
+
| `root_cause_analysis` | 🟡 Medium | 10 | `submit_root_cause` |
|
| 537 |
+
| `remediation_planning` | 🔴 Hard | 15 | `submit_resolution` |
|
| 538 |
+
|
| 539 |
+
### Important
|
| 540 |
+
- **Medium/Hard**: `check_recent_deploys` requires prior `query_logs` or `check_metrics` on that service
|
| 541 |
+
- Each action gives immediate reward feedback
|
| 542 |
+
- Wrong remediation actions are penalized
|
| 543 |
+
""")
|
| 544 |
+
|
| 545 |
+
with gr.Row(equal_height=False):
|
| 546 |
+
with gr.Column(scale=2, min_width=380):
|
| 547 |
+
gr.HTML('<div class="section-title">🎯 Episode Setup</div>')
|
| 548 |
+
with gr.Row():
|
| 549 |
+
task_dd = gr.Dropdown(
|
| 550 |
+
choices=[("🟢 Easy — Alert Classification", "alert_classification"),
|
| 551 |
+
("🟡 Medium — Root Cause Analysis", "root_cause_analysis"),
|
| 552 |
+
("🔴 Hard — Remediation Planning", "remediation_planning")],
|
| 553 |
+
value="alert_classification", label="Task", scale=2)
|
| 554 |
+
scenario_dd = gr.Dropdown(
|
| 555 |
+
choices=[("Scenario 0", "0"), ("Scenario 1", "1"), ("Scenario 2", "2")],
|
| 556 |
+
value="0", label="Scenario", scale=1)
|
| 557 |
+
reset_btn = gr.Button("🔄 Reset Environment", variant="secondary", size="lg")
|
| 558 |
+
|
| 559 |
+
gr.HTML('<div class="section-title">🎮 Action Controls</div>')
|
| 560 |
+
action_type_dd = gr.Dropdown(
|
| 561 |
+
choices=[("🔍 query_logs", "query_logs"), ("🔍 check_metrics", "check_metrics"),
|
| 562 |
+
("🔍 check_dependencies", "check_dependencies"),
|
| 563 |
+
("🔍 check_recent_deploys", "check_recent_deploys"),
|
| 564 |
+
("🔍 check_service_status", "check_service_status"),
|
| 565 |
+
("🔧 restart_service", "restart_service"),
|
| 566 |
+
("🔧 rollback_deploy", "rollback_deploy"),
|
| 567 |
+
("🔧 scale_service", "scale_service"),
|
| 568 |
+
("🔧 disable_feature_flag", "disable_feature_flag"),
|
| 569 |
+
("🔧 clear_cache", "clear_cache"),
|
| 570 |
+
("🔧 execute_runbook_step", "execute_runbook_step"),
|
| 571 |
+
("📝 submit_severity", "submit_severity"),
|
| 572 |
+
("📝 submit_root_cause", "submit_root_cause"),
|
| 573 |
+
("📝 submit_resolution", "submit_resolution")],
|
| 574 |
+
value="query_logs", label="Action Type")
|
| 575 |
+
service_dd = gr.Dropdown(choices=[], label="Target Service",
|
| 576 |
+
allow_custom_value=True, info="Populated after Reset")
|
| 577 |
+
|
| 578 |
+
with gr.Accordion("📋 Parameters", open=False):
|
| 579 |
+
severity_dd = gr.Dropdown(
|
| 580 |
+
choices=[("—", ""), ("P1 Critical", "P1"), ("P2 High", "P2"),
|
| 581 |
+
("P3 Medium", "P3"), ("P4 Low", "P4")],
|
| 582 |
+
value="", label="Severity")
|
| 583 |
+
failure_mode_input = gr.Textbox(label="Failure Mode", lines=1,
|
| 584 |
+
placeholder="e.g. unbounded query OOM killing postgres-db")
|
| 585 |
+
summary_input = gr.Textbox(label="Resolution Summary", lines=4,
|
| 586 |
+
placeholder="3+ sentences: what failed, what you did, status")
|
| 587 |
+
flag_input = gr.Textbox(label="Feature Flag", lines=1, placeholder="e.g. full_history_export")
|
| 588 |
+
runbook_input = gr.Textbox(label="Runbook Action", lines=1, placeholder="e.g. restore_bgp_routes")
|
| 589 |
+
target_version_input = gr.Textbox(label="Target Version", lines=1, placeholder="e.g. previous")
|
| 590 |
+
|
| 591 |
+
step_btn = gr.Button("▶️ Execute Action", variant="primary", size="lg")
|
| 592 |
+
|
| 593 |
+
gr.HTML('<div class="section-title">📊 Controls</div>')
|
| 594 |
+
with gr.Row():
|
| 595 |
+
grade_btn = gr.Button("📊 Grade", variant="secondary", size="sm")
|
| 596 |
+
state_btn = gr.Button("📋 State", variant="secondary", size="sm")
|
| 597 |
+
|
| 598 |
+
gr.HTML('<div class="section-title">📌 State</div>')
|
| 599 |
+
state_display = gr.Markdown("### ⏳ Ready\n\nSelect task → Reset → Begin")
|
| 600 |
+
|
| 601 |
+
with gr.Column(scale=3, min_width=480):
|
| 602 |
+
gr.HTML('<div class="section-title">👁️ Observation</div>')
|
| 603 |
+
obs_display = gr.Markdown("### 👋 Welcome\n\nSelect a task and click **Reset** to begin.")
|
| 604 |
+
|
| 605 |
+
gr.HTML('<div class="section-title">📜 History</div>')
|
| 606 |
+
history_display = gr.Markdown("*No actions yet.*")
|
| 607 |
+
|
| 608 |
+
gr.HTML('<div class="section-title">💰 Reward</div>')
|
| 609 |
+
reward_display = gr.Markdown("*Start an episode first.*")
|
| 610 |
+
|
| 611 |
+
reset_btn.click(fn=_gr_reset, inputs=[task_dd, scenario_dd],
|
| 612 |
+
outputs=[obs_display, state_display, history_display, reward_display, service_dd])
|
| 613 |
+
step_btn.click(fn=_gr_step,
|
| 614 |
+
inputs=[action_type_dd, service_dd, severity_dd, failure_mode_input,
|
| 615 |
+
summary_input, flag_input, runbook_input, target_version_input],
|
| 616 |
+
outputs=[obs_display, state_display, history_display, reward_display])
|
| 617 |
+
grade_btn.click(fn=_gr_grade, outputs=[reward_display])
|
| 618 |
+
state_btn.click(fn=_gr_state, outputs=[state_display])
|
| 619 |
+
|
| 620 |
+
app = gr.mount_gradio_app(app, demo, path="/")
|
| 621 |
+
|
| 622 |
+
|
| 623 |
+
def main():
|
| 624 |
+
"""Start the OpenEnv server."""
|
| 625 |
import uvicorn
|
| 626 |
+
uvicorn.run("server.app:app", host="0.0.0.0", port=7860, reload=False)
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
if __name__ == "__main__":
|
| 630 |
+
main()
|
server/environment.py
CHANGED
|
@@ -1,28 +1,24 @@
|
|
| 1 |
"""
|
| 2 |
server/environment.py — Core OpenEnv environment for Cloud Incident Response.
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
All state is in-memory. Thread-safe via a lock.
|
| 10 |
"""
|
| 11 |
|
| 12 |
from __future__ import annotations
|
| 13 |
|
| 14 |
-
import uuid
|
| 15 |
-
import threading
|
| 16 |
-
import sys
|
| 17 |
import os
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 20 |
|
| 21 |
-
from
|
| 22 |
-
from
|
| 23 |
-
from
|
| 24 |
-
|
| 25 |
-
# ── Action type classification ────────────────────────────────────────────────
|
| 26 |
|
| 27 |
_DIAGNOSTIC = frozenset({
|
| 28 |
"query_logs", "check_metrics", "check_dependencies",
|
|
@@ -38,54 +34,81 @@ _SUBMIT = frozenset({
|
|
| 38 |
"submit_severity", "submit_root_cause", "submit_resolution",
|
| 39 |
})
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
class IncidentEnvironment:
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
"""
|
| 58 |
-
|
| 59 |
-
def __init__(self):
|
| 60 |
-
self._lock = threading.Lock()
|
| 61 |
-
self._s: dict = {}
|
| 62 |
self._scenario: dict = {}
|
| 63 |
self._task_def: dict = {}
|
| 64 |
-
self._ready
|
| 65 |
-
|
| 66 |
-
# ── Public OpenEnv API ───────────────────────────────────────────────────
|
| 67 |
|
| 68 |
-
def reset(self, task_id: str
|
|
|
|
| 69 |
with self._lock:
|
| 70 |
task_def = get_task(task_id)
|
| 71 |
scenario = get_scenario(task_id, scenario_index)
|
| 72 |
-
|
| 73 |
self._task_def = task_def
|
| 74 |
self._scenario = scenario
|
| 75 |
self._s = {
|
| 76 |
-
"episode_id":
|
| 77 |
-
"task_id":
|
| 78 |
-
"scenario_id":
|
| 79 |
-
"step_count":
|
| 80 |
-
"max_steps":
|
| 81 |
-
"action_history":
|
| 82 |
-
"queried_data":
|
| 83 |
-
"queried_keys":
|
| 84 |
-
"
|
| 85 |
-
"
|
| 86 |
-
"
|
|
|
|
|
|
|
| 87 |
"cumulative_reward": 0.0,
|
| 88 |
-
"feedback":
|
|
|
|
| 89 |
}
|
| 90 |
self._ready = True
|
| 91 |
return self._build_obs()
|
|
@@ -94,76 +117,76 @@ class IncidentEnvironment:
|
|
| 94 |
with self._lock:
|
| 95 |
if not self._ready:
|
| 96 |
raise RuntimeError("Call reset() before step().")
|
| 97 |
-
|
| 98 |
s = self._s
|
|
|
|
|
|
|
| 99 |
if s["done"]:
|
| 100 |
-
return (
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
True,
|
| 105 |
-
{},
|
| 106 |
-
)
|
| 107 |
|
| 108 |
s["step_count"] += 1
|
| 109 |
step_num = s["step_count"]
|
| 110 |
-
at
|
| 111 |
-
params
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
s["action_history"].append({
|
| 114 |
"action_type": at,
|
| 115 |
-
"parameters":
|
| 116 |
-
"step":
|
| 117 |
})
|
| 118 |
|
| 119 |
-
r
|
| 120 |
fb: list[str] = []
|
| 121 |
|
| 122 |
-
|
| 123 |
-
if
|
| 124 |
-
r +=
|
| 125 |
-
fb.append("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
if at in _DIAGNOSTIC:
|
| 128 |
-
r, fb = self._handle_diagnostic(at, params, r, fb)
|
| 129 |
elif at in _REMEDIATION:
|
| 130 |
-
r, fb = self._handle_remediation(at, params, r, fb)
|
| 131 |
elif at in _SUBMIT:
|
| 132 |
-
r, fb, terminal = self._handle_submit(at, params, r, fb)
|
| 133 |
if terminal:
|
| 134 |
s["done"] = True
|
| 135 |
else:
|
| 136 |
-
r +=
|
| 137 |
-
fb.append(f"unknown
|
|
|
|
| 138 |
|
| 139 |
-
# Timeout if max steps reached without submission
|
| 140 |
if step_num >= s["max_steps"] and not s["done"]:
|
| 141 |
-
r +=
|
| 142 |
-
fb.append("timeout
|
| 143 |
s["done"] = True
|
| 144 |
|
| 145 |
-
# Apply grader score on terminal step
|
| 146 |
if s["done"]:
|
| 147 |
result = grade(s["task_id"], s, self._scenario)
|
|
|
|
| 148 |
s["cumulative_reward"] = round(
|
| 149 |
-
s["cumulative_reward"] + r +
|
| 150 |
-
)
|
| 151 |
-
fb.append(f"grader={result['feedback']}")
|
| 152 |
else:
|
| 153 |
s["cumulative_reward"] = round(s["cumulative_reward"] + r, 4)
|
| 154 |
|
| 155 |
s["feedback"] = " | ".join(fb) if fb else "ok"
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
reason=s["feedback"],
|
| 162 |
-
cumulative=s["cumulative_reward"],
|
| 163 |
-
),
|
| 164 |
-
s["done"],
|
| 165 |
-
{"step": step_num, "feedback": s["feedback"]},
|
| 166 |
-
)
|
| 167 |
|
| 168 |
def state(self) -> EpisodeState:
|
| 169 |
with self._lock:
|
|
@@ -171,154 +194,143 @@ class IncidentEnvironment:
|
|
| 171 |
raise RuntimeError("No active episode — call reset() first.")
|
| 172 |
s = self._s
|
| 173 |
return EpisodeState(
|
| 174 |
-
episode_id=s["episode_id"],
|
| 175 |
-
|
| 176 |
-
scenario_id=s["scenario_id"],
|
| 177 |
-
step_count=s["step_count"],
|
| 178 |
max_steps=s["max_steps"],
|
| 179 |
action_history=list(s["action_history"]),
|
| 180 |
queried_data=dict(s["queried_data"]),
|
| 181 |
-
submitted=s["submitted"],
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
r += R_QUERY_UNKNOWN
|
| 212 |
-
fb.append(f"unknown service '{service}' ({R_QUERY_UNKNOWN})")
|
| 213 |
else:
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
| 215 |
|
|
|
|
|
|
|
| 216 |
return r, fb
|
| 217 |
|
| 218 |
-
def _handle_remediation(
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
runbook = (params.runbook_action or "").lower().strip()
|
| 225 |
-
target
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
if service: keys.add(f"{at}:{service}")
|
| 230 |
if flag: keys.add(f"{at}:{flag}")
|
| 231 |
if runbook: keys.add(f"execute_runbook_step:{runbook}")
|
| 232 |
if target: keys.add(f"execute_runbook_step:{target}")
|
| 233 |
|
| 234 |
-
wrong_map
|
| 235 |
-
rem_data
|
| 236 |
|
| 237 |
-
# Check for wrong actions — also use fuzzy service matching for `at:svc` keys
|
| 238 |
is_wrong = any(k in wrong_map for k in keys)
|
| 239 |
-
if not is_wrong and
|
| 240 |
-
# Try _svc_match against wrong action keys of the form `at:svc`
|
| 241 |
for wk in wrong_map:
|
| 242 |
if ":" in wk:
|
| 243 |
w_at, w_svc = wk.split(":", 1)
|
| 244 |
-
if w_at == at and _svc_match(
|
| 245 |
is_wrong = True
|
| 246 |
break
|
| 247 |
|
| 248 |
if is_wrong:
|
| 249 |
-
r +=
|
| 250 |
-
reason = next(
|
| 251 |
-
|
| 252 |
-
"wrong action for this incident"
|
| 253 |
-
)
|
| 254 |
-
fb.append(f"wrong action '{at}': {str(reason)[:80]}")
|
| 255 |
else:
|
| 256 |
-
r +=
|
| 257 |
-
|
|
|
|
| 258 |
at_data = rem_data.get(at, {})
|
| 259 |
-
result
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
or "action executed successfully"
|
| 263 |
-
)
|
| 264 |
-
s["queried_data"].setdefault(at, {})[
|
| 265 |
-
service or flag or runbook or target or at
|
| 266 |
-
] = result
|
| 267 |
-
|
| 268 |
return r, fb
|
| 269 |
|
| 270 |
-
def _handle_submit(
|
| 271 |
-
self, at: str, params: ActionParameters, r: float, fb: list[str]
|
| 272 |
-
) -> tuple[float, list[str], bool]:
|
| 273 |
s = self._s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
s["submitted"] = True
|
|
|
|
|
|
|
| 275 |
|
| 276 |
if at == "submit_severity":
|
| 277 |
-
fb.append(f"
|
| 278 |
-
|
| 279 |
elif at == "submit_root_cause":
|
| 280 |
-
fb.append(
|
| 281 |
-
f"submitted root cause: "
|
| 282 |
-
f"service={params.service or ''}, "
|
| 283 |
-
f"failure_mode={params.failure_mode or ''}"
|
| 284 |
-
)
|
| 285 |
-
|
| 286 |
elif at == "submit_resolution":
|
| 287 |
-
summary
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
)
|
| 292 |
-
if summary.strip() and inv_count >= 1:
|
| 293 |
s["resolved"] = True
|
| 294 |
-
fb.append("
|
| 295 |
else:
|
| 296 |
-
fb.append("
|
| 297 |
-
|
| 298 |
return r, fb, True
|
| 299 |
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
def _build_obs(self) -> Observation:
|
| 303 |
-
s = self._s
|
| 304 |
sc = self._scenario
|
| 305 |
td = self._task_def
|
| 306 |
-
|
| 307 |
-
# Return sorted list of known service names (exact strings agents must use)
|
| 308 |
-
known = sorted(sc.get("known_services", set()))
|
| 309 |
-
|
| 310 |
return Observation(
|
| 311 |
-
episode_id=s["episode_id"],
|
| 312 |
-
|
| 313 |
-
scenario_id=s["scenario_id"],
|
| 314 |
-
step_count=s["step_count"],
|
| 315 |
max_steps=s["max_steps"],
|
| 316 |
incident_summary=sc.get("incident_summary", sc.get("description", "")),
|
| 317 |
alert=sc.get("alert", {}),
|
| 318 |
available_actions=td.get("available_actions", []),
|
| 319 |
queried_data=dict(s["queried_data"]),
|
| 320 |
cumulative_reward=s["cumulative_reward"],
|
| 321 |
-
done=s["done"],
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
)
|
|
|
|
| 1 |
"""
|
| 2 |
server/environment.py — Core OpenEnv environment for Cloud Incident Response.
|
| 3 |
|
| 4 |
+
Difficulty comes from SCENARIO DESIGN, not mechanics:
|
| 5 |
+
EASY: 3 services, clear metrics, obvious severity
|
| 6 |
+
MEDIUM: 8 services, root cause NOT in alert, must follow log breadcrumbs
|
| 7 |
+
HARD: 8 services + 5-7 remediation steps + quality summary + penalties
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
from __future__ import annotations
|
| 11 |
|
|
|
|
|
|
|
|
|
|
| 12 |
import os
|
| 13 |
+
import sys
|
| 14 |
+
import threading
|
| 15 |
+
import uuid
|
| 16 |
|
| 17 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 18 |
|
| 19 |
+
from graders import _svc_match, grade
|
| 20 |
+
from server.models import Action, ActionParameters, EpisodeState, Observation, Reward
|
| 21 |
+
from tasks import get_scenario, get_task
|
|
|
|
|
|
|
| 22 |
|
| 23 |
_DIAGNOSTIC = frozenset({
|
| 24 |
"query_logs", "check_metrics", "check_dependencies",
|
|
|
|
| 34 |
"submit_severity", "submit_root_cause", "submit_resolution",
|
| 35 |
})
|
| 36 |
|
| 37 |
+
_TASK_SUBMIT = {
|
| 38 |
+
"alert_classification": "submit_severity",
|
| 39 |
+
"root_cause_analysis": "submit_root_cause",
|
| 40 |
+
"remediation_planning": "submit_resolution",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
_REWARD_TABLE = {
|
| 44 |
+
"easy": {
|
| 45 |
+
"query_new_svc": +0.04, "query_new_action": +0.02,
|
| 46 |
+
"query_repeat": -0.03, "query_unknown_svc": -0.06,
|
| 47 |
+
"query_no_service": -0.04, "rem_good": +0.00,
|
| 48 |
+
"rem_wrong": -0.08, "rem_no_target": -0.05,
|
| 49 |
+
"submit_correct": +0.02, "submit_wrong": -0.08,
|
| 50 |
+
"past_half": -0.04, "timeout": -0.15,
|
| 51 |
+
"bad_action": -0.05, "exact_repeat": -0.04,
|
| 52 |
+
},
|
| 53 |
+
"medium": {
|
| 54 |
+
"query_new_svc": +0.04, "query_new_action": +0.02,
|
| 55 |
+
"query_repeat": -0.04, "query_unknown_svc": -0.06,
|
| 56 |
+
"query_no_service": -0.04, "rem_good": +0.06,
|
| 57 |
+
"rem_wrong": -0.10, "rem_no_target": -0.06,
|
| 58 |
+
"submit_correct": +0.02, "submit_wrong": -0.10,
|
| 59 |
+
"past_half": -0.02, "timeout": -0.15,
|
| 60 |
+
"bad_action": -0.05, "exact_repeat": -0.05,
|
| 61 |
+
},
|
| 62 |
+
"hard": {
|
| 63 |
+
"query_new_svc": +0.03, "query_new_action": +0.01,
|
| 64 |
+
"query_repeat": -0.03, "query_unknown_svc": -0.05,
|
| 65 |
+
"query_no_service": -0.03, "rem_good": +0.06,
|
| 66 |
+
"rem_wrong": -0.15, "rem_no_target": -0.05,
|
| 67 |
+
"submit_correct": +0.02, "submit_wrong": -0.12,
|
| 68 |
+
"past_half": -0.02, "timeout": -0.20,
|
| 69 |
+
"bad_action": -0.05, "exact_repeat": -0.04,
|
| 70 |
+
},
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
_TASK_DIFFICULTY = {
|
| 74 |
+
"alert_classification": "easy",
|
| 75 |
+
"root_cause_analysis": "medium",
|
| 76 |
+
"remediation_planning": "hard",
|
| 77 |
+
}
|
| 78 |
|
| 79 |
|
| 80 |
class IncidentEnvironment:
|
| 81 |
+
def __init__(self) -> None:
|
| 82 |
+
self._lock = threading.Lock()
|
| 83 |
+
self._s: dict = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
self._scenario: dict = {}
|
| 85 |
self._task_def: dict = {}
|
| 86 |
+
self._ready = False
|
|
|
|
|
|
|
| 87 |
|
| 88 |
+
def reset(self, task_id: str = "alert_classification",
|
| 89 |
+
scenario_index: int = 0) -> Observation:
|
| 90 |
with self._lock:
|
| 91 |
task_def = get_task(task_id)
|
| 92 |
scenario = get_scenario(task_id, scenario_index)
|
|
|
|
| 93 |
self._task_def = task_def
|
| 94 |
self._scenario = scenario
|
| 95 |
self._s = {
|
| 96 |
+
"episode_id": str(uuid.uuid4()),
|
| 97 |
+
"task_id": task_id,
|
| 98 |
+
"scenario_id": scenario["scenario_id"],
|
| 99 |
+
"step_count": 0,
|
| 100 |
+
"max_steps": task_def["max_steps"],
|
| 101 |
+
"action_history": [],
|
| 102 |
+
"queried_data": {},
|
| 103 |
+
"queried_keys": set(),
|
| 104 |
+
"services_queried": set(),
|
| 105 |
+
"exact_hashes": set(),
|
| 106 |
+
"submitted": False,
|
| 107 |
+
"resolved": False,
|
| 108 |
+
"done": False,
|
| 109 |
"cumulative_reward": 0.0,
|
| 110 |
+
"feedback": f"Episode started. {scenario['description']}",
|
| 111 |
+
"last_action_error": None,
|
| 112 |
}
|
| 113 |
self._ready = True
|
| 114 |
return self._build_obs()
|
|
|
|
| 117 |
with self._lock:
|
| 118 |
if not self._ready:
|
| 119 |
raise RuntimeError("Call reset() before step().")
|
|
|
|
| 120 |
s = self._s
|
| 121 |
+
s["last_action_error"] = None
|
| 122 |
+
|
| 123 |
if s["done"]:
|
| 124 |
+
return (self._build_obs(),
|
| 125 |
+
Reward(score=0.0, reason="episode already done",
|
| 126 |
+
cumulative=s["cumulative_reward"]),
|
| 127 |
+
True, {})
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
s["step_count"] += 1
|
| 130 |
step_num = s["step_count"]
|
| 131 |
+
at = action.action_type
|
| 132 |
+
params = action.parameters
|
| 133 |
+
task_id = s["task_id"]
|
| 134 |
+
diff = _TASK_DIFFICULTY.get(task_id, "medium")
|
| 135 |
+
rt = _REWARD_TABLE[diff]
|
| 136 |
|
| 137 |
s["action_history"].append({
|
| 138 |
"action_type": at,
|
| 139 |
+
"parameters": params.model_dump(exclude_none=True),
|
| 140 |
+
"step": step_num,
|
| 141 |
})
|
| 142 |
|
| 143 |
+
r = 0.0
|
| 144 |
fb: list[str] = []
|
| 145 |
|
| 146 |
+
h = f"{at}|{params.model_dump_json(exclude_none=True)}"
|
| 147 |
+
if h in s["exact_hashes"]:
|
| 148 |
+
r += rt["exact_repeat"]
|
| 149 |
+
fb.append(f"exact repeat ({rt['exact_repeat']:+.2f})")
|
| 150 |
+
s["exact_hashes"].add(h)
|
| 151 |
+
|
| 152 |
+
half = max(1, s["max_steps"] // 2)
|
| 153 |
+
if step_num > half and at not in _SUBMIT:
|
| 154 |
+
r += rt["past_half"]
|
| 155 |
+
fb.append(f"past halfway ({rt['past_half']:+.3f})")
|
| 156 |
|
| 157 |
if at in _DIAGNOSTIC:
|
| 158 |
+
r, fb = self._handle_diagnostic(at, params, r, fb, rt)
|
| 159 |
elif at in _REMEDIATION:
|
| 160 |
+
r, fb = self._handle_remediation(at, params, r, fb, rt, task_id)
|
| 161 |
elif at in _SUBMIT:
|
| 162 |
+
r, fb, terminal = self._handle_submit(at, params, r, fb, rt, task_id)
|
| 163 |
if terminal:
|
| 164 |
s["done"] = True
|
| 165 |
else:
|
| 166 |
+
r += rt["bad_action"]
|
| 167 |
+
fb.append(f"unknown action '{at}' ({rt['bad_action']:+.2f})")
|
| 168 |
+
s["last_action_error"] = f"Unknown action type: {at}"
|
| 169 |
|
|
|
|
| 170 |
if step_num >= s["max_steps"] and not s["done"]:
|
| 171 |
+
r += rt["timeout"]
|
| 172 |
+
fb.append(f"timeout ({rt['timeout']:+.2f})")
|
| 173 |
s["done"] = True
|
| 174 |
|
|
|
|
| 175 |
if s["done"]:
|
| 176 |
result = grade(s["task_id"], s, self._scenario)
|
| 177 |
+
grader_score = result["total"]
|
| 178 |
s["cumulative_reward"] = round(
|
| 179 |
+
s["cumulative_reward"] + r + grader_score, 4)
|
| 180 |
+
fb.append(f"grader={grader_score:.3f} ({result['feedback']})")
|
|
|
|
| 181 |
else:
|
| 182 |
s["cumulative_reward"] = round(s["cumulative_reward"] + r, 4)
|
| 183 |
|
| 184 |
s["feedback"] = " | ".join(fb) if fb else "ok"
|
| 185 |
+
return (self._build_obs(),
|
| 186 |
+
Reward(score=round(r, 4), reason=s["feedback"],
|
| 187 |
+
cumulative=s["cumulative_reward"]),
|
| 188 |
+
s["done"],
|
| 189 |
+
{"step": step_num, "feedback": s["feedback"]})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
def state(self) -> EpisodeState:
|
| 192 |
with self._lock:
|
|
|
|
| 194 |
raise RuntimeError("No active episode — call reset() first.")
|
| 195 |
s = self._s
|
| 196 |
return EpisodeState(
|
| 197 |
+
episode_id=s["episode_id"], task_id=s["task_id"],
|
| 198 |
+
scenario_id=s["scenario_id"], step_count=s["step_count"],
|
|
|
|
|
|
|
| 199 |
max_steps=s["max_steps"],
|
| 200 |
action_history=list(s["action_history"]),
|
| 201 |
queried_data=dict(s["queried_data"]),
|
| 202 |
+
submitted=s["submitted"], resolved=s["resolved"],
|
| 203 |
+
done=s["done"], cumulative_reward=s["cumulative_reward"],
|
| 204 |
+
feedback=s["feedback"])
|
| 205 |
+
|
| 206 |
+
def _handle_diagnostic(self, at, params, r, fb, rt):
|
| 207 |
+
s = self._s
|
| 208 |
+
svc = (params.service or "").lower().strip()
|
| 209 |
+
known = {v.lower() for v in self._scenario.get("known_services", set())}
|
| 210 |
+
tool = self._scenario.get("tool_responses", {}).get(at, {})
|
| 211 |
+
key = (at, svc)
|
| 212 |
+
|
| 213 |
+
if not svc:
|
| 214 |
+
r += rt["query_no_service"]
|
| 215 |
+
fb.append(f"{at}: no service ({rt['query_no_service']:+.2f})")
|
| 216 |
+
s["last_action_error"] = f"{at} requires a service parameter"
|
| 217 |
+
return r, fb
|
| 218 |
+
|
| 219 |
+
if svc not in known:
|
| 220 |
+
r += rt["query_unknown_svc"]
|
| 221 |
+
fb.append(f"unknown service '{svc}' ({rt['query_unknown_svc']:+.2f})")
|
| 222 |
+
s["last_action_error"] = f"Unknown service: {svc}"
|
| 223 |
+
return r, fb
|
| 224 |
+
|
| 225 |
+
if key in s["queried_keys"]:
|
| 226 |
+
r += rt["query_repeat"]
|
| 227 |
+
fb.append(f"repeat [{at}][{svc}] ({rt['query_repeat']:+.2f})")
|
| 228 |
+
elif svc in s["services_queried"]:
|
| 229 |
+
r += rt["query_new_action"]
|
| 230 |
+
fb.append(f"new action on {svc} ({rt['query_new_action']:+.2f})")
|
| 231 |
+
s["queried_keys"].add(key)
|
|
|
|
|
|
|
| 232 |
else:
|
| 233 |
+
r += rt["query_new_svc"]
|
| 234 |
+
fb.append(f"new service {svc} ({rt['query_new_svc']:+.2f})")
|
| 235 |
+
s["queried_keys"].add(key)
|
| 236 |
+
s["services_queried"].add(svc)
|
| 237 |
|
| 238 |
+
result = tool.get(svc, f"No data available for '{svc}'.")
|
| 239 |
+
s["queried_data"].setdefault(at, {})[svc] = result
|
| 240 |
return r, fb
|
| 241 |
|
| 242 |
+
def _handle_remediation(self, at, params, r, fb, rt, task_id):
|
| 243 |
+
s = self._s
|
| 244 |
+
if task_id == "alert_classification":
|
| 245 |
+
r += rt["rem_wrong"]
|
| 246 |
+
fb.append(f"remediation in easy task ({rt['rem_wrong']:+.2f})")
|
| 247 |
+
s["last_action_error"] = "Remediation not available in alert_classification"
|
| 248 |
+
return r, fb
|
| 249 |
+
|
| 250 |
+
svc = (params.service or "").lower().strip()
|
| 251 |
+
flag = (params.flag or "").lower().strip()
|
| 252 |
runbook = (params.runbook_action or "").lower().strip()
|
| 253 |
+
target = (params.target or "").lower().strip()
|
| 254 |
+
|
| 255 |
+
if not (svc or flag or runbook or target):
|
| 256 |
+
r += rt["rem_no_target"]
|
| 257 |
+
fb.append(f"{at}: no target ({rt['rem_no_target']:+.2f})")
|
| 258 |
+
s["last_action_error"] = f"{at} requires a target"
|
| 259 |
+
return r, fb
|
| 260 |
|
| 261 |
+
keys = {at}
|
| 262 |
+
if svc: keys.add(f"{at}:{svc}")
|
|
|
|
| 263 |
if flag: keys.add(f"{at}:{flag}")
|
| 264 |
if runbook: keys.add(f"execute_runbook_step:{runbook}")
|
| 265 |
if target: keys.add(f"execute_runbook_step:{target}")
|
| 266 |
|
| 267 |
+
wrong_map = self._scenario.get("wrong_actions", {})
|
| 268 |
+
rem_data = self._scenario.get("remediation_data", {})
|
| 269 |
|
|
|
|
| 270 |
is_wrong = any(k in wrong_map for k in keys)
|
| 271 |
+
if not is_wrong and svc:
|
|
|
|
| 272 |
for wk in wrong_map:
|
| 273 |
if ":" in wk:
|
| 274 |
w_at, w_svc = wk.split(":", 1)
|
| 275 |
+
if w_at == at and _svc_match(svc, w_svc):
|
| 276 |
is_wrong = True
|
| 277 |
break
|
| 278 |
|
| 279 |
if is_wrong:
|
| 280 |
+
r += rt["rem_wrong"]
|
| 281 |
+
reason = next((wrong_map[k] for k in keys if k in wrong_map), "wrong")
|
| 282 |
+
fb.append(f"wrong: {at} — {str(reason)[:60]} ({rt['rem_wrong']:+.2f})")
|
|
|
|
|
|
|
|
|
|
| 283 |
else:
|
| 284 |
+
r += rt["rem_good"]
|
| 285 |
+
tgt = svc or flag or runbook or target
|
| 286 |
+
fb.append(f"executed {at}:{tgt} ({rt['rem_good']:+.2f})")
|
| 287 |
at_data = rem_data.get(at, {})
|
| 288 |
+
result = (at_data.get(svc) or at_data.get(flag) or at_data.get(runbook)
|
| 289 |
+
or at_data.get(target) or "action executed successfully")
|
| 290 |
+
s["queried_data"].setdefault(at, {})[tgt] = result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
return r, fb
|
| 292 |
|
| 293 |
+
def _handle_submit(self, at, params, r, fb, rt, task_id):
|
|
|
|
|
|
|
| 294 |
s = self._s
|
| 295 |
+
correct = _TASK_SUBMIT.get(task_id, "")
|
| 296 |
+
if at != correct:
|
| 297 |
+
r += rt["submit_wrong"]
|
| 298 |
+
fb.append(f"wrong submit '{at}' (need '{correct}') ({rt['submit_wrong']:+.2f})")
|
| 299 |
+
s["last_action_error"] = f"Wrong submission type: use {correct}"
|
| 300 |
+
return r, fb, False
|
| 301 |
+
|
| 302 |
s["submitted"] = True
|
| 303 |
+
r += rt["submit_correct"]
|
| 304 |
+
fb.append(f"submitted ({rt['submit_correct']:+.2f})")
|
| 305 |
|
| 306 |
if at == "submit_severity":
|
| 307 |
+
fb.append(f"severity={(params.severity or '').upper().strip()}")
|
|
|
|
| 308 |
elif at == "submit_root_cause":
|
| 309 |
+
fb.append(f"svc={params.service or ''}, mode={params.failure_mode or ''}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
elif at == "submit_resolution":
|
| 311 |
+
summary = params.summary or ""
|
| 312 |
+
inv = sum(1 for a in s["action_history"]
|
| 313 |
+
if a.get("action_type") in _DIAGNOSTIC | _REMEDIATION)
|
| 314 |
+
if summary.strip() and inv >= 1:
|
|
|
|
|
|
|
| 315 |
s["resolved"] = True
|
| 316 |
+
fb.append("resolved")
|
| 317 |
else:
|
| 318 |
+
fb.append("insufficient investigation")
|
|
|
|
| 319 |
return r, fb, True
|
| 320 |
|
| 321 |
+
def _build_obs(self):
|
| 322 |
+
s = self._s
|
|
|
|
|
|
|
| 323 |
sc = self._scenario
|
| 324 |
td = self._task_def
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
return Observation(
|
| 326 |
+
episode_id=s["episode_id"], task_id=s["task_id"],
|
| 327 |
+
scenario_id=s["scenario_id"], step_count=s["step_count"],
|
|
|
|
|
|
|
| 328 |
max_steps=s["max_steps"],
|
| 329 |
incident_summary=sc.get("incident_summary", sc.get("description", "")),
|
| 330 |
alert=sc.get("alert", {}),
|
| 331 |
available_actions=td.get("available_actions", []),
|
| 332 |
queried_data=dict(s["queried_data"]),
|
| 333 |
cumulative_reward=s["cumulative_reward"],
|
| 334 |
+
done=s["done"], feedback=s["feedback"],
|
| 335 |
+
known_services=sorted(sc.get("known_services", set())),
|
| 336 |
+
last_action_error=s.get("last_action_error"))
|
|
|
server/models.py
CHANGED
|
@@ -1,16 +1,21 @@
|
|
| 1 |
"""
|
| 2 |
-
server/models.py — Typed Pydantic models for the OpenEnv interface.
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
-
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class ActionParameters(BaseModel):
|
| 13 |
"""Flexible parameter bag — different action types use different fields."""
|
|
|
|
| 14 |
service: str | None = None
|
| 15 |
severity: str | None = None
|
| 16 |
failure_mode: str | None = None
|
|
@@ -26,7 +31,13 @@ class ActionParameters(BaseModel):
|
|
| 26 |
|
| 27 |
|
| 28 |
class Action(BaseModel):
|
| 29 |
-
"""An action submitted by the agent to the environment.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
action_type: str
|
| 31 |
parameters: ActionParameters = Field(default_factory=ActionParameters)
|
| 32 |
|
|
@@ -34,7 +45,27 @@ class Action(BaseModel):
|
|
| 34 |
|
| 35 |
|
| 36 |
class Observation(BaseModel):
|
| 37 |
-
"""Observation returned after reset() or step().
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
episode_id: str
|
| 39 |
task_id: str
|
| 40 |
scenario_id: str
|
|
@@ -47,20 +78,48 @@ class Observation(BaseModel):
|
|
| 47 |
cumulative_reward: float
|
| 48 |
done: bool
|
| 49 |
feedback: str
|
| 50 |
-
# Explicit list of all valid service names for this scenario.
|
| 51 |
-
# Agents must use these exact strings in action parameters.
|
| 52 |
known_services: list[str] = Field(default_factory=list)
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
class Reward(BaseModel):
|
| 56 |
-
"""Reward signal returned after each step().
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
reason: str
|
| 59 |
cumulative: float
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
class EpisodeState(BaseModel):
|
| 63 |
-
"""Full episode state returned by GET /state.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
episode_id: str
|
| 65 |
task_id: str
|
| 66 |
scenario_id: str
|
|
|
|
| 1 |
"""
|
| 2 |
+
server/models.py — Typed Pydantic v2 models for the OpenEnv interface.
|
| 3 |
|
| 4 |
+
Implements the full OpenEnv spec:
|
| 5 |
+
- Action: typed action with parameters
|
| 6 |
+
- Observation: full environment state visible to the agent
|
| 7 |
+
- Reward: score + reason + cumulative (with backward-compatible 'value' alias)
|
| 8 |
+
- EpisodeState: internal state for GET /state
|
| 9 |
"""
|
| 10 |
|
| 11 |
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
from pydantic import BaseModel, Field, computed_field
|
| 14 |
|
| 15 |
|
| 16 |
class ActionParameters(BaseModel):
|
| 17 |
"""Flexible parameter bag — different action types use different fields."""
|
| 18 |
+
|
| 19 |
service: str | None = None
|
| 20 |
severity: str | None = None
|
| 21 |
failure_mode: str | None = None
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
class Action(BaseModel):
|
| 34 |
+
"""An action submitted by the agent to the environment.
|
| 35 |
+
|
| 36 |
+
Attributes:
|
| 37 |
+
action_type: One of the valid action types (query_logs, check_metrics, etc.)
|
| 38 |
+
parameters: Action-specific parameters
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
action_type: str
|
| 42 |
parameters: ActionParameters = Field(default_factory=ActionParameters)
|
| 43 |
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
class Observation(BaseModel):
|
| 48 |
+
"""Observation returned after reset() or step().
|
| 49 |
+
|
| 50 |
+
Contains all information visible to the agent at this point in the episode.
|
| 51 |
+
|
| 52 |
+
Attributes:
|
| 53 |
+
episode_id: Unique episode UUID
|
| 54 |
+
task_id: Active task identifier
|
| 55 |
+
scenario_id: Current scenario identifier
|
| 56 |
+
step_count: Number of steps taken so far
|
| 57 |
+
max_steps: Maximum steps allowed
|
| 58 |
+
incident_summary: Human-readable incident description
|
| 59 |
+
alert: Alert payload with severity, symptoms, affected services
|
| 60 |
+
available_actions: List of valid action types for this task
|
| 61 |
+
queried_data: All tool responses gathered so far (evidence)
|
| 62 |
+
cumulative_reward: Running reward total
|
| 63 |
+
done: Whether the episode has ended
|
| 64 |
+
feedback: Per-step feedback string
|
| 65 |
+
known_services: Exact service names valid for actions
|
| 66 |
+
last_action_error: Error message if last action was invalid (None if OK)
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
episode_id: str
|
| 70 |
task_id: str
|
| 71 |
scenario_id: str
|
|
|
|
| 78 |
cumulative_reward: float
|
| 79 |
done: bool
|
| 80 |
feedback: str
|
|
|
|
|
|
|
| 81 |
known_services: list[str] = Field(default_factory=list)
|
| 82 |
+
last_action_error: str | None = None
|
| 83 |
|
| 84 |
|
| 85 |
class Reward(BaseModel):
|
| 86 |
+
"""Reward signal returned after each step().
|
| 87 |
+
|
| 88 |
+
Primary field is ``score`` (the actual reward value).
|
| 89 |
+
``value`` is a computed alias for backward compatibility with OpenEnv validators.
|
| 90 |
+
|
| 91 |
+
Attributes:
|
| 92 |
+
score: The reward value for this step
|
| 93 |
+
reason: Human-readable explanation of the reward
|
| 94 |
+
cumulative: Running total of all rewards in the episode
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
score: float
|
| 98 |
reason: str
|
| 99 |
cumulative: float
|
| 100 |
|
| 101 |
+
@computed_field
|
| 102 |
+
@property
|
| 103 |
+
def value(self) -> float:
|
| 104 |
+
"""Backward-compatible alias for *score*."""
|
| 105 |
+
return self.score
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
class StepResult(BaseModel):
|
| 109 |
+
"""Result returned by POST /step — matches OpenEnv spec."""
|
| 110 |
+
|
| 111 |
+
observation: Observation
|
| 112 |
+
reward: Reward
|
| 113 |
+
done: bool
|
| 114 |
+
info: dict = Field(default_factory=dict)
|
| 115 |
+
|
| 116 |
|
| 117 |
class EpisodeState(BaseModel):
|
| 118 |
+
"""Full episode state returned by GET /state.
|
| 119 |
+
|
| 120 |
+
Contains internal bookkeeping not shown to agents directly.
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
episode_id: str
|
| 124 |
task_id: str
|
| 125 |
scenario_id: str
|
tasks.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
| 1 |
"""
|
| 2 |
tasks.py — Task and scenario definitions for Cloud Incident Response OpenEnv.
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
- BGP network partitions isolating availability zones
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
Public API:
|
| 15 |
get_task(task_id) -> task metadata dict
|
|
@@ -29,9 +31,10 @@ ALL_TASKS: dict = {
|
|
| 29 |
"score_range": [0.0, 1.0],
|
| 30 |
"description": (
|
| 31 |
"An alert has fired. Query logs and metrics across affected services, "
|
| 32 |
-
"then classify the incident severity: P1 (CRITICAL —
|
| 33 |
-
"
|
| 34 |
-
"
|
|
|
|
| 35 |
),
|
| 36 |
"available_actions": [
|
| 37 |
"query_logs",
|
|
@@ -41,7 +44,7 @@ ALL_TASKS: dict = {
|
|
| 41 |
"submit_severity",
|
| 42 |
],
|
| 43 |
"submission_action": "submit_severity",
|
| 44 |
-
"scenarios":
|
| 45 |
},
|
| 46 |
"root_cause_analysis": {
|
| 47 |
"id": "root_cause_analysis",
|
|
@@ -50,10 +53,11 @@ ALL_TASKS: dict = {
|
|
| 50 |
"max_steps": 10,
|
| 51 |
"score_range": [0.0, 1.0],
|
| 52 |
"description": (
|
| 53 |
-
"A production incident is active
|
| 54 |
-
"
|
| 55 |
-
"
|
| 56 |
-
"
|
|
|
|
| 57 |
),
|
| 58 |
"available_actions": [
|
| 59 |
"query_logs",
|
|
@@ -64,7 +68,7 @@ ALL_TASKS: dict = {
|
|
| 64 |
"submit_root_cause",
|
| 65 |
],
|
| 66 |
"submission_action": "submit_root_cause",
|
| 67 |
-
"scenarios":
|
| 68 |
},
|
| 69 |
"remediation_planning": {
|
| 70 |
"id": "remediation_planning",
|
|
@@ -74,10 +78,10 @@ ALL_TASKS: dict = {
|
|
| 74 |
"score_range": [0.0, 1.0],
|
| 75 |
"description": (
|
| 76 |
"A critical production incident requires full end-to-end resolution. "
|
| 77 |
-
"Diagnose the root cause, execute the correct
|
| 78 |
-
"
|
| 79 |
-
"then submit a resolution summary. Scored on
|
| 80 |
-
"remediation correctness, efficiency, and documentation."
|
| 81 |
),
|
| 82 |
"available_actions": [
|
| 83 |
"query_logs",
|
|
@@ -94,520 +98,831 @@ ALL_TASKS: dict = {
|
|
| 94 |
"submit_resolution",
|
| 95 |
],
|
| 96 |
"submission_action": "submit_resolution",
|
| 97 |
-
"scenarios":
|
| 98 |
},
|
| 99 |
}
|
| 100 |
|
| 101 |
# ---------------------------------------------------------------------------
|
| 102 |
-
# Scenario data — 3 tasks ×
|
| 103 |
# ---------------------------------------------------------------------------
|
| 104 |
|
| 105 |
SCENARIOS: dict = {
|
| 106 |
|
| 107 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
"alert_classification": [
|
| 110 |
-
|
| 111 |
-
# AC-001: Cascading DB connection pool exhaustion → P1
|
| 112 |
{
|
| 113 |
"scenario_id": "AC-001",
|
| 114 |
"description": (
|
| 115 |
-
"Cascading failure
|
| 116 |
-
"
|
| 117 |
-
"Revenue impact is severe and growing."
|
| 118 |
),
|
| 119 |
"incident_summary": (
|
| 120 |
-
"
|
| 121 |
-
"
|
| 122 |
-
"
|
| 123 |
),
|
| 124 |
"alert": {
|
| 125 |
-
"id":
|
| 126 |
-
"title":
|
| 127 |
-
"severity_fired":
|
| 128 |
"affected_services": ["api-gateway", "auth-service", "postgres-db"],
|
| 129 |
"symptoms": [
|
| 130 |
"api-gateway: HTTP 503 rate 78% (baseline: 0.1%)",
|
| 131 |
"auth-service: connection timeout 94% of requests",
|
| 132 |
-
"postgres-db: connection pool 500/500
|
| 133 |
-
"checkout flow:
|
| 134 |
-
"
|
| 135 |
],
|
| 136 |
-
"error_rate":
|
| 137 |
-
"duration_minutes":
|
| 138 |
-
"revenue_impact_per_min":
|
| 139 |
},
|
| 140 |
"known_services": {"api-gateway", "auth-service", "postgres-db"},
|
| 141 |
"tool_responses": {
|
| 142 |
"query_logs": {
|
| 143 |
"api-gateway": (
|
| 144 |
-
"2024-03-15T10:04:12Z ERROR upstream
|
| 145 |
-
"
|
| 146 |
-
"2024-03-15T10:04:
|
| 147 |
-
"2024-03-15T10:04:14Z ERROR circuit breaker OPEN for auth-service"
|
| 148 |
),
|
| 149 |
"auth-service": (
|
| 150 |
-
"2024-03-15T10:04:10Z ERROR
|
| 151 |
-
"2024-03-15T10:04:11Z ERROR
|
| 152 |
-
"connect: connection refused — pool exhausted (500/500)\n"
|
| 153 |
-
"2024-03-15T10:04:12Z ERROR all connection pool slots occupied"
|
| 154 |
),
|
| 155 |
"postgres-db": (
|
| 156 |
-
"2024-03-15T10:
|
| 157 |
-
"2024-03-15T10:04:
|
| 158 |
-
"for non-replication superuser\n"
|
| 159 |
-
"2024-03-15T10:04:01Z LOG max_connections=500 active=500 idle=0"
|
| 160 |
),
|
| 161 |
},
|
| 162 |
"check_metrics": {
|
| 163 |
-
"api-gateway":
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
),
|
| 167 |
-
"auth-service": (
|
| 168 |
-
"Error rate: 94% | DB connection wait: 28s | "
|
| 169 |
-
"Active connections: 0 | Request queue: 847"
|
| 170 |
-
),
|
| 171 |
-
"postgres-db": (
|
| 172 |
-
"Connections: 500/500 (100%) | Query queue: 847 | "
|
| 173 |
-
"CPU: 98% | Memory: 89% | Active queries: 500"
|
| 174 |
-
),
|
| 175 |
},
|
| 176 |
"check_dependencies": {
|
| 177 |
-
"api-gateway": "Depends on: auth-service [CRITICAL]
|
| 178 |
-
"auth-service": "Depends on: postgres-db [CRITICAL]
|
| 179 |
-
"postgres-db": "No upstream dependencies
|
| 180 |
},
|
| 181 |
"check_recent_deploys": {
|
| 182 |
-
"api-gateway":
|
| 183 |
-
"auth-service":
|
| 184 |
-
|
| 185 |
-
"increased default connection pool size from 10 to 500"
|
| 186 |
-
),
|
| 187 |
-
"postgres-db": "Last deploy: 12 days ago — no recent changes",
|
| 188 |
},
|
| 189 |
},
|
| 190 |
-
"correct_severity":
|
| 191 |
"adjacent_severities": ["P2"],
|
| 192 |
},
|
| 193 |
|
| 194 |
-
# AC-002:
|
| 195 |
{
|
| 196 |
"scenario_id": "AC-002",
|
| 197 |
"description": (
|
| 198 |
-
"
|
| 199 |
-
"
|
| 200 |
-
"
|
| 201 |
),
|
| 202 |
"incident_summary": (
|
| 203 |
-
"
|
| 204 |
-
"
|
| 205 |
-
"
|
| 206 |
),
|
| 207 |
"alert": {
|
| 208 |
-
"id":
|
| 209 |
-
"title":
|
| 210 |
-
"severity_fired":
|
| 211 |
"affected_services": ["cdn-edge", "product-service", "image-service"],
|
| 212 |
"symptoms": [
|
| 213 |
"CDN cache hit rate: 3% (normal: 94%)",
|
| 214 |
-
"product-service: origin
|
| 215 |
"image-service: CPU 95%, p99 latency 18s",
|
| 216 |
-
"
|
| 217 |
-
"Checkout: still functional
|
| 218 |
],
|
| 219 |
-
"error_rate":
|
| 220 |
-
"duration_minutes":
|
| 221 |
-
"revenue_impact_per_min":
|
| 222 |
},
|
| 223 |
"known_services": {"cdn-edge", "product-service", "image-service"},
|
| 224 |
"tool_responses": {
|
| 225 |
"query_logs": {
|
| 226 |
"cdn-edge": (
|
| 227 |
-
"2024-03-15T10:22:00Z INFO cache MISS ratio: 97%
|
| 228 |
-
"2024-03-15T10:20:11Z WARN mass cache invalidation — "
|
| 229 |
-
"2,100,000 keys purged by purge-job-prod\n"
|
| 230 |
"2024-03-15T10:20:10Z INFO purge pattern: /* (ALL keys)"
|
| 231 |
),
|
| 232 |
"product-service": (
|
| 233 |
"2024-03-15T10:22:05Z WARN request queue depth: 12,400\n"
|
| 234 |
-
"2024-03-15T10:22:06Z ERROR timeout
|
| 235 |
-
"2024-03-15T10:22:07Z WARN worker pool 95%
|
| 236 |
),
|
| 237 |
"image-service": (
|
| 238 |
-
"2024-03-15T10:22:00Z WARN CPU throttling
|
| 239 |
-
"2024-03-15T10:22:01Z ERROR worker pool exhausted
|
| 240 |
-
"2024-03-15T10:22:02Z
|
| 241 |
),
|
| 242 |
},
|
| 243 |
"check_metrics": {
|
| 244 |
-
"cdn-edge":
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
),
|
| 248 |
"product-service": (
|
| 249 |
-
"
|
| 250 |
-
"
|
| 251 |
),
|
| 252 |
-
"
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
| 255 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
},
|
| 257 |
"check_dependencies": {
|
| 258 |
-
"
|
| 259 |
-
"product-service": "Depends on:
|
| 260 |
-
"
|
| 261 |
},
|
| 262 |
"check_recent_deploys": {
|
| 263 |
-
"
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
),
|
| 267 |
-
"product-service": "Last deploy: 5 days ago — no recent changes",
|
| 268 |
-
"image-service": "Last deploy: 2 days ago — no recent changes",
|
| 269 |
},
|
| 270 |
},
|
| 271 |
-
"correct_severity":
|
| 272 |
-
"adjacent_severities": ["
|
| 273 |
},
|
| 274 |
],
|
| 275 |
|
| 276 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
"root_cause_analysis": [
|
| 279 |
|
| 280 |
-
# RCA-001:
|
|
|
|
|
|
|
| 281 |
{
|
| 282 |
"scenario_id": "RCA-001",
|
| 283 |
"description": (
|
| 284 |
-
"
|
| 285 |
-
"
|
| 286 |
-
"All downstream services are now failing. analytics-service is the culprit."
|
| 287 |
),
|
| 288 |
"incident_summary": (
|
| 289 |
-
"Multiple services
|
| 290 |
-
"order-service
|
| 291 |
-
"
|
| 292 |
),
|
| 293 |
"alert": {
|
| 294 |
-
"id":
|
| 295 |
-
"title":
|
| 296 |
-
"severity_fired":
|
| 297 |
"affected_services": [
|
| 298 |
"api-gateway", "auth-service", "order-service", "postgres-db",
|
| 299 |
],
|
| 300 |
"symptoms": [
|
| 301 |
-
"postgres-db: 4 restarts in 12 minutes",
|
| 302 |
-
"auth-service:
|
| 303 |
"order-service: all writes failing",
|
| 304 |
-
"api-gateway: 503 on
|
| 305 |
-
"analytics-service: last job failed 12 min ago",
|
| 306 |
],
|
| 307 |
-
"error_rate":
|
| 308 |
"duration_minutes": 14,
|
| 309 |
},
|
| 310 |
"known_services": {
|
| 311 |
"api-gateway", "auth-service", "order-service",
|
| 312 |
"postgres-db", "analytics-service", "redis-session",
|
|
|
|
| 313 |
},
|
| 314 |
"tool_responses": {
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
"check_metrics": {
|
| 342 |
"postgres-db": (
|
| 343 |
-
"Memory:
|
| 344 |
-
"Restarts: 4 in 12min | Status: RESTARTING"
|
|
|
|
| 345 |
),
|
| 346 |
"analytics-service": (
|
| 347 |
-
"
|
| 348 |
-
"
|
| 349 |
),
|
| 350 |
-
"auth-service":
|
| 351 |
-
"api-gateway":
|
| 352 |
"order-service": "Write success: 0% | DB: RESTARTING",
|
| 353 |
-
"redis-session": "Hit rate: 99.2% | Memory: 42% |
|
|
|
|
|
|
|
| 354 |
},
|
| 355 |
"check_dependencies": {
|
| 356 |
"postgres-db": (
|
| 357 |
-
"Clients: auth-service, order-service, analytics-service,
|
|
|
|
| 358 |
),
|
| 359 |
"analytics-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 360 |
-
"auth-service":
|
| 361 |
-
"api-gateway":
|
| 362 |
-
"order-service":
|
| 363 |
-
"redis-session":
|
|
|
|
|
|
|
| 364 |
},
|
| 365 |
"check_recent_deploys": {
|
| 366 |
"analytics-service": (
|
| 367 |
-
"Deploy 6h ago: added
|
| 368 |
-
"runs daily at 02:00 UTC
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
),
|
| 370 |
-
"postgres-db": "No deploys in 3 weeks",
|
| 371 |
-
"auth-service": "No recent deploys",
|
| 372 |
"order-service": "No recent deploys",
|
| 373 |
"redis-session": "No recent deploys",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
},
|
| 375 |
"check_service_status": {
|
| 376 |
-
"postgres-db":
|
| 377 |
-
"analytics-service": "ERROR | Last job:
|
| 378 |
-
"auth-service":
|
| 379 |
-
"api-gateway":
|
| 380 |
-
"order-service":
|
| 381 |
-
"redis-session":
|
|
|
|
|
|
|
| 382 |
},
|
| 383 |
},
|
| 384 |
"correct_root_cause": {
|
| 385 |
-
"service":
|
| 386 |
"failure_mode": "unbounded query OOM killing postgres-db",
|
| 387 |
},
|
| 388 |
"wrong_actions": {
|
| 389 |
-
"restart_service:auth-service":
|
| 390 |
-
"restart_service:api-gateway":
|
| 391 |
-
"
|
| 392 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
},
|
| 394 |
},
|
| 395 |
|
| 396 |
-
# RCA-002: BGP
|
|
|
|
|
|
|
| 397 |
{
|
| 398 |
"scenario_id": "RCA-002",
|
| 399 |
"description": (
|
| 400 |
-
"
|
| 401 |
-
"
|
| 402 |
-
"Services within AZ-1 are healthy — it is a pure network issue."
|
| 403 |
),
|
| 404 |
"incident_summary": (
|
| 405 |
-
"Checkout failure rate
|
| 406 |
-
"
|
| 407 |
-
"
|
|
|
|
| 408 |
),
|
| 409 |
"alert": {
|
| 410 |
-
"id":
|
| 411 |
-
"title":
|
| 412 |
-
"severity_fired":
|
| 413 |
"affected_services": [
|
| 414 |
"order-service", "payment-service", "fraud-detection-service",
|
| 415 |
],
|
| 416 |
"symptoms": [
|
| 417 |
-
"checkout failure rate: 61%
|
| 418 |
-
"payment-service:
|
| 419 |
-
"fraud-detection-service:
|
| 420 |
-
"
|
| 421 |
-
"Network: AZ-2/AZ-3 → AZ-1 routing broken",
|
| 422 |
],
|
| 423 |
-
"error_rate":
|
| 424 |
"duration_minutes": 9,
|
| 425 |
},
|
| 426 |
"known_services": {
|
| 427 |
"order-service", "payment-service", "fraud-detection-service",
|
| 428 |
"postgres-db", "redis-payment-cache", "network-infra",
|
|
|
|
| 429 |
},
|
| 430 |
"tool_responses": {
|
| 431 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
"order-service": (
|
| 433 |
-
"
|
| 434 |
-
"
|
| 435 |
-
"2024-03-17T14:32:11Z ERROR fraud-detection-service: i/o timeout (30s)"
|
| 436 |
),
|
| 437 |
"payment-service": (
|
| 438 |
-
"
|
| 439 |
-
"
|
|
|
|
| 440 |
),
|
| 441 |
"fraud-detection-service": (
|
| 442 |
-
"
|
| 443 |
-
"2024-03-17T14:32:01Z WARN cross-AZ health probes: 100% timeout"
|
| 444 |
),
|
| 445 |
"network-infra": (
|
| 446 |
-
"
|
| 447 |
-
"AZ-2
|
| 448 |
-
"
|
| 449 |
-
"AZ-3 lost route to AZ-1 CIDR 10.0.1.0/24\n"
|
| 450 |
-
"2024-03-17T14:31:44Z INFO router config change applied — "
|
| 451 |
-
"BGP advertisement policy updated"
|
| 452 |
),
|
| 453 |
-
"postgres-db":
|
| 454 |
-
"redis-payment-cache": "
|
|
|
|
|
|
|
| 455 |
},
|
| 456 |
-
"
|
| 457 |
"order-service": (
|
| 458 |
-
"
|
| 459 |
-
"
|
| 460 |
),
|
| 461 |
-
"payment-service":
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
"fraud-detection-service": (
|
| 466 |
-
"AZ-1 processing: normal | "
|
| 467 |
-
"Cross-AZ health checks: 100% timeout"
|
| 468 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
"network-infra": (
|
| 470 |
-
"
|
| 471 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
),
|
| 473 |
-
|
| 474 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
},
|
| 476 |
"check_dependencies": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
"order-service": (
|
| 478 |
-
"Depends on:
|
| 479 |
-
"
|
| 480 |
),
|
| 481 |
-
"
|
| 482 |
-
"fraud-detection-service": "Depends on: postgres-db [OK]",
|
| 483 |
-
"network-infra": "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN], AZ-1 [UP]",
|
| 484 |
},
|
| 485 |
"check_recent_deploys": {
|
| 486 |
-
"
|
| 487 |
-
"
|
| 488 |
-
"
|
|
|
|
| 489 |
),
|
| 490 |
-
"
|
| 491 |
-
|
| 492 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
},
|
| 494 |
"check_service_status": {
|
| 495 |
-
"
|
| 496 |
-
"
|
| 497 |
-
"
|
| 498 |
-
"
|
| 499 |
-
"
|
| 500 |
-
"redis-
|
|
|
|
|
|
|
| 501 |
},
|
| 502 |
},
|
| 503 |
"correct_root_cause": {
|
| 504 |
-
"service":
|
| 505 |
-
"failure_mode": "
|
| 506 |
},
|
| 507 |
"wrong_actions": {
|
| 508 |
-
"restart_service:
|
| 509 |
-
"restart_service:
|
| 510 |
-
"
|
| 511 |
-
"
|
|
|
|
|
|
|
|
|
|
| 512 |
},
|
| 513 |
},
|
| 514 |
],
|
| 515 |
|
| 516 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
|
| 518 |
"remediation_planning": [
|
| 519 |
|
| 520 |
-
# RP-001:
|
| 521 |
{
|
| 522 |
"scenario_id": "RP-001",
|
| 523 |
"description": (
|
| 524 |
-
"Full remediation
|
| 525 |
-
"
|
| 526 |
-
"
|
| 527 |
),
|
| 528 |
"incident_summary": (
|
| 529 |
-
"CRITICAL — postgres-db
|
| 530 |
-
"api-gateway all down.
|
| 531 |
-
"
|
| 532 |
),
|
| 533 |
"alert": {
|
| 534 |
-
"id":
|
| 535 |
-
"title":
|
| 536 |
-
"severity_fired":
|
| 537 |
"affected_services": [
|
| 538 |
-
"postgres-db", "
|
| 539 |
-
"auth-service", "order-service", "api-gateway",
|
| 540 |
],
|
| 541 |
},
|
| 542 |
"known_services": {
|
| 543 |
"postgres-db", "auth-service", "order-service",
|
| 544 |
-
"api-gateway", "analytics-service",
|
|
|
|
| 545 |
},
|
| 546 |
"tool_responses": {
|
| 547 |
"query_logs": {
|
| 548 |
"postgres-db": (
|
| 549 |
-
"FATAL:
|
| 550 |
-
"
|
| 551 |
),
|
| 552 |
"analytics-service": (
|
| 553 |
-
"
|
|
|
|
|
|
|
| 554 |
),
|
| 555 |
-
"auth-service":
|
| 556 |
-
"order-service": "ERROR: pq:
|
| 557 |
-
"api-gateway":
|
|
|
|
|
|
|
|
|
|
| 558 |
},
|
| 559 |
"check_metrics": {
|
| 560 |
-
"postgres-db":
|
| 561 |
-
"analytics-service": "Memory
|
| 562 |
-
"auth-service":
|
| 563 |
-
"order-service":
|
| 564 |
-
"api-gateway":
|
|
|
|
|
|
|
|
|
|
| 565 |
},
|
| 566 |
"check_dependencies": {
|
| 567 |
-
"postgres-db":
|
|
|
|
|
|
|
|
|
|
| 568 |
"analytics-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 569 |
-
"auth-service":
|
| 570 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
},
|
| 572 |
"check_recent_deploys": {
|
| 573 |
"analytics-service": (
|
| 574 |
-
"Deploy 6h ago:
|
| 575 |
-
"
|
| 576 |
),
|
| 577 |
-
"postgres-db": "No
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
},
|
| 579 |
"check_service_status": {
|
| 580 |
-
"postgres-db":
|
| 581 |
-
"analytics-service": "ERROR |
|
| 582 |
-
"auth-service":
|
| 583 |
-
"order-service":
|
| 584 |
-
"api-gateway":
|
|
|
|
|
|
|
|
|
|
| 585 |
},
|
| 586 |
},
|
| 587 |
"remediation_data": {
|
| 588 |
"disable_feature_flag": {
|
| 589 |
"full_history_export": (
|
| 590 |
"Cron job full_history_export DISABLED — "
|
| 591 |
-
"
|
| 592 |
),
|
| 593 |
},
|
| 594 |
"restart_service": {
|
| 595 |
-
"postgres-db": (
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
"
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
"auth-service": "auth-service restarted — reconnected to postgres-db OK",
|
| 603 |
-
"order-service": "order-service restarted — writes resuming normally",
|
| 604 |
},
|
| 605 |
"execute_runbook_step": {
|
| 606 |
-
"verify_db_health":
|
| 607 |
-
"postgres-db: connections 12/500, CPU 12%, Memory 34% — healthy"
|
| 608 |
-
),
|
| 609 |
"check_service_recovery": (
|
| 610 |
-
"auth
|
| 611 |
),
|
| 612 |
},
|
| 613 |
},
|
|
@@ -617,122 +932,294 @@ SCENARIOS: dict = {
|
|
| 617 |
"restart_service:postgres-db",
|
| 618 |
"restart_service:auth-service",
|
| 619 |
"restart_service:order-service",
|
|
|
|
| 620 |
],
|
| 621 |
"wrong_actions": {
|
| 622 |
-
"rollback_deploy:postgres-db":
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
"
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
"
|
| 629 |
-
|
| 630 |
-
),
|
| 631 |
},
|
| 632 |
"resolution_keywords": [
|
| 633 |
"analytics", "oom", "memory", "postgres", "query",
|
| 634 |
-
"full_history_export", "disabled", "restarted",
|
|
|
|
| 635 |
],
|
| 636 |
},
|
| 637 |
|
| 638 |
-
# RP-002:
|
| 639 |
{
|
| 640 |
"scenario_id": "RP-002",
|
| 641 |
"description": (
|
| 642 |
-
"Full remediation
|
| 643 |
-
"
|
| 644 |
-
"the router config change, verify checkout recovery, and document."
|
| 645 |
),
|
| 646 |
"incident_summary": (
|
| 647 |
-
"
|
| 648 |
-
"
|
| 649 |
-
"
|
|
|
|
| 650 |
),
|
| 651 |
"alert": {
|
| 652 |
-
"id":
|
| 653 |
-
"title":
|
| 654 |
-
"severity_fired":
|
| 655 |
-
"affected_services": ["
|
| 656 |
},
|
| 657 |
"known_services": {
|
| 658 |
"network-infra", "order-service", "payment-service",
|
| 659 |
"fraud-detection-service", "postgres-db",
|
|
|
|
| 660 |
},
|
| 661 |
"tool_responses": {
|
| 662 |
"query_logs": {
|
| 663 |
"network-infra": (
|
| 664 |
-
"CRITICAL: BGP route withdrawal — "
|
| 665 |
-
"
|
| 666 |
-
"
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
"ERROR: connection timeout payment-service — no route to host"
|
| 670 |
-
),
|
| 671 |
-
"payment-service": (
|
| 672 |
-
"INFO: AZ-1 traffic normal | "
|
| 673 |
-
"WARN: cross-AZ health checks failing"
|
| 674 |
-
),
|
| 675 |
-
"fraud-detection-service": (
|
| 676 |
-
"WARN: cross-AZ health probes 100% timeout | AZ-1 traffic: normal"
|
| 677 |
),
|
|
|
|
|
|
|
|
|
|
| 678 |
"postgres-db": "Operating normally",
|
|
|
|
|
|
|
|
|
|
| 679 |
},
|
| 680 |
"check_metrics": {
|
| 681 |
-
"network-infra":
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
},
|
| 687 |
"check_dependencies": {
|
| 688 |
-
"order-service":
|
| 689 |
-
"payment-service": "Depends on: postgres-db [OK]",
|
| 690 |
-
"network-infra":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 691 |
},
|
| 692 |
"check_recent_deploys": {
|
| 693 |
"network-infra": (
|
| 694 |
-
"Config change 18min ago — BGP
|
| 695 |
-
"accidentally
|
| 696 |
),
|
| 697 |
"payment-service": "No recent deploys",
|
| 698 |
-
"order-service":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
},
|
| 700 |
"check_service_status": {
|
| 701 |
-
"network-infra":
|
| 702 |
-
"payment-service": "HEALTHY (
|
| 703 |
-
"order-service":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 704 |
},
|
| 705 |
},
|
| 706 |
"remediation_data": {
|
| 707 |
"rollback_deploy": {
|
| 708 |
-
"network-infra":
|
| 709 |
-
"Router config rolled back — "
|
| 710 |
-
"BGP advertisement policy restored to previous version"
|
| 711 |
-
),
|
| 712 |
},
|
| 713 |
"execute_runbook_step": {
|
| 714 |
-
"restore_bgp_routes":
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
"verify_checkout_recovery": (
|
| 718 |
-
"Checkout failure rate: 0.3% — incident fully resolved"
|
| 719 |
-
),
|
| 720 |
},
|
| 721 |
},
|
| 722 |
"correct_remediation_sequence": [
|
| 723 |
"execute_runbook_step:restore_bgp_routes",
|
| 724 |
"rollback_deploy:network-infra",
|
|
|
|
| 725 |
"execute_runbook_step:verify_checkout_recovery",
|
| 726 |
],
|
| 727 |
"wrong_actions": {
|
| 728 |
-
"restart_service:payment-service":
|
| 729 |
-
"scale_service:payment-service":
|
| 730 |
-
"restart_service:order-service":
|
| 731 |
-
"clear_cache
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 732 |
},
|
| 733 |
"resolution_keywords": [
|
| 734 |
"bgp", "network", "route", "rollback", "partition",
|
| 735 |
-
"restored", "az-1", "az-2", "az-3", "checkout",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 736 |
],
|
| 737 |
},
|
| 738 |
],
|
|
@@ -746,8 +1233,7 @@ SCENARIOS: dict = {
|
|
| 746 |
def get_task(task_id: str) -> dict:
|
| 747 |
if task_id not in ALL_TASKS:
|
| 748 |
raise ValueError(
|
| 749 |
-
f"Unknown task_id '{task_id}'. "
|
| 750 |
-
f"Valid: {list(ALL_TASKS.keys())}"
|
| 751 |
)
|
| 752 |
return ALL_TASKS[task_id]
|
| 753 |
|
|
|
|
| 1 |
"""
|
| 2 |
tasks.py — Task and scenario definitions for Cloud Incident Response OpenEnv.
|
| 3 |
|
| 4 |
+
Difficulty calibration targets:
|
| 5 |
+
EASY → 8B: 0.75-1.0, 70B: 0.85-1.0
|
| 6 |
+
MEDIUM → 8B: 0.30-0.50, 70B: 0.45-0.65
|
| 7 |
+
HARD → 8B: 0.15-0.35, 70B: 0.30-0.50
|
|
|
|
| 8 |
|
| 9 |
+
Design principles for genuine difficulty:
|
| 10 |
+
EASY: Alert metrics are clear. Only trick is P2-vs-P3 ambiguity.
|
| 11 |
+
MEDIUM: Root cause buried. 8-10 known services. Multiple red herrings.
|
| 12 |
+
incident_summary does NOT hint at root cause. Must investigate 4+ services.
|
| 13 |
+
HARD: Same diagnosis challenge + 5-7 step remediation sequence +
|
| 14 |
+
10+ known services (many wrong choices) + quality summary required.
|
| 15 |
|
| 16 |
Public API:
|
| 17 |
get_task(task_id) -> task metadata dict
|
|
|
|
| 31 |
"score_range": [0.0, 1.0],
|
| 32 |
"description": (
|
| 33 |
"An alert has fired. Query logs and metrics across affected services, "
|
| 34 |
+
"then classify the incident severity: P1 (CRITICAL — complete outage or "
|
| 35 |
+
"revenue >$1,000/min), P2 (HIGH — major degradation affecting most users), "
|
| 36 |
+
"P3 (MEDIUM — partial/minor issue with graceful fallback), "
|
| 37 |
+
"P4 (LOW — informational). Submit with submit_severity."
|
| 38 |
),
|
| 39 |
"available_actions": [
|
| 40 |
"query_logs",
|
|
|
|
| 44 |
"submit_severity",
|
| 45 |
],
|
| 46 |
"submission_action": "submit_severity",
|
| 47 |
+
"scenarios": 3,
|
| 48 |
},
|
| 49 |
"root_cause_analysis": {
|
| 50 |
"id": "root_cause_analysis",
|
|
|
|
| 53 |
"max_steps": 10,
|
| 54 |
"score_range": [0.0, 1.0],
|
| 55 |
"description": (
|
| 56 |
+
"A production incident is active with multiple services showing errors. "
|
| 57 |
+
"Use diagnostic tools to trace the failure chain. The root cause may be "
|
| 58 |
+
"any service in the system — not necessarily one showing errors. "
|
| 59 |
+
"Query logs, metrics, dependencies, and recent deploys across ALL "
|
| 60 |
+
"available services to find the true trigger. Submit with submit_root_cause."
|
| 61 |
),
|
| 62 |
"available_actions": [
|
| 63 |
"query_logs",
|
|
|
|
| 68 |
"submit_root_cause",
|
| 69 |
],
|
| 70 |
"submission_action": "submit_root_cause",
|
| 71 |
+
"scenarios": 3,
|
| 72 |
},
|
| 73 |
"remediation_planning": {
|
| 74 |
"id": "remediation_planning",
|
|
|
|
| 78 |
"score_range": [0.0, 1.0],
|
| 79 |
"description": (
|
| 80 |
"A critical production incident requires full end-to-end resolution. "
|
| 81 |
+
"Diagnose the root cause among many services, execute the correct "
|
| 82 |
+
"remediation sequence (order matters — wrong actions are penalized), "
|
| 83 |
+
"then submit a detailed resolution summary. Scored on diagnosis quality, "
|
| 84 |
+
"remediation correctness, action efficiency, and documentation."
|
| 85 |
),
|
| 86 |
"available_actions": [
|
| 87 |
"query_logs",
|
|
|
|
| 98 |
"submit_resolution",
|
| 99 |
],
|
| 100 |
"submission_action": "submit_resolution",
|
| 101 |
+
"scenarios": 3,
|
| 102 |
},
|
| 103 |
}
|
| 104 |
|
| 105 |
# ---------------------------------------------------------------------------
|
| 106 |
+
# Scenario data — 3 tasks × 3 scenarios = 9 total episodes
|
| 107 |
# ---------------------------------------------------------------------------
|
| 108 |
|
| 109 |
SCENARIOS: dict = {
|
| 110 |
|
| 111 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 112 |
+
# TASK 1: ALERT CLASSIFICATION (EASY)
|
| 113 |
+
# Target: 8B→0.75-1.0, 70B→0.85-1.0
|
| 114 |
+
#
|
| 115 |
+
# AC-001: Clear P1 (78% errors, $12k/min) — should be trivial
|
| 116 |
+
# AC-002: Ambiguous P2 (degraded but working, $800/min)
|
| 117 |
+
# AC-003: Trap P3 (45% errors but zero revenue impact, graceful fallback)
|
| 118 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 119 |
|
| 120 |
"alert_classification": [
|
| 121 |
+
# AC-001: Clear P1
|
|
|
|
| 122 |
{
|
| 123 |
"scenario_id": "AC-001",
|
| 124 |
"description": (
|
| 125 |
+
"Cascading failure across multiple services. "
|
| 126 |
+
"Assess severity based on user and revenue impact."
|
|
|
|
| 127 |
),
|
| 128 |
"incident_summary": (
|
| 129 |
+
"Alert fired: api-gateway reporting elevated error rates. "
|
| 130 |
+
"Multiple downstream services affected. "
|
| 131 |
+
"Assess the severity of this incident."
|
| 132 |
),
|
| 133 |
"alert": {
|
| 134 |
+
"id": "ALT-20240315-001",
|
| 135 |
+
"title": "api-gateway error rate elevated",
|
| 136 |
+
"severity_fired": "UNCLASSIFIED",
|
| 137 |
"affected_services": ["api-gateway", "auth-service", "postgres-db"],
|
| 138 |
"symptoms": [
|
| 139 |
"api-gateway: HTTP 503 rate 78% (baseline: 0.1%)",
|
| 140 |
"auth-service: connection timeout 94% of requests",
|
| 141 |
+
"postgres-db: connection pool 500/500 utilized",
|
| 142 |
+
"checkout flow: unavailable",
|
| 143 |
+
"user logins: failing",
|
| 144 |
],
|
| 145 |
+
"error_rate": 0.78,
|
| 146 |
+
"duration_minutes": 4,
|
| 147 |
+
"revenue_impact_per_min": 12000,
|
| 148 |
},
|
| 149 |
"known_services": {"api-gateway", "auth-service", "postgres-db"},
|
| 150 |
"tool_responses": {
|
| 151 |
"query_logs": {
|
| 152 |
"api-gateway": (
|
| 153 |
+
"2024-03-15T10:04:12Z ERROR upstream timeout auth-service:8080\n"
|
| 154 |
+
"2024-03-15T10:04:13Z ERROR 503 Service Unavailable\n"
|
| 155 |
+
"2024-03-15T10:04:14Z ERROR circuit breaker OPEN"
|
|
|
|
| 156 |
),
|
| 157 |
"auth-service": (
|
| 158 |
+
"2024-03-15T10:04:10Z ERROR too many clients already\n"
|
| 159 |
+
"2024-03-15T10:04:11Z ERROR connection pool exhausted (500/500)"
|
|
|
|
|
|
|
| 160 |
),
|
| 161 |
"postgres-db": (
|
| 162 |
+
"2024-03-15T10:04:00Z FATAL remaining slots reserved for superuser\n"
|
| 163 |
+
"2024-03-15T10:04:01Z LOG max_connections=500 active=500"
|
|
|
|
|
|
|
| 164 |
),
|
| 165 |
},
|
| 166 |
"check_metrics": {
|
| 167 |
+
"api-gateway": "5xx rate: 78% | p99: 30s | circuit_breaker: OPEN",
|
| 168 |
+
"auth-service": "Error rate: 94% | DB wait: 28s | Queue: 847",
|
| 169 |
+
"postgres-db": "Connections: 500/500 (100%) | CPU: 98% | Memory: 89%",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
},
|
| 171 |
"check_dependencies": {
|
| 172 |
+
"api-gateway": "Depends on: auth-service [CRITICAL]",
|
| 173 |
+
"auth-service": "Depends on: postgres-db [CRITICAL]",
|
| 174 |
+
"postgres-db": "No upstream dependencies",
|
| 175 |
},
|
| 176 |
"check_recent_deploys": {
|
| 177 |
+
"api-gateway": "No recent changes",
|
| 178 |
+
"auth-service": "Deploy 47 min ago — connection pool size change",
|
| 179 |
+
"postgres-db": "No recent changes",
|
|
|
|
|
|
|
|
|
|
| 180 |
},
|
| 181 |
},
|
| 182 |
+
"correct_severity": "P1",
|
| 183 |
"adjacent_severities": ["P2"],
|
| 184 |
},
|
| 185 |
|
| 186 |
+
# AC-002: Ambiguous P2 — degraded but not down
|
| 187 |
{
|
| 188 |
"scenario_id": "AC-002",
|
| 189 |
"description": (
|
| 190 |
+
"Service degradation affecting page load times. "
|
| 191 |
+
"Core transaction flows still operational. "
|
| 192 |
+
"Assess severity carefully."
|
| 193 |
),
|
| 194 |
"incident_summary": (
|
| 195 |
+
"Alert fired: CDN cache performance degraded. "
|
| 196 |
+
"Origin servers under increased load. "
|
| 197 |
+
"Assess the severity of this incident."
|
| 198 |
),
|
| 199 |
"alert": {
|
| 200 |
+
"id": "ALT-20240315-002",
|
| 201 |
+
"title": "CDN cache performance anomaly detected",
|
| 202 |
+
"severity_fired": "UNCLASSIFIED",
|
| 203 |
"affected_services": ["cdn-edge", "product-service", "image-service"],
|
| 204 |
"symptoms": [
|
| 205 |
"CDN cache hit rate: 3% (normal: 94%)",
|
| 206 |
+
"product-service: elevated origin traffic",
|
| 207 |
"image-service: CPU 95%, p99 latency 18s",
|
| 208 |
+
"Product pages: loading slowly",
|
| 209 |
+
"Checkout: still functional",
|
| 210 |
],
|
| 211 |
+
"error_rate": 0.15,
|
| 212 |
+
"duration_minutes": 8,
|
| 213 |
+
"revenue_impact_per_min": 800,
|
| 214 |
},
|
| 215 |
"known_services": {"cdn-edge", "product-service", "image-service"},
|
| 216 |
"tool_responses": {
|
| 217 |
"query_logs": {
|
| 218 |
"cdn-edge": (
|
| 219 |
+
"2024-03-15T10:22:00Z INFO cache MISS ratio: 97%\n"
|
| 220 |
+
"2024-03-15T10:20:11Z WARN mass cache invalidation — 2.1M keys purged\n"
|
|
|
|
| 221 |
"2024-03-15T10:20:10Z INFO purge pattern: /* (ALL keys)"
|
| 222 |
),
|
| 223 |
"product-service": (
|
| 224 |
"2024-03-15T10:22:05Z WARN request queue depth: 12,400\n"
|
| 225 |
+
"2024-03-15T10:22:06Z ERROR timeout from image-service\n"
|
| 226 |
+
"2024-03-15T10:22:07Z WARN worker pool 95%"
|
| 227 |
),
|
| 228 |
"image-service": (
|
| 229 |
+
"2024-03-15T10:22:00Z WARN CPU throttling 95%\n"
|
| 230 |
+
"2024-03-15T10:22:01Z ERROR worker pool exhausted\n"
|
| 231 |
+
"2024-03-15T10:22:02Z WARN memory at 91%"
|
| 232 |
),
|
| 233 |
},
|
| 234 |
"check_metrics": {
|
| 235 |
+
"cdn-edge": "Cache hit: 3% | Origin RPS: 48,000 | Bandwidth: 890 Gbps",
|
| 236 |
+
"product-service": "Origin RPS: 48k (norm: 1.2k) | Queue: 12,400",
|
| 237 |
+
"image-service": "CPU: 95% | Memory: 91% | p99: 18s",
|
| 238 |
+
},
|
| 239 |
+
"check_dependencies": {
|
| 240 |
+
"cdn-edge": "Origin: product-service [OVERLOADED]",
|
| 241 |
+
"product-service": "Depends on: image-service [DEGRADED]",
|
| 242 |
+
"image-service": "Depends on: object-storage [OK]",
|
| 243 |
+
},
|
| 244 |
+
"check_recent_deploys": {
|
| 245 |
+
"cdn-edge": "Cronjob updated 2h ago — purge pattern changed",
|
| 246 |
+
"product-service": "No recent changes",
|
| 247 |
+
"image-service": "No recent changes",
|
| 248 |
+
},
|
| 249 |
+
},
|
| 250 |
+
"correct_severity": "P2",
|
| 251 |
+
"adjacent_severities": ["P1", "P3"],
|
| 252 |
+
},
|
| 253 |
+
|
| 254 |
+
# AC-003: P3 trap — high error rate but zero impact
|
| 255 |
+
{
|
| 256 |
+
"scenario_id": "AC-003",
|
| 257 |
+
"description": (
|
| 258 |
+
"Internal service reporting elevated errors. "
|
| 259 |
+
"Determine actual user and business impact. "
|
| 260 |
+
"Not all high error rates are critical."
|
| 261 |
+
),
|
| 262 |
+
"incident_summary": (
|
| 263 |
+
"Alert fired: recommendation-service error rate elevated to 45%. "
|
| 264 |
+
"Assess the severity based on actual user and business impact."
|
| 265 |
+
),
|
| 266 |
+
"alert": {
|
| 267 |
+
"id": "ALT-20240315-003",
|
| 268 |
+
"title": "recommendation-service error rate 45%",
|
| 269 |
+
"severity_fired": "UNCLASSIFIED",
|
| 270 |
+
"affected_services": ["recommendation-service", "product-service"],
|
| 271 |
+
"symptoms": [
|
| 272 |
+
"recommendation-service: error rate 45% (baseline: 2%)",
|
| 273 |
+
"product-service: using fallback recommendation logic",
|
| 274 |
+
"User experience: default recommendations shown",
|
| 275 |
+
"Checkout: fully functional",
|
| 276 |
+
"Revenue: no measurable change",
|
| 277 |
+
],
|
| 278 |
+
"error_rate": 0.45,
|
| 279 |
+
"duration_minutes": 22,
|
| 280 |
+
"revenue_impact_per_min": 0,
|
| 281 |
+
},
|
| 282 |
+
"known_services": {"recommendation-service", "product-service", "redis-reco-cache"},
|
| 283 |
+
"tool_responses": {
|
| 284 |
+
"query_logs": {
|
| 285 |
+
"recommendation-service": (
|
| 286 |
+
"2024-03-15T09:48:00Z ERROR model inference timeout (>5s)\n"
|
| 287 |
+
"2024-03-15T09:48:01Z WARN ML model server overloaded\n"
|
| 288 |
+
"2024-03-15T09:48:02Z INFO fallback: returning default recommendations"
|
| 289 |
),
|
| 290 |
"product-service": (
|
| 291 |
+
"2024-03-15T09:48:05Z INFO recommendation-service returned defaults\n"
|
| 292 |
+
"2024-03-15T09:48:06Z INFO serving page with default recs — no user impact"
|
| 293 |
),
|
| 294 |
+
"redis-reco-cache": "Operating normally — cache hit rate 88%",
|
| 295 |
+
},
|
| 296 |
+
"check_metrics": {
|
| 297 |
+
"recommendation-service": (
|
| 298 |
+
"Error rate: 45% | Fallback rate: 45% | "
|
| 299 |
+
"Model server: OVERLOADED | User impact: NONE (graceful)"
|
| 300 |
),
|
| 301 |
+
"product-service": (
|
| 302 |
+
"Error rate: 0.1% (normal) | Checkout: 100% | Revenue: unchanged"
|
| 303 |
+
),
|
| 304 |
+
"redis-reco-cache": "Hit rate: 88% | Memory: 34% | HEALTHY",
|
| 305 |
},
|
| 306 |
"check_dependencies": {
|
| 307 |
+
"recommendation-service": "Depends on: ML model server [SLOW]",
|
| 308 |
+
"product-service": "Depends on: recommendation-service [DEGRADED — has fallback]",
|
| 309 |
+
"redis-reco-cache": "No dependencies",
|
| 310 |
},
|
| 311 |
"check_recent_deploys": {
|
| 312 |
+
"recommendation-service": "Model update 3h ago — new model v2.4",
|
| 313 |
+
"product-service": "No recent changes",
|
| 314 |
+
"redis-reco-cache": "No recent changes",
|
|
|
|
|
|
|
|
|
|
| 315 |
},
|
| 316 |
},
|
| 317 |
+
"correct_severity": "P3",
|
| 318 |
+
"adjacent_severities": ["P2", "P4"],
|
| 319 |
},
|
| 320 |
],
|
| 321 |
|
| 322 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 323 |
+
# TASK 2: ROOT CAUSE ANALYSIS (MEDIUM)
|
| 324 |
+
# Target: 8B→0.30-0.50, 70B→0.45-0.65
|
| 325 |
+
#
|
| 326 |
+
# KEY DESIGN RULES:
|
| 327 |
+
# 1. Root cause service NEVER in affected_services
|
| 328 |
+
# 2. incident_summary describes SYMPTOMS only, no hints
|
| 329 |
+
# 3. 8-10 known_services (many to investigate)
|
| 330 |
+
# 4. Red herring deploys on non-root-cause services
|
| 331 |
+
# 5. Root cause only findable via check_recent_deploys + query_logs
|
| 332 |
+
# on the specific service — not from looking at victims
|
| 333 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 334 |
|
| 335 |
"root_cause_analysis": [
|
| 336 |
|
| 337 |
+
# RCA-001: analytics-service OOM kills postgres-db
|
| 338 |
+
# 8 known services. Root cause: analytics-service.
|
| 339 |
+
# Red herrings: auth-service deploy (cosmetic), redis healthy
|
| 340 |
{
|
| 341 |
"scenario_id": "RCA-001",
|
| 342 |
"description": (
|
| 343 |
+
"Multiple services reporting failures. Database appears to be "
|
| 344 |
+
"the epicenter but the true trigger may be elsewhere."
|
|
|
|
| 345 |
),
|
| 346 |
"incident_summary": (
|
| 347 |
+
"Multiple services are failing. postgres-db is in a crash loop. "
|
| 348 |
+
"auth-service, order-service, and api-gateway are all reporting errors. "
|
| 349 |
+
"Investigate all available services to find what triggered this cascade."
|
| 350 |
),
|
| 351 |
"alert": {
|
| 352 |
+
"id": "ALT-RCA-001",
|
| 353 |
+
"title": "Multiple service failures — database crash loop",
|
| 354 |
+
"severity_fired": "P1",
|
| 355 |
"affected_services": [
|
| 356 |
"api-gateway", "auth-service", "order-service", "postgres-db",
|
| 357 |
],
|
| 358 |
"symptoms": [
|
| 359 |
+
"postgres-db: crash loop — 4 restarts in 12 minutes",
|
| 360 |
+
"auth-service: 100% connection failures",
|
| 361 |
"order-service: all writes failing",
|
| 362 |
+
"api-gateway: 503 on authenticated routes",
|
|
|
|
| 363 |
],
|
| 364 |
+
"error_rate": 0.95,
|
| 365 |
"duration_minutes": 14,
|
| 366 |
},
|
| 367 |
"known_services": {
|
| 368 |
"api-gateway", "auth-service", "order-service",
|
| 369 |
"postgres-db", "analytics-service", "redis-session",
|
| 370 |
+
"product-service", "notification-service",
|
| 371 |
},
|
| 372 |
"tool_responses": {
|
| 373 |
+
# In RCA-001, replace the query_logs section:
|
| 374 |
+
"query_logs": {
|
| 375 |
+
"postgres-db": (
|
| 376 |
+
"2024-03-16T02:11:00Z LOG database system shut down\n"
|
| 377 |
+
"2024-03-16T02:10:58Z FATAL terminated by kernel OOM killer\n"
|
| 378 |
+
"2024-03-16T02:10:30Z LOG long-running query from "
|
| 379 |
+
"analytics-service consuming all available memory — "
|
| 380 |
+
"running for 12 minutes, no LIMIT clause"
|
| 381 |
+
),
|
| 382 |
+
"analytics-service": (
|
| 383 |
+
"2024-03-16T01:58:00Z INFO starting scheduled job: full_history_export\n"
|
| 384 |
+
"2024-03-16T01:58:01Z DEBUG executing: SELECT * FROM events "
|
| 385 |
+
"JOIN user_sessions ON ... JOIN orders ON ... — no LIMIT\n"
|
| 386 |
+
"2024-03-16T01:58:02Z WARN query plan estimates 847M row scan\n"
|
| 387 |
+
"2024-03-16T02:10:55Z ERROR job terminated — connection to database lost"
|
| 388 |
+
),
|
| 389 |
+
"auth-service": (
|
| 390 |
+
"2024-03-16T02:11:05Z ERROR connect ECONNREFUSED postgres-db:5432\n"
|
| 391 |
+
"2024-03-16T02:11:06Z ERROR all retries exhausted"
|
| 392 |
+
),
|
| 393 |
+
"api-gateway": (
|
| 394 |
+
"2024-03-16T02:11:10Z ERROR upstream auth-service: 503"
|
| 395 |
+
),
|
| 396 |
+
"order-service": (
|
| 397 |
+
"2024-03-16T02:11:08Z ERROR pq: database system is starting up"
|
| 398 |
+
),
|
| 399 |
+
"redis-session": "No errors — operating normally",
|
| 400 |
+
"product-service": (
|
| 401 |
+
"2024-03-16T02:11:12Z WARN DB queries failing — serving cached data"
|
| 402 |
+
),
|
| 403 |
+
"notification-service": (
|
| 404 |
+
"2024-03-16T02:11:15Z ERROR cannot send — user lookup failed"
|
| 405 |
+
),
|
| 406 |
+
},
|
| 407 |
"check_metrics": {
|
| 408 |
"postgres-db": (
|
| 409 |
+
"Memory: peaked at 31.8GB/32GB before kill | "
|
| 410 |
+
"Restarts: 4 in 12min | Status: RESTARTING | "
|
| 411 |
+
"Heaviest client: 10.0.5.47"
|
| 412 |
),
|
| 413 |
"analytics-service": (
|
| 414 |
+
"Last job: FAILED | Memory during job: 28GB | "
|
| 415 |
+
"IP: 10.0.5.47 | CPU: idle (job terminated)"
|
| 416 |
),
|
| 417 |
+
"auth-service": "Connections: 0% success | Queued requests: 1,200",
|
| 418 |
+
"api-gateway": "503 rate: 95% | Auth: DOWN",
|
| 419 |
"order-service": "Write success: 0% | DB: RESTARTING",
|
| 420 |
+
"redis-session": "Hit rate: 99.2% | Memory: 42% | HEALTHY",
|
| 421 |
+
"product-service": "Serving cached data | DB queries: 100% failing",
|
| 422 |
+
"notification-service": "Queue backlog: 8,400 | DB: DOWN",
|
| 423 |
},
|
| 424 |
"check_dependencies": {
|
| 425 |
"postgres-db": (
|
| 426 |
+
"Clients: auth-service, order-service, analytics-service, "
|
| 427 |
+
"product-service, notification-service"
|
| 428 |
),
|
| 429 |
"analytics-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 430 |
+
"auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
|
| 431 |
+
"api-gateway": "Depends on: auth-service [DOWN], product-service [DEGRADED]",
|
| 432 |
+
"order-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 433 |
+
"redis-session": "Standalone cache — no DB dependency",
|
| 434 |
+
"product-service": "Depends on: postgres-db [CRASH LOOP — using cache]",
|
| 435 |
+
"notification-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 436 |
},
|
| 437 |
"check_recent_deploys": {
|
| 438 |
"analytics-service": (
|
| 439 |
+
"Deploy 6h ago: added scheduled data export job — "
|
| 440 |
+
"runs daily at 02:00 UTC. Change includes cross-table "
|
| 441 |
+
"JOIN query without LIMIT clause"
|
| 442 |
+
),
|
| 443 |
+
"postgres-db": "No deploys in 3 weeks",
|
| 444 |
+
"auth-service": (
|
| 445 |
+
"Deploy 2h ago: updated structured logging format. "
|
| 446 |
+
"No functional changes, no query changes, no connection changes."
|
| 447 |
),
|
|
|
|
|
|
|
| 448 |
"order-service": "No recent deploys",
|
| 449 |
"redis-session": "No recent deploys",
|
| 450 |
+
"api-gateway": "No recent deploys",
|
| 451 |
+
"product-service": (
|
| 452 |
+
"Deploy 3 days ago: added product image lazy loading. "
|
| 453 |
+
"No DB changes."
|
| 454 |
+
),
|
| 455 |
+
"notification-service": "No recent deploys",
|
| 456 |
},
|
| 457 |
"check_service_status": {
|
| 458 |
+
"postgres-db": "RESTARTING | Uptime: 47s | Last crash: OOM",
|
| 459 |
+
"analytics-service": "ERROR | Last job: FAILED 12min ago",
|
| 460 |
+
"auth-service": "DOWN | Blocked on postgres-db",
|
| 461 |
+
"api-gateway": "DEGRADED | 95% errors",
|
| 462 |
+
"order-service": "DOWN | Blocked on postgres-db",
|
| 463 |
+
"redis-session": "HEALTHY | 99.2% hit rate",
|
| 464 |
+
"product-service": "DEGRADED | Cache fallback active",
|
| 465 |
+
"notification-service": "DEGRADED | Queue backlog 8,400",
|
| 466 |
},
|
| 467 |
},
|
| 468 |
"correct_root_cause": {
|
| 469 |
+
"service": "analytics-service",
|
| 470 |
"failure_mode": "unbounded query OOM killing postgres-db",
|
| 471 |
},
|
| 472 |
"wrong_actions": {
|
| 473 |
+
"restart_service:auth-service": "victim — DB must be fixed first",
|
| 474 |
+
"restart_service:api-gateway": "downstream — won't help",
|
| 475 |
+
"restart_service:order-service": "victim — won't help",
|
| 476 |
+
"scale_service:postgres-db": "won't prevent OOM from bad query",
|
| 477 |
+
"rollback_deploy:postgres-db": "no recent deploys",
|
| 478 |
+
"rollback_deploy:auth-service": "auth deploy was cosmetic only",
|
| 479 |
+
"rollback_deploy:product-service": "product deploy unrelated",
|
| 480 |
+
"restart_service:redis-session": "redis is healthy",
|
| 481 |
+
"restart_service:notification-service": "victim — won't help",
|
| 482 |
},
|
| 483 |
},
|
| 484 |
|
| 485 |
+
# RCA-002: network-infra BGP withdrawal
|
| 486 |
+
# 8 known services. Root cause: network-infra.
|
| 487 |
+
# Red herrings: payment-service looks down, postgres-db exists
|
| 488 |
{
|
| 489 |
"scenario_id": "RCA-002",
|
| 490 |
"description": (
|
| 491 |
+
"Checkout failures concentrated in specific availability zones. "
|
| 492 |
+
"Some services appear unreachable while others work fine."
|
|
|
|
| 493 |
),
|
| 494 |
"incident_summary": (
|
| 495 |
+
"Checkout failure rate has spiked to 61%. payment-service and "
|
| 496 |
+
"fraud-detection-service are unreachable from some parts of the "
|
| 497 |
+
"infrastructure but appear healthy from others. Multiple services "
|
| 498 |
+
"to investigate. Find the root cause."
|
| 499 |
),
|
| 500 |
"alert": {
|
| 501 |
+
"id": "ALT-RCA-002",
|
| 502 |
+
"title": "Checkout failures — partial service unreachability",
|
| 503 |
+
"severity_fired": "P2",
|
| 504 |
"affected_services": [
|
| 505 |
"order-service", "payment-service", "fraud-detection-service",
|
| 506 |
],
|
| 507 |
"symptoms": [
|
| 508 |
+
"checkout failure rate: 61%",
|
| 509 |
+
"payment-service: intermittently unreachable",
|
| 510 |
+
"fraud-detection-service: intermittently unreachable",
|
| 511 |
+
"failures appear zone-specific",
|
|
|
|
| 512 |
],
|
| 513 |
+
"error_rate": 0.61,
|
| 514 |
"duration_minutes": 9,
|
| 515 |
},
|
| 516 |
"known_services": {
|
| 517 |
"order-service", "payment-service", "fraud-detection-service",
|
| 518 |
"postgres-db", "redis-payment-cache", "network-infra",
|
| 519 |
+
"cdn-edge", "api-gateway",
|
| 520 |
},
|
| 521 |
"tool_responses": {
|
| 522 |
+
# In RCA-002, replace query_logs:
|
| 523 |
+
"query_logs": {
|
| 524 |
+
"order-service": (
|
| 525 |
+
"2024-03-17T14:32:10Z ERROR connection timeout "
|
| 526 |
+
"payment-service:8080 — no route to host\n"
|
| 527 |
+
"2024-03-17T14:32:11Z ERROR fraud-detection-service: i/o timeout\n"
|
| 528 |
+
"2024-03-17T14:32:12Z WARN failures only from AZ-2/AZ-3, "
|
| 529 |
+
"AZ-1 traffic normal — possible network-infra issue"
|
| 530 |
+
),
|
| 531 |
+
"payment-service": (
|
| 532 |
+
"2024-03-17T14:31:58Z WARN health check from external LB failing\n"
|
| 533 |
+
"2024-03-17T14:31:59Z INFO local AZ-1 traffic: all normal\n"
|
| 534 |
+
"2024-03-17T14:32:00Z INFO processing requests normally (local only)"
|
| 535 |
+
),
|
| 536 |
+
"fraud-detection-service": (
|
| 537 |
+
"2024-03-17T14:32:00Z INFO local requests: processing normally\n"
|
| 538 |
+
"2024-03-17T14:32:01Z WARN external health probes: 100% timeout"
|
| 539 |
+
),
|
| 540 |
+
"network-infra": (
|
| 541 |
+
"2024-03-17T14:31:45Z CRITICAL BGP session 10.0.2.1 DOWN — "
|
| 542 |
+
"routes to 10.0.1.0/24 withdrawn from peer\n"
|
| 543 |
+
"2024-03-17T14:31:45Z CRITICAL BGP session 10.0.3.1 DOWN — "
|
| 544 |
+
"routes to 10.0.1.0/24 withdrawn from peer\n"
|
| 545 |
+
"2024-03-17T14:31:44Z INFO configuration change applied — "
|
| 546 |
+
"export filter policy updated"
|
| 547 |
+
),
|
| 548 |
+
"postgres-db": "Operating normally — no errors",
|
| 549 |
+
"redis-payment-cache": "Operating normally — all healthy",
|
| 550 |
+
"cdn-edge": "Operating normally — cache serving fine",
|
| 551 |
+
"api-gateway": (
|
| 552 |
+
"2024-03-17T14:32:15Z ERROR some backend routes timing out\n"
|
| 553 |
+
"2024-03-17T14:32:16Z INFO AZ-1 backends: responding normally"
|
| 554 |
+
),
|
| 555 |
+
},
|
| 556 |
+
"check_metrics": {
|
| 557 |
"order-service": (
|
| 558 |
+
"Failure rate varies by source AZ: "
|
| 559 |
+
"AZ-1: 0.2% | AZ-2: 99% | AZ-3: 98%"
|
|
|
|
| 560 |
),
|
| 561 |
"payment-service": (
|
| 562 |
+
"Internal processing: 100% success | "
|
| 563 |
+
"Inbound from AZ-2: 0 connections | Inbound from AZ-3: 0 connections | "
|
| 564 |
+
"Inbound from AZ-1: normal"
|
| 565 |
),
|
| 566 |
"fraud-detection-service": (
|
| 567 |
+
"Internal: normal | External probes: 100% timeout"
|
|
|
|
| 568 |
),
|
| 569 |
"network-infra": (
|
| 570 |
+
"BGP sessions: AZ-1 internal UP | "
|
| 571 |
+
"AZ-2→AZ-1: WITHDRAWN | AZ-3→AZ-1: WITHDRAWN | "
|
| 572 |
+
"Last change: 18min ago"
|
|
|
|
|
|
|
|
|
|
| 573 |
),
|
| 574 |
+
"postgres-db": "All metrics normal",
|
| 575 |
+
"redis-payment-cache": "All metrics normal",
|
| 576 |
+
"cdn-edge": "Cache hit: 91% | Normal operation",
|
| 577 |
+
"api-gateway": "Mixed — AZ-1 OK, AZ-2/AZ-3 partial failures",
|
| 578 |
},
|
| 579 |
+
"check_dependencies": {
|
| 580 |
"order-service": (
|
| 581 |
+
"Depends on: payment-service [PARTIAL], "
|
| 582 |
+
"fraud-detection-service [PARTIAL]"
|
| 583 |
),
|
| 584 |
+
"payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
|
| 585 |
+
"fraud-detection-service": "Depends on: postgres-db [OK]",
|
| 586 |
+
"network-infra": (
|
| 587 |
+
"BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN], AZ-1 [UP]"
|
|
|
|
|
|
|
|
|
|
| 588 |
),
|
| 589 |
+
"postgres-db": "All connections healthy",
|
| 590 |
+
"redis-payment-cache": "All connections healthy",
|
| 591 |
+
"cdn-edge": "No issues",
|
| 592 |
+
"api-gateway": "Depends on: multiple backends [MIXED]",
|
| 593 |
+
},
|
| 594 |
+
"check_recent_deploys": {
|
| 595 |
"network-infra": (
|
| 596 |
+
"Router configuration change 18min ago — modified BGP "
|
| 597 |
+
"export filter policy. Change accidentally removed AZ-1 "
|
| 598 |
+
"prefix 10.0.1.0/24 from advertisements to AZ-2 and AZ-3 peers."
|
| 599 |
+
),
|
| 600 |
+
"payment-service": "No recent deploys",
|
| 601 |
+
"order-service": "No recent deploys",
|
| 602 |
+
"fraud-detection-service": "No recent deploys",
|
| 603 |
+
"postgres-db": (
|
| 604 |
+
"Minor config change 5 days ago — increased shared_buffers. "
|
| 605 |
+
"No issues since."
|
| 606 |
+
),
|
| 607 |
+
"redis-payment-cache": "No recent deploys",
|
| 608 |
+
"cdn-edge": "No recent deploys",
|
| 609 |
+
"api-gateway": (
|
| 610 |
+
"Deploy 1 day ago — added request tracing headers. "
|
| 611 |
+
"No routing changes."
|
| 612 |
),
|
| 613 |
+
},
|
| 614 |
+
"check_service_status": {
|
| 615 |
+
"payment-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
|
| 616 |
+
"order-service": "DEGRADED | Partial failures",
|
| 617 |
+
"network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
|
| 618 |
+
"fraud-detection-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
|
| 619 |
+
"postgres-db": "HEALTHY",
|
| 620 |
+
"redis-payment-cache": "HEALTHY",
|
| 621 |
+
"cdn-edge": "HEALTHY",
|
| 622 |
+
"api-gateway": "DEGRADED | Mixed backend status",
|
| 623 |
+
},
|
| 624 |
+
},
|
| 625 |
+
"correct_root_cause": {
|
| 626 |
+
"service": "network-infra",
|
| 627 |
+
"failure_mode": "BGP route withdrawal causing AZ network partition",
|
| 628 |
+
},
|
| 629 |
+
"wrong_actions": {
|
| 630 |
+
"restart_service:payment-service": "healthy — network issue",
|
| 631 |
+
"restart_service:order-service": "victim",
|
| 632 |
+
"scale_service:payment-service": "won't fix routing",
|
| 633 |
+
"clear_cache:redis-payment-cache": "cache is healthy",
|
| 634 |
+
"restart_service:api-gateway": "victim of routing issue",
|
| 635 |
+
"rollback_deploy:api-gateway": "deploy was unrelated tracing headers",
|
| 636 |
+
"rollback_deploy:postgres-db": "config change was 5 days ago, unrelated",
|
| 637 |
+
"restart_service:cdn-edge": "CDN is healthy",
|
| 638 |
+
},
|
| 639 |
+
},
|
| 640 |
+
|
| 641 |
+
# RCA-003: config-service credential rotation bug
|
| 642 |
+
# 8 known services. Root cause: config-service.
|
| 643 |
+
# Red herrings: user-service had a recent deploy, postgres-db stressed
|
| 644 |
+
{
|
| 645 |
+
"scenario_id": "RCA-003",
|
| 646 |
+
"description": (
|
| 647 |
+
"Multiple services experiencing database authentication failures. "
|
| 648 |
+
"The database itself may not be the problem."
|
| 649 |
+
),
|
| 650 |
+
"incident_summary": (
|
| 651 |
+
"Several services are reporting database authentication failures. "
|
| 652 |
+
"postgres-db connection pool is saturated. user-service and "
|
| 653 |
+
"notification-service are down. api-gateway error rate elevated. "
|
| 654 |
+
"Investigate all services to find what triggered this."
|
| 655 |
+
),
|
| 656 |
+
"alert": {
|
| 657 |
+
"id": "ALT-RCA-003",
|
| 658 |
+
"title": "Multiple services — database authentication failures",
|
| 659 |
+
"severity_fired": "P2",
|
| 660 |
+
"affected_services": [
|
| 661 |
+
"api-gateway", "user-service", "notification-service", "postgres-db",
|
| 662 |
+
],
|
| 663 |
+
"symptoms": [
|
| 664 |
+
"user-service: FATAL password authentication failed",
|
| 665 |
+
"notification-service: FATAL password authentication failed",
|
| 666 |
+
"api-gateway: 503 rate 62%",
|
| 667 |
+
"postgres-db: connection pool 490/500",
|
| 668 |
+
],
|
| 669 |
+
"error_rate": 0.62,
|
| 670 |
+
"duration_minutes": 7,
|
| 671 |
+
},
|
| 672 |
+
"known_services": {
|
| 673 |
+
"api-gateway", "user-service", "notification-service",
|
| 674 |
+
"postgres-db", "config-service", "redis-session",
|
| 675 |
+
"order-service", "product-service",
|
| 676 |
+
},
|
| 677 |
+
"tool_responses": {
|
| 678 |
+
# In RCA-003, replace query_logs:
|
| 679 |
+
"query_logs": {
|
| 680 |
+
"user-service": (
|
| 681 |
+
"2024-03-18T08:14:00Z FATAL password authentication failed "
|
| 682 |
+
"for user 'app_user'\n"
|
| 683 |
+
"2024-03-18T08:14:01Z ERROR DB credentials rejected — "
|
| 684 |
+
"credentials were pushed by config-service at 08:12:00Z\n"
|
| 685 |
+
"2024-03-18T08:14:02Z WARN config-service credential rotation "
|
| 686 |
+
"may have sent wrong credentials"
|
| 687 |
+
),
|
| 688 |
+
"notification-service": (
|
| 689 |
+
"2024-03-18T08:14:05Z FATAL password authentication failed\n"
|
| 690 |
+
"2024-03-18T08:14:06Z WARN credentials from config-service "
|
| 691 |
+
"push at 08:12:00Z appear to be stale/invalid"
|
| 692 |
+
),
|
| 693 |
+
"api-gateway": (
|
| 694 |
+
"2024-03-18T08:14:10Z ERROR upstream user-service: 503\n"
|
| 695 |
+
"2024-03-18T08:14:11Z ERROR upstream notification-service: 503"
|
| 696 |
+
),
|
| 697 |
+
"postgres-db": (
|
| 698 |
+
"2024-03-18T08:14:00Z LOG auth failure from 10.0.3.x\n"
|
| 699 |
+
"2024-03-18T08:14:00Z LOG auth failure from 10.0.4.x\n"
|
| 700 |
+
"2024-03-18T08:14:01Z LOG 490/500 slots used by failed auth retries"
|
| 701 |
+
),
|
| 702 |
+
"config-service": (
|
| 703 |
+
"2024-03-18T08:12:00Z INFO secrets rotation job executed\n"
|
| 704 |
+
"2024-03-18T08:12:01Z WARN rotation referenced PREVIOUS "
|
| 705 |
+
"credential set instead of generating new — template bug "
|
| 706 |
+
"in version v3.2.1\n"
|
| 707 |
+
"2024-03-18T08:12:02Z INFO pushed credentials to: "
|
| 708 |
+
"user-service, notification-service, order-service"
|
| 709 |
+
),
|
| 710 |
+
"redis-session": "Operating normally",
|
| 711 |
+
"order-service": (
|
| 712 |
+
"2024-03-18T08:14:20Z WARN received credential push from "
|
| 713 |
+
"config-service but have not restarted — still using old valid creds"
|
| 714 |
+
),
|
| 715 |
+
"product-service": "Operating normally — using original credentials",
|
| 716 |
+
},
|
| 717 |
+
"check_metrics": {
|
| 718 |
+
"user-service": "DB auth: 100% failure | HTTP 503: 100%",
|
| 719 |
+
"notification-service": "DB auth: 100% failure | HTTP 503: 100%",
|
| 720 |
+
"api-gateway": "503 rate: 62% | Some upstreams DOWN",
|
| 721 |
+
"postgres-db": (
|
| 722 |
+
"Connections: 490/500 | Auth failures/s: 80 | "
|
| 723 |
+
"Valid connections: 10 | DB itself: HEALTHY"
|
| 724 |
+
),
|
| 725 |
+
"config-service": (
|
| 726 |
+
"Status: HEALTHY | Last push: 7min ago | "
|
| 727 |
+
"Type: secrets_rotation | Result: COMPLETED"
|
| 728 |
+
),
|
| 729 |
+
"redis-session": "All normal",
|
| 730 |
+
"order-service": "Using old credentials — still working",
|
| 731 |
+
"product-service": "All normal — unaffected",
|
| 732 |
},
|
| 733 |
"check_dependencies": {
|
| 734 |
+
"user-service": (
|
| 735 |
+
"Depends on: postgres-db [AUTH FAIL], "
|
| 736 |
+
"config-service [credential source]"
|
| 737 |
+
),
|
| 738 |
+
"notification-service": (
|
| 739 |
+
"Depends on: postgres-db [AUTH FAIL], "
|
| 740 |
+
"config-service [credential source]"
|
| 741 |
+
),
|
| 742 |
+
"api-gateway": "Depends on: user-service [DOWN], notification-service [DOWN]",
|
| 743 |
+
"postgres-db": "No upstream dependencies — DB is healthy",
|
| 744 |
+
"config-service": (
|
| 745 |
+
"Provides: credentials to user-service, "
|
| 746 |
+
"notification-service, order-service"
|
| 747 |
+
),
|
| 748 |
+
"redis-session": "Standalone",
|
| 749 |
"order-service": (
|
| 750 |
+
"Depends on: postgres-db [OK — old creds], "
|
| 751 |
+
"config-service [pending push]"
|
| 752 |
),
|
| 753 |
+
"product-service": "Depends on: postgres-db [OK — original creds]",
|
|
|
|
|
|
|
| 754 |
},
|
| 755 |
"check_recent_deploys": {
|
| 756 |
+
"config-service": (
|
| 757 |
+
"Deploy 2h ago: version v3.2.1 — updated secrets rotation "
|
| 758 |
+
"job template. Bug: rotation references previous credential "
|
| 759 |
+
"set instead of generating new credentials."
|
| 760 |
),
|
| 761 |
+
"user-service": (
|
| 762 |
+
"Deploy 4h ago: added new profile API endpoint. "
|
| 763 |
+
"No database or credential changes."
|
| 764 |
+
),
|
| 765 |
+
"notification-service": "No recent deploys",
|
| 766 |
+
"postgres-db": "No recent deploys",
|
| 767 |
+
"api-gateway": "No recent deploys",
|
| 768 |
+
"redis-session": "No recent deploys",
|
| 769 |
+
"order-service": (
|
| 770 |
+
"Deploy 1 day ago: updated order confirmation email template. "
|
| 771 |
+
"No DB changes."
|
| 772 |
+
),
|
| 773 |
+
"product-service": "No recent deploys",
|
| 774 |
},
|
| 775 |
"check_service_status": {
|
| 776 |
+
"user-service": "DOWN | DB auth failures",
|
| 777 |
+
"notification-service": "DOWN | DB auth failures",
|
| 778 |
+
"api-gateway": "DEGRADED | 62% error rate",
|
| 779 |
+
"postgres-db": "STRESSED but HEALTHY | 490/500 connections (failed auths)",
|
| 780 |
+
"config-service": "HEALTHY | Last rotation: 7min ago (completed)",
|
| 781 |
+
"redis-session": "HEALTHY",
|
| 782 |
+
"order-service": "HEALTHY | Old credentials still valid",
|
| 783 |
+
"product-service": "HEALTHY",
|
| 784 |
},
|
| 785 |
},
|
| 786 |
"correct_root_cause": {
|
| 787 |
+
"service": "config-service",
|
| 788 |
+
"failure_mode": "secrets rotation pushed stale credentials to downstream services",
|
| 789 |
},
|
| 790 |
"wrong_actions": {
|
| 791 |
+
"restart_service:user-service": "will retry with same bad credentials",
|
| 792 |
+
"restart_service:notification-service": "same bad credentials",
|
| 793 |
+
"restart_service:postgres-db": "DB is healthy — client creds are bad",
|
| 794 |
+
"scale_service:postgres-db": "connections are failed auths",
|
| 795 |
+
"rollback_deploy:user-service": "user-service deploy was unrelated",
|
| 796 |
+
"rollback_deploy:order-service": "order-service deploy was unrelated",
|
| 797 |
+
"restart_service:api-gateway": "downstream — fix upstream first",
|
| 798 |
},
|
| 799 |
},
|
| 800 |
],
|
| 801 |
|
| 802 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 803 |
+
# TASK 3: REMEDIATION PLANNING (HARD)
|
| 804 |
+
# Target: 8B→0.15-0.35, 70B→0.30-0.50
|
| 805 |
+
#
|
| 806 |
+
# KEY DESIGN RULES:
|
| 807 |
+
# 1. Same diagnostic challenge as medium
|
| 808 |
+
# 2. 5-7 step remediation sequence required
|
| 809 |
+
# 3. 8-10 known services = many wrong choices
|
| 810 |
+
# 4. Wrong actions carry -0.05 penalty each (up to -0.15)
|
| 811 |
+
# 5. Summary must hit 3+ keywords for bonus
|
| 812 |
+
# 6. incident_summary does NOT reveal root cause
|
| 813 |
+
# ══════════════════════════════════════════════════════════════════════
|
| 814 |
|
| 815 |
"remediation_planning": [
|
| 816 |
|
| 817 |
+
# RP-001: OOM remediation — 6-step sequence, 8 services
|
| 818 |
{
|
| 819 |
"scenario_id": "RP-001",
|
| 820 |
"description": (
|
| 821 |
+
"Full incident remediation required. Multiple services down. "
|
| 822 |
+
"Diagnose the root cause, execute fixes in the correct order, "
|
| 823 |
+
"and document your resolution."
|
| 824 |
),
|
| 825 |
"incident_summary": (
|
| 826 |
+
"CRITICAL — postgres-db is crash-looping. auth-service, order-service, "
|
| 827 |
+
"and api-gateway are all down. notification-service queue backing up. "
|
| 828 |
+
"Diagnose the root cause, fix it, restore all services, and document."
|
| 829 |
),
|
| 830 |
"alert": {
|
| 831 |
+
"id": "ALT-RP-001",
|
| 832 |
+
"title": "CRITICAL: database crash loop — multiple services down",
|
| 833 |
+
"severity_fired": "P1",
|
| 834 |
"affected_services": [
|
| 835 |
+
"postgres-db", "auth-service", "order-service", "api-gateway",
|
|
|
|
| 836 |
],
|
| 837 |
},
|
| 838 |
"known_services": {
|
| 839 |
"postgres-db", "auth-service", "order-service",
|
| 840 |
+
"api-gateway", "analytics-service", "redis-session",
|
| 841 |
+
"product-service", "notification-service",
|
| 842 |
},
|
| 843 |
"tool_responses": {
|
| 844 |
"query_logs": {
|
| 845 |
"postgres-db": (
|
| 846 |
+
"FATAL: terminated by kernel OOM killer — "
|
| 847 |
+
"query from client 10.0.5.47 running 12min consuming all memory"
|
| 848 |
),
|
| 849 |
"analytics-service": (
|
| 850 |
+
"INFO: starting job full_history_export\n"
|
| 851 |
+
"WARN: query plan: 847M rows, cross-table JOIN, no LIMIT\n"
|
| 852 |
+
"ERROR: job terminated — database connection lost"
|
| 853 |
),
|
| 854 |
+
"auth-service": "ERROR: connect ECONNREFUSED postgres-db:5432",
|
| 855 |
+
"order-service": "ERROR: pq: database system is starting up",
|
| 856 |
+
"api-gateway": "ERROR: upstream auth-service 503",
|
| 857 |
+
"redis-session": "Operating normally",
|
| 858 |
+
"product-service": "WARN: DB failing — serving cached data",
|
| 859 |
+
"notification-service": "ERROR: user lookup failed — queuing",
|
| 860 |
},
|
| 861 |
"check_metrics": {
|
| 862 |
+
"postgres-db": "OOM killed | Restarts: 4 | Heaviest client: 10.0.5.47",
|
| 863 |
+
"analytics-service": "Job FAILED | Memory peak: 31GB/32GB | IP: 10.0.5.47",
|
| 864 |
+
"auth-service": "0% DB success | Queue: 1,200",
|
| 865 |
+
"order-service": "0% write success",
|
| 866 |
+
"api-gateway": "503 rate: 95%",
|
| 867 |
+
"redis-session": "HEALTHY | 99.2% hit rate",
|
| 868 |
+
"product-service": "Cache fallback active",
|
| 869 |
+
"notification-service": "Queue: 8,400 messages backed up",
|
| 870 |
},
|
| 871 |
"check_dependencies": {
|
| 872 |
+
"postgres-db": (
|
| 873 |
+
"Clients: auth-service, order-service, analytics-service, "
|
| 874 |
+
"product-service, notification-service"
|
| 875 |
+
),
|
| 876 |
"analytics-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 877 |
+
"auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
|
| 878 |
+
"api-gateway": "Depends on: auth-service [DOWN]",
|
| 879 |
+
"order-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 880 |
+
"redis-session": "Standalone",
|
| 881 |
+
"product-service": "Depends on: postgres-db [CRASH LOOP — cache fallback]",
|
| 882 |
+
"notification-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 883 |
},
|
| 884 |
"check_recent_deploys": {
|
| 885 |
"analytics-service": (
|
| 886 |
+
"Deploy 6h ago: added scheduled export job — "
|
| 887 |
+
"cross-table JOIN without LIMIT clause"
|
| 888 |
),
|
| 889 |
+
"postgres-db": "No deploys in 3 weeks",
|
| 890 |
+
"auth-service": "Deploy 2h ago: logging format only — no functional changes",
|
| 891 |
+
"order-service": "No recent deploys",
|
| 892 |
+
"product-service": "Deploy 3 days ago: image lazy loading — no DB changes",
|
| 893 |
+
"notification-service": "No recent deploys",
|
| 894 |
},
|
| 895 |
"check_service_status": {
|
| 896 |
+
"postgres-db": "CRASH LOOP | OOM | Uptime: 47s",
|
| 897 |
+
"analytics-service": "ERROR | Job FAILED",
|
| 898 |
+
"auth-service": "DOWN",
|
| 899 |
+
"order-service": "DOWN",
|
| 900 |
+
"api-gateway": "DEGRADED | 95% errors",
|
| 901 |
+
"redis-session": "HEALTHY",
|
| 902 |
+
"product-service": "DEGRADED | Cache fallback",
|
| 903 |
+
"notification-service": "DEGRADED | Queue backlog",
|
| 904 |
},
|
| 905 |
},
|
| 906 |
"remediation_data": {
|
| 907 |
"disable_feature_flag": {
|
| 908 |
"full_history_export": (
|
| 909 |
"Cron job full_history_export DISABLED — "
|
| 910 |
+
"unbounded query will not execute again"
|
| 911 |
),
|
| 912 |
},
|
| 913 |
"restart_service": {
|
| 914 |
+
"postgres-db": "postgres-db restarted — accepting connections (12/500)",
|
| 915 |
+
"analytics-service": "analytics-service restarted — idle",
|
| 916 |
+
"auth-service": "auth-service restarted — connected to postgres-db OK",
|
| 917 |
+
"order-service": "order-service restarted — writes resuming",
|
| 918 |
+
"api-gateway": "api-gateway restarted — routing recovered",
|
| 919 |
+
"product-service": "product-service — switched from cache to live DB",
|
| 920 |
+
"notification-service": "notification-service — draining queue",
|
|
|
|
|
|
|
| 921 |
},
|
| 922 |
"execute_runbook_step": {
|
| 923 |
+
"verify_db_health": "postgres-db: 12/500 connections, CPU 12%, Memory 34% — healthy",
|
|
|
|
|
|
|
| 924 |
"check_service_recovery": (
|
| 925 |
+
"auth OK | order OK | api-gateway OK | product OK | notification DRAINING"
|
| 926 |
),
|
| 927 |
},
|
| 928 |
},
|
|
|
|
| 932 |
"restart_service:postgres-db",
|
| 933 |
"restart_service:auth-service",
|
| 934 |
"restart_service:order-service",
|
| 935 |
+
"execute_runbook_step:verify_db_health",
|
| 936 |
],
|
| 937 |
"wrong_actions": {
|
| 938 |
+
"rollback_deploy:postgres-db": "no recent deploy",
|
| 939 |
+
"scale_service:postgres-db": "won't prevent OOM",
|
| 940 |
+
"restart_service:api-gateway": "downstream — fix DB stack first",
|
| 941 |
+
"rollback_deploy:auth-service": "cosmetic deploy only",
|
| 942 |
+
"clear_cache:redis-session": "healthy — not related",
|
| 943 |
+
"restart_service:redis-session": "healthy — not related",
|
| 944 |
+
"rollback_deploy:product-service": "unrelated deploy",
|
| 945 |
+
"restart_service:notification-service": "will recover once DB is up",
|
|
|
|
| 946 |
},
|
| 947 |
"resolution_keywords": [
|
| 948 |
"analytics", "oom", "memory", "postgres", "query",
|
| 949 |
+
"full_history_export", "disabled", "restarted",
|
| 950 |
+
"recovered", "unbounded", "crash", "kill",
|
| 951 |
],
|
| 952 |
},
|
| 953 |
|
| 954 |
+
# RP-002: BGP remediation — 4-step sequence, 8 services
|
| 955 |
{
|
| 956 |
"scenario_id": "RP-002",
|
| 957 |
"description": (
|
| 958 |
+
"Full incident remediation required. Checkout failures affecting "
|
| 959 |
+
"most users. Diagnose, fix, verify, and document."
|
|
|
|
| 960 |
),
|
| 961 |
"incident_summary": (
|
| 962 |
+
"Checkout failure rate 61%. payment-service unreachable from most "
|
| 963 |
+
"of the infrastructure. Some services report no issues. "
|
| 964 |
+
"Diagnose the root cause, execute remediation, verify recovery, "
|
| 965 |
+
"and document the resolution."
|
| 966 |
),
|
| 967 |
"alert": {
|
| 968 |
+
"id": "ALT-RP-002",
|
| 969 |
+
"title": "Checkout failures — partial service unreachability",
|
| 970 |
+
"severity_fired": "P2",
|
| 971 |
+
"affected_services": ["order-service", "payment-service"],
|
| 972 |
},
|
| 973 |
"known_services": {
|
| 974 |
"network-infra", "order-service", "payment-service",
|
| 975 |
"fraud-detection-service", "postgres-db",
|
| 976 |
+
"redis-payment-cache", "cdn-edge", "api-gateway",
|
| 977 |
},
|
| 978 |
"tool_responses": {
|
| 979 |
"query_logs": {
|
| 980 |
"network-infra": (
|
| 981 |
+
"CRITICAL: BGP peer 10.0.2.1 route withdrawal — "
|
| 982 |
+
"routes to 10.0.1.0/24 removed\n"
|
| 983 |
+
"CRITICAL: BGP peer 10.0.3.1 route withdrawal — "
|
| 984 |
+
"routes to 10.0.1.0/24 removed\n"
|
| 985 |
+
"INFO: configuration change applied — export filter updated"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 986 |
),
|
| 987 |
+
"order-service": "ERROR: timeout payment-service — no route to host",
|
| 988 |
+
"payment-service": "INFO: local traffic normal | WARN: external health failing",
|
| 989 |
+
"fraud-detection-service": "WARN: cross-AZ probes timeout | Local: OK",
|
| 990 |
"postgres-db": "Operating normally",
|
| 991 |
+
"redis-payment-cache": "Operating normally",
|
| 992 |
+
"cdn-edge": "Operating normally",
|
| 993 |
+
"api-gateway": "ERROR: some backend routes timing out",
|
| 994 |
},
|
| 995 |
"check_metrics": {
|
| 996 |
+
"network-infra": (
|
| 997 |
+
"BGP AZ-2→AZ-1: WITHDRAWN | AZ-3→AZ-1: WITHDRAWN | "
|
| 998 |
+
"AZ-1 internal: UP | Last change: 18min ago"
|
| 999 |
+
),
|
| 1000 |
+
"order-service": "AZ-1: 0.2% fail | AZ-2: 99% fail | AZ-3: 98% fail",
|
| 1001 |
+
"payment-service": "Internal: 100% success | External: 0 inbound from AZ-2/3",
|
| 1002 |
+
"fraud-detection-service": "Local: normal | External: timeout",
|
| 1003 |
+
"postgres-db": "All normal",
|
| 1004 |
+
"redis-payment-cache": "All normal",
|
| 1005 |
+
"cdn-edge": "Cache: 91% hit | Normal",
|
| 1006 |
+
"api-gateway": "Mixed — AZ-1 OK, AZ-2/3 partial failures",
|
| 1007 |
},
|
| 1008 |
"check_dependencies": {
|
| 1009 |
+
"order-service": "Depends on: payment-service [PARTIAL], fraud-detection [PARTIAL]",
|
| 1010 |
+
"payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
|
| 1011 |
+
"network-infra": "BGP: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN]",
|
| 1012 |
+
"fraud-detection-service": "Depends on: postgres-db [OK]",
|
| 1013 |
+
"postgres-db": "All healthy",
|
| 1014 |
+
"redis-payment-cache": "All healthy",
|
| 1015 |
+
"cdn-edge": "No issues",
|
| 1016 |
+
"api-gateway": "Mixed backends",
|
| 1017 |
},
|
| 1018 |
"check_recent_deploys": {
|
| 1019 |
"network-infra": (
|
| 1020 |
+
"Config change 18min ago — BGP export filter modified, "
|
| 1021 |
+
"accidentally removed AZ-1 prefix from AZ-2/AZ-3 ads"
|
| 1022 |
),
|
| 1023 |
"payment-service": "No recent deploys",
|
| 1024 |
+
"order-service": "No recent deploys",
|
| 1025 |
+
"fraud-detection-service": "No recent deploys",
|
| 1026 |
+
"postgres-db": "Minor change 5 days ago — increased shared_buffers",
|
| 1027 |
+
"redis-payment-cache": "No recent deploys",
|
| 1028 |
+
"cdn-edge": "No recent deploys",
|
| 1029 |
+
"api-gateway": "Deploy 1 day ago — tracing headers, no routing changes",
|
| 1030 |
},
|
| 1031 |
"check_service_status": {
|
| 1032 |
+
"network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
|
| 1033 |
+
"payment-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
|
| 1034 |
+
"order-service": "DEGRADED",
|
| 1035 |
+
"fraud-detection-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
|
| 1036 |
+
"postgres-db": "HEALTHY",
|
| 1037 |
+
"redis-payment-cache": "HEALTHY",
|
| 1038 |
+
"cdn-edge": "HEALTHY",
|
| 1039 |
+
"api-gateway": "DEGRADED",
|
| 1040 |
},
|
| 1041 |
},
|
| 1042 |
"remediation_data": {
|
| 1043 |
"rollback_deploy": {
|
| 1044 |
+
"network-infra": "Router config rolled back — BGP policy restored",
|
|
|
|
|
|
|
|
|
|
| 1045 |
},
|
| 1046 |
"execute_runbook_step": {
|
| 1047 |
+
"restore_bgp_routes": "BGP routes restored — AZ-2/3 can reach AZ-1",
|
| 1048 |
+
"verify_checkout_recovery": "Checkout failure: 0.3% — resolved",
|
| 1049 |
+
"verify_cross_az_connectivity": "AZ-2→AZ-1: OK | AZ-3→AZ-1: OK",
|
|
|
|
|
|
|
|
|
|
| 1050 |
},
|
| 1051 |
},
|
| 1052 |
"correct_remediation_sequence": [
|
| 1053 |
"execute_runbook_step:restore_bgp_routes",
|
| 1054 |
"rollback_deploy:network-infra",
|
| 1055 |
+
"execute_runbook_step:verify_cross_az_connectivity",
|
| 1056 |
"execute_runbook_step:verify_checkout_recovery",
|
| 1057 |
],
|
| 1058 |
"wrong_actions": {
|
| 1059 |
+
"restart_service:payment-service": "healthy — network issue",
|
| 1060 |
+
"scale_service:payment-service": "won't fix routing",
|
| 1061 |
+
"restart_service:order-service": "victim",
|
| 1062 |
+
"clear_cache:redis-payment-cache": "unrelated",
|
| 1063 |
+
"restart_service:cdn-edge": "healthy",
|
| 1064 |
+
"restart_service:fraud-detection-service": "healthy locally",
|
| 1065 |
+
"restart_service:api-gateway": "victim of routing",
|
| 1066 |
+
"rollback_deploy:api-gateway": "deploy was unrelated",
|
| 1067 |
+
"rollback_deploy:postgres-db": "change was 5 days ago",
|
| 1068 |
},
|
| 1069 |
"resolution_keywords": [
|
| 1070 |
"bgp", "network", "route", "rollback", "partition",
|
| 1071 |
+
"restored", "az-1", "az-2", "az-3", "checkout",
|
| 1072 |
+
"withdrawal", "config", "advertisement", "export",
|
| 1073 |
+
],
|
| 1074 |
+
},
|
| 1075 |
+
|
| 1076 |
+
# RP-003: Credential rotation remediation — 7-step sequence, 8 services
|
| 1077 |
+
{
|
| 1078 |
+
"scenario_id": "RP-003",
|
| 1079 |
+
"description": (
|
| 1080 |
+
"Full incident remediation required. Multiple services failing "
|
| 1081 |
+
"database authentication. Diagnose, fix, verify, and document."
|
| 1082 |
+
),
|
| 1083 |
+
"incident_summary": (
|
| 1084 |
+
"Multiple services reporting database authentication failures. "
|
| 1085 |
+
"postgres-db connection pool near capacity with failed auth attempts. "
|
| 1086 |
+
"user-service and notification-service are down. api-gateway degraded. "
|
| 1087 |
+
"Diagnose the root cause, execute remediation, and document."
|
| 1088 |
+
),
|
| 1089 |
+
"alert": {
|
| 1090 |
+
"id": "ALT-RP-003",
|
| 1091 |
+
"title": "Multiple services — DB authentication failures",
|
| 1092 |
+
"severity_fired": "P2",
|
| 1093 |
+
"affected_services": [
|
| 1094 |
+
"user-service", "notification-service", "api-gateway",
|
| 1095 |
+
],
|
| 1096 |
+
},
|
| 1097 |
+
"known_services": {
|
| 1098 |
+
"api-gateway", "user-service", "notification-service",
|
| 1099 |
+
"postgres-db", "config-service", "redis-session",
|
| 1100 |
+
"order-service", "product-service",
|
| 1101 |
+
},
|
| 1102 |
+
"tool_responses": {
|
| 1103 |
+
"query_logs": {
|
| 1104 |
+
"user-service": (
|
| 1105 |
+
"FATAL: password authentication failed for user 'app_user'\n"
|
| 1106 |
+
"ERROR: DB credentials rejected\n"
|
| 1107 |
+
"WARN: credentials last refreshed at 08:12:00Z"
|
| 1108 |
+
),
|
| 1109 |
+
"notification-service": (
|
| 1110 |
+
"FATAL: password authentication failed\n"
|
| 1111 |
+
"WARN: credentials from 08:12:00Z appear stale"
|
| 1112 |
+
),
|
| 1113 |
+
"api-gateway": (
|
| 1114 |
+
"ERROR: upstream user-service 503\n"
|
| 1115 |
+
"ERROR: upstream notification-service 503"
|
| 1116 |
+
),
|
| 1117 |
+
"postgres-db": (
|
| 1118 |
+
"LOG: auth failure from 10.0.3.x (user-service)\n"
|
| 1119 |
+
"LOG: auth failure from 10.0.4.x (notification-service)\n"
|
| 1120 |
+
"LOG: 490/500 slots used by failed auth retries"
|
| 1121 |
+
),
|
| 1122 |
+
"config-service": (
|
| 1123 |
+
"INFO: secrets rotation executed at 08:12:00Z\n"
|
| 1124 |
+
"WARN: rotation used PREVIOUS credential set — "
|
| 1125 |
+
"template bug in v3.2.1\n"
|
| 1126 |
+
"INFO: pushed to: user-service, notification-service, order-service"
|
| 1127 |
+
),
|
| 1128 |
+
"redis-session": "Operating normally",
|
| 1129 |
+
"order-service": (
|
| 1130 |
+
"WARN: received credential push at 08:12:00Z — "
|
| 1131 |
+
"not applied yet, still using old valid credentials"
|
| 1132 |
+
),
|
| 1133 |
+
"product-service": "Operating normally — using original credentials",
|
| 1134 |
+
},
|
| 1135 |
+
"check_metrics": {
|
| 1136 |
+
"user-service": "DB auth: 100% failure | HTTP 503: 100%",
|
| 1137 |
+
"notification-service": "DB auth: 100% failure | HTTP 503: 100%",
|
| 1138 |
+
"api-gateway": "503 rate: 62%",
|
| 1139 |
+
"postgres-db": "Connections: 490/500 | Auth failures/s: 80 | DB: HEALTHY",
|
| 1140 |
+
"config-service": "HEALTHY | Last push: 7min ago | Type: secrets_rotation",
|
| 1141 |
+
"redis-session": "All normal",
|
| 1142 |
+
"order-service": "HEALTHY | Using old (valid) credentials",
|
| 1143 |
+
"product-service": "HEALTHY | Unaffected",
|
| 1144 |
+
},
|
| 1145 |
+
"check_dependencies": {
|
| 1146 |
+
"user-service": "Depends on: postgres-db [AUTH FAIL], config-service [creds]",
|
| 1147 |
+
"notification-service": "Depends on: postgres-db [AUTH FAIL], config-service [creds]",
|
| 1148 |
+
"api-gateway": "Depends on: user-service [DOWN], notification-service [DOWN]",
|
| 1149 |
+
"postgres-db": "No upstream — DB itself is healthy",
|
| 1150 |
+
"config-service": "Provides credentials to: user-svc, notification-svc, order-svc",
|
| 1151 |
+
"redis-session": "Standalone",
|
| 1152 |
+
"order-service": "Depends on: postgres-db [OK — old creds]",
|
| 1153 |
+
"product-service": "Depends on: postgres-db [OK — original creds]",
|
| 1154 |
+
},
|
| 1155 |
+
"check_recent_deploys": {
|
| 1156 |
+
"config-service": (
|
| 1157 |
+
"Deploy 2h ago: v3.2.1 — updated secrets rotation template. "
|
| 1158 |
+
"Bug: references previous credential set instead of generating new."
|
| 1159 |
+
),
|
| 1160 |
+
"user-service": "Deploy 4h ago: profile endpoint — no DB changes",
|
| 1161 |
+
"notification-service": "No recent deploys",
|
| 1162 |
+
"postgres-db": "No recent deploys",
|
| 1163 |
+
"api-gateway": "No recent deploys",
|
| 1164 |
+
"redis-session": "No recent deploys",
|
| 1165 |
+
"order-service": "Deploy 1 day ago: email template — no DB changes",
|
| 1166 |
+
"product-service": "No recent deploys",
|
| 1167 |
+
},
|
| 1168 |
+
"check_service_status": {
|
| 1169 |
+
"user-service": "DOWN | DB auth failures",
|
| 1170 |
+
"notification-service": "DOWN | DB auth failures",
|
| 1171 |
+
"api-gateway": "DEGRADED | 62%",
|
| 1172 |
+
"postgres-db": "STRESSED | 490/500 connections (failed auths)",
|
| 1173 |
+
"config-service": "HEALTHY | Rotation completed",
|
| 1174 |
+
"redis-session": "HEALTHY",
|
| 1175 |
+
"order-service": "HEALTHY | Old creds valid",
|
| 1176 |
+
"product-service": "HEALTHY",
|
| 1177 |
+
},
|
| 1178 |
+
},
|
| 1179 |
+
"remediation_data": {
|
| 1180 |
+
"rollback_deploy": {
|
| 1181 |
+
"config-service": "config-service rolled back to v3.2.0 — bug removed",
|
| 1182 |
+
},
|
| 1183 |
+
"execute_runbook_step": {
|
| 1184 |
+
"trigger_credential_rotation": (
|
| 1185 |
+
"Correct credentials generated and pushed to "
|
| 1186 |
+
"user-service, notification-service, order-service"
|
| 1187 |
+
),
|
| 1188 |
+
"verify_db_connectivity": (
|
| 1189 |
+
"user-service: DB OK | notification-service: DB OK | "
|
| 1190 |
+
"order-service: DB OK | postgres-db: 45/500 connections"
|
| 1191 |
+
),
|
| 1192 |
+
"verify_api_recovery": "api-gateway 503 rate: 0.1% — recovered",
|
| 1193 |
+
},
|
| 1194 |
+
"restart_service": {
|
| 1195 |
+
"user-service": "user-service restarted — DB auth OK with correct creds",
|
| 1196 |
+
"notification-service": "notification-service restarted — DB auth OK",
|
| 1197 |
+
"order-service": "order-service restarted — using correct credentials",
|
| 1198 |
+
},
|
| 1199 |
+
},
|
| 1200 |
+
"correct_remediation_sequence": [
|
| 1201 |
+
"rollback_deploy:config-service",
|
| 1202 |
+
"execute_runbook_step:trigger_credential_rotation",
|
| 1203 |
+
"restart_service:user-service",
|
| 1204 |
+
"restart_service:notification-service",
|
| 1205 |
+
"restart_service:order-service",
|
| 1206 |
+
"execute_runbook_step:verify_db_connectivity",
|
| 1207 |
+
"execute_runbook_step:verify_api_recovery",
|
| 1208 |
+
],
|
| 1209 |
+
"wrong_actions": {
|
| 1210 |
+
"restart_service:postgres-db": "DB is healthy — problem is credentials",
|
| 1211 |
+
"scale_service:postgres-db": "connections are failed auths",
|
| 1212 |
+
"restart_service:api-gateway": "downstream — fix auth first",
|
| 1213 |
+
"rollback_deploy:user-service": "deploy was unrelated",
|
| 1214 |
+
"rollback_deploy:order-service": "deploy was unrelated",
|
| 1215 |
+
"clear_cache:redis-session": "healthy",
|
| 1216 |
+
"restart_service:product-service": "healthy",
|
| 1217 |
+
"restart_service:redis-session": "healthy",
|
| 1218 |
+
},
|
| 1219 |
+
"resolution_keywords": [
|
| 1220 |
+
"config", "credential", "rotation", "stale", "password",
|
| 1221 |
+
"authentication", "rollback", "config-service", "v3.2.1",
|
| 1222 |
+
"restarted", "recovered", "push", "secrets", "template",
|
| 1223 |
],
|
| 1224 |
},
|
| 1225 |
],
|
|
|
|
| 1233 |
def get_task(task_id: str) -> dict:
|
| 1234 |
if task_id not in ALL_TASKS:
|
| 1235 |
raise ValueError(
|
| 1236 |
+
f"Unknown task_id '{task_id}'. Valid: {list(ALL_TASKS.keys())}"
|
|
|
|
| 1237 |
)
|
| 1238 |
return ALL_TASKS[task_id]
|
| 1239 |
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|