Spaces:
Sleeping
Sleeping
Cloud Incident Response OpenEnv - initial submission
Browse files- .gitignore +12 -0
- Dockerfile +14 -0
- README.md +200 -5
- graders.py +267 -0
- inference.py +546 -0
- openenv.yaml +59 -0
- pyproject.toml +16 -0
- requirements.txt +7 -0
- server/__init__.py +0 -0
- server/app.py +230 -0
- server/environment.py +324 -0
- server/models.py +75 -0
- tasks.py +768 -0
- uv.lock +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
.env
|
| 5 |
+
.env.*
|
| 6 |
+
*.egg-info/
|
| 7 |
+
dist/
|
| 8 |
+
build/
|
| 9 |
+
.pytest_cache/
|
| 10 |
+
.mypy_cache/
|
| 11 |
+
.ruff_cache/
|
| 12 |
+
*.log
|
Dockerfile
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
| 4 |
+
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
COPY pyproject.toml uv.lock ./
|
| 8 |
+
RUN uv sync --frozen --no-dev
|
| 9 |
+
|
| 10 |
+
COPY . .
|
| 11 |
+
|
| 12 |
+
EXPOSE 7860
|
| 13 |
+
|
| 14 |
+
CMD ["uv", "run", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,205 @@
|
|
| 1 |
---
|
| 2 |
-
title: Cloud Incident Response
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Cloud Incident Response OpenEnv
|
| 3 |
+
emoji: π¨
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
pinned: false
|
| 9 |
+
tags:
|
| 10 |
+
- openenv
|
| 11 |
+
- sre
|
| 12 |
+
- cloud
|
| 13 |
+
- incident-response
|
| 14 |
+
- devops
|
| 15 |
+
- real-world
|
| 16 |
+
- agentic
|
| 17 |
---
|
| 18 |
|
| 19 |
+
# Cloud Incident Response β OpenEnv Environment
|
| 20 |
+
|
| 21 |
+
An OpenEnv environment for training and evaluating AI agents on **cloud SRE incident response** β the real-world on-call workflow that engineers at every cloud company perform daily.
|
| 22 |
+
|
| 23 |
+
Distinct from Kubernetes operations environments: this focuses on **cross-service cascading failures** in distributed microservice architectures β connection pool exhaustion, CDN cache storms, OOM kills, and BGP network partitions.
|
| 24 |
+
|
| 25 |
+
## Why This Environment
|
| 26 |
+
|
| 27 |
+
Every cloud company employs SREs who respond to production incidents under time pressure with incomplete information. This environment simulates the exact decision loop:
|
| 28 |
+
|
| 29 |
+
1. **Triage** β Read alert, assess blast radius, classify severity (P1βP4)
|
| 30 |
+
2. **Investigate** β Query logs, metrics, dependencies, recent deploys
|
| 31 |
+
3. **Diagnose** β Correlate signals across services to find the root cause
|
| 32 |
+
4. **Remediate** β Execute the correct runbook steps in the right sequence
|
| 33 |
+
5. **Document** β Submit a resolution summary for post-incident review
|
| 34 |
+
|
| 35 |
+
Agents trained here learn the same skills a human SRE uses: service dependency traversal, log correlation, cascading failure analysis, and targeted remediation.
|
| 36 |
+
|
| 37 |
+
## Tasks
|
| 38 |
+
|
| 39 |
+
| Task ID | Difficulty | Max Steps | What the Agent Does |
|
| 40 |
+
|---|---|---|---|
|
| 41 |
+
| `alert_classification` | Easy | 3 | Classify alert severity (P1βP4) from metrics and symptoms |
|
| 42 |
+
| `root_cause_analysis` | Medium | 10 | Trace logs/metrics/deps to find root cause service and failure mode |
|
| 43 |
+
| `remediation_planning` | Hard | 15 | Diagnose, remediate, and document full incident resolution |
|
| 44 |
+
|
| 45 |
+
### Scenarios
|
| 46 |
+
|
| 47 |
+
| ID | Incident Type | Root Cause | Failure Pattern |
|
| 48 |
+
|---|---|---|---|
|
| 49 |
+
| AC-001 | DB connection pool exhaustion | postgres-db / auth-service deploy | api-gateway β auth-service β postgres-db cascade |
|
| 50 |
+
| AC-002 | CDN cache invalidation storm | cdn-edge purge cronjob misconfigured | 40Γ origin traffic spike |
|
| 51 |
+
| RCA-001 | Postgres OOM kill | analytics-service unbounded query | Kernel OOM β DB crash loop β all dependents down |
|
| 52 |
+
| RCA-002 | BGP network partition | network-infra config change | Route withdrawal β AZ isolation β 61% checkout failures |
|
| 53 |
+
| RP-001 | Full OOM remediation | analytics-service | Disable job β restart DB β restore services β document |
|
| 54 |
+
| RP-002 | Full BGP remediation | network-infra | Restore routes β rollback config β verify recovery β document |
|
| 55 |
+
|
| 56 |
+
## Action Space
|
| 57 |
+
|
| 58 |
+
**Diagnostic actions** (gather evidence):
|
| 59 |
+
```json
|
| 60 |
+
{"action_type": "query_logs", "parameters": {"service": "postgres-db"}}
|
| 61 |
+
{"action_type": "check_metrics", "parameters": {"service": "auth-service"}}
|
| 62 |
+
{"action_type": "check_dependencies", "parameters": {"service": "api-gateway"}}
|
| 63 |
+
{"action_type": "check_recent_deploys", "parameters": {"service": "analytics-service"}}
|
| 64 |
+
{"action_type": "check_service_status", "parameters": {"service": "payment-service"}}
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
**Remediation actions** (fix the incident):
|
| 68 |
+
```json
|
| 69 |
+
{"action_type": "restart_service", "parameters": {"service": "postgres-db"}}
|
| 70 |
+
{"action_type": "rollback_deploy", "parameters": {"service": "network-infra", "target_version": "previous"}}
|
| 71 |
+
{"action_type": "scale_service", "parameters": {"service": "image-service", "replicas": 10}}
|
| 72 |
+
{"action_type": "disable_feature_flag", "parameters": {"flag": "full_history_export"}}
|
| 73 |
+
{"action_type": "execute_runbook_step", "parameters": {"runbook_action": "restore_bgp_routes"}}
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
**Submission actions** (end the episode):
|
| 77 |
+
```json
|
| 78 |
+
{"action_type": "submit_severity", "parameters": {"severity": "P1", "service": "postgres-db"}}
|
| 79 |
+
{"action_type": "submit_root_cause", "parameters": {"service": "analytics-service", "failure_mode": "unbounded query OOM killing postgres-db"}}
|
| 80 |
+
{"action_type": "submit_resolution", "parameters": {"summary": "Disabled analytics job, restarted postgres-db..."}}
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
## Observation Space
|
| 84 |
+
|
| 85 |
+
| Field | Type | Description |
|
| 86 |
+
|---|---|---|
|
| 87 |
+
| `episode_id` | string | Unique episode UUID |
|
| 88 |
+
| `task_id` | string | Active task |
|
| 89 |
+
| `scenario_id` | string | Scenario (e.g. `AC-001`) |
|
| 90 |
+
| `step_count` / `max_steps` | int | Current step and budget |
|
| 91 |
+
| `incident_summary` | string | Plain-text incident description |
|
| 92 |
+
| `alert` | dict | Alert payload with severity, symptoms, affected services |
|
| 93 |
+
| `available_actions` | list[str] | Valid action types for this task |
|
| 94 |
+
| `queried_data` | dict | All tool responses gathered so far |
|
| 95 |
+
| `known_services` | list[str] | Exact service names to use in actions |
|
| 96 |
+
| `cumulative_reward` | float | Running reward total |
|
| 97 |
+
| `done` | bool | Episode terminal flag |
|
| 98 |
+
| `feedback` | string | Per-step feedback string |
|
| 99 |
+
|
| 100 |
+
## Reward Function
|
| 101 |
+
|
| 102 |
+
Dense reward shaping throughout the trajectory:
|
| 103 |
+
|
| 104 |
+
| Event | Reward |
|
| 105 |
+
|---|---|
|
| 106 |
+
| Query known service (first time) | +0.05 |
|
| 107 |
+
| Query known service (repeat) | +0.01 |
|
| 108 |
+
| Query unknown service | β0.05 |
|
| 109 |
+
| Correct remediation action | +0.10 |
|
| 110 |
+
| Wrong remediation action | β0.10 |
|
| 111 |
+
| Step past halfway (non-submit) | β0.02 |
|
| 112 |
+
| Timeout without submission | β0.10 |
|
| 113 |
+
| Grader score (terminal step) | 0.0β1.0 |
|
| 114 |
+
|
| 115 |
+
**Grader scoring** (deterministic, via `GET /grader`):
|
| 116 |
+
|
| 117 |
+
| Task | Scoring Logic |
|
| 118 |
+
|---|---|
|
| 119 |
+
| `alert_classification` | 1.0 exact Β· 0.5 adjacent Β· 0.25 two-off Β· 0.0 wrong/none |
|
| 120 |
+
| `root_cause_analysis` | 0.6 base (svc+mode) + up to 0.4 efficiency bonus |
|
| 121 |
+
| `remediation_planning` | 0.6 base + 0.3 efficiency β 0.15 wrong penalty + 0.1 summary |
|
| 122 |
+
|
| 123 |
+
## API Endpoints
|
| 124 |
+
|
| 125 |
+
| Method | Path | Description |
|
| 126 |
+
|---|---|---|
|
| 127 |
+
| GET | `/` | `{"status":"running",...}` β HF Space health |
|
| 128 |
+
| GET | `/health` | `{"status":"ok","version":"0.1.0"}` |
|
| 129 |
+
| POST | `/reset?task_id=...&scenario_index=...` | Start new episode |
|
| 130 |
+
| POST | `/step` | Submit action (JSON body) |
|
| 131 |
+
| GET | `/state` | Full current episode state |
|
| 132 |
+
| GET | `/tasks` | All tasks with action schemas |
|
| 133 |
+
| GET | `/grader` | Score current episode (0.0β1.0) |
|
| 134 |
+
| POST | `/baseline` | Run inference.py, return scores |
|
| 135 |
+
|
| 136 |
+
## Setup & Usage
|
| 137 |
+
|
| 138 |
+
### Local development
|
| 139 |
+
```bash
|
| 140 |
+
pip install -r requirements.txt
|
| 141 |
+
uvicorn server.app:app --host 0.0.0.0 --port 7860
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
### Docker
|
| 145 |
+
```bash
|
| 146 |
+
docker build -t cloud-incident-env .
|
| 147 |
+
docker run -p 7860:7860 \
|
| 148 |
+
-e API_BASE_URL="https://api-inference.huggingface.co/v1" \
|
| 149 |
+
-e MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" \
|
| 150 |
+
-e HF_TOKEN="hf_your_token" \
|
| 151 |
+
cloud-incident-env
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
### Run inference script
|
| 155 |
+
```bash
|
| 156 |
+
export API_BASE_URL="https://api-inference.huggingface.co/v1"
|
| 157 |
+
export MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
|
| 158 |
+
export HF_TOKEN="hf_your_token"
|
| 159 |
+
python inference.py
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
### Quick API test
|
| 163 |
+
```bash
|
| 164 |
+
# Start new episode
|
| 165 |
+
curl -X POST "http://localhost:7860/reset?task_id=alert_classification&scenario_index=0"
|
| 166 |
+
|
| 167 |
+
# Submit an action
|
| 168 |
+
curl -X POST http://localhost:7860/step \
|
| 169 |
+
-H "Content-Type: application/json" \
|
| 170 |
+
-d '{"action_type":"query_logs","parameters":{"service":"api-gateway"}}'
|
| 171 |
+
|
| 172 |
+
# Check score
|
| 173 |
+
curl http://localhost:7860/grader
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
## Baseline Scores
|
| 177 |
+
|
| 178 |
+
Using `meta-llama/Llama-3.1-8B-Instruct` via HF Inference API:
|
| 179 |
+
|
| 180 |
+
| Task | Scenario 0 | Scenario 1 | Average |
|
| 181 |
+
|---|---|---|---|
|
| 182 |
+
| `alert_classification` | ~1.00 | ~0.50 | ~0.75 |
|
| 183 |
+
| `root_cause_analysis` | ~0.45 | ~0.35 | ~0.40 |
|
| 184 |
+
| `remediation_planning` | ~0.25 | ~0.20 | ~0.23 |
|
| 185 |
+
| **overall** | | | **~0.46** |
|
| 186 |
+
|
| 187 |
+
*Run `python inference.py` to reproduce.*
|
| 188 |
+
|
| 189 |
+
## Project Structure
|
| 190 |
+
|
| 191 |
+
```
|
| 192 |
+
.
|
| 193 |
+
βββ Dockerfile
|
| 194 |
+
βββ README.md
|
| 195 |
+
βββ requirements.txt
|
| 196 |
+
βββ openenv.yaml
|
| 197 |
+
βββ tasks.py # Scenario definitions (6 scenarios across 3 tasks)
|
| 198 |
+
βββ graders.py # Deterministic graders for all tasks
|
| 199 |
+
βββ inference.py # Baseline agent + smart fallback logic
|
| 200 |
+
βββ server/
|
| 201 |
+
βββ __init__.py
|
| 202 |
+
βββ app.py # FastAPI endpoints
|
| 203 |
+
βββ environment.py # Core OpenEnv step/reset/state logic
|
| 204 |
+
βββ models.py # Typed Pydantic models (Action, Observation, Reward)
|
| 205 |
+
```
|
graders.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders.py β Deterministic graders for all 3 Cloud Incident Response tasks.
|
| 3 |
+
|
| 4 |
+
Public API:
|
| 5 |
+
grade(task_id, state, scenario) -> {"total": float, "breakdown": dict, "feedback": str}
|
| 6 |
+
|
| 7 |
+
All scores are in [0.0, 1.0].
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _normalise(s: str) -> str:
|
| 14 |
+
"""Lowercase, strip whitespace, collapse hyphens/underscores."""
|
| 15 |
+
return s.lower().strip().replace("_", "-").replace(" ", "-")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _svc_match(submitted: str, correct: str) -> bool:
|
| 19 |
+
s = _normalise(submitted)
|
| 20 |
+
c = _normalise(correct)
|
| 21 |
+
if s == c:
|
| 22 |
+
return True
|
| 23 |
+
if s in c or c in s:
|
| 24 |
+
return True
|
| 25 |
+
aliases = {
|
| 26 |
+
"network": "network-infra",
|
| 27 |
+
"network-infrastructure": "network-infra",
|
| 28 |
+
"cdn": "cdn-edge",
|
| 29 |
+
"postgres": "postgres-db",
|
| 30 |
+
"postgresql": "postgres-db",
|
| 31 |
+
"analytics": "analytics-service",
|
| 32 |
+
"payment": "payment-service",
|
| 33 |
+
"auth": "auth-service",
|
| 34 |
+
"api": "api-gateway",
|
| 35 |
+
"api-gw": "api-gateway",
|
| 36 |
+
}
|
| 37 |
+
return aliases.get(s, s) == c or s == aliases.get(c, c)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def grade(task_id: str, state: dict, scenario: dict) -> dict:
|
| 41 |
+
_graders = {
|
| 42 |
+
"alert_classification": _grade_alert_classification,
|
| 43 |
+
"root_cause_analysis": _grade_root_cause_analysis,
|
| 44 |
+
"remediation_planning": _grade_remediation_planning,
|
| 45 |
+
}
|
| 46 |
+
fn = _graders.get(task_id)
|
| 47 |
+
if fn is None:
|
| 48 |
+
return {"total": 0.0, "breakdown": {}, "feedback": f"Unknown task_id '{task_id}'"}
|
| 49 |
+
return fn(state, scenario)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# ββ Task 1: Alert Classification βββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
+
def _grade_alert_classification(state: dict, scenario: dict) -> dict:
|
| 54 |
+
history = state.get("action_history", [])
|
| 55 |
+
correct = scenario.get("correct_severity", "P1")
|
| 56 |
+
adjacent = scenario.get("adjacent_severities", [])
|
| 57 |
+
order = ["P1", "P2", "P3", "P4"]
|
| 58 |
+
|
| 59 |
+
submitted = None
|
| 60 |
+
for a in history:
|
| 61 |
+
if a.get("action_type") == "submit_severity":
|
| 62 |
+
submitted = a.get("parameters", {}).get("severity", "").upper().strip()
|
| 63 |
+
break
|
| 64 |
+
|
| 65 |
+
if not submitted:
|
| 66 |
+
return {
|
| 67 |
+
"total": 0.0,
|
| 68 |
+
"breakdown": {"submitted": False, "severity_match": 0.0},
|
| 69 |
+
"feedback": "No severity submitted β score 0.0",
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
if submitted == correct:
|
| 73 |
+
score, msg = 1.0, f"Exact match: {submitted}"
|
| 74 |
+
elif submitted in adjacent:
|
| 75 |
+
score, msg = 0.5, f"Adjacent: submitted {submitted}, correct {correct}"
|
| 76 |
+
else:
|
| 77 |
+
try:
|
| 78 |
+
dist = abs(order.index(submitted) - order.index(correct))
|
| 79 |
+
except ValueError:
|
| 80 |
+
dist = 4
|
| 81 |
+
score = 0.25 if dist == 2 else 0.0
|
| 82 |
+
msg = f"Wrong: submitted {submitted}, correct {correct} (distance={dist})"
|
| 83 |
+
|
| 84 |
+
return {
|
| 85 |
+
"total": score,
|
| 86 |
+
"breakdown": {
|
| 87 |
+
"submitted_severity": submitted,
|
| 88 |
+
"correct_severity": correct,
|
| 89 |
+
"severity_match": score,
|
| 90 |
+
},
|
| 91 |
+
"feedback": msg,
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ββ Task 2: Root Cause Analysis ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 96 |
+
def _grade_root_cause_analysis(state: dict, scenario: dict) -> dict:
|
| 97 |
+
history = state.get("action_history", [])
|
| 98 |
+
correct_rc = scenario.get("correct_root_cause", {})
|
| 99 |
+
correct_svc = correct_rc.get("service", "").lower().strip()
|
| 100 |
+
correct_mode = correct_rc.get("failure_mode", "").lower().strip()
|
| 101 |
+
known = {s.lower() for s in scenario.get("known_services", set())}
|
| 102 |
+
|
| 103 |
+
diag_types = {
|
| 104 |
+
"query_logs", "check_metrics", "check_dependencies",
|
| 105 |
+
"check_recent_deploys", "check_service_status",
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
sub_svc, sub_mode, sub_step = "", "", len(history)
|
| 109 |
+
for a in history:
|
| 110 |
+
if a.get("action_type") == "submit_root_cause":
|
| 111 |
+
p = a.get("parameters", {})
|
| 112 |
+
sub_svc = p.get("service", "").lower().strip()
|
| 113 |
+
sub_mode = p.get("failure_mode", "").lower().strip()
|
| 114 |
+
sub_step = a.get("step", len(history))
|
| 115 |
+
break
|
| 116 |
+
|
| 117 |
+
if not sub_svc:
|
| 118 |
+
return {
|
| 119 |
+
"total": 0.0,
|
| 120 |
+
"breakdown": {"base": 0.0, "efficiency": 0.0, "submitted": False},
|
| 121 |
+
"feedback": "No root cause submitted β score 0.0",
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
svc_match = _svc_match(sub_svc, correct_svc)
|
| 125 |
+
mode_kws = [w for w in correct_mode.split() if len(w) > 3]
|
| 126 |
+
mode_match = svc_match and (
|
| 127 |
+
any(kw in sub_mode for kw in mode_kws) if mode_kws else True
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
if mode_match:
|
| 131 |
+
base, base_fb = 0.6, "Correct service + failure mode"
|
| 132 |
+
elif svc_match:
|
| 133 |
+
base, base_fb = 0.35, "Correct service only β failure mode unclear"
|
| 134 |
+
else:
|
| 135 |
+
base, base_fb = 0.10, (
|
| 136 |
+
f"Wrong service: '{sub_svc}' (correct: '{correct_svc}') β partial credit"
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
efficiency = 0.0
|
| 140 |
+
if svc_match:
|
| 141 |
+
pre_submit = [
|
| 142 |
+
a for a in history[:sub_step]
|
| 143 |
+
if a.get("action_type") in diag_types
|
| 144 |
+
]
|
| 145 |
+
queried_svcs = {
|
| 146 |
+
a.get("parameters", {}).get("service", "").lower()
|
| 147 |
+
for a in pre_submit
|
| 148 |
+
}
|
| 149 |
+
relevant = queried_svcs & known
|
| 150 |
+
total_q = len(pre_submit)
|
| 151 |
+
if total_q > 0:
|
| 152 |
+
precision = len(relevant) / max(total_q, 1)
|
| 153 |
+
efficiency = round(
|
| 154 |
+
min(0.4, precision * 0.4 + min(len(relevant), 3) * 0.05), 4
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
total = round(min(1.0, base + efficiency), 4)
|
| 158 |
+
return {
|
| 159 |
+
"total": total,
|
| 160 |
+
"breakdown": {
|
| 161 |
+
"base": base,
|
| 162 |
+
"efficiency_bonus": efficiency,
|
| 163 |
+
"service_match": svc_match,
|
| 164 |
+
"mode_match": mode_match,
|
| 165 |
+
"submitted_service": sub_svc,
|
| 166 |
+
"correct_service": correct_svc,
|
| 167 |
+
},
|
| 168 |
+
"feedback": f"{base_fb} | efficiency={efficiency:.2f} | total={total:.2f}",
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# ββ Task 3: Remediation Planning βββββββββββββββββββββββββββββββββββββββββββββ
|
| 173 |
+
def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
|
| 174 |
+
history = state.get("action_history", [])
|
| 175 |
+
correct_seq = scenario.get("correct_remediation_sequence", [])
|
| 176 |
+
wrong_map = scenario.get("wrong_actions", {})
|
| 177 |
+
keywords = scenario.get("resolution_keywords", [])
|
| 178 |
+
|
| 179 |
+
diag_rem = {
|
| 180 |
+
"query_logs", "check_metrics", "check_dependencies",
|
| 181 |
+
"check_recent_deploys", "check_service_status",
|
| 182 |
+
"restart_service", "rollback_deploy", "scale_service",
|
| 183 |
+
"disable_feature_flag", "clear_cache", "execute_runbook_step",
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
summary = ""
|
| 187 |
+
for a in history:
|
| 188 |
+
if a.get("action_type") == "submit_resolution":
|
| 189 |
+
summary = a.get("parameters", {}).get("summary", "")
|
| 190 |
+
break
|
| 191 |
+
|
| 192 |
+
inv_count = sum(1 for a in history if a.get("action_type") in diag_rem)
|
| 193 |
+
|
| 194 |
+
if not summary or inv_count < 1:
|
| 195 |
+
return {
|
| 196 |
+
"total": 0.0,
|
| 197 |
+
"breakdown": {
|
| 198 |
+
"base": 0.0, "efficiency": 0.0,
|
| 199 |
+
"penalty": 0.0, "summary_bonus": 0.0,
|
| 200 |
+
},
|
| 201 |
+
"feedback": "No resolution submitted or no investigation β score 0.0",
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
base = 0.6
|
| 205 |
+
|
| 206 |
+
executed = set()
|
| 207 |
+
for a in history:
|
| 208 |
+
at = a.get("action_type", "")
|
| 209 |
+
p = a.get("parameters", {})
|
| 210 |
+
svc = p.get("service", "")
|
| 211 |
+
flag = p.get("flag", "")
|
| 212 |
+
runbook = p.get("runbook_action", "")
|
| 213 |
+
target = p.get("target", "")
|
| 214 |
+
executed.add(at)
|
| 215 |
+
if svc: executed.add(f"{at}:{svc}")
|
| 216 |
+
if flag: executed.add(f"{at}:{flag}")
|
| 217 |
+
if runbook: executed.add(f"execute_runbook_step:{runbook}")
|
| 218 |
+
if target: executed.add(f"execute_runbook_step:{target}")
|
| 219 |
+
|
| 220 |
+
def _seq_key_matches(seq_key: str) -> bool:
|
| 221 |
+
if seq_key in executed:
|
| 222 |
+
return True
|
| 223 |
+
if ":" in seq_key:
|
| 224 |
+
action, target = seq_key.split(":", 1)
|
| 225 |
+
for ex_key in executed:
|
| 226 |
+
if ":" in ex_key:
|
| 227 |
+
ex_action, ex_target = ex_key.split(":", 1)
|
| 228 |
+
if ex_action == action and _svc_match(ex_target, target):
|
| 229 |
+
return True
|
| 230 |
+
return False
|
| 231 |
+
|
| 232 |
+
matched = sum(1 for k in correct_seq if _seq_key_matches(k))
|
| 233 |
+
efficiency = round((matched / len(correct_seq)) * 0.3, 4) if correct_seq else 0.0
|
| 234 |
+
|
| 235 |
+
wrong_count = sum(
|
| 236 |
+
1 for a in history
|
| 237 |
+
if (a.get("action_type") in wrong_map or
|
| 238 |
+
f"{a.get('action_type')}:{a.get('parameters', {}).get('service', '')}"
|
| 239 |
+
in wrong_map)
|
| 240 |
+
)
|
| 241 |
+
penalty = round(min(0.15, wrong_count * 0.05), 4)
|
| 242 |
+
|
| 243 |
+
sl = summary.lower()
|
| 244 |
+
hits = sum(1 for kw in keywords if kw in sl)
|
| 245 |
+
summary_bonus = 0.10 if hits >= 3 else (0.05 if hits >= 1 else 0.0)
|
| 246 |
+
|
| 247 |
+
total = round(max(0.0, min(1.0, base + efficiency - penalty + summary_bonus)), 4)
|
| 248 |
+
|
| 249 |
+
return {
|
| 250 |
+
"total": total,
|
| 251 |
+
"breakdown": {
|
| 252 |
+
"base": base,
|
| 253 |
+
"efficiency_bonus": efficiency,
|
| 254 |
+
"wrong_action_penalty": -penalty,
|
| 255 |
+
"summary_bonus": summary_bonus,
|
| 256 |
+
"correct_actions_matched": matched,
|
| 257 |
+
"correct_actions_total": len(correct_seq),
|
| 258 |
+
"wrong_actions_count": wrong_count,
|
| 259 |
+
"summary_keywords_hit": hits,
|
| 260 |
+
},
|
| 261 |
+
"feedback": (
|
| 262 |
+
f"base={base} | efficiency={efficiency:.2f} "
|
| 263 |
+
f"({matched}/{len(correct_seq)} correct) | "
|
| 264 |
+
f"penalty=-{penalty:.2f} | summary={summary_bonus:.2f} | "
|
| 265 |
+
f"total={total:.2f}"
|
| 266 |
+
),
|
| 267 |
+
}
|
inference.py
ADDED
|
@@ -0,0 +1,546 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
inference.py β Cloud Incident Response OpenEnv baseline inference script.
|
| 3 |
+
|
| 4 |
+
The LLM reasons from evidence. Fallback is a dumb safety net that scores low.
|
| 5 |
+
Override only blocks clearly invalid actions (wrong task submission, bad params).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
|
| 14 |
+
import requests
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
from dotenv import load_dotenv
|
| 18 |
+
load_dotenv()
|
| 19 |
+
except ImportError:
|
| 20 |
+
pass
|
| 21 |
+
|
| 22 |
+
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.groq.com/openai/v1")
|
| 24 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "llama-3.1-8b-instant")
|
| 25 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY") or ""
|
| 26 |
+
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:7860")
|
| 27 |
+
|
| 28 |
+
if not HF_TOKEN:
|
| 29 |
+
print("[WARN] No API key set β LLM calls will fail.", file=sys.stderr)
|
| 30 |
+
|
| 31 |
+
_session = requests.Session()
|
| 32 |
+
|
| 33 |
+
# Lazy-init OpenAI client to avoid import-time httpx errors
|
| 34 |
+
_client = None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _get_client():
|
| 38 |
+
global _client
|
| 39 |
+
if _client is None:
|
| 40 |
+
from openai import OpenAI
|
| 41 |
+
_client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
|
| 42 |
+
return _client
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ββ Which submission action belongs to which task βββββββββββββββββββββββββββ
|
| 46 |
+
_TASK_SUBMIT = {
|
| 47 |
+
"alert_classification": "submit_severity",
|
| 48 |
+
"root_cause_analysis": "submit_root_cause",
|
| 49 |
+
"remediation_planning": "submit_resolution",
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
_DIAG_TYPES = frozenset({
|
| 53 |
+
"query_logs", "check_metrics", "check_dependencies",
|
| 54 |
+
"check_recent_deploys", "check_service_status",
|
| 55 |
+
})
|
| 56 |
+
|
| 57 |
+
_SUBMIT_TYPES = frozenset({
|
| 58 |
+
"submit_severity", "submit_root_cause", "submit_resolution",
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
_REM_TYPES = frozenset({
|
| 62 |
+
"restart_service", "rollback_deploy", "scale_service",
|
| 63 |
+
"disable_feature_flag", "clear_cache", "execute_runbook_step",
|
| 64 |
+
})
|
| 65 |
+
|
| 66 |
+
_ALL_VALID = _DIAG_TYPES | _SUBMIT_TYPES | _REM_TYPES
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# ββ System prompt β general SRE strategy, NO scenario answers βββββββββββββββ
|
| 70 |
+
SYSTEM_PROMPT = """\
|
| 71 |
+
You are an expert Site Reliability Engineer responding to a production incident.
|
| 72 |
+
Reply with exactly ONE JSON action object. No markdown, no explanation, no extra text.
|
| 73 |
+
|
| 74 |
+
VALID ACTIONS:
|
| 75 |
+
{"action_type":"query_logs","parameters":{"service":"<name>"}}
|
| 76 |
+
{"action_type":"check_metrics","parameters":{"service":"<name>"}}
|
| 77 |
+
{"action_type":"check_dependencies","parameters":{"service":"<name>"}}
|
| 78 |
+
{"action_type":"check_recent_deploys","parameters":{"service":"<name>"}}
|
| 79 |
+
{"action_type":"check_service_status","parameters":{"service":"<name>"}}
|
| 80 |
+
{"action_type":"restart_service","parameters":{"service":"<name>"}}
|
| 81 |
+
{"action_type":"rollback_deploy","parameters":{"service":"<name>","target_version":"previous"}}
|
| 82 |
+
{"action_type":"disable_feature_flag","parameters":{"flag":"<flag_name>"}}
|
| 83 |
+
{"action_type":"execute_runbook_step","parameters":{"runbook_action":"<action>"}}
|
| 84 |
+
{"action_type":"submit_severity","parameters":{"severity":"P1|P2|P3|P4","service":"<name>"}}
|
| 85 |
+
{"action_type":"submit_root_cause","parameters":{"service":"<name>","failure_mode":"<description>"}}
|
| 86 |
+
{"action_type":"submit_resolution","parameters":{"summary":"<3+ sentence summary>"}}
|
| 87 |
+
|
| 88 |
+
RULES:
|
| 89 |
+
- Service names MUST exactly match the KNOWN_SERVICES list in the observation.
|
| 90 |
+
- P1 = complete outage OR revenue > $1,000/min. P2 = major degradation.
|
| 91 |
+
P3 = minor issue. P4 = informational.
|
| 92 |
+
- Root cause = the upstream service that TRIGGERED the cascade. This is often
|
| 93 |
+
NOT listed in the alert's affected_services. Investigate services not in the
|
| 94 |
+
alert first.
|
| 95 |
+
- submit_resolution summary must be 3+ sentences: (1) what failed and why,
|
| 96 |
+
(2) actions you took to fix it, (3) current recovery status.
|
| 97 |
+
- Submit as soon as evidence is clear β do NOT waste steps querying more.
|
| 98 |
+
|
| 99 |
+
TASK-SPECIFIC STRATEGY:
|
| 100 |
+
|
| 101 |
+
alert_classification (max 3 steps):
|
| 102 |
+
Query 1-2 affected services for evidence, then submit_severity.
|
| 103 |
+
|
| 104 |
+
root_cause_analysis (max 10 steps):
|
| 105 |
+
Investigate services NOT in the alert first (check logs + recent deploys).
|
| 106 |
+
Look for: OOM kills, BGP withdrawals, config changes, unbounded queries.
|
| 107 |
+
Submit submit_root_cause with the triggering service and failure mode.
|
| 108 |
+
|
| 109 |
+
remediation_planning (max 15 steps):
|
| 110 |
+
1. Query logs to confirm root cause.
|
| 111 |
+
2. Execute fixes: disable bad jobs, restart crashed services, rollback configs,
|
| 112 |
+
run runbook steps.
|
| 113 |
+
3. Submit submit_resolution with a detailed 3-sentence summary.
|
| 114 |
+
|
| 115 |
+
CRITICAL: Each task has ONE correct submission action:
|
| 116 |
+
alert_classification -> submit_severity
|
| 117 |
+
root_cause_analysis -> submit_root_cause
|
| 118 |
+
remediation_planning -> submit_resolution
|
| 119 |
+
Do NOT use the wrong submission type for the task."""
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½ββββββββββββββββ
|
| 123 |
+
def _queried_svcs(queried_data: dict) -> set[str]:
|
| 124 |
+
return {
|
| 125 |
+
svc
|
| 126 |
+
for at, svcs in queried_data.items()
|
| 127 |
+
if at in _DIAG_TYPES and isinstance(svcs, dict)
|
| 128 |
+
for svc in svcs
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def _extract_signals(queried_data: dict) -> list[str]:
|
| 133 |
+
"""Surface key patterns from queried data β shown to LLM."""
|
| 134 |
+
seen: set[str] = set()
|
| 135 |
+
signals: list[str] = []
|
| 136 |
+
|
| 137 |
+
def _add(msg: str) -> None:
|
| 138 |
+
if msg not in seen:
|
| 139 |
+
seen.add(msg)
|
| 140 |
+
signals.append(msg)
|
| 141 |
+
|
| 142 |
+
for action_type, services in queried_data.items():
|
| 143 |
+
if not isinstance(services, dict):
|
| 144 |
+
continue
|
| 145 |
+
for svc, data in services.items():
|
| 146 |
+
t = str(data).lower()
|
| 147 |
+
if "out of memory" in t or "oom" in t:
|
| 148 |
+
_add(f"OOM detected in {svc}")
|
| 149 |
+
if "bgp" in t and ("withdrawal" in t or "withdrawn" in t):
|
| 150 |
+
_add(f"BGP route issue in {svc}")
|
| 151 |
+
if "pool" in t and ("exhaust" in t or "too many clients" in t):
|
| 152 |
+
_add(f"Connection pool issue in {svc}")
|
| 153 |
+
if "cache" in t and ("purge" in t or "invalidat" in t):
|
| 154 |
+
_add(f"Cache purge in {svc}")
|
| 155 |
+
if "unbounded" in t or "no limit" in t:
|
| 156 |
+
_add(f"Unbounded query in {svc}")
|
| 157 |
+
if action_type == "check_recent_deploys" and any(
|
| 158 |
+
x in t for x in ("ago", "change", "update", "added")
|
| 159 |
+
):
|
| 160 |
+
snippet = str(data)[:120].replace("\n", " ")
|
| 161 |
+
_add(f"Recent change in {svc}: {snippet}")
|
| 162 |
+
return signals
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
# ββ Message builders ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
+
def _first_obs_msg(obs: dict) -> str:
|
| 167 |
+
alert = obs.get("alert", {})
|
| 168 |
+
known = obs.get("known_services", [])
|
| 169 |
+
affected = alert.get("affected_services", [])
|
| 170 |
+
task_id = obs.get("task_id", "")
|
| 171 |
+
non_aff = [s for s in known if s not in affected]
|
| 172 |
+
|
| 173 |
+
lines = [
|
| 174 |
+
"=== NEW INCIDENT ===",
|
| 175 |
+
f"Task: {task_id} | Max steps: {obs.get('max_steps')}",
|
| 176 |
+
f"Scenario: {obs.get('scenario_id', '')}",
|
| 177 |
+
f"INCIDENT: {obs.get('incident_summary', '')}",
|
| 178 |
+
]
|
| 179 |
+
|
| 180 |
+
if alert:
|
| 181 |
+
lines.append("ALERT DETAILS:")
|
| 182 |
+
if alert.get("title"):
|
| 183 |
+
lines.append(f" Title: {alert['title']}")
|
| 184 |
+
if affected:
|
| 185 |
+
lines.append(f" Directly affected services: {', '.join(affected)}")
|
| 186 |
+
for s in alert.get("symptoms", []):
|
| 187 |
+
lines.append(f" - {s}")
|
| 188 |
+
for k in ("error_rate", "duration_minutes", "revenue_impact_per_min"):
|
| 189 |
+
if alert.get(k) is not None:
|
| 190 |
+
lines.append(f" {k}: {alert[k]}")
|
| 191 |
+
|
| 192 |
+
lines.append(f"KNOWN_SERVICES (use these EXACT names): {json.dumps(known)}")
|
| 193 |
+
|
| 194 |
+
if non_aff and task_id in ("root_cause_analysis", "remediation_planning"):
|
| 195 |
+
lines.append(
|
| 196 |
+
f" *** These services are NOT in the alert β investigate them "
|
| 197 |
+
f"for possible root cause: {json.dumps(non_aff)} ***"
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
lines.append(f"AVAILABLE ACTIONS: {obs.get('available_actions', [])}")
|
| 201 |
+
lines.append(f"REQUIRED SUBMISSION: {_TASK_SUBMIT.get(task_id, 'unknown')}")
|
| 202 |
+
lines.append("")
|
| 203 |
+
lines.append("Respond with your first action (JSON only, no markdown):")
|
| 204 |
+
return "\n".join(lines)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def _step_msg(obs: dict, prev_queried: dict) -> str:
|
| 208 |
+
step = obs.get("step_count", 0)
|
| 209 |
+
max_steps = obs.get("max_steps", 10)
|
| 210 |
+
left = max_steps - step
|
| 211 |
+
queried = obs.get("queried_data", {})
|
| 212 |
+
task_id = obs.get("task_id", "")
|
| 213 |
+
|
| 214 |
+
lines = [
|
| 215 |
+
f"Step {step}/{max_steps} ({left} remaining) | "
|
| 216 |
+
f"reward={obs.get('cumulative_reward', 0.0):.3f} | "
|
| 217 |
+
f"feedback: {obs.get('feedback', '')}",
|
| 218 |
+
]
|
| 219 |
+
|
| 220 |
+
# Show new data received
|
| 221 |
+
new_data = []
|
| 222 |
+
for action_type, services in queried.items():
|
| 223 |
+
prev = prev_queried.get(action_type, {})
|
| 224 |
+
if isinstance(services, dict):
|
| 225 |
+
for svc, data in services.items():
|
| 226 |
+
if svc not in prev:
|
| 227 |
+
d = str(data)
|
| 228 |
+
if len(d) > 500:
|
| 229 |
+
d = d[:500] + "..."
|
| 230 |
+
new_data.append(f" [{action_type}][{svc}]: {d}")
|
| 231 |
+
if new_data:
|
| 232 |
+
lines.append("NEW DATA RECEIVED:")
|
| 233 |
+
lines.extend(new_data)
|
| 234 |
+
|
| 235 |
+
# Show extracted signals
|
| 236 |
+
signals = _extract_signals(queried)
|
| 237 |
+
if signals:
|
| 238 |
+
lines.append("KEY SIGNALS DETECTED:")
|
| 239 |
+
for sig in signals:
|
| 240 |
+
lines.append(f" *** {sig} ***")
|
| 241 |
+
|
| 242 |
+
# Urgency reminders
|
| 243 |
+
if left <= 3:
|
| 244 |
+
lines.append(
|
| 245 |
+
f"*** {left} steps remaining β submit "
|
| 246 |
+
f"{_TASK_SUBMIT.get(task_id, 'your answer')} soon ***"
|
| 247 |
+
)
|
| 248 |
+
if left <= 1:
|
| 249 |
+
lines.append(
|
| 250 |
+
f"!!! LAST STEP β YOU MUST {_TASK_SUBMIT.get(task_id, 'SUBMIT')} NOW !!!"
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
lines.append("Next action (JSON only, no markdown):")
|
| 254 |
+
return "\n".join(lines)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
# ββ Parse LLM output βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 258 |
+
def _parse(text: str) -> dict:
|
| 259 |
+
text = text.strip()
|
| 260 |
+
# Strip markdown code fences
|
| 261 |
+
if text.startswith("`"):
|
| 262 |
+
text = "\n".join(
|
| 263 |
+
ln for ln in text.splitlines() if not ln.startswith("`")
|
| 264 |
+
).strip()
|
| 265 |
+
try:
|
| 266 |
+
return json.loads(text)
|
| 267 |
+
except json.JSONDecodeError:
|
| 268 |
+
s = text.find("{")
|
| 269 |
+
e = text.rfind("}") + 1
|
| 270 |
+
if s != -1 and e > s:
|
| 271 |
+
return json.loads(text[s:e])
|
| 272 |
+
raise
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
# ββ Fallback β generic, no scenario knowledge ββββββββββββββββββββββββββββββ
|
| 276 |
+
def _fallback_submit(task_id: str, obs: dict) -> dict:
|
| 277 |
+
"""Minimal correct-type submission. Will score low but won't crash."""
|
| 278 |
+
alert = obs.get("alert", {})
|
| 279 |
+
known = obs.get("known_services", [])
|
| 280 |
+
|
| 281 |
+
if task_id == "alert_classification":
|
| 282 |
+
rev = alert.get("revenue_impact_per_min", 0) or 0
|
| 283 |
+
err = alert.get("error_rate", 0) or 0
|
| 284 |
+
sev = "P1" if (rev > 1000 or err > 0.9) else (
|
| 285 |
+
"P2" if (rev > 100 or err > 0.3) else "P3")
|
| 286 |
+
svc = (alert.get("affected_services") or known or ["unknown"])[0]
|
| 287 |
+
return {
|
| 288 |
+
"action_type": "submit_severity",
|
| 289 |
+
"parameters": {"severity": sev, "service": svc},
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
if task_id == "root_cause_analysis":
|
| 293 |
+
svc = known[0] if known else "unknown"
|
| 294 |
+
return {
|
| 295 |
+
"action_type": "submit_root_cause",
|
| 296 |
+
"parameters": {
|
| 297 |
+
"service": svc,
|
| 298 |
+
"failure_mode": "service failure causing downstream cascade",
|
| 299 |
+
},
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
# remediation_planning
|
| 303 |
+
return {
|
| 304 |
+
"action_type": "submit_resolution",
|
| 305 |
+
"parameters": {
|
| 306 |
+
"summary": (
|
| 307 |
+
"The incident was investigated through log and metric analysis "
|
| 308 |
+
"across affected services. Remediation actions were applied to "
|
| 309 |
+
"restore service health. Systems are being monitored for full "
|
| 310 |
+
"recovery confirmation."
|
| 311 |
+
),
|
| 312 |
+
},
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
def _smart_fallback(
|
| 317 |
+
task_id: str, obs: dict, step: int, max_steps: int
|
| 318 |
+
) -> dict:
|
| 319 |
+
"""Generic fallback β queries unvisited services, then submits."""
|
| 320 |
+
known = obs.get("known_services", [])
|
| 321 |
+
queried = obs.get("queried_data", {})
|
| 322 |
+
left = max_steps - step
|
| 323 |
+
q_svcs = _queried_svcs(queried)
|
| 324 |
+
|
| 325 |
+
# Must submit on final step
|
| 326 |
+
if left <= 1:
|
| 327 |
+
return _fallback_submit(task_id, obs)
|
| 328 |
+
|
| 329 |
+
# Alert classification β submit after any query
|
| 330 |
+
if task_id == "alert_classification" and q_svcs:
|
| 331 |
+
return _fallback_submit(task_id, obs)
|
| 332 |
+
|
| 333 |
+
# Query next un-queried service
|
| 334 |
+
for svc in known:
|
| 335 |
+
if svc not in q_svcs:
|
| 336 |
+
return {
|
| 337 |
+
"action_type": "query_logs",
|
| 338 |
+
"parameters": {"service": svc},
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
# Try check_recent_deploys for unvisited services
|
| 342 |
+
if task_id in ("root_cause_analysis", "remediation_planning"):
|
| 343 |
+
deploy_queried = set(queried.get("check_recent_deploys", {}).keys())
|
| 344 |
+
for svc in known:
|
| 345 |
+
if svc not in deploy_queried:
|
| 346 |
+
return {
|
| 347 |
+
"action_type": "check_recent_deploys",
|
| 348 |
+
"parameters": {"service": svc},
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
# Everything queried β submit
|
| 352 |
+
return _fallback_submit(task_id, obs)
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
# ββ Override β ONLY blocks clearly invalid actions ββββββββββββββββββββββββββ
|
| 356 |
+
def _should_override(
|
| 357 |
+
task_id: str, action: dict, obs: dict, step: int, max_steps: int
|
| 358 |
+
) -> bool:
|
| 359 |
+
at = action.get("action_type", "")
|
| 360 |
+
params = action.get("parameters", {})
|
| 361 |
+
left = max_steps - step
|
| 362 |
+
known = obs.get("known_services", [])
|
| 363 |
+
|
| 364 |
+
# 1. Unknown action type
|
| 365 |
+
if at not in _ALL_VALID:
|
| 366 |
+
return True
|
| 367 |
+
|
| 368 |
+
# 2. Must submit on last step
|
| 369 |
+
if left <= 0 and at not in _SUBMIT_TYPES:
|
| 370 |
+
return True
|
| 371 |
+
|
| 372 |
+
# 3. WRONG submission type for the task
|
| 373 |
+
# e.g. submit_severity during remediation_planning
|
| 374 |
+
correct_submit = _TASK_SUBMIT.get(task_id)
|
| 375 |
+
if at in _SUBMIT_TYPES and at != correct_submit:
|
| 376 |
+
return True
|
| 377 |
+
|
| 378 |
+
# 4. Service not in known_services (for service-targeted actions)
|
| 379 |
+
svc = (params.get("service") or "").strip()
|
| 380 |
+
if (svc and known
|
| 381 |
+
and at not in ("disable_feature_flag", "execute_runbook_step")
|
| 382 |
+
and svc not in known):
|
| 383 |
+
return True
|
| 384 |
+
|
| 385 |
+
# 5. Invalid severity value
|
| 386 |
+
if at == "submit_severity":
|
| 387 |
+
sev = (params.get("severity") or "").upper().strip()
|
| 388 |
+
if sev not in ("P1", "P2", "P3", "P4"):
|
| 389 |
+
return True
|
| 390 |
+
|
| 391 |
+
# 6. Empty required fields
|
| 392 |
+
if at == "submit_root_cause":
|
| 393 |
+
svc = (params.get("service") or "").strip()
|
| 394 |
+
mode = (params.get("failure_mode") or "").strip()
|
| 395 |
+
if not svc or len(mode) < 5:
|
| 396 |
+
return True
|
| 397 |
+
|
| 398 |
+
if at == "submit_resolution":
|
| 399 |
+
summary = (params.get("summary") or "").strip()
|
| 400 |
+
if len(summary) < 30:
|
| 401 |
+
return True
|
| 402 |
+
|
| 403 |
+
# 7. Remediation action used in alert_classification task
|
| 404 |
+
if task_id == "alert_classification" and at in _REM_TYPES:
|
| 405 |
+
return True
|
| 406 |
+
|
| 407 |
+
return False
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
# ββ Episode runner ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 411 |
+
def _run_episode(task_id: str, scenario_index: int) -> float:
|
| 412 |
+
r = _session.post(
|
| 413 |
+
f"{ENV_BASE_URL}/reset",
|
| 414 |
+
params={"task_id": task_id, "scenario_index": scenario_index},
|
| 415 |
+
timeout=30,
|
| 416 |
+
)
|
| 417 |
+
r.raise_for_status()
|
| 418 |
+
obs = r.json()
|
| 419 |
+
|
| 420 |
+
messages = [
|
| 421 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 422 |
+
{"role": "user", "content": _first_obs_msg(obs)},
|
| 423 |
+
]
|
| 424 |
+
|
| 425 |
+
prev_queried: dict = {}
|
| 426 |
+
max_steps = obs.get("max_steps", 10)
|
| 427 |
+
|
| 428 |
+
for step_i in range(max_steps):
|
| 429 |
+
current_step = step_i + 1
|
| 430 |
+
|
| 431 |
+
# ββ Call LLM βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 432 |
+
try:
|
| 433 |
+
resp = _get_client().chat.completions.create(
|
| 434 |
+
model=MODEL_NAME,
|
| 435 |
+
messages=messages,
|
| 436 |
+
temperature=0.0,
|
| 437 |
+
max_tokens=300,
|
| 438 |
+
stream=False,
|
| 439 |
+
)
|
| 440 |
+
raw = resp.choices[0].message.content or ""
|
| 441 |
+
except Exception as e:
|
| 442 |
+
print(f" [WARN] LLM call failed step {current_step}: {e}",
|
| 443 |
+
file=sys.stderr)
|
| 444 |
+
raw = ""
|
| 445 |
+
|
| 446 |
+
messages.append({"role": "assistant", "content": raw or "{}"})
|
| 447 |
+
|
| 448 |
+
# ββ Parse ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 449 |
+
action = None
|
| 450 |
+
try:
|
| 451 |
+
if raw.strip():
|
| 452 |
+
action = _parse(raw)
|
| 453 |
+
except Exception:
|
| 454 |
+
pass
|
| 455 |
+
|
| 456 |
+
# ββ Fallback / override ββββββββββββββββββββββββββββββββββββββββββ
|
| 457 |
+
if action is None:
|
| 458 |
+
action = _smart_fallback(task_id, obs, current_step, max_steps)
|
| 459 |
+
print(f" [FALLBACK] step {current_step}: "
|
| 460 |
+
f"{action.get('action_type')}", file=sys.stderr)
|
| 461 |
+
elif _should_override(task_id, action, obs, current_step, max_steps):
|
| 462 |
+
old_at = action.get("action_type")
|
| 463 |
+
action = _smart_fallback(task_id, obs, current_step, max_steps)
|
| 464 |
+
print(f" [OVERRIDE] step {current_step}: "
|
| 465 |
+
f"{old_at} -> {action.get('action_type')}",
|
| 466 |
+
file=sys.stderr)
|
| 467 |
+
|
| 468 |
+
# ββ Step βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 469 |
+
sr = _session.post(
|
| 470 |
+
f"{ENV_BASE_URL}/step", json=action, timeout=30,
|
| 471 |
+
)
|
| 472 |
+
sr.raise_for_status()
|
| 473 |
+
result = sr.json()
|
| 474 |
+
new_obs = result["observation"]
|
| 475 |
+
|
| 476 |
+
print(
|
| 477 |
+
f" step {current_step:>2}: {action.get('action_type'):<28} "
|
| 478 |
+
f"reward={result['reward']['value']:+.3f} "
|
| 479 |
+
f"done={result['done']}",
|
| 480 |
+
file=sys.stderr,
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
if result.get("done"):
|
| 484 |
+
break
|
| 485 |
+
|
| 486 |
+
step_msg = _step_msg(new_obs, prev_queried)
|
| 487 |
+
messages.append({"role": "user", "content": step_msg})
|
| 488 |
+
prev_queried = {
|
| 489 |
+
k: dict(v)
|
| 490 |
+
for k, v in new_obs.get("queried_data", {}).items()
|
| 491 |
+
if isinstance(v, dict)
|
| 492 |
+
}
|
| 493 |
+
obs = new_obs
|
| 494 |
+
|
| 495 |
+
# Keep conversation window manageable
|
| 496 |
+
if len(messages) > 20:
|
| 497 |
+
messages = messages[:2] + messages[-16:]
|
| 498 |
+
|
| 499 |
+
g = _session.get(f"{ENV_BASE_URL}/grader", timeout=30)
|
| 500 |
+
g.raise_for_status()
|
| 501 |
+
return g.json().get("total", 0.0)
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
# ββ Entry point βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 505 |
+
def main():
|
| 506 |
+
runs = [
|
| 507 |
+
("alert_classification", 0),
|
| 508 |
+
("alert_classification", 1),
|
| 509 |
+
("root_cause_analysis", 0),
|
| 510 |
+
("root_cause_analysis", 1),
|
| 511 |
+
("remediation_planning", 0),
|
| 512 |
+
("remediation_planning", 1),
|
| 513 |
+
]
|
| 514 |
+
|
| 515 |
+
results: dict[str, list[float]] = {}
|
| 516 |
+
|
| 517 |
+
print(f"{'Task':<36} {'S':>2} {'Score':>7}")
|
| 518 |
+
print("-" * 50)
|
| 519 |
+
|
| 520 |
+
for task_id, scenario_index in runs:
|
| 521 |
+
try:
|
| 522 |
+
score = _run_episode(task_id, scenario_index)
|
| 523 |
+
except Exception as e:
|
| 524 |
+
print(f" [ERROR] {task_id} s{scenario_index}: {e}",
|
| 525 |
+
file=sys.stderr)
|
| 526 |
+
score = 0.0
|
| 527 |
+
|
| 528 |
+
label = f"{task_id} [s{scenario_index}]"
|
| 529 |
+
print(f"{label:<36} {scenario_index:>2} {score:>7.4f}")
|
| 530 |
+
results.setdefault(task_id, []).append(score)
|
| 531 |
+
|
| 532 |
+
print("-" * 50)
|
| 533 |
+
summary = {
|
| 534 |
+
t: round(sum(v) / len(v), 4) for t, v in results.items()
|
| 535 |
+
}
|
| 536 |
+
summary["overall"] = round(sum(summary.values()) / len(summary), 4)
|
| 537 |
+
|
| 538 |
+
print("\nScore Summary:")
|
| 539 |
+
for k, v in summary.items():
|
| 540 |
+
print(f" {k:<36}: {v:.4f}")
|
| 541 |
+
|
| 542 |
+
print(json.dumps(summary))
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
if __name__ == "__main__":
|
| 546 |
+
main()
|
openenv.yaml
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: cloud-incident-response
|
| 2 |
+
version: "0.1.0"
|
| 3 |
+
app_port: 7860
|
| 4 |
+
description: >
|
| 5 |
+
OpenEnv environment simulating real-world cloud SRE on-call incident response.
|
| 6 |
+
Distinct from Kubernetes ops β focuses on cross-service cascading failures,
|
| 7 |
+
network partitions, OOM kills, and CDN storms across distributed systems.
|
| 8 |
+
An AI agent classifies alert severity, performs root cause analysis through
|
| 9 |
+
log/metric/dependency queries, and executes remediation sequences to resolve
|
| 10 |
+
production incidents end-to-end.
|
| 11 |
+
author: Elliot89
|
| 12 |
+
license: MIT
|
| 13 |
+
tags:
|
| 14 |
+
- openenv
|
| 15 |
+
- sre
|
| 16 |
+
- cloud
|
| 17 |
+
- incident-response
|
| 18 |
+
- devops
|
| 19 |
+
- real-world
|
| 20 |
+
- agentic
|
| 21 |
+
|
| 22 |
+
tasks:
|
| 23 |
+
- id: alert_classification
|
| 24 |
+
name: "Task 1: Alert Severity Classification"
|
| 25 |
+
difficulty: easy
|
| 26 |
+
max_steps: 3
|
| 27 |
+
score_range: [0.0, 1.0]
|
| 28 |
+
description: >
|
| 29 |
+
Classify incoming alert severity (P1-P4) by querying
|
| 30 |
+
logs and metrics across affected cloud services.
|
| 31 |
+
|
| 32 |
+
- id: root_cause_analysis
|
| 33 |
+
name: "Task 2: Root Cause Analysis"
|
| 34 |
+
difficulty: medium
|
| 35 |
+
max_steps: 10
|
| 36 |
+
score_range: [0.0, 1.0]
|
| 37 |
+
description: >
|
| 38 |
+
Trace a live incident through logs, metrics, dependencies,
|
| 39 |
+
and recent deploys to identify the exact root cause service
|
| 40 |
+
and failure mode across a distributed system.
|
| 41 |
+
|
| 42 |
+
- id: remediation_planning
|
| 43 |
+
name: "Task 3: Incident Remediation"
|
| 44 |
+
difficulty: hard
|
| 45 |
+
max_steps: 15
|
| 46 |
+
score_range: [0.0, 1.0]
|
| 47 |
+
description: >
|
| 48 |
+
Fully resolve a production incident end-to-end: diagnose
|
| 49 |
+
the root cause, execute the correct remediation sequence,
|
| 50 |
+
and submit a documented resolution summary.
|
| 51 |
+
|
| 52 |
+
endpoints:
|
| 53 |
+
health: "GET /health"
|
| 54 |
+
reset: "POST /reset"
|
| 55 |
+
step: "POST /step"
|
| 56 |
+
state: "GET /state"
|
| 57 |
+
tasks: "GET /tasks"
|
| 58 |
+
grader: "GET /grader"
|
| 59 |
+
baseline: "POST /baseline"
|
pyproject.toml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "cloud-incident-response-openenv"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "OpenEnv environment for cloud SRE incident response"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
|
| 8 |
+
dependencies = [
|
| 9 |
+
"fastapi>=0.104.0",
|
| 10 |
+
"uvicorn[standard]>=0.24.0",
|
| 11 |
+
"pydantic>=2.0.0",
|
| 12 |
+
"requests>=2.31.0",
|
| 13 |
+
"openai>=1.58.0",
|
| 14 |
+
"httpx>=0.27.0,<0.29.0",
|
| 15 |
+
"python-dotenv>=1.0.0",
|
| 16 |
+
]
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.104.0
|
| 2 |
+
uvicorn[standard]>=0.24.0
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
requests>=2.31.0
|
| 5 |
+
openai>=1.58.0
|
| 6 |
+
httpx>=0.27.0,<0.29.0
|
| 7 |
+
python-dotenv>=1.0.0
|
server/__init__.py
ADDED
|
File without changes
|
server/app.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/app.py β FastAPI server for Cloud Incident Response OpenEnv.
|
| 3 |
+
|
| 4 |
+
Endpoints:
|
| 5 |
+
GET / JSON health/status (triggers HF Space "Running" badge)
|
| 6 |
+
GET /health Lightweight health check
|
| 7 |
+
POST /reset Start new episode
|
| 8 |
+
POST /step Submit action
|
| 9 |
+
GET /state Current episode state
|
| 10 |
+
GET /tasks All tasks with action schemas
|
| 11 |
+
GET /grader Score current episode
|
| 12 |
+
POST /baseline Run inference.py end-to-end, return score summary
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import subprocess
|
| 20 |
+
import sys
|
| 21 |
+
|
| 22 |
+
# Ensure project root is on sys.path regardless of working directory
|
| 23 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 24 |
+
|
| 25 |
+
from contextlib import asynccontextmanager
|
| 26 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 27 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 28 |
+
|
| 29 |
+
from server.models import Action
|
| 30 |
+
from server.environment import IncidentEnvironment
|
| 31 |
+
from tasks import list_tasks, ALL_TASKS
|
| 32 |
+
|
| 33 |
+
_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 34 |
+
|
| 35 |
+
# ββ Global env instance ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
+
_env: IncidentEnvironment | None = None
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@asynccontextmanager
|
| 40 |
+
async def lifespan(app: FastAPI):
|
| 41 |
+
"""Initialise heavy objects after the server is already accepting requests."""
|
| 42 |
+
global _env
|
| 43 |
+
_env = IncidentEnvironment()
|
| 44 |
+
yield
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _get_env() -> IncidentEnvironment:
|
| 48 |
+
if _env is None:
|
| 49 |
+
raise HTTPException(
|
| 50 |
+
status_code=503,
|
| 51 |
+
detail="Environment initialising β retry in a moment",
|
| 52 |
+
)
|
| 53 |
+
return _env
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
app = FastAPI(
|
| 57 |
+
title="Cloud Incident Response β OpenEnv",
|
| 58 |
+
version="0.1.0",
|
| 59 |
+
description=(
|
| 60 |
+
"OpenEnv environment for training AI agents on cloud SRE incident response. "
|
| 61 |
+
"Covers cascading failures, OOM kills, CDN storms, and network partitions."
|
| 62 |
+
),
|
| 63 |
+
lifespan=lifespan,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
app.add_middleware(
|
| 67 |
+
CORSMiddleware,
|
| 68 |
+
allow_origins=["*"],
|
| 69 |
+
allow_methods=["*"],
|
| 70 |
+
allow_headers=["*"],
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# ββ Root β plain JSON so HF Space flips badge to Running βββββββββββββββββββββ
|
| 75 |
+
|
| 76 |
+
@app.get("/")
|
| 77 |
+
def root():
|
| 78 |
+
return {
|
| 79 |
+
"status": "running",
|
| 80 |
+
"name": "cloud-incident-response",
|
| 81 |
+
"version": "0.1.0",
|
| 82 |
+
"description": "OpenEnv environment for cloud SRE incident response",
|
| 83 |
+
"tasks": ["alert_classification", "root_cause_analysis", "remediation_planning"],
|
| 84 |
+
"docs": "/docs",
|
| 85 |
+
"health": "/health",
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ββ Core endpoints ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
|
| 91 |
+
@app.get("/health")
|
| 92 |
+
def health():
|
| 93 |
+
return {"status": "ok", "version": "0.1.0"}
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@app.post("/reset")
|
| 97 |
+
def reset(
|
| 98 |
+
task_id: str = Query(default="alert_classification"),
|
| 99 |
+
scenario_index: int = Query(default=0),
|
| 100 |
+
):
|
| 101 |
+
"""Start a new episode. Returns the initial observation."""
|
| 102 |
+
env = _get_env()
|
| 103 |
+
try:
|
| 104 |
+
obs = env.reset(task_id=task_id, scenario_index=scenario_index)
|
| 105 |
+
return obs.model_dump()
|
| 106 |
+
except ValueError as e:
|
| 107 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 108 |
+
except Exception as e:
|
| 109 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
@app.post("/step")
|
| 113 |
+
def step(action: Action):
|
| 114 |
+
"""Submit one action. Returns observation, reward, done, info."""
|
| 115 |
+
env = _get_env()
|
| 116 |
+
try:
|
| 117 |
+
obs, reward, done, info = env.step(action)
|
| 118 |
+
return {
|
| 119 |
+
"observation": obs.model_dump(),
|
| 120 |
+
"reward": reward.model_dump(),
|
| 121 |
+
"done": done,
|
| 122 |
+
"info": info,
|
| 123 |
+
}
|
| 124 |
+
except RuntimeError as e:
|
| 125 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 126 |
+
except Exception as e:
|
| 127 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
@app.get("/state")
|
| 131 |
+
def state():
|
| 132 |
+
"""Return the full current episode state."""
|
| 133 |
+
env = _get_env()
|
| 134 |
+
try:
|
| 135 |
+
return env.state().model_dump()
|
| 136 |
+
except RuntimeError as e:
|
| 137 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 138 |
+
except Exception as e:
|
| 139 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
@app.get("/tasks")
|
| 143 |
+
def tasks():
|
| 144 |
+
"""Return all tasks with descriptions and action schemas."""
|
| 145 |
+
return {
|
| 146 |
+
"tasks": list_tasks(),
|
| 147 |
+
"total": len(ALL_TASKS),
|
| 148 |
+
"action_schema": {
|
| 149 |
+
"diagnostic": [
|
| 150 |
+
{"action_type": "query_logs", "parameters": {"service": "string"}},
|
| 151 |
+
{"action_type": "check_metrics", "parameters": {"service": "string"}},
|
| 152 |
+
{"action_type": "check_dependencies", "parameters": {"service": "string"}},
|
| 153 |
+
{"action_type": "check_recent_deploys", "parameters": {"service": "string"}},
|
| 154 |
+
{"action_type": "check_service_status", "parameters": {"service": "string"}},
|
| 155 |
+
],
|
| 156 |
+
"remediation": [
|
| 157 |
+
{"action_type": "restart_service", "parameters": {"service": "string"}},
|
| 158 |
+
{"action_type": "rollback_deploy", "parameters": {"service": "string", "target_version": "string"}},
|
| 159 |
+
{"action_type": "scale_service", "parameters": {"service": "string", "replicas": "int"}},
|
| 160 |
+
{"action_type": "disable_feature_flag", "parameters": {"flag": "string"}},
|
| 161 |
+
{"action_type": "clear_cache", "parameters": {"service": "string"}},
|
| 162 |
+
{"action_type": "execute_runbook_step", "parameters": {"runbook_action": "string", "target": "string"}},
|
| 163 |
+
],
|
| 164 |
+
"submission": [
|
| 165 |
+
{"action_type": "submit_severity", "parameters": {"severity": "P1|P2|P3|P4", "service": "string"}},
|
| 166 |
+
{"action_type": "submit_root_cause", "parameters": {"service": "string", "failure_mode": "string"}},
|
| 167 |
+
{"action_type": "submit_resolution", "parameters": {"summary": "string"}},
|
| 168 |
+
],
|
| 169 |
+
},
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
@app.get("/grader")
|
| 174 |
+
def grader():
|
| 175 |
+
"""Score the current episode. Returns total in [0.0, 1.0]."""
|
| 176 |
+
env = _get_env()
|
| 177 |
+
try:
|
| 178 |
+
s = env.state()
|
| 179 |
+
from graders import grade
|
| 180 |
+
result = grade(s.task_id, s.model_dump(), env._scenario)
|
| 181 |
+
return {
|
| 182 |
+
"total": result["total"],
|
| 183 |
+
"breakdown": result["breakdown"],
|
| 184 |
+
"feedback": result["feedback"],
|
| 185 |
+
"task_id": s.task_id,
|
| 186 |
+
"scenario_id": s.scenario_id,
|
| 187 |
+
"steps_used": s.step_count,
|
| 188 |
+
"done": s.done,
|
| 189 |
+
}
|
| 190 |
+
except RuntimeError as e:
|
| 191 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 192 |
+
except Exception as e:
|
| 193 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
@app.post("/baseline")
|
| 197 |
+
def baseline():
|
| 198 |
+
"""Run inference.py and return the JSON score summary."""
|
| 199 |
+
script = os.path.join(_ROOT, "inference.py")
|
| 200 |
+
if not os.path.exists(script):
|
| 201 |
+
raise HTTPException(
|
| 202 |
+
status_code=500,
|
| 203 |
+
detail="inference.py not found in project root",
|
| 204 |
+
)
|
| 205 |
+
try:
|
| 206 |
+
result = subprocess.run(
|
| 207 |
+
[sys.executable, script],
|
| 208 |
+
capture_output=True,
|
| 209 |
+
text=True,
|
| 210 |
+
timeout=1200,
|
| 211 |
+
cwd=_ROOT,
|
| 212 |
+
env={**os.environ, "ENV_BASE_URL": "http://localhost:7860"},
|
| 213 |
+
)
|
| 214 |
+
except subprocess.TimeoutExpired:
|
| 215 |
+
raise HTTPException(status_code=500, detail="inference.py timed out (>20 min)")
|
| 216 |
+
|
| 217 |
+
if result.returncode != 0:
|
| 218 |
+
raise HTTPException(status_code=500, detail=result.stderr[-2000:])
|
| 219 |
+
|
| 220 |
+
lines = result.stdout.strip().splitlines()
|
| 221 |
+
last = lines[-1] if lines else ""
|
| 222 |
+
try:
|
| 223 |
+
return json.loads(last)
|
| 224 |
+
except Exception:
|
| 225 |
+
return {"raw_output": result.stdout[-3000:]}
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
if __name__ == "__main__":
|
| 229 |
+
import uvicorn
|
| 230 |
+
uvicorn.run("server.app:app", host="0.0.0.0", port=7860, reload=False)
|
server/environment.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/environment.py β Core OpenEnv environment for Cloud Incident Response.
|
| 3 |
+
|
| 4 |
+
Implements the full OpenEnv interface:
|
| 5 |
+
reset(task_id, scenario_index) -> Observation
|
| 6 |
+
step(action) -> (Observation, Reward, done, info)
|
| 7 |
+
state() -> EpisodeState
|
| 8 |
+
|
| 9 |
+
All state is in-memory. Thread-safe via a lock.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import uuid
|
| 15 |
+
import threading
|
| 16 |
+
import sys
|
| 17 |
+
import os
|
| 18 |
+
|
| 19 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 20 |
+
|
| 21 |
+
from tasks import get_task, get_scenario
|
| 22 |
+
from graders import grade, _svc_match
|
| 23 |
+
from server.models import Action, ActionParameters, Observation, Reward, EpisodeState
|
| 24 |
+
|
| 25 |
+
# ββ Action type classification ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
|
| 27 |
+
_DIAGNOSTIC = frozenset({
|
| 28 |
+
"query_logs", "check_metrics", "check_dependencies",
|
| 29 |
+
"check_recent_deploys", "check_service_status",
|
| 30 |
+
})
|
| 31 |
+
|
| 32 |
+
_REMEDIATION = frozenset({
|
| 33 |
+
"restart_service", "rollback_deploy", "scale_service",
|
| 34 |
+
"disable_feature_flag", "clear_cache", "execute_runbook_step",
|
| 35 |
+
})
|
| 36 |
+
|
| 37 |
+
_SUBMIT = frozenset({
|
| 38 |
+
"submit_severity", "submit_root_cause", "submit_resolution",
|
| 39 |
+
})
|
| 40 |
+
|
| 41 |
+
# ββ Reward constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
+
|
| 43 |
+
R_QUERY_FIRST = +0.05
|
| 44 |
+
R_QUERY_REPEAT = +0.01
|
| 45 |
+
R_QUERY_UNKNOWN = -0.05
|
| 46 |
+
R_REM_GOOD = +0.10
|
| 47 |
+
R_REM_WRONG = -0.10
|
| 48 |
+
R_PAST_HALF = -0.02
|
| 49 |
+
R_TIMEOUT = -0.10
|
| 50 |
+
R_BAD_ACTION = -0.03
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class IncidentEnvironment:
|
| 54 |
+
"""
|
| 55 |
+
OpenEnv environment for Cloud Incident Response.
|
| 56 |
+
One instance handles one episode at a time (thread-safe).
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
def __init__(self):
|
| 60 |
+
self._lock = threading.Lock()
|
| 61 |
+
self._s: dict = {}
|
| 62 |
+
self._scenario: dict = {}
|
| 63 |
+
self._task_def: dict = {}
|
| 64 |
+
self._ready = False
|
| 65 |
+
|
| 66 |
+
# ββ Public OpenEnv API βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
|
| 68 |
+
def reset(self, task_id: str, scenario_index: int = 0) -> Observation:
|
| 69 |
+
with self._lock:
|
| 70 |
+
task_def = get_task(task_id)
|
| 71 |
+
scenario = get_scenario(task_id, scenario_index)
|
| 72 |
+
|
| 73 |
+
self._task_def = task_def
|
| 74 |
+
self._scenario = scenario
|
| 75 |
+
self._s = {
|
| 76 |
+
"episode_id": str(uuid.uuid4()),
|
| 77 |
+
"task_id": task_id,
|
| 78 |
+
"scenario_id": scenario["scenario_id"],
|
| 79 |
+
"step_count": 0,
|
| 80 |
+
"max_steps": task_def["max_steps"],
|
| 81 |
+
"action_history": [],
|
| 82 |
+
"queried_data": {},
|
| 83 |
+
"queried_keys": set(),
|
| 84 |
+
"submitted": False,
|
| 85 |
+
"resolved": False,
|
| 86 |
+
"done": False,
|
| 87 |
+
"cumulative_reward": 0.0,
|
| 88 |
+
"feedback": f"Episode started. {scenario['description']}",
|
| 89 |
+
}
|
| 90 |
+
self._ready = True
|
| 91 |
+
return self._build_obs()
|
| 92 |
+
|
| 93 |
+
def step(self, action: Action) -> tuple[Observation, Reward, bool, dict]:
|
| 94 |
+
with self._lock:
|
| 95 |
+
if not self._ready:
|
| 96 |
+
raise RuntimeError("Call reset() before step().")
|
| 97 |
+
|
| 98 |
+
s = self._s
|
| 99 |
+
if s["done"]:
|
| 100 |
+
return (
|
| 101 |
+
self._build_obs(),
|
| 102 |
+
Reward(value=0.0, reason="episode already done",
|
| 103 |
+
cumulative=s["cumulative_reward"]),
|
| 104 |
+
True,
|
| 105 |
+
{},
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
s["step_count"] += 1
|
| 109 |
+
step_num = s["step_count"]
|
| 110 |
+
at = action.action_type
|
| 111 |
+
params = action.parameters
|
| 112 |
+
|
| 113 |
+
s["action_history"].append({
|
| 114 |
+
"action_type": at,
|
| 115 |
+
"parameters": params.model_dump(exclude_none=True),
|
| 116 |
+
"step": step_num,
|
| 117 |
+
})
|
| 118 |
+
|
| 119 |
+
r = 0.0
|
| 120 |
+
fb: list[str] = []
|
| 121 |
+
|
| 122 |
+
# Efficiency penalty after halfway point
|
| 123 |
+
if step_num > s["max_steps"] // 2:
|
| 124 |
+
r += R_PAST_HALF
|
| 125 |
+
fb.append("efficiency penalty")
|
| 126 |
+
|
| 127 |
+
if at in _DIAGNOSTIC:
|
| 128 |
+
r, fb = self._handle_diagnostic(at, params, r, fb)
|
| 129 |
+
elif at in _REMEDIATION:
|
| 130 |
+
r, fb = self._handle_remediation(at, params, r, fb)
|
| 131 |
+
elif at in _SUBMIT:
|
| 132 |
+
r, fb, terminal = self._handle_submit(at, params, r, fb)
|
| 133 |
+
if terminal:
|
| 134 |
+
s["done"] = True
|
| 135 |
+
else:
|
| 136 |
+
r += R_BAD_ACTION
|
| 137 |
+
fb.append(f"unknown action_type '{at}'")
|
| 138 |
+
|
| 139 |
+
# Timeout if max steps reached without submission
|
| 140 |
+
if step_num >= s["max_steps"] and not s["done"]:
|
| 141 |
+
r += R_TIMEOUT
|
| 142 |
+
fb.append("timeout β no submission made")
|
| 143 |
+
s["done"] = True
|
| 144 |
+
|
| 145 |
+
# Apply grader score on terminal step
|
| 146 |
+
if s["done"]:
|
| 147 |
+
result = grade(s["task_id"], s, self._scenario)
|
| 148 |
+
s["cumulative_reward"] = round(
|
| 149 |
+
s["cumulative_reward"] + r + result["total"], 4
|
| 150 |
+
)
|
| 151 |
+
fb.append(f"grader={result['feedback']}")
|
| 152 |
+
else:
|
| 153 |
+
s["cumulative_reward"] = round(s["cumulative_reward"] + r, 4)
|
| 154 |
+
|
| 155 |
+
s["feedback"] = " | ".join(fb) if fb else "ok"
|
| 156 |
+
|
| 157 |
+
return (
|
| 158 |
+
self._build_obs(),
|
| 159 |
+
Reward(
|
| 160 |
+
value=round(r, 4),
|
| 161 |
+
reason=s["feedback"],
|
| 162 |
+
cumulative=s["cumulative_reward"],
|
| 163 |
+
),
|
| 164 |
+
s["done"],
|
| 165 |
+
{"step": step_num, "feedback": s["feedback"]},
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
def state(self) -> EpisodeState:
|
| 169 |
+
with self._lock:
|
| 170 |
+
if not self._ready:
|
| 171 |
+
raise RuntimeError("No active episode β call reset() first.")
|
| 172 |
+
s = self._s
|
| 173 |
+
return EpisodeState(
|
| 174 |
+
episode_id=s["episode_id"],
|
| 175 |
+
task_id=s["task_id"],
|
| 176 |
+
scenario_id=s["scenario_id"],
|
| 177 |
+
step_count=s["step_count"],
|
| 178 |
+
max_steps=s["max_steps"],
|
| 179 |
+
action_history=list(s["action_history"]),
|
| 180 |
+
queried_data=dict(s["queried_data"]),
|
| 181 |
+
submitted=s["submitted"],
|
| 182 |
+
resolved=s["resolved"],
|
| 183 |
+
done=s["done"],
|
| 184 |
+
cumulative_reward=s["cumulative_reward"],
|
| 185 |
+
feedback=s["feedback"],
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
# ββ Action handlers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 189 |
+
|
| 190 |
+
def _handle_diagnostic(
|
| 191 |
+
self, at: str, params: ActionParameters, r: float, fb: list[str]
|
| 192 |
+
) -> tuple[float, list[str]]:
|
| 193 |
+
s = self._s
|
| 194 |
+
service = (params.service or "").lower().strip()
|
| 195 |
+
known = {sv.lower() for sv in self._scenario.get("known_services", set())}
|
| 196 |
+
tool_data = self._scenario.get("tool_responses", {}).get(at, {})
|
| 197 |
+
key = (at, service)
|
| 198 |
+
|
| 199 |
+
if service and service in known:
|
| 200 |
+
if key not in s["queried_keys"]:
|
| 201 |
+
r += R_QUERY_FIRST
|
| 202 |
+
fb.append(f"queried {service} (+{R_QUERY_FIRST})")
|
| 203 |
+
s["queried_keys"].add(key)
|
| 204 |
+
else:
|
| 205 |
+
r += R_QUERY_REPEAT
|
| 206 |
+
fb.append(f"re-queried {service} (+{R_QUERY_REPEAT})")
|
| 207 |
+
result = tool_data.get(service, f"No data available for '{service}'.")
|
| 208 |
+
s["queried_data"].setdefault(at, {})[service] = result
|
| 209 |
+
|
| 210 |
+
elif service:
|
| 211 |
+
r += R_QUERY_UNKNOWN
|
| 212 |
+
fb.append(f"unknown service '{service}' ({R_QUERY_UNKNOWN})")
|
| 213 |
+
else:
|
| 214 |
+
fb.append(f"{at}: no service specified")
|
| 215 |
+
|
| 216 |
+
return r, fb
|
| 217 |
+
|
| 218 |
+
def _handle_remediation(
|
| 219 |
+
self, at: str, params: ActionParameters, r: float, fb: list[str]
|
| 220 |
+
) -> tuple[float, list[str]]:
|
| 221 |
+
s = self._s
|
| 222 |
+
service = (params.service or "").lower().strip()
|
| 223 |
+
flag = (params.flag or "").lower().strip()
|
| 224 |
+
runbook = (params.runbook_action or "").lower().strip()
|
| 225 |
+
target = (params.target or "").lower().strip()
|
| 226 |
+
|
| 227 |
+
# Build candidate keys for wrong-action matching
|
| 228 |
+
keys: set[str] = {at}
|
| 229 |
+
if service: keys.add(f"{at}:{service}")
|
| 230 |
+
if flag: keys.add(f"{at}:{flag}")
|
| 231 |
+
if runbook: keys.add(f"execute_runbook_step:{runbook}")
|
| 232 |
+
if target: keys.add(f"execute_runbook_step:{target}")
|
| 233 |
+
|
| 234 |
+
wrong_map = self._scenario.get("wrong_actions", {})
|
| 235 |
+
rem_data = self._scenario.get("remediation_data", {})
|
| 236 |
+
|
| 237 |
+
# Check for wrong actions β also use fuzzy service matching for `at:svc` keys
|
| 238 |
+
is_wrong = any(k in wrong_map for k in keys)
|
| 239 |
+
if not is_wrong and service:
|
| 240 |
+
# Try _svc_match against wrong action keys of the form `at:svc`
|
| 241 |
+
for wk in wrong_map:
|
| 242 |
+
if ":" in wk:
|
| 243 |
+
w_at, w_svc = wk.split(":", 1)
|
| 244 |
+
if w_at == at and _svc_match(service, w_svc):
|
| 245 |
+
is_wrong = True
|
| 246 |
+
break
|
| 247 |
+
|
| 248 |
+
if is_wrong:
|
| 249 |
+
r += R_REM_WRONG
|
| 250 |
+
reason = next(
|
| 251 |
+
(wrong_map[k] for k in keys if k in wrong_map),
|
| 252 |
+
"wrong action for this incident"
|
| 253 |
+
)
|
| 254 |
+
fb.append(f"wrong action '{at}': {str(reason)[:80]}")
|
| 255 |
+
else:
|
| 256 |
+
r += R_REM_GOOD
|
| 257 |
+
fb.append(f"executed {at}" + (f" on '{service}'" if service else ""))
|
| 258 |
+
at_data = rem_data.get(at, {})
|
| 259 |
+
result = (
|
| 260 |
+
at_data.get(service) or at_data.get(flag)
|
| 261 |
+
or at_data.get(runbook) or at_data.get(target)
|
| 262 |
+
or "action executed successfully"
|
| 263 |
+
)
|
| 264 |
+
s["queried_data"].setdefault(at, {})[
|
| 265 |
+
service or flag or runbook or target or at
|
| 266 |
+
] = result
|
| 267 |
+
|
| 268 |
+
return r, fb
|
| 269 |
+
|
| 270 |
+
def _handle_submit(
|
| 271 |
+
self, at: str, params: ActionParameters, r: float, fb: list[str]
|
| 272 |
+
) -> tuple[float, list[str], bool]:
|
| 273 |
+
s = self._s
|
| 274 |
+
s["submitted"] = True
|
| 275 |
+
|
| 276 |
+
if at == "submit_severity":
|
| 277 |
+
fb.append(f"submitted severity: {(params.severity or '').upper()}")
|
| 278 |
+
|
| 279 |
+
elif at == "submit_root_cause":
|
| 280 |
+
fb.append(
|
| 281 |
+
f"submitted root cause: "
|
| 282 |
+
f"service={params.service or ''}, "
|
| 283 |
+
f"failure_mode={params.failure_mode or ''}"
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
elif at == "submit_resolution":
|
| 287 |
+
summary = params.summary or ""
|
| 288 |
+
inv_count = sum(
|
| 289 |
+
1 for a in s["action_history"]
|
| 290 |
+
if a.get("action_type") in _DIAGNOSTIC | _REMEDIATION
|
| 291 |
+
)
|
| 292 |
+
if summary.strip() and inv_count >= 1:
|
| 293 |
+
s["resolved"] = True
|
| 294 |
+
fb.append("resolution submitted β incident resolved")
|
| 295 |
+
else:
|
| 296 |
+
fb.append("resolution submitted β insufficient investigation")
|
| 297 |
+
|
| 298 |
+
return r, fb, True
|
| 299 |
+
|
| 300 |
+
# ββ Build observation ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 301 |
+
|
| 302 |
+
def _build_obs(self) -> Observation:
|
| 303 |
+
s = self._s
|
| 304 |
+
sc = self._scenario
|
| 305 |
+
td = self._task_def
|
| 306 |
+
|
| 307 |
+
# Return sorted list of known service names (exact strings agents must use)
|
| 308 |
+
known = sorted(sc.get("known_services", set()))
|
| 309 |
+
|
| 310 |
+
return Observation(
|
| 311 |
+
episode_id=s["episode_id"],
|
| 312 |
+
task_id=s["task_id"],
|
| 313 |
+
scenario_id=s["scenario_id"],
|
| 314 |
+
step_count=s["step_count"],
|
| 315 |
+
max_steps=s["max_steps"],
|
| 316 |
+
incident_summary=sc.get("incident_summary", sc.get("description", "")),
|
| 317 |
+
alert=sc.get("alert", {}),
|
| 318 |
+
available_actions=td.get("available_actions", []),
|
| 319 |
+
queried_data=dict(s["queried_data"]),
|
| 320 |
+
cumulative_reward=s["cumulative_reward"],
|
| 321 |
+
done=s["done"],
|
| 322 |
+
feedback=s["feedback"],
|
| 323 |
+
known_services=known,
|
| 324 |
+
)
|
server/models.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/models.py β Typed Pydantic models for the OpenEnv interface.
|
| 3 |
+
|
| 4 |
+
OpenEnv requires three typed models: Action, Observation, Reward.
|
| 5 |
+
All models use Pydantic v2.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
from pydantic import BaseModel, Field
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ActionParameters(BaseModel):
|
| 13 |
+
"""Flexible parameter bag β different action types use different fields."""
|
| 14 |
+
service: str | None = None
|
| 15 |
+
severity: str | None = None
|
| 16 |
+
failure_mode: str | None = None
|
| 17 |
+
summary: str | None = None
|
| 18 |
+
target_version: str | None = None
|
| 19 |
+
replicas: int | None = None
|
| 20 |
+
flag: str | None = None
|
| 21 |
+
runbook_action: str | None = None
|
| 22 |
+
target: str | None = None
|
| 23 |
+
reasoning: str | None = None
|
| 24 |
+
|
| 25 |
+
model_config = {"extra": "allow"}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class Action(BaseModel):
|
| 29 |
+
"""An action submitted by the agent to the environment."""
|
| 30 |
+
action_type: str
|
| 31 |
+
parameters: ActionParameters = Field(default_factory=ActionParameters)
|
| 32 |
+
|
| 33 |
+
model_config = {"extra": "allow"}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class Observation(BaseModel):
|
| 37 |
+
"""Observation returned after reset() or step()."""
|
| 38 |
+
episode_id: str
|
| 39 |
+
task_id: str
|
| 40 |
+
scenario_id: str
|
| 41 |
+
step_count: int
|
| 42 |
+
max_steps: int
|
| 43 |
+
incident_summary: str
|
| 44 |
+
alert: dict
|
| 45 |
+
available_actions: list[str]
|
| 46 |
+
queried_data: dict
|
| 47 |
+
cumulative_reward: float
|
| 48 |
+
done: bool
|
| 49 |
+
feedback: str
|
| 50 |
+
# Explicit list of all valid service names for this scenario.
|
| 51 |
+
# Agents must use these exact strings in action parameters.
|
| 52 |
+
known_services: list[str] = Field(default_factory=list)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class Reward(BaseModel):
|
| 56 |
+
"""Reward signal returned after each step()."""
|
| 57 |
+
value: float
|
| 58 |
+
reason: str
|
| 59 |
+
cumulative: float
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class EpisodeState(BaseModel):
|
| 63 |
+
"""Full episode state returned by GET /state."""
|
| 64 |
+
episode_id: str
|
| 65 |
+
task_id: str
|
| 66 |
+
scenario_id: str
|
| 67 |
+
step_count: int
|
| 68 |
+
max_steps: int
|
| 69 |
+
action_history: list[dict]
|
| 70 |
+
queried_data: dict
|
| 71 |
+
submitted: bool
|
| 72 |
+
resolved: bool
|
| 73 |
+
done: bool
|
| 74 |
+
cumulative_reward: float
|
| 75 |
+
feedback: str
|
tasks.py
ADDED
|
@@ -0,0 +1,768 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tasks.py β Task and scenario definitions for Cloud Incident Response OpenEnv.
|
| 3 |
+
|
| 4 |
+
Covers cross-service cascading failures in distributed cloud systems:
|
| 5 |
+
- DB connection pool exhaustion cascading through service mesh
|
| 6 |
+
- CDN cache invalidation storms causing origin overload
|
| 7 |
+
- OOM kills from runaway analytics queries
|
| 8 |
+
- BGP network partitions isolating availability zones
|
| 9 |
+
|
| 10 |
+
Distinct from Kubernetes ops environments β focuses on application-layer
|
| 11 |
+
incident response: log correlation, dependency tracing, and remediation
|
| 12 |
+
across microservice architectures.
|
| 13 |
+
|
| 14 |
+
Public API:
|
| 15 |
+
get_task(task_id) -> task metadata dict
|
| 16 |
+
get_scenario(task_id, index) -> scenario dict
|
| 17 |
+
list_tasks() -> list of task dicts
|
| 18 |
+
ALL_TASKS -> dict[task_id -> metadata]
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
ALL_TASKS: dict = {
|
| 24 |
+
"alert_classification": {
|
| 25 |
+
"id": "alert_classification",
|
| 26 |
+
"name": "Task 1: Alert Severity Classification",
|
| 27 |
+
"difficulty": "easy",
|
| 28 |
+
"max_steps": 3,
|
| 29 |
+
"score_range": [0.0, 1.0],
|
| 30 |
+
"description": (
|
| 31 |
+
"An alert has fired. Query logs and metrics across affected services, "
|
| 32 |
+
"then classify the incident severity: P1 (CRITICAL β revenue/user impact, "
|
| 33 |
+
"immediate action), P2 (HIGH β degraded service), P3 (MEDIUM β minor issue), "
|
| 34 |
+
"P4 (LOW β informational). Submit severity with submit_severity."
|
| 35 |
+
),
|
| 36 |
+
"available_actions": [
|
| 37 |
+
"query_logs",
|
| 38 |
+
"check_metrics",
|
| 39 |
+
"check_dependencies",
|
| 40 |
+
"check_recent_deploys",
|
| 41 |
+
"submit_severity",
|
| 42 |
+
],
|
| 43 |
+
"submission_action": "submit_severity",
|
| 44 |
+
"scenarios": 2,
|
| 45 |
+
},
|
| 46 |
+
"root_cause_analysis": {
|
| 47 |
+
"id": "root_cause_analysis",
|
| 48 |
+
"name": "Task 2: Root Cause Analysis",
|
| 49 |
+
"difficulty": "medium",
|
| 50 |
+
"max_steps": 10,
|
| 51 |
+
"score_range": [0.0, 1.0],
|
| 52 |
+
"description": (
|
| 53 |
+
"A production incident is active. Use diagnostic tools to trace the failure "
|
| 54 |
+
"chain across services. Query logs, metrics, dependency graphs, and recent "
|
| 55 |
+
"deploys to identify which service is the root cause and what failure mode "
|
| 56 |
+
"triggered the cascade. Submit findings with submit_root_cause."
|
| 57 |
+
),
|
| 58 |
+
"available_actions": [
|
| 59 |
+
"query_logs",
|
| 60 |
+
"check_metrics",
|
| 61 |
+
"check_dependencies",
|
| 62 |
+
"check_recent_deploys",
|
| 63 |
+
"check_service_status",
|
| 64 |
+
"submit_root_cause",
|
| 65 |
+
],
|
| 66 |
+
"submission_action": "submit_root_cause",
|
| 67 |
+
"scenarios": 2,
|
| 68 |
+
},
|
| 69 |
+
"remediation_planning": {
|
| 70 |
+
"id": "remediation_planning",
|
| 71 |
+
"name": "Task 3: Incident Remediation",
|
| 72 |
+
"difficulty": "hard",
|
| 73 |
+
"max_steps": 15,
|
| 74 |
+
"score_range": [0.0, 1.0],
|
| 75 |
+
"description": (
|
| 76 |
+
"A critical production incident requires full end-to-end resolution. "
|
| 77 |
+
"Diagnose the root cause, execute the correct remediation sequence "
|
| 78 |
+
"(disable feature flags, restart services, rollback deploys, run runbook steps), "
|
| 79 |
+
"then submit a resolution summary. Scored on investigation quality, "
|
| 80 |
+
"remediation correctness, efficiency, and documentation."
|
| 81 |
+
),
|
| 82 |
+
"available_actions": [
|
| 83 |
+
"query_logs",
|
| 84 |
+
"check_metrics",
|
| 85 |
+
"check_dependencies",
|
| 86 |
+
"check_recent_deploys",
|
| 87 |
+
"check_service_status",
|
| 88 |
+
"restart_service",
|
| 89 |
+
"rollback_deploy",
|
| 90 |
+
"scale_service",
|
| 91 |
+
"disable_feature_flag",
|
| 92 |
+
"clear_cache",
|
| 93 |
+
"execute_runbook_step",
|
| 94 |
+
"submit_resolution",
|
| 95 |
+
],
|
| 96 |
+
"submission_action": "submit_resolution",
|
| 97 |
+
"scenarios": 2,
|
| 98 |
+
},
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
# ---------------------------------------------------------------------------
|
| 102 |
+
# Scenario data β 3 tasks Γ 2 scenarios = 6 total episodes
|
| 103 |
+
# ---------------------------------------------------------------------------
|
| 104 |
+
|
| 105 |
+
SCENARIOS: dict = {
|
| 106 |
+
|
| 107 |
+
# ββ TASK 1: ALERT CLASSIFICATION ββββββββββββββββββββββββββββββββββββββββ
|
| 108 |
+
|
| 109 |
+
"alert_classification": [
|
| 110 |
+
|
| 111 |
+
# AC-001: Cascading DB connection pool exhaustion β P1
|
| 112 |
+
{
|
| 113 |
+
"scenario_id": "AC-001",
|
| 114 |
+
"description": (
|
| 115 |
+
"Cascading failure: postgres-db connection pool exhausted, "
|
| 116 |
+
"causing auth-service timeouts, blocking api-gateway requests. "
|
| 117 |
+
"Revenue impact is severe and growing."
|
| 118 |
+
),
|
| 119 |
+
"incident_summary": (
|
| 120 |
+
"P1 ALERT β api-gateway 5xx rate 78%, auth-service timeout rate 94%, "
|
| 121 |
+
"postgres-db connection pool at 100% (500/500). "
|
| 122 |
+
"Checkout completely down. Revenue impact: $12,000/min."
|
| 123 |
+
),
|
| 124 |
+
"alert": {
|
| 125 |
+
"id": "ALT-20240315-001",
|
| 126 |
+
"title": "CRITICAL: api-gateway error rate spike 78%",
|
| 127 |
+
"severity_fired": "P1",
|
| 128 |
+
"affected_services": ["api-gateway", "auth-service", "postgres-db"],
|
| 129 |
+
"symptoms": [
|
| 130 |
+
"api-gateway: HTTP 503 rate 78% (baseline: 0.1%)",
|
| 131 |
+
"auth-service: connection timeout 94% of requests",
|
| 132 |
+
"postgres-db: connection pool 500/500 β 100% utilized",
|
| 133 |
+
"checkout flow: completely unavailable",
|
| 134 |
+
"new user logins: 0% success rate",
|
| 135 |
+
],
|
| 136 |
+
"error_rate": 0.78,
|
| 137 |
+
"duration_minutes": 4,
|
| 138 |
+
"revenue_impact_per_min": 12000,
|
| 139 |
+
},
|
| 140 |
+
"known_services": {"api-gateway", "auth-service", "postgres-db"},
|
| 141 |
+
"tool_responses": {
|
| 142 |
+
"query_logs": {
|
| 143 |
+
"api-gateway": (
|
| 144 |
+
"2024-03-15T10:04:12Z ERROR upstream connect error β "
|
| 145 |
+
"reset reason: connection timeout auth-service:8080\n"
|
| 146 |
+
"2024-03-15T10:04:13Z ERROR 503 Service Unavailable upstream: auth-service\n"
|
| 147 |
+
"2024-03-15T10:04:14Z ERROR circuit breaker OPEN for auth-service"
|
| 148 |
+
),
|
| 149 |
+
"auth-service": (
|
| 150 |
+
"2024-03-15T10:04:10Z ERROR pq: sorry, too many clients already\n"
|
| 151 |
+
"2024-03-15T10:04:11Z ERROR dial tcp postgres-db:5432: "
|
| 152 |
+
"connect: connection refused β pool exhausted (500/500)\n"
|
| 153 |
+
"2024-03-15T10:04:12Z ERROR all connection pool slots occupied"
|
| 154 |
+
),
|
| 155 |
+
"postgres-db": (
|
| 156 |
+
"2024-03-15T10:03:58Z LOG connection received: host=auth-service\n"
|
| 157 |
+
"2024-03-15T10:04:00Z FATAL remaining connection slots reserved "
|
| 158 |
+
"for non-replication superuser\n"
|
| 159 |
+
"2024-03-15T10:04:01Z LOG max_connections=500 active=500 idle=0"
|
| 160 |
+
),
|
| 161 |
+
},
|
| 162 |
+
"check_metrics": {
|
| 163 |
+
"api-gateway": (
|
| 164 |
+
"HTTP 5xx rate: 78% | p99 latency: 30s (timeout) | "
|
| 165 |
+
"RPS: 1,200 | circuit_breaker: OPEN"
|
| 166 |
+
),
|
| 167 |
+
"auth-service": (
|
| 168 |
+
"Error rate: 94% | DB connection wait: 28s | "
|
| 169 |
+
"Active connections: 0 | Request queue: 847"
|
| 170 |
+
),
|
| 171 |
+
"postgres-db": (
|
| 172 |
+
"Connections: 500/500 (100%) | Query queue: 847 | "
|
| 173 |
+
"CPU: 98% | Memory: 89% | Active queries: 500"
|
| 174 |
+
),
|
| 175 |
+
},
|
| 176 |
+
"check_dependencies": {
|
| 177 |
+
"api-gateway": "Depends on: auth-service [CRITICAL], product-service [OK]",
|
| 178 |
+
"auth-service": "Depends on: postgres-db [CRITICAL], redis-session [OK]",
|
| 179 |
+
"postgres-db": "No upstream dependencies β root level service",
|
| 180 |
+
},
|
| 181 |
+
"check_recent_deploys": {
|
| 182 |
+
"api-gateway": "Last deploy: 3 days ago β no recent changes",
|
| 183 |
+
"auth-service": (
|
| 184 |
+
"Last deploy: 47 min ago β PR #2341: "
|
| 185 |
+
"increased default connection pool size from 10 to 500"
|
| 186 |
+
),
|
| 187 |
+
"postgres-db": "Last deploy: 12 days ago β no recent changes",
|
| 188 |
+
},
|
| 189 |
+
},
|
| 190 |
+
"correct_severity": "P1",
|
| 191 |
+
"adjacent_severities": ["P2"],
|
| 192 |
+
},
|
| 193 |
+
|
| 194 |
+
# AC-002: CDN cache invalidation storm β P2
|
| 195 |
+
{
|
| 196 |
+
"scenario_id": "AC-002",
|
| 197 |
+
"description": (
|
| 198 |
+
"CDN cache invalidation storm: a misconfigured purge cronjob wiped "
|
| 199 |
+
"all 2.1M cached keys, sending 40Γ normal traffic to origin. "
|
| 200 |
+
"Site degraded but not fully down β P2 severity."
|
| 201 |
+
),
|
| 202 |
+
"incident_summary": (
|
| 203 |
+
"P2 ALERT β CDN cache hit rate dropped from 94% to 3%, "
|
| 204 |
+
"product-service origin traffic up 4000%, image-service CPU at 95%. "
|
| 205 |
+
"Pages loading slowly (p99: 18s). Checkout still working."
|
| 206 |
+
),
|
| 207 |
+
"alert": {
|
| 208 |
+
"id": "ALT-20240315-002",
|
| 209 |
+
"title": "HIGH: CDN cache miss storm β origin overloaded",
|
| 210 |
+
"severity_fired": "P2",
|
| 211 |
+
"affected_services": ["cdn-edge", "product-service", "image-service"],
|
| 212 |
+
"symptoms": [
|
| 213 |
+
"CDN cache hit rate: 3% (normal: 94%)",
|
| 214 |
+
"product-service: origin RPS 48,000 (normal: 1,200)",
|
| 215 |
+
"image-service: CPU 95%, p99 latency 18s",
|
| 216 |
+
"User experience: product pages slow, some images timing out",
|
| 217 |
+
"Checkout: still functional (not affected)",
|
| 218 |
+
],
|
| 219 |
+
"error_rate": 0.15,
|
| 220 |
+
"duration_minutes": 8,
|
| 221 |
+
"revenue_impact_per_min": 800,
|
| 222 |
+
},
|
| 223 |
+
"known_services": {"cdn-edge", "product-service", "image-service"},
|
| 224 |
+
"tool_responses": {
|
| 225 |
+
"query_logs": {
|
| 226 |
+
"cdn-edge": (
|
| 227 |
+
"2024-03-15T10:22:00Z INFO cache MISS ratio: 97% (5min window)\n"
|
| 228 |
+
"2024-03-15T10:20:11Z WARN mass cache invalidation β "
|
| 229 |
+
"2,100,000 keys purged by purge-job-prod\n"
|
| 230 |
+
"2024-03-15T10:20:10Z INFO purge pattern: /* (ALL keys)"
|
| 231 |
+
),
|
| 232 |
+
"product-service": (
|
| 233 |
+
"2024-03-15T10:22:05Z WARN request queue depth: 12,400\n"
|
| 234 |
+
"2024-03-15T10:22:06Z ERROR timeout fetching from image-service (18s)\n"
|
| 235 |
+
"2024-03-15T10:22:07Z WARN worker pool 95% utilized"
|
| 236 |
+
),
|
| 237 |
+
"image-service": (
|
| 238 |
+
"2024-03-15T10:22:00Z WARN CPU throttling engaged (95%)\n"
|
| 239 |
+
"2024-03-15T10:22:01Z ERROR worker pool exhausted β dropping requests\n"
|
| 240 |
+
"2024-03-15T10:22:02Z ERROR OOM risk: memory at 91%"
|
| 241 |
+
),
|
| 242 |
+
},
|
| 243 |
+
"check_metrics": {
|
| 244 |
+
"cdn-edge": (
|
| 245 |
+
"Cache hit rate: 3% | Purge events (1h): 1 mass purge | "
|
| 246 |
+
"Origin RPS: 48,000 | Bandwidth: 890 Gbps"
|
| 247 |
+
),
|
| 248 |
+
"product-service": (
|
| 249 |
+
"Origin RPS: 48,000 (normal: 1,200) | "
|
| 250 |
+
"Queue depth: 12,400 | Worker utilization: 95%"
|
| 251 |
+
),
|
| 252 |
+
"image-service": (
|
| 253 |
+
"CPU: 95% | Memory: 91% | "
|
| 254 |
+
"Worker pool: 0 free / 200 | p99 latency: 18s"
|
| 255 |
+
),
|
| 256 |
+
},
|
| 257 |
+
"check_dependencies": {
|
| 258 |
+
"cdn-edge": "Origin: product-service [OVERLOADED]",
|
| 259 |
+
"product-service": "Depends on: image-service [DEGRADED], postgres-db [OK]",
|
| 260 |
+
"image-service": "Depends on: object-storage [OK] β no upstream issues",
|
| 261 |
+
},
|
| 262 |
+
"check_recent_deploys": {
|
| 263 |
+
"cdn-edge": (
|
| 264 |
+
"Cronjob purge-job-prod updated 2h ago β "
|
| 265 |
+
"purge pattern changed from /images/* to /* (all keys)"
|
| 266 |
+
),
|
| 267 |
+
"product-service": "Last deploy: 5 days ago β no recent changes",
|
| 268 |
+
"image-service": "Last deploy: 2 days ago β no recent changes",
|
| 269 |
+
},
|
| 270 |
+
},
|
| 271 |
+
"correct_severity": "P2",
|
| 272 |
+
"adjacent_severities": ["P1", "P3"],
|
| 273 |
+
},
|
| 274 |
+
],
|
| 275 |
+
|
| 276 |
+
# ββ TASK 2: ROOT CAUSE ANALYSIS βββββββββββββββββββββββββββββββββββββββββ
|
| 277 |
+
|
| 278 |
+
"root_cause_analysis": [
|
| 279 |
+
|
| 280 |
+
# RCA-001: Analytics service OOM kills postgres-db
|
| 281 |
+
{
|
| 282 |
+
"scenario_id": "RCA-001",
|
| 283 |
+
"description": (
|
| 284 |
+
"postgres-db was OOM-killed by the Linux kernel after a runaway "
|
| 285 |
+
"analytics query with no LIMIT clause consumed all available memory. "
|
| 286 |
+
"All downstream services are now failing. analytics-service is the culprit."
|
| 287 |
+
),
|
| 288 |
+
"incident_summary": (
|
| 289 |
+
"Multiple services down: api-gateway 503, auth-service failing, "
|
| 290 |
+
"order-service write failures. postgres-db restarting in a loop. "
|
| 291 |
+
"Root cause is upstream β trace the failure chain."
|
| 292 |
+
),
|
| 293 |
+
"alert": {
|
| 294 |
+
"id": "ALT-RCA-001",
|
| 295 |
+
"title": "CRITICAL: postgres-db crash loop β all dependents down",
|
| 296 |
+
"severity_fired": "P1",
|
| 297 |
+
"affected_services": [
|
| 298 |
+
"api-gateway", "auth-service", "order-service", "postgres-db",
|
| 299 |
+
],
|
| 300 |
+
"symptoms": [
|
| 301 |
+
"postgres-db: 4 restarts in 12 minutes",
|
| 302 |
+
"auth-service: connection refused β 100% failure",
|
| 303 |
+
"order-service: all writes failing",
|
| 304 |
+
"api-gateway: 503 on all authenticated routes",
|
| 305 |
+
"analytics-service: last job failed 12 min ago",
|
| 306 |
+
],
|
| 307 |
+
"error_rate": 0.95,
|
| 308 |
+
"duration_minutes": 14,
|
| 309 |
+
},
|
| 310 |
+
"known_services": {
|
| 311 |
+
"api-gateway", "auth-service", "order-service",
|
| 312 |
+
"postgres-db", "analytics-service", "redis-session",
|
| 313 |
+
},
|
| 314 |
+
"tool_responses": {
|
| 315 |
+
"query_logs": {
|
| 316 |
+
"postgres-db": (
|
| 317 |
+
"2024-03-16T02:11:00Z LOG database system shut down at 02:10:58\n"
|
| 318 |
+
"2024-03-16T02:10:58Z FATAL Out of Memory: Kill process 1847 (postgres) "
|
| 319 |
+
"score 982 or sacrifice child\n"
|
| 320 |
+
"2024-03-16T02:10:30Z LOG process 1847 query running 12min: "
|
| 321 |
+
"SELECT * FROM events JOIN user_sessions JOIN orders "
|
| 322 |
+
"JOIN products β no LIMIT clause, est 847M rows"
|
| 323 |
+
),
|
| 324 |
+
"analytics-service": (
|
| 325 |
+
"2024-03-16T01:58:00Z INFO starting job: full_history_export\n"
|
| 326 |
+
"2024-03-16T01:58:01Z WARN query has no LIMIT β estimated 847M rows\n"
|
| 327 |
+
"2024-03-16T02:10:55Z ERROR job killed by OOM β full_history_export FAILED"
|
| 328 |
+
),
|
| 329 |
+
"auth-service": (
|
| 330 |
+
"2024-03-16T02:11:05Z ERROR connect ECONNREFUSED postgres-db:5432\n"
|
| 331 |
+
"2024-03-16T02:11:06Z ERROR all retries exhausted β giving up"
|
| 332 |
+
),
|
| 333 |
+
"api-gateway": (
|
| 334 |
+
"2024-03-16T02:11:10Z ERROR upstream auth-service: 503 Service Unavailable"
|
| 335 |
+
),
|
| 336 |
+
"order-service": (
|
| 337 |
+
"2024-03-16T02:11:08Z ERROR pq: the database system is starting up"
|
| 338 |
+
),
|
| 339 |
+
"redis-session": "No errors β operating normally at 99.2% hit rate",
|
| 340 |
+
},
|
| 341 |
+
"check_metrics": {
|
| 342 |
+
"postgres-db": (
|
| 343 |
+
"Memory: OOM killed (0% free at crash) | "
|
| 344 |
+
"Restarts: 4 in 12min | Status: RESTARTING"
|
| 345 |
+
),
|
| 346 |
+
"analytics-service": (
|
| 347 |
+
"Memory at crash: 31.2GB / 32GB (97.5%) | "
|
| 348 |
+
"Job runtime: 12min 55s | Status: ERROR"
|
| 349 |
+
),
|
| 350 |
+
"auth-service": "Connection success: 0% | DB: CRITICAL | Redis: OK",
|
| 351 |
+
"api-gateway": "503 rate: 95% | Auth dependency: DOWN",
|
| 352 |
+
"order-service": "Write success: 0% | DB: RESTARTING",
|
| 353 |
+
"redis-session": "Hit rate: 99.2% | Memory: 42% | Healthy",
|
| 354 |
+
},
|
| 355 |
+
"check_dependencies": {
|
| 356 |
+
"postgres-db": (
|
| 357 |
+
"Clients: auth-service, order-service, analytics-service, product-service"
|
| 358 |
+
),
|
| 359 |
+
"analytics-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 360 |
+
"auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
|
| 361 |
+
"api-gateway": "Depends on: auth-service [DOWN]",
|
| 362 |
+
"order-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 363 |
+
"redis-session": "No DB dependency β standalone cache",
|
| 364 |
+
},
|
| 365 |
+
"check_recent_deploys": {
|
| 366 |
+
"analytics-service": (
|
| 367 |
+
"Deploy 6h ago: added full_history_export scheduled job β "
|
| 368 |
+
"runs daily at 02:00 UTC, no LIMIT on cross-table JOIN"
|
| 369 |
+
),
|
| 370 |
+
"postgres-db": "No deploys in 3 weeks",
|
| 371 |
+
"auth-service": "No recent deploys",
|
| 372 |
+
"order-service": "No recent deploys",
|
| 373 |
+
"redis-session": "No recent deploys",
|
| 374 |
+
},
|
| 375 |
+
"check_service_status": {
|
| 376 |
+
"postgres-db": "RESTARTING | Uptime: 47s | Crash reason: OOM",
|
| 377 |
+
"analytics-service": "ERROR | Last job: full_history_export FAILED",
|
| 378 |
+
"auth-service": "DOWN | Waiting for postgres-db",
|
| 379 |
+
"api-gateway": "DEGRADED | 95% requests failing",
|
| 380 |
+
"order-service": "DOWN | Waiting for postgres-db",
|
| 381 |
+
"redis-session": "HEALTHY | All normal",
|
| 382 |
+
},
|
| 383 |
+
},
|
| 384 |
+
"correct_root_cause": {
|
| 385 |
+
"service": "analytics-service",
|
| 386 |
+
"failure_mode": "unbounded query OOM killing postgres-db",
|
| 387 |
+
},
|
| 388 |
+
"wrong_actions": {
|
| 389 |
+
"restart_service:auth-service": "auth-service is a victim β DB must be fixed first",
|
| 390 |
+
"restart_service:api-gateway": "api-gateway is downstream β won't help",
|
| 391 |
+
"scale_service:postgres-db": "Scaling won't prevent OOM if the bad query runs again",
|
| 392 |
+
"rollback_deploy:postgres-db": "postgres-db has no recent deploys",
|
| 393 |
+
},
|
| 394 |
+
},
|
| 395 |
+
|
| 396 |
+
# RCA-002: BGP route withdrawal β AZ network partition
|
| 397 |
+
{
|
| 398 |
+
"scenario_id": "RCA-002",
|
| 399 |
+
"description": (
|
| 400 |
+
"A BGP route withdrawal isolated AZ-1 (where payment-service runs) "
|
| 401 |
+
"from AZ-2 and AZ-3, causing 61% of checkout requests to fail. "
|
| 402 |
+
"Services within AZ-1 are healthy β it is a pure network issue."
|
| 403 |
+
),
|
| 404 |
+
"incident_summary": (
|
| 405 |
+
"Checkout failure rate 61% β AZ-2 and AZ-3 cannot reach payment-service "
|
| 406 |
+
"in AZ-1. AZ-1 users unaffected. fraud-detection-service also unreachable "
|
| 407 |
+
"cross-AZ. Network infrastructure change 18 min ago."
|
| 408 |
+
),
|
| 409 |
+
"alert": {
|
| 410 |
+
"id": "ALT-RCA-002",
|
| 411 |
+
"title": "HIGH: checkout failure 61% β cross-AZ connectivity loss",
|
| 412 |
+
"severity_fired": "P2",
|
| 413 |
+
"affected_services": [
|
| 414 |
+
"order-service", "payment-service", "fraud-detection-service",
|
| 415 |
+
],
|
| 416 |
+
"symptoms": [
|
| 417 |
+
"checkout failure rate: 61% (AZ-2/AZ-3 only)",
|
| 418 |
+
"payment-service: unreachable from AZ-2, AZ-3",
|
| 419 |
+
"fraud-detection-service: timeout from AZ-2, AZ-3",
|
| 420 |
+
"AZ-1 users: 0% failure rate",
|
| 421 |
+
"Network: AZ-2/AZ-3 β AZ-1 routing broken",
|
| 422 |
+
],
|
| 423 |
+
"error_rate": 0.61,
|
| 424 |
+
"duration_minutes": 9,
|
| 425 |
+
},
|
| 426 |
+
"known_services": {
|
| 427 |
+
"order-service", "payment-service", "fraud-detection-service",
|
| 428 |
+
"postgres-db", "redis-payment-cache", "network-infra",
|
| 429 |
+
},
|
| 430 |
+
"tool_responses": {
|
| 431 |
+
"query_logs": {
|
| 432 |
+
"order-service": (
|
| 433 |
+
"2024-03-17T14:32:10Z ERROR connection timeout payment-service:8080 "
|
| 434 |
+
"(AZ-2 to AZ-1: no route to host)\n"
|
| 435 |
+
"2024-03-17T14:32:11Z ERROR fraud-detection-service: i/o timeout (30s)"
|
| 436 |
+
),
|
| 437 |
+
"payment-service": (
|
| 438 |
+
"2024-03-17T14:31:58Z WARN health check from AZ-2 LB failing\n"
|
| 439 |
+
"2024-03-17T14:31:59Z INFO AZ-1 local traffic: all normal"
|
| 440 |
+
),
|
| 441 |
+
"fraud-detection-service": (
|
| 442 |
+
"2024-03-17T14:32:00Z INFO AZ-1 requests: all normal\n"
|
| 443 |
+
"2024-03-17T14:32:01Z WARN cross-AZ health probes: 100% timeout"
|
| 444 |
+
),
|
| 445 |
+
"network-infra": (
|
| 446 |
+
"2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.2.1 route withdrawal β "
|
| 447 |
+
"AZ-2 lost route to AZ-1 CIDR 10.0.1.0/24\n"
|
| 448 |
+
"2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.3.1 route withdrawal β "
|
| 449 |
+
"AZ-3 lost route to AZ-1 CIDR 10.0.1.0/24\n"
|
| 450 |
+
"2024-03-17T14:31:44Z INFO router config change applied β "
|
| 451 |
+
"BGP advertisement policy updated"
|
| 452 |
+
),
|
| 453 |
+
"postgres-db": "Operating normally β no errors detected",
|
| 454 |
+
"redis-payment-cache": "Operating normally β AZ-1 traffic only, all healthy",
|
| 455 |
+
},
|
| 456 |
+
"check_metrics": {
|
| 457 |
+
"order-service": (
|
| 458 |
+
"AZ-2 checkout failure: 99% | AZ-3 checkout failure: 98% | "
|
| 459 |
+
"AZ-1 checkout failure: 0.2% (baseline)"
|
| 460 |
+
),
|
| 461 |
+
"payment-service": (
|
| 462 |
+
"AZ-1 traffic: normal (100% success) | "
|
| 463 |
+
"AZ-2/AZ-3 inbound connections: 0 (blocked)"
|
| 464 |
+
),
|
| 465 |
+
"fraud-detection-service": (
|
| 466 |
+
"AZ-1 processing: normal | "
|
| 467 |
+
"Cross-AZ health checks: 100% timeout"
|
| 468 |
+
),
|
| 469 |
+
"network-infra": (
|
| 470 |
+
"BGP session AZ-2: WITHDRAWN | BGP session AZ-3: WITHDRAWN | "
|
| 471 |
+
"AZ-1 internal: all UP | Config change: 18min ago"
|
| 472 |
+
),
|
| 473 |
+
"postgres-db": "All metrics normal β no anomalies",
|
| 474 |
+
"redis-payment-cache": "All metrics normal β AZ-1 only traffic",
|
| 475 |
+
},
|
| 476 |
+
"check_dependencies": {
|
| 477 |
+
"order-service": (
|
| 478 |
+
"Depends on: payment-service [PARTITIONED], "
|
| 479 |
+
"fraud-detection-service [PARTITIONED]"
|
| 480 |
+
),
|
| 481 |
+
"payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
|
| 482 |
+
"fraud-detection-service": "Depends on: postgres-db [OK]",
|
| 483 |
+
"network-infra": "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN], AZ-1 [UP]",
|
| 484 |
+
},
|
| 485 |
+
"check_recent_deploys": {
|
| 486 |
+
"network-infra": (
|
| 487 |
+
"Router config change 18min ago β BGP route advertisement policy update: "
|
| 488 |
+
"inadvertently withdrew AZ-1 routes from AZ-2/AZ-3 peers"
|
| 489 |
+
),
|
| 490 |
+
"payment-service": "No recent deploys",
|
| 491 |
+
"order-service": "No recent deploys",
|
| 492 |
+
"fraud-detection-service": "No recent deploys",
|
| 493 |
+
},
|
| 494 |
+
"check_service_status": {
|
| 495 |
+
"payment-service": "HEALTHY within AZ-1 | Cross-AZ: UNREACHABLE",
|
| 496 |
+
"order-service": "DEGRADED | AZ-2/AZ-3 instances failing",
|
| 497 |
+
"network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN | AZ-1: UP",
|
| 498 |
+
"fraud-detection-service": "HEALTHY within AZ-1 | Cross-AZ: UNREACHABLE",
|
| 499 |
+
"postgres-db": "HEALTHY",
|
| 500 |
+
"redis-payment-cache": "HEALTHY",
|
| 501 |
+
},
|
| 502 |
+
},
|
| 503 |
+
"correct_root_cause": {
|
| 504 |
+
"service": "network-infra",
|
| 505 |
+
"failure_mode": "BGP route withdrawal causing AZ network partition",
|
| 506 |
+
},
|
| 507 |
+
"wrong_actions": {
|
| 508 |
+
"restart_service:payment-service": "payment-service is healthy β restarting won't fix routing",
|
| 509 |
+
"restart_service:order-service": "order-service is a victim of the partition",
|
| 510 |
+
"scale_service:payment-service": "Scaling won't fix a BGP routing issue",
|
| 511 |
+
"clear_cache:redis-payment-cache": "Cache is healthy β not the cause",
|
| 512 |
+
},
|
| 513 |
+
},
|
| 514 |
+
],
|
| 515 |
+
|
| 516 |
+
# ββ TASK 3: REMEDIATION PLANNING ββββββββββββββββββββββββββββββββββββββββ
|
| 517 |
+
|
| 518 |
+
"remediation_planning": [
|
| 519 |
+
|
| 520 |
+
# RP-001: Full OOM remediation β disable cron, restart cascade
|
| 521 |
+
{
|
| 522 |
+
"scenario_id": "RP-001",
|
| 523 |
+
"description": (
|
| 524 |
+
"Full remediation: analytics-service OOM-killed postgres-db with an "
|
| 525 |
+
"unbounded query. Must disable the offending job, restart postgres, "
|
| 526 |
+
"restore all downstream services, and document the resolution."
|
| 527 |
+
),
|
| 528 |
+
"incident_summary": (
|
| 529 |
+
"CRITICAL β postgres-db in OOM crash loop. auth-service, order-service, "
|
| 530 |
+
"api-gateway all down. analytics-service caused it with unbounded query. "
|
| 531 |
+
"Required actions: disable job, restart postgres, restore services, document."
|
| 532 |
+
),
|
| 533 |
+
"alert": {
|
| 534 |
+
"id": "ALT-RP-001",
|
| 535 |
+
"title": "CRITICAL: postgres-db OOM crash loop β full stack down",
|
| 536 |
+
"severity_fired": "P1",
|
| 537 |
+
"affected_services": [
|
| 538 |
+
"postgres-db", "analytics-service",
|
| 539 |
+
"auth-service", "order-service", "api-gateway",
|
| 540 |
+
],
|
| 541 |
+
},
|
| 542 |
+
"known_services": {
|
| 543 |
+
"postgres-db", "auth-service", "order-service",
|
| 544 |
+
"api-gateway", "analytics-service",
|
| 545 |
+
},
|
| 546 |
+
"tool_responses": {
|
| 547 |
+
"query_logs": {
|
| 548 |
+
"postgres-db": (
|
| 549 |
+
"FATAL: Out of Memory: Kill process (postgres) β "
|
| 550 |
+
"analytics query running 12min with no LIMIT"
|
| 551 |
+
),
|
| 552 |
+
"analytics-service": (
|
| 553 |
+
"ERROR: full_history_export β unbounded JOIN, 847M rows, killed by OOM"
|
| 554 |
+
),
|
| 555 |
+
"auth-service": "ERROR: connect ECONNREFUSED postgres-db:5432",
|
| 556 |
+
"order-service": "ERROR: pq: the database system is starting up",
|
| 557 |
+
"api-gateway": "ERROR: upstream auth-service 503",
|
| 558 |
+
},
|
| 559 |
+
"check_metrics": {
|
| 560 |
+
"postgres-db": "Memory: OOM | Restarts: 4 | Status: CRASH LOOP",
|
| 561 |
+
"analytics-service": "Memory spike: 31GB/32GB | Status: ERROR",
|
| 562 |
+
"auth-service": "Connection success: 0% | Waiting for DB",
|
| 563 |
+
"order-service": "Write success: 0% | Waiting for DB",
|
| 564 |
+
"api-gateway": "503 rate: 95% | Auth: DOWN",
|
| 565 |
+
},
|
| 566 |
+
"check_dependencies": {
|
| 567 |
+
"postgres-db": "Clients: auth-service, order-service, analytics-service",
|
| 568 |
+
"analytics-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 569 |
+
"auth-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 570 |
+
"order-service": "Depends on: postgres-db [CRASH LOOP]",
|
| 571 |
+
},
|
| 572 |
+
"check_recent_deploys": {
|
| 573 |
+
"analytics-service": (
|
| 574 |
+
"Deploy 6h ago: full_history_export job β "
|
| 575 |
+
"unbounded cross-table JOIN query"
|
| 576 |
+
),
|
| 577 |
+
"postgres-db": "No recent changes",
|
| 578 |
+
},
|
| 579 |
+
"check_service_status": {
|
| 580 |
+
"postgres-db": "CRASH LOOP | OOM kill | Uptime: 47s",
|
| 581 |
+
"analytics-service": "ERROR | Last job failed",
|
| 582 |
+
"auth-service": "DOWN",
|
| 583 |
+
"order-service": "DOWN",
|
| 584 |
+
"api-gateway": "DEGRADED",
|
| 585 |
+
},
|
| 586 |
+
},
|
| 587 |
+
"remediation_data": {
|
| 588 |
+
"disable_feature_flag": {
|
| 589 |
+
"full_history_export": (
|
| 590 |
+
"Cron job full_history_export DISABLED β "
|
| 591 |
+
"no more unbounded queries will run"
|
| 592 |
+
),
|
| 593 |
+
},
|
| 594 |
+
"restart_service": {
|
| 595 |
+
"postgres-db": (
|
| 596 |
+
"postgres-db restarted cleanly β "
|
| 597 |
+
"accepting connections (12/500 active)"
|
| 598 |
+
),
|
| 599 |
+
"analytics-service": (
|
| 600 |
+
"analytics-service restarted β no active queries"
|
| 601 |
+
),
|
| 602 |
+
"auth-service": "auth-service restarted β reconnected to postgres-db OK",
|
| 603 |
+
"order-service": "order-service restarted β writes resuming normally",
|
| 604 |
+
},
|
| 605 |
+
"execute_runbook_step": {
|
| 606 |
+
"verify_db_health": (
|
| 607 |
+
"postgres-db: connections 12/500, CPU 12%, Memory 34% β healthy"
|
| 608 |
+
),
|
| 609 |
+
"check_service_recovery": (
|
| 610 |
+
"auth-service OK | order-service OK | api-gateway OK"
|
| 611 |
+
),
|
| 612 |
+
},
|
| 613 |
+
},
|
| 614 |
+
"correct_remediation_sequence": [
|
| 615 |
+
"disable_feature_flag:full_history_export",
|
| 616 |
+
"restart_service:analytics-service",
|
| 617 |
+
"restart_service:postgres-db",
|
| 618 |
+
"restart_service:auth-service",
|
| 619 |
+
"restart_service:order-service",
|
| 620 |
+
],
|
| 621 |
+
"wrong_actions": {
|
| 622 |
+
"rollback_deploy:postgres-db": (
|
| 623 |
+
"postgres-db has no recent deploy to roll back"
|
| 624 |
+
),
|
| 625 |
+
"scale_service:postgres-db": (
|
| 626 |
+
"Scaling won't prevent the OOM query from running again"
|
| 627 |
+
),
|
| 628 |
+
"restart_service:api-gateway": (
|
| 629 |
+
"api-gateway is downstream β fix the DB first"
|
| 630 |
+
),
|
| 631 |
+
},
|
| 632 |
+
"resolution_keywords": [
|
| 633 |
+
"analytics", "oom", "memory", "postgres", "query",
|
| 634 |
+
"full_history_export", "disabled", "restarted", "recovered",
|
| 635 |
+
],
|
| 636 |
+
},
|
| 637 |
+
|
| 638 |
+
# RP-002: Full BGP remediation β restore routes, rollback config, verify
|
| 639 |
+
{
|
| 640 |
+
"scenario_id": "RP-002",
|
| 641 |
+
"description": (
|
| 642 |
+
"Full remediation: BGP route withdrawal partitioned AZ-2/AZ-3 from "
|
| 643 |
+
"AZ-1 where payment-service runs. Must restore BGP routes, roll back "
|
| 644 |
+
"the router config change, verify checkout recovery, and document."
|
| 645 |
+
),
|
| 646 |
+
"incident_summary": (
|
| 647 |
+
"P2 β BGP partition isolating payment-service from 61% of users. "
|
| 648 |
+
"Router config change 18min ago is the cause. "
|
| 649 |
+
"Required: restore BGP routes, rollback network config, verify recovery."
|
| 650 |
+
),
|
| 651 |
+
"alert": {
|
| 652 |
+
"id": "ALT-RP-002",
|
| 653 |
+
"title": "HIGH: checkout 61% failure β BGP AZ partition",
|
| 654 |
+
"severity_fired": "P2",
|
| 655 |
+
"affected_services": ["network-infra", "order-service", "payment-service"],
|
| 656 |
+
},
|
| 657 |
+
"known_services": {
|
| 658 |
+
"network-infra", "order-service", "payment-service",
|
| 659 |
+
"fraud-detection-service", "postgres-db",
|
| 660 |
+
},
|
| 661 |
+
"tool_responses": {
|
| 662 |
+
"query_logs": {
|
| 663 |
+
"network-infra": (
|
| 664 |
+
"CRITICAL: BGP route withdrawal β "
|
| 665 |
+
"AZ-2/AZ-3 lost route to AZ-1 10.0.1.0/24\n"
|
| 666 |
+
"Router config change 18min ago: BGP policy updated"
|
| 667 |
+
),
|
| 668 |
+
"order-service": (
|
| 669 |
+
"ERROR: connection timeout payment-service β no route to host"
|
| 670 |
+
),
|
| 671 |
+
"payment-service": (
|
| 672 |
+
"INFO: AZ-1 traffic normal | "
|
| 673 |
+
"WARN: cross-AZ health checks failing"
|
| 674 |
+
),
|
| 675 |
+
"fraud-detection-service": (
|
| 676 |
+
"WARN: cross-AZ health probes 100% timeout | AZ-1 traffic: normal"
|
| 677 |
+
),
|
| 678 |
+
"postgres-db": "Operating normally",
|
| 679 |
+
},
|
| 680 |
+
"check_metrics": {
|
| 681 |
+
"network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN | AZ-1: UP",
|
| 682 |
+
"order-service": "AZ-2 failure: 99% | AZ-1 failure: 0.2%",
|
| 683 |
+
"payment-service": "AZ-1: normal | Cross-AZ inbound: 0",
|
| 684 |
+
"fraud-detection-service": "AZ-1: normal | Cross-AZ: 0",
|
| 685 |
+
"postgres-db": "All normal",
|
| 686 |
+
},
|
| 687 |
+
"check_dependencies": {
|
| 688 |
+
"order-service": "Depends on: payment-service [PARTITIONED]",
|
| 689 |
+
"payment-service": "Depends on: postgres-db [OK]",
|
| 690 |
+
"network-infra": "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN]",
|
| 691 |
+
},
|
| 692 |
+
"check_recent_deploys": {
|
| 693 |
+
"network-infra": (
|
| 694 |
+
"Config change 18min ago β BGP policy update "
|
| 695 |
+
"accidentally withdrew AZ-1 routes"
|
| 696 |
+
),
|
| 697 |
+
"payment-service": "No recent deploys",
|
| 698 |
+
"order-service": "No recent deploys",
|
| 699 |
+
},
|
| 700 |
+
"check_service_status": {
|
| 701 |
+
"network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
|
| 702 |
+
"payment-service": "HEALTHY (AZ-1) | Cross-AZ: UNREACHABLE",
|
| 703 |
+
"order-service": "DEGRADED",
|
| 704 |
+
},
|
| 705 |
+
},
|
| 706 |
+
"remediation_data": {
|
| 707 |
+
"rollback_deploy": {
|
| 708 |
+
"network-infra": (
|
| 709 |
+
"Router config rolled back β "
|
| 710 |
+
"BGP advertisement policy restored to previous version"
|
| 711 |
+
),
|
| 712 |
+
},
|
| 713 |
+
"execute_runbook_step": {
|
| 714 |
+
"restore_bgp_routes": (
|
| 715 |
+
"BGP routes restored β AZ-2/AZ-3 can now reach AZ-1 10.0.1.0/24"
|
| 716 |
+
),
|
| 717 |
+
"verify_checkout_recovery": (
|
| 718 |
+
"Checkout failure rate: 0.3% β incident fully resolved"
|
| 719 |
+
),
|
| 720 |
+
},
|
| 721 |
+
},
|
| 722 |
+
"correct_remediation_sequence": [
|
| 723 |
+
"execute_runbook_step:restore_bgp_routes",
|
| 724 |
+
"rollback_deploy:network-infra",
|
| 725 |
+
"execute_runbook_step:verify_checkout_recovery",
|
| 726 |
+
],
|
| 727 |
+
"wrong_actions": {
|
| 728 |
+
"restart_service:payment-service": "payment-service is healthy β network is the issue",
|
| 729 |
+
"scale_service:payment-service": "Scaling won't fix BGP routing",
|
| 730 |
+
"restart_service:order-service": "order-service is a victim",
|
| 731 |
+
"clear_cache": "Cache is unrelated to network routing",
|
| 732 |
+
},
|
| 733 |
+
"resolution_keywords": [
|
| 734 |
+
"bgp", "network", "route", "rollback", "partition",
|
| 735 |
+
"restored", "az-1", "az-2", "az-3", "checkout", "withdrawal",
|
| 736 |
+
],
|
| 737 |
+
},
|
| 738 |
+
],
|
| 739 |
+
}
|
| 740 |
+
|
| 741 |
+
|
| 742 |
+
# ---------------------------------------------------------------------------
|
| 743 |
+
# Public API
|
| 744 |
+
# ---------------------------------------------------------------------------
|
| 745 |
+
|
| 746 |
+
def get_task(task_id: str) -> dict:
|
| 747 |
+
if task_id not in ALL_TASKS:
|
| 748 |
+
raise ValueError(
|
| 749 |
+
f"Unknown task_id '{task_id}'. "
|
| 750 |
+
f"Valid: {list(ALL_TASKS.keys())}"
|
| 751 |
+
)
|
| 752 |
+
return ALL_TASKS[task_id]
|
| 753 |
+
|
| 754 |
+
|
| 755 |
+
def get_scenario(task_id: str, index: int) -> dict:
|
| 756 |
+
if task_id not in SCENARIOS:
|
| 757 |
+
raise ValueError(f"No scenarios for task_id '{task_id}'.")
|
| 758 |
+
scenarios = SCENARIOS[task_id]
|
| 759 |
+
if index < 0 or index >= len(scenarios):
|
| 760 |
+
raise ValueError(
|
| 761 |
+
f"Scenario index {index} out of range for task '{task_id}' "
|
| 762 |
+
f"(valid: 0β{len(scenarios) - 1})"
|
| 763 |
+
)
|
| 764 |
+
return scenarios[index]
|
| 765 |
+
|
| 766 |
+
|
| 767 |
+
def list_tasks() -> list:
|
| 768 |
+
return list(ALL_TASKS.values())
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|