Spaces:
Sleeping
Sleeping
Commit Β·
d5fc8a7
0
Parent(s):
feat: SRE incident response OpenEnv v0.1.0
Browse files- .gitignore +20 -0
- Dockerfile +27 -0
- README.md +155 -0
- graders.py +294 -0
- inference.py +204 -0
- openenv.yaml +53 -0
- requirements.txt +6 -0
- server/__init__.py +0 -0
- server/app.py +196 -0
- server/environment.py +309 -0
- server/models.py +71 -0
- tasks.py +664 -0
.gitignore
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
.Python
|
| 5 |
+
*.egg-info/
|
| 6 |
+
dist/
|
| 7 |
+
build/
|
| 8 |
+
.env
|
| 9 |
+
.venv/
|
| 10 |
+
venv/
|
| 11 |
+
env/
|
| 12 |
+
*.log
|
| 13 |
+
.DS_Store
|
| 14 |
+
.pytest_cache/
|
| 15 |
+
.mypy_cache/
|
| 16 |
+
.ruff_cache/
|
| 17 |
+
tests/
|
| 18 |
+
conftest.py
|
| 19 |
+
pyproject.toml
|
| 20 |
+
*.sqlite
|
Dockerfile
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install Python dependencies first (maximizes Docker layer cache)
|
| 7 |
+
COPY requirements.txt .
|
| 8 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 9 |
+
|
| 10 |
+
# Copy application code
|
| 11 |
+
COPY tasks.py .
|
| 12 |
+
COPY graders.py .
|
| 13 |
+
COPY openenv.yaml .
|
| 14 |
+
COPY inference.py .
|
| 15 |
+
COPY server/ ./server/
|
| 16 |
+
|
| 17 |
+
# Ensure server is a Python package
|
| 18 |
+
RUN touch server/__init__.py
|
| 19 |
+
|
| 20 |
+
# HuggingFace Spaces requires port 7860
|
| 21 |
+
EXPOSE 7860
|
| 22 |
+
|
| 23 |
+
# Add project root to Python path
|
| 24 |
+
ENV PYTHONPATH=/app
|
| 25 |
+
|
| 26 |
+
# Start the FastAPI server
|
| 27 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: SRE Incident Response OpenEnv
|
| 3 |
+
emoji: π¨
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
tags:
|
| 9 |
+
- openenv
|
| 10 |
+
- sre
|
| 11 |
+
- devops
|
| 12 |
+
- incident-response
|
| 13 |
+
- real-world
|
| 14 |
+
- agentic
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
# SRE Incident Response β OpenEnv Environment
|
| 18 |
+
|
| 19 |
+
An OpenEnv environment for training and evaluating AI agents on Site Reliability Engineering (SRE) tasks. Agents handle real production incident scenarios: triaging alerts, identifying root causes through log/metric correlation, and executing remediation runbooks to resolve cascading failures.
|
| 20 |
+
|
| 21 |
+
## Why This Environment
|
| 22 |
+
|
| 23 |
+
Every cloud company employs SREs who respond to production incidents under time pressure. This environment simulates the exact decision loop an on-call SRE follows:
|
| 24 |
+
|
| 25 |
+
1. **Triage** β Read alert payload, assess blast radius, classify severity (P1βP4)
|
| 26 |
+
2. **Investigate** β Query logs, metrics, dependency graphs, recent deploys
|
| 27 |
+
3. **Diagnose** β Correlate signals across services to find the root cause
|
| 28 |
+
4. **Remediate** β Execute the correct runbook steps in the right order
|
| 29 |
+
5. **Document** β Submit a resolution summary for post-incident review
|
| 30 |
+
|
| 31 |
+
Scenarios include cascading DB failures, CDN cache storms, OOM kills, and BGP network partitions β all modeled from real production incident patterns.
|
| 32 |
+
|
| 33 |
+
## Tasks
|
| 34 |
+
|
| 35 |
+
| Task ID | Difficulty | Max Steps | Description |
|
| 36 |
+
|---|---|---|---|
|
| 37 |
+
| `alert_classification` | Easy | 3 | Classify alert severity (P1βP4) from metrics and symptoms |
|
| 38 |
+
| `root_cause_analysis` | Medium | 10 | Trace logs/metrics/deps to find root cause service + failure mode |
|
| 39 |
+
| `remediation_planning` | Hard | 15 | Diagnose, remediate, and document full incident resolution |
|
| 40 |
+
|
| 41 |
+
Each task has 2 scenarios:
|
| 42 |
+
|
| 43 |
+
| Scenario | Incident Type |
|
| 44 |
+
|---|---|
|
| 45 |
+
| AC-001 | Cascading DB connection pool exhaustion (postgres β auth β api-gateway) |
|
| 46 |
+
| AC-002 | CDN cache invalidation storm (misconfigured purge β 40Γ origin traffic) |
|
| 47 |
+
| RCA-001 | Postgres OOM kill by runaway analytics query |
|
| 48 |
+
| RCA-002 | BGP route withdrawal β AZ network partition β 61% checkout failures |
|
| 49 |
+
| RP-001 | Full OOM remediation (stop job β restart DB β restore services) |
|
| 50 |
+
| RP-002 | Full BGP remediation (restore routes β roll back config β verify recovery) |
|
| 51 |
+
|
| 52 |
+
## Action Space
|
| 53 |
+
|
| 54 |
+
**Diagnostic:**
|
| 55 |
+
```json
|
| 56 |
+
{"action_type": "query_logs", "parameters": {"service": "postgres-db"}}
|
| 57 |
+
{"action_type": "check_metrics", "parameters": {"service": "auth-service"}}
|
| 58 |
+
{"action_type": "check_dependencies", "parameters": {"service": "api-gateway"}}
|
| 59 |
+
{"action_type": "check_recent_deploys", "parameters": {"service": "analytics-service"}}
|
| 60 |
+
{"action_type": "check_service_status", "parameters": {"service": "payment-service"}}
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
**Remediation:**
|
| 64 |
+
```json
|
| 65 |
+
{"action_type": "restart_service", "parameters": {"service": "postgres-db"}}
|
| 66 |
+
{"action_type": "rollback_deploy", "parameters": {"service": "network-infra", "target_version": "previous"}}
|
| 67 |
+
{"action_type": "scale_service", "parameters": {"service": "image-service", "replicas": 10}}
|
| 68 |
+
{"action_type": "disable_feature_flag", "parameters": {"flag": "full_history_export"}}
|
| 69 |
+
{"action_type": "execute_runbook_step", "parameters": {"runbook_action": "restore_bgp_routes"}}
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
**Submission:**
|
| 73 |
+
```json
|
| 74 |
+
{"action_type": "submit_severity", "parameters": {"severity": "P1", "service": "postgres-db"}}
|
| 75 |
+
{"action_type": "submit_root_cause", "parameters": {"service": "analytics-service", "failure_mode": "unbounded query OOM"}}
|
| 76 |
+
{"action_type": "submit_resolution", "parameters": {"summary": "Disabled analytics cron job, restarted postgres-db..."}}
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
## Observation Space
|
| 80 |
+
|
| 81 |
+
Each step returns:
|
| 82 |
+
|
| 83 |
+
| Field | Type | Description |
|
| 84 |
+
|---|---|---|
|
| 85 |
+
| `episode_id` | string | Unique episode UUID |
|
| 86 |
+
| `task_id` | string | Active task |
|
| 87 |
+
| `scenario_id` | string | Scenario identifier (e.g. `AC-001`) |
|
| 88 |
+
| `step_count` / `max_steps` | int | Current step and budget |
|
| 89 |
+
| `incident_summary` | string | Plain-text incident description |
|
| 90 |
+
| `alert` | dict | Alert payload with severity, affected services, symptoms |
|
| 91 |
+
| `available_actions` | list | Valid action types for this task |
|
| 92 |
+
| `queried_data` | dict | All tool responses gathered so far |
|
| 93 |
+
| `cumulative_reward` | float | Running reward total |
|
| 94 |
+
| `done` | bool | Episode terminal flag |
|
| 95 |
+
| `feedback` | string | Per-step feedback string |
|
| 96 |
+
|
| 97 |
+
## Reward Function
|
| 98 |
+
|
| 99 |
+
| Event | Reward |
|
| 100 |
+
|---|---|
|
| 101 |
+
| Query known service (first time) | +0.05 |
|
| 102 |
+
| Query known service (repeat) | +0.01 |
|
| 103 |
+
| Query unknown service | -0.05 |
|
| 104 |
+
| Correct remediation action | +0.10 |
|
| 105 |
+
| Wrong remediation action | -0.10 |
|
| 106 |
+
| Step past halfway (non-submit) | -0.02 |
|
| 107 |
+
| Timeout without submission | -0.10 |
|
| 108 |
+
| Grader score (on terminal step) | 0.0β1.0 |
|
| 109 |
+
|
| 110 |
+
## API Endpoints
|
| 111 |
+
|
| 112 |
+
| Method | Path | Description |
|
| 113 |
+
|---|---|---|
|
| 114 |
+
| GET | `/health` | `{"status": "ok", "version": "0.1.0"}` |
|
| 115 |
+
| POST | `/reset?task_id=...&scenario_index=...` | Start new episode |
|
| 116 |
+
| POST | `/step` | Submit action (JSON body) |
|
| 117 |
+
| GET | `/state` | Full current episode state |
|
| 118 |
+
| GET | `/tasks` | All tasks with schemas |
|
| 119 |
+
| GET | `/grader` | Score current episode (0.0β1.0) |
|
| 120 |
+
| POST | `/baseline` | Run inference.py, return scores |
|
| 121 |
+
|
| 122 |
+
## Setup
|
| 123 |
+
|
| 124 |
+
```bash
|
| 125 |
+
# Local development
|
| 126 |
+
pip install -r requirements.txt
|
| 127 |
+
uvicorn server.app:app --host 0.0.0.0 --port 7860
|
| 128 |
+
|
| 129 |
+
# Docker
|
| 130 |
+
docker build -t sre-incident-env .
|
| 131 |
+
docker run -p 7860:7860 \
|
| 132 |
+
-e API_BASE_URL="https://api.groq.com/openai/v1" \
|
| 133 |
+
-e MODEL_NAME="llama-3.1-8b-instant" \
|
| 134 |
+
-e HF_TOKEN="your_api_key" \
|
| 135 |
+
sre-incident-env
|
| 136 |
+
|
| 137 |
+
# Run baseline inference
|
| 138 |
+
export API_BASE_URL="https://api.groq.com/openai/v1"
|
| 139 |
+
export MODEL_NAME="llama-3.1-8b-instant"
|
| 140 |
+
export HF_TOKEN="your_groq_key"
|
| 141 |
+
python inference.py
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
## Baseline Scores
|
| 145 |
+
|
| 146 |
+
Using `llama-3.1-8b-instant` via Groq:
|
| 147 |
+
|
| 148 |
+
| Task | Score |
|
| 149 |
+
|---|---|
|
| 150 |
+
| `alert_classification` | ~0.75 |
|
| 151 |
+
| `root_cause_analysis` | ~0.35 |
|
| 152 |
+
| `remediation_planning` | ~0.20 |
|
| 153 |
+
| **overall** | **~0.43** |
|
| 154 |
+
|
| 155 |
+
*Run `python inference.py` to reproduce.*
|
graders.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders.py β Deterministic graders for all 3 SRE Incident Response tasks.
|
| 3 |
+
|
| 4 |
+
Public API:
|
| 5 |
+
grade(task_id, state, scenario) -> {"total": float, "breakdown": dict, "feedback": str}
|
| 6 |
+
|
| 7 |
+
All scores are in [0.0, 1.0]. Graders are deterministic and reproducible.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def grade(task_id: str, state: dict, scenario: dict) -> dict:
|
| 14 |
+
"""
|
| 15 |
+
Entry point. Routes to the correct task grader.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
task_id: One of alert_classification, root_cause_analysis, remediation_planning
|
| 19 |
+
state: Current episode state dict from IncidentEnvironment
|
| 20 |
+
scenario: The scenario dict that was loaded for this episode
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
{
|
| 24 |
+
"total": float in [0.0, 1.0],
|
| 25 |
+
"breakdown": dict of sub-scores,
|
| 26 |
+
"feedback": human-readable string
|
| 27 |
+
}
|
| 28 |
+
"""
|
| 29 |
+
graders = {
|
| 30 |
+
"alert_classification": _grade_alert_classification,
|
| 31 |
+
"root_cause_analysis": _grade_root_cause_analysis,
|
| 32 |
+
"remediation_planning": _grade_remediation_planning,
|
| 33 |
+
}
|
| 34 |
+
if task_id not in graders:
|
| 35 |
+
return {"total": 0.0, "breakdown": {}, "feedback": f"Unknown task_id: {task_id}"}
|
| 36 |
+
return graders[task_id](state, scenario)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# ββ Task 1: Alert Classification ββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
|
| 41 |
+
def _grade_alert_classification(state: dict, scenario: dict) -> dict:
|
| 42 |
+
"""
|
| 43 |
+
Scoring:
|
| 44 |
+
1.0 β exact severity match
|
| 45 |
+
0.5 β adjacent severity (one level off)
|
| 46 |
+
0.25 β two levels off
|
| 47 |
+
0.0 β opposite end or no submission
|
| 48 |
+
"""
|
| 49 |
+
action_history = state.get("action_history", [])
|
| 50 |
+
correct = scenario.get("correct_severity", "P1")
|
| 51 |
+
adjacent = scenario.get("adjacent_severities", [])
|
| 52 |
+
|
| 53 |
+
submitted_severity = None
|
| 54 |
+
for action in action_history:
|
| 55 |
+
if action.get("action_type") == "submit_severity":
|
| 56 |
+
submitted_severity = (
|
| 57 |
+
action.get("parameters", {}).get("severity", "")
|
| 58 |
+
.upper()
|
| 59 |
+
.strip()
|
| 60 |
+
)
|
| 61 |
+
break
|
| 62 |
+
|
| 63 |
+
if not submitted_severity:
|
| 64 |
+
return {
|
| 65 |
+
"total": 0.0,
|
| 66 |
+
"breakdown": {"severity_match": 0.0, "submitted": False},
|
| 67 |
+
"feedback": "No severity submitted β score 0.0",
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
severity_order = ["P1", "P2", "P3", "P4"]
|
| 71 |
+
|
| 72 |
+
if submitted_severity == correct:
|
| 73 |
+
score = 1.0
|
| 74 |
+
feedback = f"Exact match: {submitted_severity} == {correct}"
|
| 75 |
+
elif submitted_severity in adjacent:
|
| 76 |
+
score = 0.5
|
| 77 |
+
feedback = f"Adjacent severity: submitted {submitted_severity}, correct {correct}"
|
| 78 |
+
else:
|
| 79 |
+
# Distance-based fallback
|
| 80 |
+
try:
|
| 81 |
+
dist = abs(severity_order.index(submitted_severity) - severity_order.index(correct))
|
| 82 |
+
except ValueError:
|
| 83 |
+
dist = 4
|
| 84 |
+
if dist == 2:
|
| 85 |
+
score = 0.25
|
| 86 |
+
else:
|
| 87 |
+
score = 0.0
|
| 88 |
+
feedback = f"Wrong severity: submitted {submitted_severity}, correct {correct} (dist={dist})"
|
| 89 |
+
|
| 90 |
+
return {
|
| 91 |
+
"total": score,
|
| 92 |
+
"breakdown": {
|
| 93 |
+
"submitted_severity": submitted_severity,
|
| 94 |
+
"correct_severity": correct,
|
| 95 |
+
"severity_match": score,
|
| 96 |
+
},
|
| 97 |
+
"feedback": feedback,
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ββ Task 2: Root Cause Analysis βββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
+
|
| 103 |
+
def _grade_root_cause_analysis(state: dict, scenario: dict) -> dict:
|
| 104 |
+
"""
|
| 105 |
+
Scoring:
|
| 106 |
+
Base score (0.0β0.6):
|
| 107 |
+
0.6 β correct service AND correct failure_mode
|
| 108 |
+
0.35 β correct service only
|
| 109 |
+
0.0 β wrong service
|
| 110 |
+
Efficiency bonus (0.0β0.4):
|
| 111 |
+
Based on how many unique relevant services were queried before submitting.
|
| 112 |
+
More targeted = higher bonus (penalises random querying).
|
| 113 |
+
"""
|
| 114 |
+
action_history = state.get("action_history", [])
|
| 115 |
+
correct_rc = scenario.get("correct_root_cause", {})
|
| 116 |
+
correct_service = correct_rc.get("service", "").lower().strip()
|
| 117 |
+
correct_mode = correct_rc.get("failure_mode", "").lower().strip()
|
| 118 |
+
known_services = {s.lower() for s in scenario.get("known_services", set())}
|
| 119 |
+
|
| 120 |
+
# Find the submit_root_cause action
|
| 121 |
+
submitted_service = ""
|
| 122 |
+
submitted_mode = ""
|
| 123 |
+
submit_step = None
|
| 124 |
+
for action in action_history:
|
| 125 |
+
if action.get("action_type") == "submit_root_cause":
|
| 126 |
+
params = action.get("parameters", {})
|
| 127 |
+
submitted_service = params.get("service", "").lower().strip()
|
| 128 |
+
submitted_mode = params.get("failure_mode", "").lower().strip()
|
| 129 |
+
submit_step = action.get("step", len(action_history))
|
| 130 |
+
break
|
| 131 |
+
|
| 132 |
+
if not submitted_service:
|
| 133 |
+
return {
|
| 134 |
+
"total": 0.0,
|
| 135 |
+
"breakdown": {"base": 0.0, "efficiency": 0.0, "submitted": False},
|
| 136 |
+
"feedback": "No root cause submitted β score 0.0",
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
# Base score
|
| 140 |
+
service_match = submitted_service == correct_service
|
| 141 |
+
mode_keywords = [w for w in correct_mode.split() if len(w) > 3]
|
| 142 |
+
mode_match = service_match and any(
|
| 143 |
+
kw in submitted_mode for kw in mode_keywords
|
| 144 |
+
) if mode_keywords else service_match
|
| 145 |
+
|
| 146 |
+
if mode_match:
|
| 147 |
+
base = 0.6
|
| 148 |
+
base_feedback = f"Correct service ({submitted_service}) + failure mode matched"
|
| 149 |
+
elif service_match:
|
| 150 |
+
base = 0.35
|
| 151 |
+
base_feedback = f"Correct service ({submitted_service}) but failure mode unclear"
|
| 152 |
+
else:
|
| 153 |
+
base = 0.0
|
| 154 |
+
base_feedback = f"Wrong service: submitted '{submitted_service}', correct '{correct_service}'"
|
| 155 |
+
|
| 156 |
+
# Efficiency bonus β only awarded if service was correct
|
| 157 |
+
efficiency = 0.0
|
| 158 |
+
if service_match and submit_step is not None:
|
| 159 |
+
diagnostic_actions = {"query_logs", "check_metrics", "check_dependencies",
|
| 160 |
+
"check_recent_deploys", "check_service_status"}
|
| 161 |
+
queried = {
|
| 162 |
+
a.get("parameters", {}).get("service", "").lower()
|
| 163 |
+
for a in action_history[:submit_step]
|
| 164 |
+
if a.get("action_type") in diagnostic_actions
|
| 165 |
+
}
|
| 166 |
+
relevant_queried = queried & known_services
|
| 167 |
+
# Reward for querying relevant services efficiently
|
| 168 |
+
# Full bonus for querying 2-3 key services; less for spraying all services
|
| 169 |
+
total_queries = sum(
|
| 170 |
+
1 for a in action_history[:submit_step]
|
| 171 |
+
if a.get("action_type") in diagnostic_actions
|
| 172 |
+
)
|
| 173 |
+
if total_queries > 0:
|
| 174 |
+
precision = len(relevant_queried) / max(total_queries, 1)
|
| 175 |
+
efficiency = round(min(0.4, precision * 0.4 + min(len(relevant_queried), 3) * 0.05), 4)
|
| 176 |
+
|
| 177 |
+
total = round(min(1.0, base + efficiency), 4)
|
| 178 |
+
|
| 179 |
+
return {
|
| 180 |
+
"total": total,
|
| 181 |
+
"breakdown": {
|
| 182 |
+
"base": base,
|
| 183 |
+
"efficiency_bonus": efficiency,
|
| 184 |
+
"service_match": service_match,
|
| 185 |
+
"mode_match": mode_match,
|
| 186 |
+
"submitted_service": submitted_service,
|
| 187 |
+
"correct_service": correct_service,
|
| 188 |
+
},
|
| 189 |
+
"feedback": f"{base_feedback} | efficiency bonus: {efficiency:.2f} | total: {total:.2f}",
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# ββ Task 3: Remediation Planning ββββββββββββββββββββββββββββββββββββββββββββ
|
| 194 |
+
|
| 195 |
+
def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
|
| 196 |
+
"""
|
| 197 |
+
Scoring:
|
| 198 |
+
Resolution base (0.0 or 0.6):
|
| 199 |
+
0.6 β submit_resolution with non-empty summary after β₯1 investigation action
|
| 200 |
+
Efficiency bonus (0.0β0.3):
|
| 201 |
+
Fraction of correct remediation actions executed (from correct_remediation_sequence)
|
| 202 |
+
Wrong action penalty (up to -0.15):
|
| 203 |
+
-0.05 per wrong action (capped at -0.15)
|
| 204 |
+
Summary quality bonus (0.0β0.1):
|
| 205 |
+
+0.1 if summary contains β₯3 resolution keywords from scenario
|
| 206 |
+
"""
|
| 207 |
+
action_history = state.get("action_history", [])
|
| 208 |
+
correct_seq = scenario.get("correct_remediation_sequence", [])
|
| 209 |
+
wrong_actions_map = scenario.get("wrong_actions", {})
|
| 210 |
+
resolution_keywords = scenario.get("resolution_keywords", [])
|
| 211 |
+
|
| 212 |
+
diagnostic_actions = {"query_logs", "check_metrics", "check_dependencies",
|
| 213 |
+
"check_recent_deploys", "check_service_status"}
|
| 214 |
+
remediation_actions = {"restart_service", "rollback_deploy", "scale_service",
|
| 215 |
+
"disable_feature_flag", "clear_cache", "execute_runbook_step"}
|
| 216 |
+
|
| 217 |
+
# Find submit_resolution
|
| 218 |
+
submitted_summary = ""
|
| 219 |
+
for action in action_history:
|
| 220 |
+
if action.get("action_type") == "submit_resolution":
|
| 221 |
+
submitted_summary = action.get("parameters", {}).get("summary", "")
|
| 222 |
+
break
|
| 223 |
+
|
| 224 |
+
investigation_count = sum(
|
| 225 |
+
1 for a in action_history
|
| 226 |
+
if a.get("action_type") in diagnostic_actions | remediation_actions
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
if not submitted_summary or investigation_count < 1:
|
| 230 |
+
return {
|
| 231 |
+
"total": 0.0,
|
| 232 |
+
"breakdown": {"base": 0.0, "efficiency": 0.0, "penalty": 0.0, "summary": 0.0},
|
| 233 |
+
"feedback": "No resolution submitted or no investigation β score 0.0",
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
base = 0.6
|
| 237 |
+
|
| 238 |
+
# Efficiency bonus β which correct actions were executed?
|
| 239 |
+
executed_action_keys = set()
|
| 240 |
+
for a in action_history:
|
| 241 |
+
at = a.get("action_type", "")
|
| 242 |
+
svc = a.get("parameters", {}).get("service", "")
|
| 243 |
+
flag = a.get("parameters", {}).get("flag", "")
|
| 244 |
+
step_action = a.get("parameters", {}).get("runbook_action", "")
|
| 245 |
+
target = a.get("parameters", {}).get("target", "")
|
| 246 |
+
# Build key variants that match correct_remediation_sequence format
|
| 247 |
+
executed_action_keys.add(at)
|
| 248 |
+
if svc:
|
| 249 |
+
executed_action_keys.add(f"{at}:{svc}")
|
| 250 |
+
if flag:
|
| 251 |
+
executed_action_keys.add(f"{at}:{flag}")
|
| 252 |
+
if step_action:
|
| 253 |
+
executed_action_keys.add(f"execute_runbook_step:{step_action}")
|
| 254 |
+
if target:
|
| 255 |
+
executed_action_keys.add(f"execute_runbook_step:{target}")
|
| 256 |
+
|
| 257 |
+
matched = sum(1 for key in correct_seq if key in executed_action_keys)
|
| 258 |
+
efficiency = round((matched / len(correct_seq)) * 0.3, 4) if correct_seq else 0.0
|
| 259 |
+
|
| 260 |
+
# Wrong action penalty
|
| 261 |
+
wrong_count = 0
|
| 262 |
+
for a in action_history:
|
| 263 |
+
at = a.get("action_type", "")
|
| 264 |
+
svc = a.get("parameters", {}).get("service", "")
|
| 265 |
+
key1 = at
|
| 266 |
+
key2 = f"{at}:{svc}"
|
| 267 |
+
if key1 in wrong_actions_map or key2 in wrong_actions_map:
|
| 268 |
+
wrong_count += 1
|
| 269 |
+
penalty = round(min(0.15, wrong_count * 0.05), 4)
|
| 270 |
+
|
| 271 |
+
# Summary quality bonus
|
| 272 |
+
summary_lower = submitted_summary.lower()
|
| 273 |
+
keyword_hits = sum(1 for kw in resolution_keywords if kw in summary_lower)
|
| 274 |
+
summary_bonus = 0.1 if keyword_hits >= 3 else 0.05 if keyword_hits >= 1 else 0.0
|
| 275 |
+
|
| 276 |
+
total = round(max(0.0, min(1.0, base + efficiency - penalty + summary_bonus)), 4)
|
| 277 |
+
|
| 278 |
+
return {
|
| 279 |
+
"total": total,
|
| 280 |
+
"breakdown": {
|
| 281 |
+
"base": base,
|
| 282 |
+
"efficiency_bonus": efficiency,
|
| 283 |
+
"wrong_action_penalty": -penalty,
|
| 284 |
+
"summary_bonus": summary_bonus,
|
| 285 |
+
"correct_actions_matched": matched,
|
| 286 |
+
"correct_actions_total": len(correct_seq),
|
| 287 |
+
"wrong_actions_count": wrong_count,
|
| 288 |
+
"summary_keywords_hit": keyword_hits,
|
| 289 |
+
},
|
| 290 |
+
"feedback": (
|
| 291 |
+
f"base={base} | efficiency={efficiency:.2f} ({matched}/{len(correct_seq)} correct actions) "
|
| 292 |
+
f"| penalty=-{penalty:.2f} | summary_bonus={summary_bonus:.2f} | total={total:.2f}"
|
| 293 |
+
),
|
| 294 |
+
}
|
inference.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
inference.py β OpenEnv Hackathon baseline inference script.
|
| 3 |
+
|
| 4 |
+
Required env vars (set in HF Space secrets or .env):
|
| 5 |
+
API_BASE_URL OpenAI-compatible LLM endpoint
|
| 6 |
+
MODEL_NAME Model identifier
|
| 7 |
+
HF_TOKEN API key for the LLM endpoint
|
| 8 |
+
|
| 9 |
+
Runs the agent against all 3 tasks Γ 2 scenarios each.
|
| 10 |
+
Final stdout line is valid JSON β required by the hackathon validator.
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
export API_BASE_URL="https://api.groq.com/openai/v1"
|
| 14 |
+
export MODEL_NAME="llama-3.1-8b-instant"
|
| 15 |
+
export HF_TOKEN="gsk_your_key_here"
|
| 16 |
+
python inference.py
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import json
|
| 22 |
+
import os
|
| 23 |
+
import sys
|
| 24 |
+
|
| 25 |
+
import requests
|
| 26 |
+
from openai import OpenAI
|
| 27 |
+
from dotenv import load_dotenv
|
| 28 |
+
|
| 29 |
+
load_dotenv()
|
| 30 |
+
|
| 31 |
+
# ββ Config from env vars (hackathon required names) ββββββββββββββββββββββββββ
|
| 32 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.groq.com/openai/v1")
|
| 33 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "llama-3.1-8b-instant")
|
| 34 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 35 |
+
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:7860")
|
| 36 |
+
|
| 37 |
+
if not HF_TOKEN:
|
| 38 |
+
print("[WARN] HF_TOKEN is not set β LLM calls will fail.", file=sys.stderr)
|
| 39 |
+
|
| 40 |
+
client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
|
| 41 |
+
|
| 42 |
+
# ββ System prompt βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
SYSTEM_PROMPT = """You are an expert Site Reliability Engineer (SRE) responding to a live production incident.
|
| 44 |
+
|
| 45 |
+
You receive an incident observation as JSON. Respond with ONLY a single valid JSON action object β no markdown, no explanation.
|
| 46 |
+
|
| 47 |
+
Available action_types and their parameters:
|
| 48 |
+
Diagnostic (gather info):
|
| 49 |
+
{"action_type": "query_logs", "parameters": {"service": "<name>"}}
|
| 50 |
+
{"action_type": "check_metrics", "parameters": {"service": "<name>"}}
|
| 51 |
+
{"action_type": "check_dependencies", "parameters": {"service": "<name>"}}
|
| 52 |
+
{"action_type": "check_recent_deploys", "parameters": {"service": "<name>"}}
|
| 53 |
+
{"action_type": "check_service_status", "parameters": {"service": "<name>"}}
|
| 54 |
+
|
| 55 |
+
Remediation (fix the issue):
|
| 56 |
+
{"action_type": "restart_service", "parameters": {"service": "<name>"}}
|
| 57 |
+
{"action_type": "rollback_deploy", "parameters": {"service": "<name>", "target_version": "previous"}}
|
| 58 |
+
{"action_type": "scale_service", "parameters": {"service": "<name>", "replicas": 5}}
|
| 59 |
+
{"action_type": "disable_feature_flag", "parameters": {"flag": "<flag_name>"}}
|
| 60 |
+
{"action_type": "clear_cache", "parameters": {"service": "<name>"}}
|
| 61 |
+
{"action_type": "execute_runbook_step", "parameters": {"runbook_action": "<action>", "target": "<name>"}}
|
| 62 |
+
|
| 63 |
+
Submission (end the episode β choose ONE based on task):
|
| 64 |
+
{"action_type": "submit_severity", "parameters": {"severity": "P1|P2|P3|P4", "service": "<root_cause_service>"}}
|
| 65 |
+
{"action_type": "submit_root_cause", "parameters": {"service": "<root_cause>", "failure_mode": "<what_went_wrong>"}}
|
| 66 |
+
{"action_type": "submit_resolution", "parameters": {"summary": "<full description of what happened and what you did>"}}
|
| 67 |
+
|
| 68 |
+
Strategy by task:
|
| 69 |
+
alert_classification (max 3 steps): Query 1-2 services for evidence, then submit_severity.
|
| 70 |
+
root_cause_analysis (max 10 steps): Query logs/metrics/deps for multiple services, trace the failure chain, then submit_root_cause.
|
| 71 |
+
remediation_planning (max 15 steps): Investigate, execute fix actions, then submit_resolution with a detailed summary.
|
| 72 |
+
|
| 73 |
+
Output ONLY the JSON object. Nothing else."""
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _format_obs(obs: dict) -> str:
|
| 77 |
+
parts = [
|
| 78 |
+
f"TASK: {obs.get('task_id')} | Step {obs.get('step_count')}/{obs.get('max_steps')}",
|
| 79 |
+
f"INCIDENT: {obs.get('incident_summary', '')}",
|
| 80 |
+
]
|
| 81 |
+
alert = obs.get("alert", {})
|
| 82 |
+
if alert:
|
| 83 |
+
parts.append("ALERT:\n" + json.dumps(alert, indent=2))
|
| 84 |
+
if obs.get("available_actions"):
|
| 85 |
+
parts.append(f"AVAILABLE ACTIONS: {obs['available_actions']}")
|
| 86 |
+
if obs.get("queried_data"):
|
| 87 |
+
parts.append("DATA GATHERED:\n" + json.dumps(obs["queried_data"], indent=2))
|
| 88 |
+
parts.append(f"LAST REWARD: {obs.get('cumulative_reward', 0.0)}")
|
| 89 |
+
parts.append(f"FEEDBACK: {obs.get('feedback', '')}")
|
| 90 |
+
return "\n\n".join(parts)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _parse_action(text: str) -> dict:
|
| 94 |
+
text = text.strip()
|
| 95 |
+
# Strip markdown code fences if present
|
| 96 |
+
if text.startswith("```"):
|
| 97 |
+
lines = [l for l in text.splitlines() if not l.startswith("```")]
|
| 98 |
+
text = "\n".join(lines).strip()
|
| 99 |
+
try:
|
| 100 |
+
return json.loads(text)
|
| 101 |
+
except json.JSONDecodeError:
|
| 102 |
+
start, end = text.find("{"), text.rfind("}") + 1
|
| 103 |
+
if start != -1 and end > start:
|
| 104 |
+
return json.loads(text[start:end])
|
| 105 |
+
raise
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _run_episode(task_id: str, scenario_index: int) -> float:
|
| 109 |
+
r = requests.post(
|
| 110 |
+
f"{ENV_BASE_URL}/reset",
|
| 111 |
+
params={"task_id": task_id, "scenario_index": scenario_index},
|
| 112 |
+
timeout=30,
|
| 113 |
+
)
|
| 114 |
+
r.raise_for_status()
|
| 115 |
+
obs = r.json()
|
| 116 |
+
|
| 117 |
+
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
| 118 |
+
|
| 119 |
+
for _step in range(obs.get("max_steps", 10)):
|
| 120 |
+
messages.append({"role": "user", "content": _format_obs(obs)})
|
| 121 |
+
|
| 122 |
+
response = client.chat.completions.create(
|
| 123 |
+
model=MODEL_NAME,
|
| 124 |
+
messages=messages,
|
| 125 |
+
temperature=0.0,
|
| 126 |
+
max_tokens=256,
|
| 127 |
+
)
|
| 128 |
+
raw = response.choices[0].message.content
|
| 129 |
+
messages.append({"role": "assistant", "content": raw})
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
action = _parse_action(raw)
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f" [WARN] parse failed at step {_step+1}: {e}", file=sys.stderr)
|
| 135 |
+
# Graceful fallback per task
|
| 136 |
+
if task_id == "alert_classification":
|
| 137 |
+
action = {"action_type": "submit_severity",
|
| 138 |
+
"parameters": {"severity": "P2", "service": "unknown"}}
|
| 139 |
+
elif task_id == "root_cause_analysis":
|
| 140 |
+
action = {"action_type": "submit_root_cause",
|
| 141 |
+
"parameters": {"service": "unknown", "failure_mode": "unknown"}}
|
| 142 |
+
else:
|
| 143 |
+
action = {"action_type": "submit_resolution",
|
| 144 |
+
"parameters": {"summary": "Unable to determine root cause."}}
|
| 145 |
+
|
| 146 |
+
step_r = requests.post(
|
| 147 |
+
f"{ENV_BASE_URL}/step",
|
| 148 |
+
json=action,
|
| 149 |
+
headers={"Content-Type": "application/json"},
|
| 150 |
+
timeout=30,
|
| 151 |
+
)
|
| 152 |
+
step_r.raise_for_status()
|
| 153 |
+
result = step_r.json()
|
| 154 |
+
obs = result["observation"]
|
| 155 |
+
|
| 156 |
+
if result.get("done"):
|
| 157 |
+
break
|
| 158 |
+
|
| 159 |
+
# Get final grader score
|
| 160 |
+
g = requests.get(f"{ENV_BASE_URL}/grader", timeout=30)
|
| 161 |
+
g.raise_for_status()
|
| 162 |
+
return g.json().get("total", 0.0)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def main():
|
| 166 |
+
runs = [
|
| 167 |
+
("alert_classification", 0),
|
| 168 |
+
("alert_classification", 1),
|
| 169 |
+
("root_cause_analysis", 0),
|
| 170 |
+
("root_cause_analysis", 1),
|
| 171 |
+
("remediation_planning", 0),
|
| 172 |
+
("remediation_planning", 1),
|
| 173 |
+
]
|
| 174 |
+
|
| 175 |
+
results: dict[str, list[float]] = {}
|
| 176 |
+
|
| 177 |
+
print(f"{'Task':<30} {'Scenario':>8} {'Score':>8}")
|
| 178 |
+
print("-" * 52)
|
| 179 |
+
|
| 180 |
+
for task_id, scenario_index in runs:
|
| 181 |
+
try:
|
| 182 |
+
score = _run_episode(task_id, scenario_index)
|
| 183 |
+
except Exception as e:
|
| 184 |
+
print(f" [ERROR] {task_id} s{scenario_index}: {e}", file=sys.stderr)
|
| 185 |
+
score = 0.0
|
| 186 |
+
|
| 187 |
+
label = f"{task_id} [s{scenario_index}]"
|
| 188 |
+
print(f"{label:<30} {scenario_index:>8} {score:>8.4f}")
|
| 189 |
+
results.setdefault(task_id, []).append(score)
|
| 190 |
+
|
| 191 |
+
print("-" * 52)
|
| 192 |
+
summary = {task: round(sum(v) / len(v), 4) for task, v in results.items()}
|
| 193 |
+
summary["overall"] = round(sum(summary.values()) / len(summary), 4)
|
| 194 |
+
|
| 195 |
+
print("\nBaseline Summary:")
|
| 196 |
+
for k, v in summary.items():
|
| 197 |
+
print(f" {k:<30}: {v:.4f}")
|
| 198 |
+
|
| 199 |
+
# Final line must be valid JSON β parsed by /baseline endpoint
|
| 200 |
+
print(json.dumps(summary))
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
if __name__ == "__main__":
|
| 204 |
+
main()
|
openenv.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: sre-incident-response
|
| 2 |
+
version: "0.1.0"
|
| 3 |
+
app_port: 7860
|
| 4 |
+
description: >
|
| 5 |
+
OpenEnv environment simulating SRE/DevOps on-call incident response.
|
| 6 |
+
An AI agent classifies alert severity, performs root cause analysis
|
| 7 |
+
through log/metric queries, and executes remediation actions to
|
| 8 |
+
resolve production incidents end-to-end.
|
| 9 |
+
author: Elliot89
|
| 10 |
+
license: MIT
|
| 11 |
+
tags:
|
| 12 |
+
- openenv
|
| 13 |
+
- sre
|
| 14 |
+
- devops
|
| 15 |
+
- incident-response
|
| 16 |
+
- real-world
|
| 17 |
+
- agentic
|
| 18 |
+
|
| 19 |
+
tasks:
|
| 20 |
+
- id: alert_classification
|
| 21 |
+
name: "Task 1: Alert Severity Classification"
|
| 22 |
+
difficulty: easy
|
| 23 |
+
max_steps: 3
|
| 24 |
+
score_range: [0.0, 1.0]
|
| 25 |
+
description: >
|
| 26 |
+
Classify incoming alert severity (P1βP4) using diagnostic tools.
|
| 27 |
+
|
| 28 |
+
- id: root_cause_analysis
|
| 29 |
+
name: "Task 2: Root Cause Analysis"
|
| 30 |
+
difficulty: medium
|
| 31 |
+
max_steps: 10
|
| 32 |
+
score_range: [0.0, 1.0]
|
| 33 |
+
description: >
|
| 34 |
+
Trace a live incident through logs/metrics/dependencies
|
| 35 |
+
to identify the exact root cause service and failure mode.
|
| 36 |
+
|
| 37 |
+
- id: remediation_planning
|
| 38 |
+
name: "Task 3: Incident Remediation"
|
| 39 |
+
difficulty: hard
|
| 40 |
+
max_steps: 15
|
| 41 |
+
score_range: [0.0, 1.0]
|
| 42 |
+
description: >
|
| 43 |
+
Fully resolve a production incident: diagnose, remediate,
|
| 44 |
+
and submit a documented resolution summary.
|
| 45 |
+
|
| 46 |
+
endpoints:
|
| 47 |
+
health: "GET /health"
|
| 48 |
+
reset: "POST /reset"
|
| 49 |
+
step: "POST /step"
|
| 50 |
+
state: "GET /state"
|
| 51 |
+
tasks: "GET /tasks"
|
| 52 |
+
grader: "GET /grader"
|
| 53 |
+
baseline: "POST /baseline"
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.0
|
| 2 |
+
uvicorn[standard]==0.30.6
|
| 3 |
+
pydantic==2.9.2
|
| 4 |
+
openai==1.51.0
|
| 5 |
+
requests==2.32.3
|
| 6 |
+
python-dotenv==1.0.1
|
server/__init__.py
ADDED
|
File without changes
|
server/app.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/app.py β FastAPI server exposing the OpenEnv HTTP interface.
|
| 3 |
+
|
| 4 |
+
Endpoints:
|
| 5 |
+
GET /health
|
| 6 |
+
GET /
|
| 7 |
+
POST /reset?task_id=...&scenario_index=...
|
| 8 |
+
POST /step
|
| 9 |
+
GET /state
|
| 10 |
+
GET /tasks
|
| 11 |
+
GET /grader
|
| 12 |
+
POST /baseline
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import subprocess
|
| 20 |
+
import sys
|
| 21 |
+
|
| 22 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 23 |
+
|
| 24 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 25 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 26 |
+
|
| 27 |
+
from server.models import Action
|
| 28 |
+
from server.environment import IncidentEnvironment
|
| 29 |
+
from tasks import list_tasks, ALL_TASKS
|
| 30 |
+
|
| 31 |
+
_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 32 |
+
|
| 33 |
+
app = FastAPI(
|
| 34 |
+
title="SRE Incident Response β OpenEnv",
|
| 35 |
+
version="0.1.0",
|
| 36 |
+
description="OpenEnv environment for training AI agents on SRE incident response tasks.",
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
app.add_middleware(
|
| 40 |
+
CORSMiddleware,
|
| 41 |
+
allow_origins=["*"],
|
| 42 |
+
allow_methods=["*"],
|
| 43 |
+
allow_headers=["*"],
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
env = IncidentEnvironment()
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ββ Health / root ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
+
|
| 51 |
+
@app.get("/health")
|
| 52 |
+
def health():
|
| 53 |
+
return {"status": "ok", "version": "0.1.0"}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@app.get("/")
|
| 57 |
+
def root():
|
| 58 |
+
return {
|
| 59 |
+
"name": "SRE Incident Response OpenEnv",
|
| 60 |
+
"version": "0.1.0",
|
| 61 |
+
"docs": "/docs",
|
| 62 |
+
"health": "/health",
|
| 63 |
+
"tasks": "/tasks",
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ββ Core OpenEnv endpoints βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
+
|
| 69 |
+
@app.post("/reset")
|
| 70 |
+
def reset(
|
| 71 |
+
task_id: str = Query(default="alert_classification"),
|
| 72 |
+
scenario_index: int = Query(default=0),
|
| 73 |
+
):
|
| 74 |
+
"""Start a new episode. Returns the initial observation."""
|
| 75 |
+
try:
|
| 76 |
+
obs = env.reset(task_id=task_id, scenario_index=scenario_index)
|
| 77 |
+
return obs.model_dump()
|
| 78 |
+
except ValueError as e:
|
| 79 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 80 |
+
except Exception as e:
|
| 81 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@app.post("/step")
|
| 85 |
+
def step(action: Action):
|
| 86 |
+
"""Submit an action. Returns observation, reward, done, info."""
|
| 87 |
+
try:
|
| 88 |
+
obs, reward, done, info = env.step(action)
|
| 89 |
+
return {
|
| 90 |
+
"observation": obs.model_dump(),
|
| 91 |
+
"reward": reward.model_dump(),
|
| 92 |
+
"done": done,
|
| 93 |
+
"info": info,
|
| 94 |
+
}
|
| 95 |
+
except RuntimeError as e:
|
| 96 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 97 |
+
except Exception as e:
|
| 98 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@app.get("/state")
|
| 102 |
+
def state():
|
| 103 |
+
"""Return the full current episode state."""
|
| 104 |
+
try:
|
| 105 |
+
return env.state().model_dump()
|
| 106 |
+
except RuntimeError as e:
|
| 107 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 108 |
+
except Exception as e:
|
| 109 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
@app.get("/tasks")
|
| 113 |
+
def tasks():
|
| 114 |
+
"""Return all available tasks with descriptions and action schemas."""
|
| 115 |
+
return {
|
| 116 |
+
"tasks": list_tasks(),
|
| 117 |
+
"total": len(ALL_TASKS),
|
| 118 |
+
"action_schema": {
|
| 119 |
+
"diagnostic": [
|
| 120 |
+
{"action_type": "query_logs", "parameters": {"service": "string"}},
|
| 121 |
+
{"action_type": "check_metrics", "parameters": {"service": "string"}},
|
| 122 |
+
{"action_type": "check_dependencies", "parameters": {"service": "string"}},
|
| 123 |
+
{"action_type": "check_recent_deploys", "parameters": {"service": "string"}},
|
| 124 |
+
{"action_type": "check_service_status", "parameters": {"service": "string"}},
|
| 125 |
+
],
|
| 126 |
+
"remediation": [
|
| 127 |
+
{"action_type": "restart_service", "parameters": {"service": "string"}},
|
| 128 |
+
{"action_type": "rollback_deploy", "parameters": {"service": "string", "target_version": "string"}},
|
| 129 |
+
{"action_type": "scale_service", "parameters": {"service": "string", "replicas": "int"}},
|
| 130 |
+
{"action_type": "disable_feature_flag", "parameters": {"flag": "string"}},
|
| 131 |
+
{"action_type": "clear_cache", "parameters": {"service": "string"}},
|
| 132 |
+
{"action_type": "execute_runbook_step", "parameters": {"runbook_action": "string", "target": "string"}},
|
| 133 |
+
],
|
| 134 |
+
"submission": [
|
| 135 |
+
{"action_type": "submit_severity", "parameters": {"severity": "P1|P2|P3|P4", "service": "string"}},
|
| 136 |
+
{"action_type": "submit_root_cause", "parameters": {"service": "string", "failure_mode": "string"}},
|
| 137 |
+
{"action_type": "submit_resolution", "parameters": {"summary": "string"}},
|
| 138 |
+
],
|
| 139 |
+
},
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
@app.get("/grader")
|
| 144 |
+
def grader():
|
| 145 |
+
"""Run the grader on the current episode. Returns score in [0.0, 1.0]."""
|
| 146 |
+
try:
|
| 147 |
+
s = env.state()
|
| 148 |
+
from graders import grade
|
| 149 |
+
result = grade(s.task_id, s.model_dump(), env._scenario)
|
| 150 |
+
return {
|
| 151 |
+
"total": result["total"],
|
| 152 |
+
"breakdown": result["breakdown"],
|
| 153 |
+
"feedback": result["feedback"],
|
| 154 |
+
"task_id": s.task_id,
|
| 155 |
+
"scenario_id": s.scenario_id,
|
| 156 |
+
"steps_used": s.step_count,
|
| 157 |
+
"done": s.done,
|
| 158 |
+
}
|
| 159 |
+
except RuntimeError as e:
|
| 160 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 161 |
+
except Exception as e:
|
| 162 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
@app.post("/baseline")
|
| 166 |
+
def baseline():
|
| 167 |
+
"""Run inference.py and return the JSON score summary."""
|
| 168 |
+
script = os.path.join(_PROJECT_ROOT, "inference.py")
|
| 169 |
+
if not os.path.exists(script):
|
| 170 |
+
raise HTTPException(status_code=500, detail="inference.py not found in project root")
|
| 171 |
+
try:
|
| 172 |
+
result = subprocess.run(
|
| 173 |
+
[sys.executable, script],
|
| 174 |
+
capture_output=True,
|
| 175 |
+
text=True,
|
| 176 |
+
timeout=1200,
|
| 177 |
+
cwd=_PROJECT_ROOT,
|
| 178 |
+
env={**os.environ, "ENV_BASE_URL": "http://localhost:7860"},
|
| 179 |
+
)
|
| 180 |
+
except subprocess.TimeoutExpired:
|
| 181 |
+
raise HTTPException(status_code=500, detail="inference.py timed out (>20 min)")
|
| 182 |
+
|
| 183 |
+
if result.returncode != 0:
|
| 184 |
+
raise HTTPException(status_code=500, detail=result.stderr[-2000:])
|
| 185 |
+
|
| 186 |
+
lines = result.stdout.strip().splitlines()
|
| 187 |
+
last_line = lines[-1] if lines else ""
|
| 188 |
+
try:
|
| 189 |
+
return json.loads(last_line)
|
| 190 |
+
except Exception:
|
| 191 |
+
return {"raw_output": result.stdout[-3000:]}
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
if __name__ == "__main__":
|
| 195 |
+
import uvicorn
|
| 196 |
+
uvicorn.run("server.app:app", host="0.0.0.0", port=7860, reload=False)
|
server/environment.py
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/environment.py β Core OpenEnv environment for SRE Incident Response.
|
| 3 |
+
|
| 4 |
+
Implements the full OpenEnv interface:
|
| 5 |
+
reset(task_id, scenario_index) -> Observation
|
| 6 |
+
step(action) -> (Observation, Reward, done, info)
|
| 7 |
+
state() -> EpisodeState
|
| 8 |
+
|
| 9 |
+
All state is in-memory. Thread-safe via a lock.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import uuid
|
| 15 |
+
import threading
|
| 16 |
+
import sys
|
| 17 |
+
import os
|
| 18 |
+
|
| 19 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 20 |
+
|
| 21 |
+
from tasks import ALL_TASKS, get_task, get_scenario
|
| 22 |
+
from graders import grade
|
| 23 |
+
from server.models import Action, ActionParameters, Observation, Reward, EpisodeState
|
| 24 |
+
|
| 25 |
+
# ββ Action type sets βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
|
| 27 |
+
_DIAGNOSTIC = {
|
| 28 |
+
"query_logs", "check_metrics", "check_dependencies",
|
| 29 |
+
"check_recent_deploys", "check_service_status",
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
_REMEDIATION = {
|
| 33 |
+
"restart_service", "rollback_deploy", "scale_service",
|
| 34 |
+
"disable_feature_flag", "clear_cache", "execute_runbook_step",
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
_SUBMIT = {
|
| 38 |
+
"submit_severity", "submit_root_cause", "submit_resolution",
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# ββ Reward constants βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
+
|
| 43 |
+
R_QUERY_KNOWN_FIRST = +0.05
|
| 44 |
+
R_QUERY_KNOWN_REPEAT = +0.01
|
| 45 |
+
R_QUERY_UNKNOWN = -0.05
|
| 46 |
+
R_REMEDIATION_GOOD = +0.10
|
| 47 |
+
R_REMEDIATION_WRONG = -0.10
|
| 48 |
+
R_STEP_PAST_HALF = -0.02
|
| 49 |
+
R_TIMEOUT = -0.10
|
| 50 |
+
R_UNKNOWN_ACTION = -0.03
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class IncidentEnvironment:
|
| 54 |
+
"""
|
| 55 |
+
OpenEnv environment for SRE Incident Response.
|
| 56 |
+
One instance handles one episode at a time.
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
def __init__(self):
|
| 60 |
+
self._lock = threading.Lock()
|
| 61 |
+
self._s: dict = {}
|
| 62 |
+
self._scenario: dict = {}
|
| 63 |
+
self._task_def: dict = {}
|
| 64 |
+
self._ready = False
|
| 65 |
+
|
| 66 |
+
# ββ Public OpenEnv API βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
|
| 68 |
+
def reset(self, task_id: str, scenario_index: int = 0) -> Observation:
|
| 69 |
+
"""Start a fresh episode. Returns the initial Observation."""
|
| 70 |
+
with self._lock:
|
| 71 |
+
task_def = get_task(task_id)
|
| 72 |
+
scenario = get_scenario(task_id, scenario_index)
|
| 73 |
+
|
| 74 |
+
self._task_def = task_def
|
| 75 |
+
self._scenario = scenario
|
| 76 |
+
self._s = {
|
| 77 |
+
"episode_id": str(uuid.uuid4()),
|
| 78 |
+
"task_id": task_id,
|
| 79 |
+
"scenario_id": scenario["scenario_id"],
|
| 80 |
+
"step_count": 0,
|
| 81 |
+
"max_steps": task_def["max_steps"],
|
| 82 |
+
"action_history": [],
|
| 83 |
+
"queried_data": {},
|
| 84 |
+
"queried_keys": set(), # tracks (action_type, service) for repeat detection
|
| 85 |
+
"submitted": False,
|
| 86 |
+
"resolved": False,
|
| 87 |
+
"done": False,
|
| 88 |
+
"cumulative_reward": 0.0,
|
| 89 |
+
"feedback": f"Episode started. {scenario['description']}",
|
| 90 |
+
}
|
| 91 |
+
self._ready = True
|
| 92 |
+
return self._build_obs()
|
| 93 |
+
|
| 94 |
+
def step(self, action: Action) -> tuple[Observation, Reward, bool, dict]:
|
| 95 |
+
"""Process one agent action. Returns (Observation, Reward, done, info)."""
|
| 96 |
+
with self._lock:
|
| 97 |
+
if not self._ready:
|
| 98 |
+
raise RuntimeError("Call reset() before step().")
|
| 99 |
+
|
| 100 |
+
s = self._s
|
| 101 |
+
if s["done"]:
|
| 102 |
+
obs = self._build_obs()
|
| 103 |
+
return obs, Reward(value=0.0, reason="episode already done",
|
| 104 |
+
cumulative=s["cumulative_reward"]), True, {}
|
| 105 |
+
|
| 106 |
+
s["step_count"] += 1
|
| 107 |
+
step_num = s["step_count"]
|
| 108 |
+
max_steps = s["max_steps"]
|
| 109 |
+
at = action.action_type
|
| 110 |
+
params = action.parameters
|
| 111 |
+
|
| 112 |
+
# Record action
|
| 113 |
+
s["action_history"].append({
|
| 114 |
+
"action_type": at,
|
| 115 |
+
"parameters": params.model_dump(exclude_none=True),
|
| 116 |
+
"step": step_num,
|
| 117 |
+
})
|
| 118 |
+
|
| 119 |
+
# ββ Compute step reward ββββββββββββββββββββββββββββββββββββββββββ
|
| 120 |
+
r = 0.0
|
| 121 |
+
fb: list[str] = []
|
| 122 |
+
|
| 123 |
+
# Efficiency penalty past halfway
|
| 124 |
+
if step_num > max_steps // 2:
|
| 125 |
+
r += R_STEP_PAST_HALF
|
| 126 |
+
fb.append("efficiency penalty")
|
| 127 |
+
|
| 128 |
+
if at in _DIAGNOSTIC:
|
| 129 |
+
r, fb = self._handle_diagnostic(at, params, r, fb)
|
| 130 |
+
elif at in _REMEDIATION:
|
| 131 |
+
r, fb = self._handle_remediation(at, params, r, fb)
|
| 132 |
+
elif at in _SUBMIT:
|
| 133 |
+
r, fb, terminal = self._handle_submit(at, params, r, fb)
|
| 134 |
+
if terminal:
|
| 135 |
+
s["done"] = True
|
| 136 |
+
else:
|
| 137 |
+
r += R_UNKNOWN_ACTION
|
| 138 |
+
fb.append(f"unknown action_type '{at}'")
|
| 139 |
+
|
| 140 |
+
# Timeout
|
| 141 |
+
if step_num >= max_steps and not s["done"]:
|
| 142 |
+
r += R_TIMEOUT
|
| 143 |
+
fb.append("timeout β no submission made")
|
| 144 |
+
s["done"] = True
|
| 145 |
+
|
| 146 |
+
# Run grader on terminal step
|
| 147 |
+
if s["done"]:
|
| 148 |
+
result = grade(s["task_id"], s, self._scenario)
|
| 149 |
+
s["cumulative_reward"] = round(
|
| 150 |
+
s["cumulative_reward"] + result["total"], 4
|
| 151 |
+
)
|
| 152 |
+
fb.append(f"grader β {result['feedback']}")
|
| 153 |
+
|
| 154 |
+
s["cumulative_reward"] = round(s["cumulative_reward"] + r, 4)
|
| 155 |
+
s["feedback"] = " | ".join(fb) if fb else "ok"
|
| 156 |
+
|
| 157 |
+
reward_obj = Reward(
|
| 158 |
+
value=round(r, 4),
|
| 159 |
+
reason=s["feedback"],
|
| 160 |
+
cumulative=s["cumulative_reward"],
|
| 161 |
+
)
|
| 162 |
+
return self._build_obs(), reward_obj, s["done"], {"step": step_num, "feedback": s["feedback"]}
|
| 163 |
+
|
| 164 |
+
def state(self) -> EpisodeState:
|
| 165 |
+
"""Return the full current episode state."""
|
| 166 |
+
with self._lock:
|
| 167 |
+
if not self._ready:
|
| 168 |
+
raise RuntimeError("No active episode β call reset() first.")
|
| 169 |
+
s = self._s
|
| 170 |
+
return EpisodeState(
|
| 171 |
+
episode_id=s["episode_id"],
|
| 172 |
+
task_id=s["task_id"],
|
| 173 |
+
scenario_id=s["scenario_id"],
|
| 174 |
+
step_count=s["step_count"],
|
| 175 |
+
max_steps=s["max_steps"],
|
| 176 |
+
action_history=s["action_history"],
|
| 177 |
+
queried_data=dict(s["queried_data"]),
|
| 178 |
+
submitted=s["submitted"],
|
| 179 |
+
resolved=s["resolved"],
|
| 180 |
+
done=s["done"],
|
| 181 |
+
cumulative_reward=s["cumulative_reward"],
|
| 182 |
+
feedback=s["feedback"],
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
# ββ Action handlers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 186 |
+
|
| 187 |
+
def _handle_diagnostic(
|
| 188 |
+
self, at: str, params: ActionParameters, r: float, fb: list[str]
|
| 189 |
+
) -> tuple[float, list[str]]:
|
| 190 |
+
s = self._s
|
| 191 |
+
service = (params.service or "").lower().strip()
|
| 192 |
+
known = {sv.lower() for sv in self._scenario.get("known_services", set())}
|
| 193 |
+
tool_data = self._scenario.get("tool_responses", {}).get(at, {})
|
| 194 |
+
query_key = (at, service)
|
| 195 |
+
|
| 196 |
+
if service and service in known:
|
| 197 |
+
if query_key not in s["queried_keys"]:
|
| 198 |
+
r += R_QUERY_KNOWN_FIRST
|
| 199 |
+
fb.append(f"queried {service} (+{R_QUERY_KNOWN_FIRST})")
|
| 200 |
+
s["queried_keys"].add(query_key)
|
| 201 |
+
else:
|
| 202 |
+
r += R_QUERY_KNOWN_REPEAT
|
| 203 |
+
fb.append(f"re-queried {service} (+{R_QUERY_KNOWN_REPEAT})")
|
| 204 |
+
|
| 205 |
+
result = tool_data.get(service, f"No data for '{service}'.")
|
| 206 |
+
s["queried_data"].setdefault(at, {})[service] = result
|
| 207 |
+
|
| 208 |
+
elif service:
|
| 209 |
+
r += R_QUERY_UNKNOWN
|
| 210 |
+
fb.append(f"unknown service '{service}' ({R_QUERY_UNKNOWN})")
|
| 211 |
+
else:
|
| 212 |
+
fb.append(f"{at}: no service specified")
|
| 213 |
+
|
| 214 |
+
return r, fb
|
| 215 |
+
|
| 216 |
+
def _handle_remediation(
|
| 217 |
+
self, at: str, params: ActionParameters, r: float, fb: list[str]
|
| 218 |
+
) -> tuple[float, list[str]]:
|
| 219 |
+
s = self._s
|
| 220 |
+
service = (params.service or "").lower().strip()
|
| 221 |
+
flag = (params.flag or "").lower().strip()
|
| 222 |
+
runbook_action = (params.runbook_action or "").lower().strip()
|
| 223 |
+
target = (params.target or "").lower().strip()
|
| 224 |
+
|
| 225 |
+
# Build lookup keys
|
| 226 |
+
keys_to_check = {at}
|
| 227 |
+
if service:
|
| 228 |
+
keys_to_check.add(f"{at}:{service}")
|
| 229 |
+
if flag:
|
| 230 |
+
keys_to_check.add(f"{at}:{flag}")
|
| 231 |
+
if runbook_action:
|
| 232 |
+
keys_to_check.add(f"execute_runbook_step:{runbook_action}")
|
| 233 |
+
if target:
|
| 234 |
+
keys_to_check.add(f"execute_runbook_step:{target}")
|
| 235 |
+
|
| 236 |
+
wrong_map = self._scenario.get("wrong_actions", {})
|
| 237 |
+
rem_data = self._scenario.get("remediation_data", {})
|
| 238 |
+
|
| 239 |
+
is_wrong = any(k in wrong_map for k in keys_to_check)
|
| 240 |
+
|
| 241 |
+
if is_wrong:
|
| 242 |
+
r += R_REMEDIATION_WRONG
|
| 243 |
+
reason = next((wrong_map[k] for k in keys_to_check if k in wrong_map), "wrong action")
|
| 244 |
+
fb.append(f"wrong: {at} β {str(reason)[:80]}")
|
| 245 |
+
else:
|
| 246 |
+
r += R_REMEDIATION_GOOD
|
| 247 |
+
fb.append(f"executed {at}" + (f" on {service}" if service else ""))
|
| 248 |
+
# Store remediation result if available
|
| 249 |
+
at_data = rem_data.get(at, {})
|
| 250 |
+
result = (
|
| 251 |
+
at_data.get(service)
|
| 252 |
+
or at_data.get(flag)
|
| 253 |
+
or at_data.get(runbook_action)
|
| 254 |
+
or at_data.get(target)
|
| 255 |
+
or "action executed"
|
| 256 |
+
)
|
| 257 |
+
s["queried_data"].setdefault(at, {})[service or flag or runbook_action or at] = result
|
| 258 |
+
|
| 259 |
+
return r, fb
|
| 260 |
+
|
| 261 |
+
def _handle_submit(
|
| 262 |
+
self, at: str, params: ActionParameters, r: float, fb: list[str]
|
| 263 |
+
) -> tuple[float, list[str], bool]:
|
| 264 |
+
s = self._s
|
| 265 |
+
s["submitted"] = True
|
| 266 |
+
|
| 267 |
+
if at == "submit_severity":
|
| 268 |
+
severity = (params.severity or "").upper()
|
| 269 |
+
fb.append(f"submitted severity: {severity}")
|
| 270 |
+
|
| 271 |
+
elif at == "submit_root_cause":
|
| 272 |
+
svc = params.service or ""
|
| 273 |
+
mode = params.failure_mode or ""
|
| 274 |
+
fb.append(f"submitted root cause: service={svc}, failure_mode={mode}")
|
| 275 |
+
|
| 276 |
+
elif at == "submit_resolution":
|
| 277 |
+
summary = params.summary or ""
|
| 278 |
+
diag_rem_count = sum(
|
| 279 |
+
1 for a in s["action_history"]
|
| 280 |
+
if a.get("action_type") in _DIAGNOSTIC | _REMEDIATION
|
| 281 |
+
)
|
| 282 |
+
if summary.strip() and diag_rem_count >= 1:
|
| 283 |
+
s["resolved"] = True
|
| 284 |
+
fb.append("resolution submitted β incident resolved")
|
| 285 |
+
else:
|
| 286 |
+
fb.append("resolution submitted (insufficient investigation)")
|
| 287 |
+
|
| 288 |
+
return r, fb, True # always terminal
|
| 289 |
+
|
| 290 |
+
# ββ Build observation ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 291 |
+
|
| 292 |
+
def _build_obs(self) -> Observation:
|
| 293 |
+
s = self._s
|
| 294 |
+
sc = self._scenario
|
| 295 |
+
td = self._task_def
|
| 296 |
+
return Observation(
|
| 297 |
+
episode_id=s["episode_id"],
|
| 298 |
+
task_id=s["task_id"],
|
| 299 |
+
scenario_id=s["scenario_id"],
|
| 300 |
+
step_count=s["step_count"],
|
| 301 |
+
max_steps=s["max_steps"],
|
| 302 |
+
incident_summary=sc.get("incident_summary", sc.get("description", "")),
|
| 303 |
+
alert=sc.get("alert", {}),
|
| 304 |
+
available_actions=td.get("available_actions", []),
|
| 305 |
+
queried_data=dict(s["queried_data"]),
|
| 306 |
+
cumulative_reward=s["cumulative_reward"],
|
| 307 |
+
done=s["done"],
|
| 308 |
+
feedback=s["feedback"],
|
| 309 |
+
)
|
server/models.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/models.py β Typed Pydantic models for the OpenEnv interface.
|
| 3 |
+
|
| 4 |
+
OpenEnv requires three typed models: Action, Observation, Reward.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ActionParameters(BaseModel):
|
| 12 |
+
"""Flexible parameter bag β different action types use different fields."""
|
| 13 |
+
service: str | None = None
|
| 14 |
+
severity: str | None = None
|
| 15 |
+
failure_mode: str | None = None
|
| 16 |
+
summary: str | None = None
|
| 17 |
+
target_version: str | None = None
|
| 18 |
+
replicas: int | None = None
|
| 19 |
+
flag: str | None = None
|
| 20 |
+
runbook_action: str | None = None
|
| 21 |
+
target: str | None = None
|
| 22 |
+
reasoning: str | None = None
|
| 23 |
+
|
| 24 |
+
model_config = {"extra": "allow"}
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class Action(BaseModel):
|
| 28 |
+
"""An action submitted by the agent to the environment."""
|
| 29 |
+
action_type: str
|
| 30 |
+
parameters: ActionParameters = Field(default_factory=ActionParameters)
|
| 31 |
+
|
| 32 |
+
model_config = {"extra": "allow"}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class Observation(BaseModel):
|
| 36 |
+
"""Observation returned after reset() or step()."""
|
| 37 |
+
episode_id: str
|
| 38 |
+
task_id: str
|
| 39 |
+
scenario_id: str
|
| 40 |
+
step_count: int
|
| 41 |
+
max_steps: int
|
| 42 |
+
incident_summary: str
|
| 43 |
+
alert: dict
|
| 44 |
+
available_actions: list[str]
|
| 45 |
+
queried_data: dict
|
| 46 |
+
cumulative_reward: float
|
| 47 |
+
done: bool
|
| 48 |
+
feedback: str
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class Reward(BaseModel):
|
| 52 |
+
"""Reward signal returned after each step()."""
|
| 53 |
+
value: float
|
| 54 |
+
reason: str
|
| 55 |
+
cumulative: float
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class EpisodeState(BaseModel):
|
| 59 |
+
"""Full episode state returned by GET /state."""
|
| 60 |
+
episode_id: str
|
| 61 |
+
task_id: str
|
| 62 |
+
scenario_id: str
|
| 63 |
+
step_count: int
|
| 64 |
+
max_steps: int
|
| 65 |
+
action_history: list[dict]
|
| 66 |
+
queried_data: dict
|
| 67 |
+
submitted: bool
|
| 68 |
+
resolved: bool
|
| 69 |
+
done: bool
|
| 70 |
+
cumulative_reward: float
|
| 71 |
+
feedback: str
|
tasks.py
ADDED
|
@@ -0,0 +1,664 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tasks.py β Task definitions and scenario data for SRE Incident Response OpenEnv.
|
| 3 |
+
|
| 4 |
+
Structure:
|
| 5 |
+
ALL_TASKS : dict[task_id -> task metadata]
|
| 6 |
+
SCENARIOS : dict[task_id -> list[scenario_dict]]
|
| 7 |
+
|
| 8 |
+
Public API:
|
| 9 |
+
get_task(task_id) -> task metadata dict
|
| 10 |
+
get_scenario(task_id, index) -> scenario dict
|
| 11 |
+
list_tasks() -> list of task metadata dicts
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
ALL_TASKS: dict = {
|
| 15 |
+
"alert_classification": {
|
| 16 |
+
"id": "alert_classification",
|
| 17 |
+
"name": "Task 1: Alert Severity Classification",
|
| 18 |
+
"difficulty": "easy",
|
| 19 |
+
"max_steps": 3,
|
| 20 |
+
"score_range": [0.0, 1.0],
|
| 21 |
+
"description": (
|
| 22 |
+
"Given an incoming alert with symptoms, affected services, and error rates, "
|
| 23 |
+
"classify the incident severity as P1 (CRITICAL), P2 (HIGH), P3 (MEDIUM), "
|
| 24 |
+
"or P4 (LOW). Use available diagnostic tools to gather evidence before submitting."
|
| 25 |
+
),
|
| 26 |
+
"available_actions": [
|
| 27 |
+
"query_logs", "check_metrics", "check_dependencies",
|
| 28 |
+
"check_recent_deploys", "submit_severity",
|
| 29 |
+
],
|
| 30 |
+
"submission_action": "submit_severity",
|
| 31 |
+
},
|
| 32 |
+
"root_cause_analysis": {
|
| 33 |
+
"id": "root_cause_analysis",
|
| 34 |
+
"name": "Task 2: Root Cause Analysis",
|
| 35 |
+
"difficulty": "medium",
|
| 36 |
+
"max_steps": 10,
|
| 37 |
+
"score_range": [0.0, 1.0],
|
| 38 |
+
"description": (
|
| 39 |
+
"An active incident is in progress. Use diagnostic tools to query logs, "
|
| 40 |
+
"metrics, dependencies, and recent deploys across services. Identify the "
|
| 41 |
+
"exact root cause service and failure mode, then submit your findings."
|
| 42 |
+
),
|
| 43 |
+
"available_actions": [
|
| 44 |
+
"query_logs", "check_metrics", "check_dependencies",
|
| 45 |
+
"check_recent_deploys", "check_service_status", "submit_root_cause",
|
| 46 |
+
],
|
| 47 |
+
"submission_action": "submit_root_cause",
|
| 48 |
+
},
|
| 49 |
+
"remediation_planning": {
|
| 50 |
+
"id": "remediation_planning",
|
| 51 |
+
"name": "Task 3: Incident Remediation",
|
| 52 |
+
"difficulty": "hard",
|
| 53 |
+
"max_steps": 15,
|
| 54 |
+
"score_range": [0.0, 1.0],
|
| 55 |
+
"description": (
|
| 56 |
+
"A production incident requires full resolution. Diagnose the root cause, "
|
| 57 |
+
"execute the correct remediation sequence (restart, rollback, scale, drain), "
|
| 58 |
+
"then submit a resolution summary. Scored on investigation quality, "
|
| 59 |
+
"remediation correctness, efficiency, and documentation."
|
| 60 |
+
),
|
| 61 |
+
"available_actions": [
|
| 62 |
+
"query_logs", "check_metrics", "check_dependencies",
|
| 63 |
+
"check_recent_deploys", "check_service_status",
|
| 64 |
+
"restart_service", "rollback_deploy", "scale_service",
|
| 65 |
+
"disable_feature_flag", "clear_cache", "execute_runbook_step",
|
| 66 |
+
"submit_resolution",
|
| 67 |
+
],
|
| 68 |
+
"submission_action": "submit_resolution",
|
| 69 |
+
},
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
# ---------------------------------------------------------------------------
|
| 73 |
+
# Scenario data
|
| 74 |
+
# Each scenario has:
|
| 75 |
+
# scenario_id, description, incident_summary, alert, known_services,
|
| 76 |
+
# tool_responses, correct_severity, correct_root_cause, correct_remediation,
|
| 77 |
+
# wrong_actions
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
|
| 80 |
+
SCENARIOS: dict = {
|
| 81 |
+
|
| 82 |
+
# ββ ALERT CLASSIFICATION βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 83 |
+
|
| 84 |
+
"alert_classification": [
|
| 85 |
+
|
| 86 |
+
# Scenario 0: DB connection pool exhaustion cascading up
|
| 87 |
+
{
|
| 88 |
+
"scenario_id": "AC-001",
|
| 89 |
+
"description": (
|
| 90 |
+
"Cascading failure: postgres-db connection pool exhausted, "
|
| 91 |
+
"causing auth-service timeouts, which is blocking api-gateway requests."
|
| 92 |
+
),
|
| 93 |
+
"incident_summary": (
|
| 94 |
+
"P1 ALERT β api-gateway 5xx rate 78%, auth-service timeout rate 94%, "
|
| 95 |
+
"postgres-db connection pool at 100% (500/500). "
|
| 96 |
+
"Checkout flow completely down. Revenue impact: $12k/min."
|
| 97 |
+
),
|
| 98 |
+
"alert": {
|
| 99 |
+
"id": "ALT-20240315-001",
|
| 100 |
+
"title": "CRITICAL: api-gateway error rate 78%",
|
| 101 |
+
"severity_fired": "P1",
|
| 102 |
+
"affected_services": ["api-gateway", "auth-service", "postgres-db"],
|
| 103 |
+
"symptoms": [
|
| 104 |
+
"api-gateway: HTTP 503 rate 78% (up from baseline 0.1%)",
|
| 105 |
+
"auth-service: connection timeout 94% of requests",
|
| 106 |
+
"postgres-db: connection pool 500/500 (100% utilized)",
|
| 107 |
+
"checkout flow: completely unavailable",
|
| 108 |
+
"Active user sessions: 0 new logins succeeding",
|
| 109 |
+
],
|
| 110 |
+
"error_rate": 0.78,
|
| 111 |
+
"duration_minutes": 4,
|
| 112 |
+
"revenue_impact_per_min": 12000,
|
| 113 |
+
},
|
| 114 |
+
"known_services": {"api-gateway", "auth-service", "postgres-db"},
|
| 115 |
+
"tool_responses": {
|
| 116 |
+
"query_logs": {
|
| 117 |
+
"api-gateway": (
|
| 118 |
+
"2024-03-15T10:04:12Z ERROR upstream connect error or disconnect/reset "
|
| 119 |
+
"before headers. reset reason: connection timeout β auth-service:8080\n"
|
| 120 |
+
"2024-03-15T10:04:13Z ERROR 503 Service Unavailable β upstream: auth-service"
|
| 121 |
+
),
|
| 122 |
+
"auth-service": (
|
| 123 |
+
"2024-03-15T10:04:10Z ERROR pq: sorry, too many clients already\n"
|
| 124 |
+
"2024-03-15T10:04:11Z ERROR dial tcp postgres-db:5432: connect: "
|
| 125 |
+
"connection refused β pool exhausted"
|
| 126 |
+
),
|
| 127 |
+
"postgres-db": (
|
| 128 |
+
"2024-03-15T10:03:58Z LOG connection received: host=auth-service\n"
|
| 129 |
+
"2024-03-15T10:04:00Z FATAL remaining connection slots are reserved "
|
| 130 |
+
"for non-replication superuser connections\n"
|
| 131 |
+
"2024-03-15T10:04:01Z LOG max_connections=500 currently active=500"
|
| 132 |
+
),
|
| 133 |
+
},
|
| 134 |
+
"check_metrics": {
|
| 135 |
+
"api-gateway": "HTTP 5xx rate: 78% | Latency p99: 30s (timeout) | RPS: 1200",
|
| 136 |
+
"auth-service": "Error rate: 94% | DB connection wait: 28s | Active conns: 0",
|
| 137 |
+
"postgres-db": "Connections: 500/500 (100%) | Query queue depth: 847 | CPU: 98%",
|
| 138 |
+
},
|
| 139 |
+
"check_dependencies": {
|
| 140 |
+
"api-gateway": "Depends on: auth-service [DEGRADED], product-service [OK]",
|
| 141 |
+
"auth-service": "Depends on: postgres-db [CRITICAL], redis-session [OK]",
|
| 142 |
+
"postgres-db": "No upstream dependencies",
|
| 143 |
+
},
|
| 144 |
+
"check_recent_deploys": {
|
| 145 |
+
"api-gateway": "Last deploy: 3 days ago β no recent changes",
|
| 146 |
+
"auth-service": "Last deploy: 47 mins ago β added connection pool monitoring",
|
| 147 |
+
"postgres-db": "Last deploy: 12 days ago β no recent changes",
|
| 148 |
+
},
|
| 149 |
+
},
|
| 150 |
+
"correct_severity": "P1",
|
| 151 |
+
"adjacent_severities": ["P2"],
|
| 152 |
+
"correct_root_cause": {
|
| 153 |
+
"service": "postgres-db",
|
| 154 |
+
"failure_mode": "connection pool exhaustion",
|
| 155 |
+
},
|
| 156 |
+
"correct_remediation": [
|
| 157 |
+
"restart_service:auth-service",
|
| 158 |
+
"execute_runbook_step:increase_max_connections",
|
| 159 |
+
"scale_service:postgres-db",
|
| 160 |
+
],
|
| 161 |
+
"wrong_actions": {
|
| 162 |
+
"rollback_deploy": "Rolling back auth-service monitoring changes won't fix pool exhaustion",
|
| 163 |
+
"restart_service:api-gateway": "api-gateway is a victim, not the cause",
|
| 164 |
+
"clear_cache": "Cache is not related to DB connection pool exhaustion",
|
| 165 |
+
},
|
| 166 |
+
},
|
| 167 |
+
|
| 168 |
+
# Scenario 1: CDN cache invalidation storm
|
| 169 |
+
{
|
| 170 |
+
"scenario_id": "AC-002",
|
| 171 |
+
"description": (
|
| 172 |
+
"CDN cache invalidation storm: a misconfigured cache purge script ran "
|
| 173 |
+
"against all product images, sending 40x normal traffic to origin."
|
| 174 |
+
),
|
| 175 |
+
"incident_summary": (
|
| 176 |
+
"P2 ALERT β product-service origin traffic spike 4000%, "
|
| 177 |
+
"image-service CPU 95%, CDN cache hit rate dropped from 94% to 3%. "
|
| 178 |
+
"Site slow but partially functional. Latency p99: 18s."
|
| 179 |
+
),
|
| 180 |
+
"alert": {
|
| 181 |
+
"id": "ALT-20240315-002",
|
| 182 |
+
"title": "HIGH: product-service origin traffic anomaly",
|
| 183 |
+
"severity_fired": "P2",
|
| 184 |
+
"affected_services": ["cdn-edge", "product-service", "image-service"],
|
| 185 |
+
"symptoms": [
|
| 186 |
+
"CDN cache hit rate: 3% (normal: 94%)",
|
| 187 |
+
"product-service: origin RPS 48,000 (normal: 1,200)",
|
| 188 |
+
"image-service: CPU 95%, latency p99 18s",
|
| 189 |
+
"User-facing: product pages loading slowly, some images timing out",
|
| 190 |
+
"No complete outage β checkout still working",
|
| 191 |
+
],
|
| 192 |
+
"error_rate": 0.15,
|
| 193 |
+
"duration_minutes": 8,
|
| 194 |
+
"revenue_impact_per_min": 800,
|
| 195 |
+
},
|
| 196 |
+
"known_services": {"cdn-edge", "product-service", "image-service"},
|
| 197 |
+
"tool_responses": {
|
| 198 |
+
"query_logs": {
|
| 199 |
+
"cdn-edge": (
|
| 200 |
+
"2024-03-15T10:22:00Z INFO cache MISS ratio: 97% (last 5min)\n"
|
| 201 |
+
"2024-03-15T10:20:11Z WARN mass cache invalidation event detected "
|
| 202 |
+
"β 2.1M keys purged by purge-job-prod\n"
|
| 203 |
+
"2024-03-15T10:20:10Z INFO purge request from 10.0.1.45 β pattern: /*"
|
| 204 |
+
),
|
| 205 |
+
"product-service": (
|
| 206 |
+
"2024-03-15T10:22:05Z WARN request queue depth: 12,400\n"
|
| 207 |
+
"2024-03-15T10:22:06Z ERROR timeout fetching image from image-service"
|
| 208 |
+
),
|
| 209 |
+
"image-service": (
|
| 210 |
+
"2024-03-15T10:22:00Z WARN CPU throttling engaged\n"
|
| 211 |
+
"2024-03-15T10:22:01Z ERROR worker pool exhausted β dropping requests"
|
| 212 |
+
),
|
| 213 |
+
},
|
| 214 |
+
"check_metrics": {
|
| 215 |
+
"cdn-edge": "Cache hit rate: 3% | Purge events last hour: 1 (mass) | Origin RPS: 48k",
|
| 216 |
+
"product-service": "Origin RPS: 48,000 (normal 1,200) | Queue depth: 12,400",
|
| 217 |
+
"image-service": "CPU: 95% | Worker pool: 0 free / 200 | Latency p99: 18s",
|
| 218 |
+
},
|
| 219 |
+
"check_dependencies": {
|
| 220 |
+
"cdn-edge": "Origin: product-service [OVERLOADED]",
|
| 221 |
+
"product-service": "Depends on: image-service [DEGRADED], postgres-db [OK]",
|
| 222 |
+
"image-service": "Depends on: object-storage [OK] β no upstream issues",
|
| 223 |
+
},
|
| 224 |
+
"check_recent_deploys": {
|
| 225 |
+
"cdn-edge": "Cronjob purge-job-prod modified 2 hours ago β pattern changed from /images/* to /*",
|
| 226 |
+
"product-service": "Last deploy: 5 days ago",
|
| 227 |
+
"image-service": "Last deploy: 2 days ago",
|
| 228 |
+
},
|
| 229 |
+
},
|
| 230 |
+
"correct_severity": "P2",
|
| 231 |
+
"adjacent_severities": ["P1", "P3"],
|
| 232 |
+
"correct_root_cause": {
|
| 233 |
+
"service": "cdn-edge",
|
| 234 |
+
"failure_mode": "mass cache invalidation / misconfigured purge job",
|
| 235 |
+
},
|
| 236 |
+
"correct_remediation": [
|
| 237 |
+
"disable_feature_flag:purge-job-prod",
|
| 238 |
+
"execute_runbook_step:warm_cdn_cache",
|
| 239 |
+
"scale_service:image-service",
|
| 240 |
+
],
|
| 241 |
+
"wrong_actions": {
|
| 242 |
+
"restart_service:image-service": "Restarting won't fix the CDN cache miss storm at source",
|
| 243 |
+
"rollback_deploy:product-service": "product-service has no recent changes",
|
| 244 |
+
"restart_service:cdn-edge": "Restarting CDN edge nodes will make cache miss rate worse temporarily",
|
| 245 |
+
},
|
| 246 |
+
},
|
| 247 |
+
],
|
| 248 |
+
|
| 249 |
+
# ββ ROOT CAUSE ANALYSIS ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 250 |
+
|
| 251 |
+
"root_cause_analysis": [
|
| 252 |
+
|
| 253 |
+
# Scenario 0: Postgres OOM killed by runaway analytics query
|
| 254 |
+
{
|
| 255 |
+
"scenario_id": "RCA-001",
|
| 256 |
+
"description": (
|
| 257 |
+
"postgres-db was OOM-killed by the Linux kernel after a runaway analytics "
|
| 258 |
+
"query consumed all available memory, taking down all dependent services."
|
| 259 |
+
),
|
| 260 |
+
"incident_summary": (
|
| 261 |
+
"Multiple services down: api-gateway 503, auth-service failing, "
|
| 262 |
+
"order-service unable to write. postgres-db restarting repeatedly. "
|
| 263 |
+
"Root cause is upstream β needs investigation."
|
| 264 |
+
),
|
| 265 |
+
"alert": {
|
| 266 |
+
"id": "ALT-RCA-001",
|
| 267 |
+
"title": "CRITICAL: postgres-db repeated restarts, all dependents degraded",
|
| 268 |
+
"severity_fired": "P1",
|
| 269 |
+
"affected_services": ["api-gateway", "auth-service", "order-service", "postgres-db"],
|
| 270 |
+
"symptoms": [
|
| 271 |
+
"postgres-db: restarted 4 times in 12 minutes",
|
| 272 |
+
"auth-service: connection refused errors 100%",
|
| 273 |
+
"order-service: write failures 100%",
|
| 274 |
+
"api-gateway: 503 on all authenticated routes",
|
| 275 |
+
],
|
| 276 |
+
"error_rate": 0.95,
|
| 277 |
+
"duration_minutes": 14,
|
| 278 |
+
},
|
| 279 |
+
"known_services": {
|
| 280 |
+
"api-gateway", "auth-service", "order-service",
|
| 281 |
+
"postgres-db", "analytics-service", "redis-session",
|
| 282 |
+
},
|
| 283 |
+
"tool_responses": {
|
| 284 |
+
"query_logs": {
|
| 285 |
+
"postgres-db": (
|
| 286 |
+
"2024-03-16T02:11:00Z LOG database system was shut down at 2024-03-16 02:10:58\n"
|
| 287 |
+
"2024-03-16T02:10:58Z FATAL Out of Memory: Kill process 1847 (postgres) "
|
| 288 |
+
"score 982 or sacrifice child\n"
|
| 289 |
+
"2024-03-16T02:10:30Z LOG process 1847 still running query started "
|
| 290 |
+
"2024-03-16 01:58:00: SELECT * FROM events JOIN user_sessions JOIN orders "
|
| 291 |
+
"JOIN products β no LIMIT clause"
|
| 292 |
+
),
|
| 293 |
+
"analytics-service": (
|
| 294 |
+
"2024-03-16T01:58:00Z INFO starting scheduled report: full_history_export\n"
|
| 295 |
+
"2024-03-16T02:10:55Z ERROR query killed by OOM β report failed\n"
|
| 296 |
+
"2024-03-16T01:58:01Z WARN query has no LIMIT β estimated rows: 847M"
|
| 297 |
+
),
|
| 298 |
+
"auth-service": (
|
| 299 |
+
"2024-03-16T02:11:05Z ERROR connect ECONNREFUSED postgres-db:5432\n"
|
| 300 |
+
"2024-03-16T02:11:06Z ERROR all retries exhausted"
|
| 301 |
+
),
|
| 302 |
+
"api-gateway": (
|
| 303 |
+
"2024-03-16T02:11:10Z ERROR upstream auth-service: 503 Service Unavailable"
|
| 304 |
+
),
|
| 305 |
+
"order-service": (
|
| 306 |
+
"2024-03-16T02:11:08Z ERROR pq: the database system is starting up"
|
| 307 |
+
),
|
| 308 |
+
"redis-session": "No errors β operating normally",
|
| 309 |
+
},
|
| 310 |
+
"check_metrics": {
|
| 311 |
+
"postgres-db": "Memory: 0% free (OOM killed) | Restarts: 4 | Last crash: 2min ago",
|
| 312 |
+
"analytics-service": "Memory used: 31GB / 32GB at time of crash | Query runtime: 12min",
|
| 313 |
+
"auth-service": "Connection success rate: 0% | DB dependency: CRITICAL",
|
| 314 |
+
"api-gateway": "503 rate: 95% | Auth dependency: DOWN",
|
| 315 |
+
"order-service": "Write success rate: 0% | DB dependency: RESTARTING",
|
| 316 |
+
"redis-session": "Hit rate: 99.2% | Memory: 42% | All normal",
|
| 317 |
+
},
|
| 318 |
+
"check_dependencies": {
|
| 319 |
+
"postgres-db": "Clients: auth-service, order-service, analytics-service, product-service",
|
| 320 |
+
"analytics-service": "Depends on: postgres-db [RESTARTING]",
|
| 321 |
+
"auth-service": "Depends on: postgres-db [RESTARTING], redis-session [OK]",
|
| 322 |
+
"api-gateway": "Depends on: auth-service [DOWN]",
|
| 323 |
+
"order-service": "Depends on: postgres-db [RESTARTING]",
|
| 324 |
+
"redis-session": "No DB dependency β standalone",
|
| 325 |
+
},
|
| 326 |
+
"check_recent_deploys": {
|
| 327 |
+
"analytics-service": (
|
| 328 |
+
"Deploy 6 hours ago: added full_history_export scheduled job β "
|
| 329 |
+
"no LIMIT on cross-table JOIN query, runs at 02:00 UTC daily"
|
| 330 |
+
),
|
| 331 |
+
"postgres-db": "No recent deploys β last change 3 weeks ago",
|
| 332 |
+
"auth-service": "No recent deploys",
|
| 333 |
+
"order-service": "No recent deploys",
|
| 334 |
+
},
|
| 335 |
+
"check_service_status": {
|
| 336 |
+
"postgres-db": "Status: RESTARTING | Uptime: 47s | Previous uptime: 14min",
|
| 337 |
+
"analytics-service": "Status: ERROR | Last job: full_history_export FAILED",
|
| 338 |
+
"auth-service": "Status: DOWN | Waiting for postgres-db",
|
| 339 |
+
"api-gateway": "Status: DEGRADED | 95% requests failing",
|
| 340 |
+
},
|
| 341 |
+
},
|
| 342 |
+
"correct_root_cause": {
|
| 343 |
+
"service": "analytics-service",
|
| 344 |
+
"failure_mode": "unbounded query causing OOM kill on postgres-db",
|
| 345 |
+
},
|
| 346 |
+
"correct_remediation": [
|
| 347 |
+
"restart_service:analytics-service",
|
| 348 |
+
"disable_feature_flag:full_history_export",
|
| 349 |
+
"execute_runbook_step:restart_postgres",
|
| 350 |
+
],
|
| 351 |
+
"wrong_actions": {
|
| 352 |
+
"restart_service:auth-service": "auth-service is a victim, restarting won't fix the DB",
|
| 353 |
+
"restart_service:api-gateway": "api-gateway is a victim downstream",
|
| 354 |
+
"scale_service:postgres-db": "Scaling DB won't prevent OOM if the bad query runs again",
|
| 355 |
+
"rollback_deploy:postgres-db": "postgres-db has no recent deploys",
|
| 356 |
+
},
|
| 357 |
+
},
|
| 358 |
+
|
| 359 |
+
# Scenario 1: Network partition isolating payment service
|
| 360 |
+
{
|
| 361 |
+
"scenario_id": "RCA-002",
|
| 362 |
+
"description": (
|
| 363 |
+
"A BGP route withdrawal caused a network partition isolating the "
|
| 364 |
+
"payment-service AZ, resulting in 61% checkout failure rate."
|
| 365 |
+
),
|
| 366 |
+
"incident_summary": (
|
| 367 |
+
"Checkout failures 61%, payment-service unreachable from AZ-2 and AZ-3. "
|
| 368 |
+
"fraud-detection-service also unreachable. "
|
| 369 |
+
"Partial service β users in AZ-1 unaffected."
|
| 370 |
+
),
|
| 371 |
+
"alert": {
|
| 372 |
+
"id": "ALT-RCA-002",
|
| 373 |
+
"title": "HIGH: checkout failure rate 61%, payment-service connectivity loss",
|
| 374 |
+
"severity_fired": "P2",
|
| 375 |
+
"affected_services": ["order-service", "payment-service", "fraud-detection-service"],
|
| 376 |
+
"symptoms": [
|
| 377 |
+
"checkout failure rate: 61% (only AZ-2 and AZ-3 affected)",
|
| 378 |
+
"payment-service: unreachable from AZ-2, AZ-3",
|
| 379 |
+
"fraud-detection-service: timeout from AZ-2, AZ-3",
|
| 380 |
+
"AZ-1 users: completely unaffected",
|
| 381 |
+
"Network latency AZ-2βAZ-1: infinite (no route)",
|
| 382 |
+
],
|
| 383 |
+
"error_rate": 0.61,
|
| 384 |
+
"duration_minutes": 9,
|
| 385 |
+
},
|
| 386 |
+
"known_services": {
|
| 387 |
+
"order-service", "payment-service", "fraud-detection-service",
|
| 388 |
+
"postgres-db", "redis-payment-cache", "network-infra",
|
| 389 |
+
},
|
| 390 |
+
"tool_responses": {
|
| 391 |
+
"query_logs": {
|
| 392 |
+
"order-service": (
|
| 393 |
+
"2024-03-17T14:32:10Z ERROR connection timeout payment-service:8080 "
|
| 394 |
+
"(AZ-2 β AZ-1: no route to host)\n"
|
| 395 |
+
"2024-03-17T14:32:11Z ERROR fraud-detection-service: i/o timeout"
|
| 396 |
+
),
|
| 397 |
+
"payment-service": (
|
| 398 |
+
"2024-03-17T14:31:58Z WARN health check failing from AZ-2 load balancer\n"
|
| 399 |
+
"2024-03-17T14:31:59Z INFO all local (AZ-1) requests processing normally"
|
| 400 |
+
),
|
| 401 |
+
"fraud-detection-service": (
|
| 402 |
+
"2024-03-17T14:32:00Z INFO processing normally within AZ-1\n"
|
| 403 |
+
"2024-03-17T14:32:01Z WARN cross-AZ health checks timing out"
|
| 404 |
+
),
|
| 405 |
+
"network-infra": (
|
| 406 |
+
"2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.2.1 route withdrawal β "
|
| 407 |
+
"AZ-2 lost route to AZ-1 CIDR 10.0.1.0/24\n"
|
| 408 |
+
"2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.3.1 route withdrawal β "
|
| 409 |
+
"AZ-3 lost route to AZ-1 CIDR 10.0.1.0/24"
|
| 410 |
+
),
|
| 411 |
+
"postgres-db": "Operating normally β no errors",
|
| 412 |
+
"redis-payment-cache": "Operating normally β AZ-1 only traffic, all good",
|
| 413 |
+
},
|
| 414 |
+
"check_metrics": {
|
| 415 |
+
"order-service": "AZ-2 checkout failure: 99% | AZ-1 checkout failure: 0.2% (baseline)",
|
| 416 |
+
"payment-service": "AZ-1 traffic: normal | AZ-2/AZ-3 inbound: 0 (blocked by network)",
|
| 417 |
+
"fraud-detection-service": "AZ-1 normal | Cross-AZ: 100% timeout",
|
| 418 |
+
"network-infra": "BGP sessions AZ-2/AZ-3: DOWN | AZ-1 internal: all UP",
|
| 419 |
+
"postgres-db": "All metrics normal",
|
| 420 |
+
"redis-payment-cache": "All metrics normal",
|
| 421 |
+
},
|
| 422 |
+
"check_dependencies": {
|
| 423 |
+
"order-service": "Depends on: payment-service [PARTITIONED], fraud-detection-service [PARTITIONED]",
|
| 424 |
+
"payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
|
| 425 |
+
"fraud-detection-service": "Depends on: postgres-db [OK]",
|
| 426 |
+
"network-infra": "BGP peers: AZ-2 [DOWN], AZ-3 [DOWN], AZ-1 [UP]",
|
| 427 |
+
},
|
| 428 |
+
"check_recent_deploys": {
|
| 429 |
+
"network-infra": (
|
| 430 |
+
"Router config change 18 mins ago: updated BGP route advertisement policy β "
|
| 431 |
+
"inadvertently withdrew AZ-1 routes from AZ-2/AZ-3 peers"
|
| 432 |
+
),
|
| 433 |
+
"payment-service": "No recent deploys",
|
| 434 |
+
"order-service": "No recent deploys",
|
| 435 |
+
},
|
| 436 |
+
"check_service_status": {
|
| 437 |
+
"payment-service": "Status: HEALTHY (within AZ-1) | Cross-AZ: UNREACHABLE",
|
| 438 |
+
"order-service": "Status: DEGRADED | AZ-2/3 instances failing",
|
| 439 |
+
"network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN | AZ-1: UP",
|
| 440 |
+
"fraud-detection-service": "Status: HEALTHY (within AZ-1) | Cross-AZ: UNREACHABLE",
|
| 441 |
+
},
|
| 442 |
+
},
|
| 443 |
+
"correct_root_cause": {
|
| 444 |
+
"service": "network-infra",
|
| 445 |
+
"failure_mode": "BGP route withdrawal causing AZ network partition",
|
| 446 |
+
},
|
| 447 |
+
"correct_remediation": [
|
| 448 |
+
"execute_runbook_step:restore_bgp_routes",
|
| 449 |
+
"rollback_deploy:network-infra",
|
| 450 |
+
],
|
| 451 |
+
"wrong_actions": {
|
| 452 |
+
"restart_service:payment-service": "payment-service is healthy β network is the issue",
|
| 453 |
+
"restart_service:order-service": "order-service is a victim of the network partition",
|
| 454 |
+
"scale_service:payment-service": "Scaling won't fix a network routing problem",
|
| 455 |
+
"clear_cache:redis-payment-cache": "Cache is operating normally β not the cause",
|
| 456 |
+
},
|
| 457 |
+
},
|
| 458 |
+
],
|
| 459 |
+
|
| 460 |
+
# ββ REMEDIATION PLANNING βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 461 |
+
|
| 462 |
+
"remediation_planning": [
|
| 463 |
+
|
| 464 |
+
# Scenario 0: Postgres OOM β full remediation required
|
| 465 |
+
{
|
| 466 |
+
"scenario_id": "RP-001",
|
| 467 |
+
"description": (
|
| 468 |
+
"Full remediation required: analytics-service OOM-killed postgres-db. "
|
| 469 |
+
"Must stop the offending job, restart DB, restore services, document."
|
| 470 |
+
),
|
| 471 |
+
"incident_summary": (
|
| 472 |
+
"CRITICAL β postgres-db repeatedly OOM-killed by analytics runaway query. "
|
| 473 |
+
"auth-service, order-service, api-gateway all down. "
|
| 474 |
+
"Requires: stop analytics job, restart postgres, verify service recovery, document."
|
| 475 |
+
),
|
| 476 |
+
"alert": {
|
| 477 |
+
"id": "ALT-RP-001",
|
| 478 |
+
"title": "CRITICAL: postgres-db OOM killed β full stack down",
|
| 479 |
+
"severity_fired": "P1",
|
| 480 |
+
"affected_services": ["postgres-db", "auth-service", "order-service", "api-gateway"],
|
| 481 |
+
},
|
| 482 |
+
"known_services": {
|
| 483 |
+
"postgres-db", "auth-service", "order-service",
|
| 484 |
+
"api-gateway", "analytics-service",
|
| 485 |
+
},
|
| 486 |
+
"tool_responses": {
|
| 487 |
+
"query_logs": {
|
| 488 |
+
"postgres-db": (
|
| 489 |
+
"FATAL: Out of Memory: Kill process (postgres) β analytics query running 12min with no LIMIT"
|
| 490 |
+
),
|
| 491 |
+
"analytics-service": "ERROR: full_history_export job β unbounded JOIN query killed by OOM",
|
| 492 |
+
"auth-service": "ERROR: connect ECONNREFUSED postgres-db:5432",
|
| 493 |
+
"order-service": "ERROR: pq: the database system is starting up",
|
| 494 |
+
"api-gateway": "ERROR: upstream auth-service 503",
|
| 495 |
+
},
|
| 496 |
+
"check_metrics": {
|
| 497 |
+
"postgres-db": "Memory: OOM | Restarts: 4 | Status: RESTARTING",
|
| 498 |
+
"analytics-service": "Status: ERROR | Memory spike to 31GB before crash",
|
| 499 |
+
"auth-service": "Connection success: 0% | Waiting for DB",
|
| 500 |
+
"order-service": "Write success: 0% | Waiting for DB",
|
| 501 |
+
},
|
| 502 |
+
"check_dependencies": {
|
| 503 |
+
"postgres-db": "Clients: auth-service, order-service, analytics-service",
|
| 504 |
+
"analytics-service": "Depends on: postgres-db",
|
| 505 |
+
"auth-service": "Depends on: postgres-db [DOWN]",
|
| 506 |
+
"order-service": "Depends on: postgres-db [DOWN]",
|
| 507 |
+
},
|
| 508 |
+
"check_recent_deploys": {
|
| 509 |
+
"analytics-service": "Deploy 6h ago: added full_history_export cron job β unbounded query",
|
| 510 |
+
"postgres-db": "No recent changes",
|
| 511 |
+
},
|
| 512 |
+
"check_service_status": {
|
| 513 |
+
"postgres-db": "RESTARTING | Uptime: 47s",
|
| 514 |
+
"analytics-service": "ERROR | Last job failed",
|
| 515 |
+
"auth-service": "DOWN",
|
| 516 |
+
"order-service": "DOWN",
|
| 517 |
+
},
|
| 518 |
+
},
|
| 519 |
+
"remediation_data": {
|
| 520 |
+
"disable_feature_flag": {
|
| 521 |
+
"full_history_export": "Cron job full_history_export disabled β analytics queries halted",
|
| 522 |
+
},
|
| 523 |
+
"restart_service": {
|
| 524 |
+
"postgres-db": "postgres-db restarted cleanly β accepting connections",
|
| 525 |
+
"analytics-service": "analytics-service restarted β no active queries",
|
| 526 |
+
"auth-service": "auth-service restarted β reconnected to postgres-db successfully",
|
| 527 |
+
"order-service": "order-service restarted β write operations resuming",
|
| 528 |
+
},
|
| 529 |
+
"execute_runbook_step": {
|
| 530 |
+
"verify_db_health": "postgres-db connections: 12/500 β healthy",
|
| 531 |
+
"check_service_recovery": "auth-service OK, order-service OK, api-gateway OK",
|
| 532 |
+
},
|
| 533 |
+
},
|
| 534 |
+
"correct_severity": "P1",
|
| 535 |
+
"correct_root_cause": {
|
| 536 |
+
"service": "analytics-service",
|
| 537 |
+
"failure_mode": "unbounded query OOM killing postgres-db",
|
| 538 |
+
},
|
| 539 |
+
"correct_remediation_sequence": [
|
| 540 |
+
"disable_feature_flag:full_history_export",
|
| 541 |
+
"restart_service:analytics-service",
|
| 542 |
+
"restart_service:postgres-db",
|
| 543 |
+
"restart_service:auth-service",
|
| 544 |
+
"restart_service:order-service",
|
| 545 |
+
],
|
| 546 |
+
"wrong_actions": {
|
| 547 |
+
"rollback_deploy:postgres-db": "postgres-db has no recent deploy to roll back",
|
| 548 |
+
"scale_service:postgres-db": "Scaling won't stop the OOM query from running again",
|
| 549 |
+
"restart_service:api-gateway": "api-gateway is downstream victim β fix DB first",
|
| 550 |
+
},
|
| 551 |
+
"resolution_keywords": [
|
| 552 |
+
"analytics", "oom", "memory", "postgres", "query", "full_history_export",
|
| 553 |
+
"disabled", "restarted", "recovered",
|
| 554 |
+
],
|
| 555 |
+
},
|
| 556 |
+
|
| 557 |
+
# Scenario 1: BGP network partition β full remediation
|
| 558 |
+
{
|
| 559 |
+
"scenario_id": "RP-002",
|
| 560 |
+
"description": (
|
| 561 |
+
"Full remediation: BGP route withdrawal partitioned AZ-2/AZ-3 from AZ-1 "
|
| 562 |
+
"where payment-service runs. Must restore BGP routes, roll back network config."
|
| 563 |
+
),
|
| 564 |
+
"incident_summary": (
|
| 565 |
+
"P2 β BGP route withdrawal isolating payment-service from 61% of users. "
|
| 566 |
+
"Requires: restore BGP routes, roll back router config, verify checkout recovery."
|
| 567 |
+
),
|
| 568 |
+
"alert": {
|
| 569 |
+
"id": "ALT-RP-002",
|
| 570 |
+
"title": "HIGH: checkout 61% failure β BGP network partition AZ-2/AZ-3",
|
| 571 |
+
"severity_fired": "P2",
|
| 572 |
+
"affected_services": ["network-infra", "order-service", "payment-service"],
|
| 573 |
+
},
|
| 574 |
+
"known_services": {
|
| 575 |
+
"network-infra", "order-service", "payment-service",
|
| 576 |
+
"fraud-detection-service", "postgres-db",
|
| 577 |
+
},
|
| 578 |
+
"tool_responses": {
|
| 579 |
+
"query_logs": {
|
| 580 |
+
"network-infra": (
|
| 581 |
+
"CRITICAL: BGP route withdrawal β AZ-2/AZ-3 lost route to AZ-1 10.0.1.0/24\n"
|
| 582 |
+
"Router config change 18min ago: BGP advertisement policy update"
|
| 583 |
+
),
|
| 584 |
+
"order-service": "ERROR: connection timeout payment-service β no route to host",
|
| 585 |
+
"payment-service": "INFO: AZ-1 traffic normal | WARN: cross-AZ health checks failing",
|
| 586 |
+
},
|
| 587 |
+
"check_metrics": {
|
| 588 |
+
"network-infra": "BGP AZ-2: DOWN | BGP AZ-3: DOWN | AZ-1: UP",
|
| 589 |
+
"order-service": "AZ-2 failure: 99% | AZ-1 failure: 0.2%",
|
| 590 |
+
"payment-service": "AZ-1 normal | Cross-AZ inbound: 0",
|
| 591 |
+
},
|
| 592 |
+
"check_dependencies": {
|
| 593 |
+
"order-service": "Depends on: payment-service [PARTITIONED]",
|
| 594 |
+
"payment-service": "Depends on: postgres-db [OK]",
|
| 595 |
+
"network-infra": "BGP peers: AZ-2 [DOWN], AZ-3 [DOWN]",
|
| 596 |
+
},
|
| 597 |
+
"check_recent_deploys": {
|
| 598 |
+
"network-infra": "Config change 18min ago β BGP policy update withdrew AZ-1 routes",
|
| 599 |
+
"payment-service": "No recent deploys",
|
| 600 |
+
},
|
| 601 |
+
"check_service_status": {
|
| 602 |
+
"network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
|
| 603 |
+
"payment-service": "HEALTHY (AZ-1 only) | Cross-AZ: UNREACHABLE",
|
| 604 |
+
"order-service": "DEGRADED",
|
| 605 |
+
},
|
| 606 |
+
},
|
| 607 |
+
"remediation_data": {
|
| 608 |
+
"rollback_deploy": {
|
| 609 |
+
"network-infra": "Router config rolled back β BGP advertisement policy restored",
|
| 610 |
+
},
|
| 611 |
+
"execute_runbook_step": {
|
| 612 |
+
"restore_bgp_routes": "BGP routes restored β AZ-2/AZ-3 can reach AZ-1",
|
| 613 |
+
"verify_checkout_recovery": "Checkout failure rate: 0.3% β incident resolved",
|
| 614 |
+
},
|
| 615 |
+
},
|
| 616 |
+
"correct_severity": "P2",
|
| 617 |
+
"correct_root_cause": {
|
| 618 |
+
"service": "network-infra",
|
| 619 |
+
"failure_mode": "BGP route withdrawal network partition",
|
| 620 |
+
},
|
| 621 |
+
"correct_remediation_sequence": [
|
| 622 |
+
"execute_runbook_step:restore_bgp_routes",
|
| 623 |
+
"rollback_deploy:network-infra",
|
| 624 |
+
"execute_runbook_step:verify_checkout_recovery",
|
| 625 |
+
],
|
| 626 |
+
"wrong_actions": {
|
| 627 |
+
"restart_service:payment-service": "payment-service is healthy β network is the issue",
|
| 628 |
+
"scale_service:payment-service": "Scaling won't fix a routing problem",
|
| 629 |
+
"restart_service:order-service": "order-service is a victim",
|
| 630 |
+
"clear_cache": "Cache is unrelated to network routing",
|
| 631 |
+
},
|
| 632 |
+
"resolution_keywords": [
|
| 633 |
+
"bgp", "network", "route", "rollback", "partition", "restored",
|
| 634 |
+
"az-1", "az-2", "az-3", "checkout",
|
| 635 |
+
],
|
| 636 |
+
},
|
| 637 |
+
],
|
| 638 |
+
}
|
| 639 |
+
|
| 640 |
+
|
| 641 |
+
# ---------------------------------------------------------------------------
|
| 642 |
+
# Public API
|
| 643 |
+
# ---------------------------------------------------------------------------
|
| 644 |
+
|
| 645 |
+
def get_task(task_id: str) -> dict:
|
| 646 |
+
if task_id not in ALL_TASKS:
|
| 647 |
+
raise ValueError(f"Unknown task_id '{task_id}'. Valid: {list(ALL_TASKS)}")
|
| 648 |
+
return ALL_TASKS[task_id]
|
| 649 |
+
|
| 650 |
+
|
| 651 |
+
def get_scenario(task_id: str, index: int) -> dict:
|
| 652 |
+
if task_id not in SCENARIOS:
|
| 653 |
+
raise ValueError(f"No scenarios for task_id '{task_id}'.")
|
| 654 |
+
scenarios = SCENARIOS[task_id]
|
| 655 |
+
if index < 0 or index >= len(scenarios):
|
| 656 |
+
raise ValueError(
|
| 657 |
+
f"Scenario index {index} out of range for task '{task_id}' "
|
| 658 |
+
f"(has {len(scenarios)} scenarios: 0β{len(scenarios)-1})."
|
| 659 |
+
)
|
| 660 |
+
return scenarios[index]
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
def list_tasks() -> list:
|
| 664 |
+
return list(ALL_TASKS.values())
|