Spaces:

Elliot89
/

cloud-incident-response

Sleeping

App Files Files Community

Elliot89 commited on Mar 31

Commit

77eb356

1 Parent(s): 89fa09c

Cloud Incident Response OpenEnv - initial submission

Browse files

Files changed (14) hide show

.gitignore +12 -0
Dockerfile +14 -0
README.md +200 -5
graders.py +267 -0
inference.py +546 -0
openenv.yaml +59 -0
pyproject.toml +16 -0
requirements.txt +7 -0
server/__init__.py +0 -0
server/app.py +230 -0
server/environment.py +324 -0
server/models.py +75 -0
tasks.py +768 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+__pycache__/
+*.pyc
+*.pyo
+.env
+.env.*
+*.egg-info/
+dist/
+build/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+*.log

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+FROM python:3.11-slim
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+WORKDIR /app
+COPY pyproject.toml uv.lock ./
+RUN uv sync --frozen --no-dev
+COPY . .
+EXPOSE 7860
+CMD ["uv", "run", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,205 @@
 ---
-title: Cloud Incident Response
-emoji: 🐢
-colorFrom: indigo
-colorTo: blue
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Cloud Incident Response OpenEnv
+emoji: 🚨
+colorFrom: red
+colorTo: yellow
 sdk: docker
+app_port: 7860
 pinned: false
+tags:
+  - openenv
+  - sre
+  - cloud
+  - incident-response
+  - devops
+  - real-world
+  - agentic
 ---
+# Cloud Incident Response — OpenEnv Environment
+An OpenEnv environment for training and evaluating AI agents on **cloud SRE incident response** — the real-world on-call workflow that engineers at every cloud company perform daily.
+Distinct from Kubernetes operations environments: this focuses on **cross-service cascading failures** in distributed microservice architectures — connection pool exhaustion, CDN cache storms, OOM kills, and BGP network partitions.
+## Why This Environment
+Every cloud company employs SREs who respond to production incidents under time pressure with incomplete information. This environment simulates the exact decision loop:
+1. **Triage** — Read alert, assess blast radius, classify severity (P1–P4)
+2. **Investigate** — Query logs, metrics, dependencies, recent deploys
+3. **Diagnose** — Correlate signals across services to find the root cause
+4. **Remediate** — Execute the correct runbook steps in the right sequence
+5. **Document** — Submit a resolution summary for post-incident review
+Agents trained here learn the same skills a human SRE uses: service dependency traversal, log correlation, cascading failure analysis, and targeted remediation.
+## Tasks
+| Task ID | Difficulty | Max Steps | What the Agent Does |
+|---|---|---|---|
+| `alert_classification` | Easy | 3 | Classify alert severity (P1–P4) from metrics and symptoms |
+| `root_cause_analysis` | Medium | 10 | Trace logs/metrics/deps to find root cause service and failure mode |
+| `remediation_planning` | Hard | 15 | Diagnose, remediate, and document full incident resolution |
+### Scenarios
+| ID | Incident Type | Root Cause | Failure Pattern |
+|---|---|---|---|
+| AC-001 | DB connection pool exhaustion | postgres-db / auth-service deploy | api-gateway → auth-service → postgres-db cascade |
+| AC-002 | CDN cache invalidation storm | cdn-edge purge cronjob misconfigured | 40× origin traffic spike |
+| RCA-001 | Postgres OOM kill | analytics-service unbounded query | Kernel OOM → DB crash loop → all dependents down |
+| RCA-002 | BGP network partition | network-infra config change | Route withdrawal → AZ isolation → 61% checkout failures |
+| RP-001 | Full OOM remediation | analytics-service | Disable job → restart DB → restore services → document |
+| RP-002 | Full BGP remediation | network-infra | Restore routes → rollback config → verify recovery → document |
+## Action Space
+**Diagnostic actions** (gather evidence):
+```json
+{"action_type": "query_logs",           "parameters": {"service": "postgres-db"}}
+{"action_type": "check_metrics",        "parameters": {"service": "auth-service"}}
+{"action_type": "check_dependencies",   "parameters": {"service": "api-gateway"}}
+{"action_type": "check_recent_deploys", "parameters": {"service": "analytics-service"}}
+{"action_type": "check_service_status", "parameters": {"service": "payment-service"}}
+```
+**Remediation actions** (fix the incident):
+```json
+{"action_type": "restart_service",      "parameters": {"service": "postgres-db"}}
+{"action_type": "rollback_deploy",      "parameters": {"service": "network-infra", "target_version": "previous"}}
+{"action_type": "scale_service",        "parameters": {"service": "image-service", "replicas": 10}}
+{"action_type": "disable_feature_flag", "parameters": {"flag": "full_history_export"}}
+{"action_type": "execute_runbook_step", "parameters": {"runbook_action": "restore_bgp_routes"}}
+```
+**Submission actions** (end the episode):
+```json
+{"action_type": "submit_severity",   "parameters": {"severity": "P1", "service": "postgres-db"}}
+{"action_type": "submit_root_cause", "parameters": {"service": "analytics-service", "failure_mode": "unbounded query OOM killing postgres-db"}}
+{"action_type": "submit_resolution", "parameters": {"summary": "Disabled analytics job, restarted postgres-db..."}}
+```
+## Observation Space
+| Field | Type | Description |
+|---|---|---|
+| `episode_id` | string | Unique episode UUID |
+| `task_id` | string | Active task |
+| `scenario_id` | string | Scenario (e.g. `AC-001`) |
+| `step_count` / `max_steps` | int | Current step and budget |
+| `incident_summary` | string | Plain-text incident description |
+| `alert` | dict | Alert payload with severity, symptoms, affected services |
+| `available_actions` | list[str] | Valid action types for this task |
+| `queried_data` | dict | All tool responses gathered so far |
+| `known_services` | list[str] | Exact service names to use in actions |
+| `cumulative_reward` | float | Running reward total |
+| `done` | bool | Episode terminal flag |
+| `feedback` | string | Per-step feedback string |
+## Reward Function
+Dense reward shaping throughout the trajectory:
+| Event | Reward |
+|---|---|
+| Query known service (first time) | +0.05 |
+| Query known service (repeat) | +0.01 |
+| Query unknown service | −0.05 |
+| Correct remediation action | +0.10 |
+| Wrong remediation action | −0.10 |
+| Step past halfway (non-submit) | −0.02 |
+| Timeout without submission | −0.10 |
+| Grader score (terminal step) | 0.0–1.0 |
+**Grader scoring** (deterministic, via `GET /grader`):
+| Task | Scoring Logic |
+|---|---|
+| `alert_classification` | 1.0 exact · 0.5 adjacent · 0.25 two-off · 0.0 wrong/none |
+| `root_cause_analysis` | 0.6 base (svc+mode) + up to 0.4 efficiency bonus |
+| `remediation_planning` | 0.6 base + 0.3 efficiency − 0.15 wrong penalty + 0.1 summary |
+## API Endpoints
+| Method | Path | Description |
+|---|---|---|
+| GET | `/` | `{"status":"running",...}` — HF Space health |
+| GET | `/health` | `{"status":"ok","version":"0.1.0"}` |
+| POST | `/reset?task_id=...&scenario_index=...` | Start new episode |
+| POST | `/step` | Submit action (JSON body) |
+| GET | `/state` | Full current episode state |
+| GET | `/tasks` | All tasks with action schemas |
+| GET | `/grader` | Score current episode (0.0–1.0) |
+| POST | `/baseline` | Run inference.py, return scores |
+## Setup & Usage
+### Local development
+```bash
+pip install -r requirements.txt
+uvicorn server.app:app --host 0.0.0.0 --port 7860
+```
+### Docker
+```bash
+docker build -t cloud-incident-env .
+docker run -p 7860:7860 \
+  -e API_BASE_URL="https://api-inference.huggingface.co/v1" \
+  -e MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" \
+  -e HF_TOKEN="hf_your_token" \
+  cloud-incident-env
+```
+### Run inference script
+```bash
+export API_BASE_URL="https://api-inference.huggingface.co/v1"
+export MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
+export HF_TOKEN="hf_your_token"
+python inference.py
+```
+### Quick API test
+```bash
+# Start new episode
+curl -X POST "http://localhost:7860/reset?task_id=alert_classification&scenario_index=0"
+# Submit an action
+curl -X POST http://localhost:7860/step \
+  -H "Content-Type: application/json" \
+  -d '{"action_type":"query_logs","parameters":{"service":"api-gateway"}}'
+# Check score
+curl http://localhost:7860/grader
+```
+## Baseline Scores
+Using `meta-llama/Llama-3.1-8B-Instruct` via HF Inference API:
+| Task | Scenario 0 | Scenario 1 | Average |
+|---|---|---|---|
+| `alert_classification` | ~1.00 | ~0.50 | ~0.75 |
+| `root_cause_analysis` | ~0.45 | ~0.35 | ~0.40 |
+| `remediation_planning` | ~0.25 | ~0.20 | ~0.23 |
+| **overall** | | | **~0.46** |
+*Run `python inference.py` to reproduce.*
+## Project Structure
+```
+.
+├── Dockerfile
+├── README.md
+├── requirements.txt
+├── openenv.yaml
+├── tasks.py          # Scenario definitions (6 scenarios across 3 tasks)
+├── graders.py        # Deterministic graders for all tasks
+├── inference.py      # Baseline agent + smart fallback logic
+└── server/
+    ├── __init__.py
+    ├── app.py        # FastAPI endpoints
+    ├── environment.py # Core OpenEnv step/reset/state logic
+    └── models.py     # Typed Pydantic models (Action, Observation, Reward)
+```

graders.py ADDED Viewed

	@@ -0,0 +1,267 @@

+"""
+graders.py — Deterministic graders for all 3 Cloud Incident Response tasks.
+Public API:
+    grade(task_id, state, scenario) -> {"total": float, "breakdown": dict, "feedback": str}
+All scores are in [0.0, 1.0].
+"""
+from __future__ import annotations
+def _normalise(s: str) -> str:
+    """Lowercase, strip whitespace, collapse hyphens/underscores."""
+    return s.lower().strip().replace("_", "-").replace(" ", "-")
+def _svc_match(submitted: str, correct: str) -> bool:
+    s = _normalise(submitted)
+    c = _normalise(correct)
+    if s == c:
+        return True
+    if s in c or c in s:
+        return True
+    aliases = {
+        "network": "network-infra",
+        "network-infrastructure": "network-infra",
+        "cdn": "cdn-edge",
+        "postgres": "postgres-db",
+        "postgresql": "postgres-db",
+        "analytics": "analytics-service",
+        "payment": "payment-service",
+        "auth": "auth-service",
+        "api": "api-gateway",
+        "api-gw": "api-gateway",
+    }
+    return aliases.get(s, s) == c or s == aliases.get(c, c)
+def grade(task_id: str, state: dict, scenario: dict) -> dict:
+    _graders = {
+        "alert_classification": _grade_alert_classification,
+        "root_cause_analysis": _grade_root_cause_analysis,
+        "remediation_planning": _grade_remediation_planning,
+    }
+    fn = _graders.get(task_id)
+    if fn is None:
+        return {"total": 0.0, "breakdown": {}, "feedback": f"Unknown task_id '{task_id}'"}
+    return fn(state, scenario)
+# ── Task 1: Alert Classification ─────────────────────────────────────────────
+def _grade_alert_classification(state: dict, scenario: dict) -> dict:
+    history = state.get("action_history", [])
+    correct = scenario.get("correct_severity", "P1")
+    adjacent = scenario.get("adjacent_severities", [])
+    order = ["P1", "P2", "P3", "P4"]
+    submitted = None
+    for a in history:
+        if a.get("action_type") == "submit_severity":
+            submitted = a.get("parameters", {}).get("severity", "").upper().strip()
+            break
+    if not submitted:
+        return {
+            "total": 0.0,
+            "breakdown": {"submitted": False, "severity_match": 0.0},
+            "feedback": "No severity submitted — score 0.0",
+        }
+    if submitted == correct:
+        score, msg = 1.0, f"Exact match: {submitted}"
+    elif submitted in adjacent:
+        score, msg = 0.5, f"Adjacent: submitted {submitted}, correct {correct}"
+    else:
+        try:
+            dist = abs(order.index(submitted) - order.index(correct))
+        except ValueError:
+            dist = 4
+        score = 0.25 if dist == 2 else 0.0
+        msg = f"Wrong: submitted {submitted}, correct {correct} (distance={dist})"
+    return {
+        "total": score,
+        "breakdown": {
+            "submitted_severity": submitted,
+            "correct_severity": correct,
+            "severity_match": score,
+        },
+        "feedback": msg,
+    }
+# ── Task 2: Root Cause Analysis ──────────────────────────────────────────────
+def _grade_root_cause_analysis(state: dict, scenario: dict) -> dict:
+    history = state.get("action_history", [])
+    correct_rc = scenario.get("correct_root_cause", {})
+    correct_svc = correct_rc.get("service", "").lower().strip()
+    correct_mode = correct_rc.get("failure_mode", "").lower().strip()
+    known = {s.lower() for s in scenario.get("known_services", set())}
+    diag_types = {
+        "query_logs", "check_metrics", "check_dependencies",
+        "check_recent_deploys", "check_service_status",
+    }
+    sub_svc, sub_mode, sub_step = "", "", len(history)
+    for a in history:
+        if a.get("action_type") == "submit_root_cause":
+            p = a.get("parameters", {})
+            sub_svc = p.get("service", "").lower().strip()
+            sub_mode = p.get("failure_mode", "").lower().strip()
+            sub_step = a.get("step", len(history))
+            break
+    if not sub_svc:
+        return {
+            "total": 0.0,
+            "breakdown": {"base": 0.0, "efficiency": 0.0, "submitted": False},
+            "feedback": "No root cause submitted — score 0.0",
+        }
+    svc_match = _svc_match(sub_svc, correct_svc)
+    mode_kws = [w for w in correct_mode.split() if len(w) > 3]
+    mode_match = svc_match and (
+        any(kw in sub_mode for kw in mode_kws) if mode_kws else True
+    )
+    if mode_match:
+        base, base_fb = 0.6, "Correct service + failure mode"
+    elif svc_match:
+        base, base_fb = 0.35, "Correct service only — failure mode unclear"
+    else:
+        base, base_fb = 0.10, (
+            f"Wrong service: '{sub_svc}' (correct: '{correct_svc}') — partial credit"
+        )
+    efficiency = 0.0
+    if svc_match:
+        pre_submit = [
+            a for a in history[:sub_step]
+            if a.get("action_type") in diag_types
+        ]
+        queried_svcs = {
+            a.get("parameters", {}).get("service", "").lower()
+            for a in pre_submit
+        }
+        relevant = queried_svcs & known
+        total_q = len(pre_submit)
+        if total_q > 0:
+            precision = len(relevant) / max(total_q, 1)
+            efficiency = round(
+                min(0.4, precision * 0.4 + min(len(relevant), 3) * 0.05), 4
+            )
+    total = round(min(1.0, base + efficiency), 4)
+    return {
+        "total": total,
+        "breakdown": {
+            "base": base,
+            "efficiency_bonus": efficiency,
+            "service_match": svc_match,
+            "mode_match": mode_match,
+            "submitted_service": sub_svc,
+            "correct_service": correct_svc,
+        },
+        "feedback": f"{base_fb} | efficiency={efficiency:.2f} | total={total:.2f}",
+    }
+# ── Task 3: Remediation Planning ─────────────────────────────────────────────
+def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
+    history = state.get("action_history", [])
+    correct_seq = scenario.get("correct_remediation_sequence", [])
+    wrong_map = scenario.get("wrong_actions", {})
+    keywords = scenario.get("resolution_keywords", [])
+    diag_rem = {
+        "query_logs", "check_metrics", "check_dependencies",
+        "check_recent_deploys", "check_service_status",
+        "restart_service", "rollback_deploy", "scale_service",
+        "disable_feature_flag", "clear_cache", "execute_runbook_step",
+    }
+    summary = ""
+    for a in history:
+        if a.get("action_type") == "submit_resolution":
+            summary = a.get("parameters", {}).get("summary", "")
+            break
+    inv_count = sum(1 for a in history if a.get("action_type") in diag_rem)
+    if not summary or inv_count < 1:
+        return {
+            "total": 0.0,
+            "breakdown": {
+                "base": 0.0, "efficiency": 0.0,
+                "penalty": 0.0, "summary_bonus": 0.0,
+            },
+            "feedback": "No resolution submitted or no investigation — score 0.0",
+        }
+    base = 0.6
+    executed = set()
+    for a in history:
+        at = a.get("action_type", "")
+        p = a.get("parameters", {})
+        svc = p.get("service", "")
+        flag = p.get("flag", "")
+        runbook = p.get("runbook_action", "")
+        target = p.get("target", "")
+        executed.add(at)
+        if svc:     executed.add(f"{at}:{svc}")
+        if flag:    executed.add(f"{at}:{flag}")
+        if runbook: executed.add(f"execute_runbook_step:{runbook}")
+        if target:  executed.add(f"execute_runbook_step:{target}")
+    def _seq_key_matches(seq_key: str) -> bool:
+        if seq_key in executed:
+            return True
+        if ":" in seq_key:
+            action, target = seq_key.split(":", 1)
+            for ex_key in executed:
+                if ":" in ex_key:
+                    ex_action, ex_target = ex_key.split(":", 1)
+                    if ex_action == action and _svc_match(ex_target, target):
+                        return True
+        return False
+    matched = sum(1 for k in correct_seq if _seq_key_matches(k))
+    efficiency = round((matched / len(correct_seq)) * 0.3, 4) if correct_seq else 0.0
+    wrong_count = sum(
+        1 for a in history
+        if (a.get("action_type") in wrong_map or
+            f"{a.get('action_type')}:{a.get('parameters', {}).get('service', '')}"
+            in wrong_map)
+    )
+    penalty = round(min(0.15, wrong_count * 0.05), 4)
+    sl = summary.lower()
+    hits = sum(1 for kw in keywords if kw in sl)
+    summary_bonus = 0.10 if hits >= 3 else (0.05 if hits >= 1 else 0.0)
+    total = round(max(0.0, min(1.0, base + efficiency - penalty + summary_bonus)), 4)
+    return {
+        "total": total,
+        "breakdown": {
+            "base": base,
+            "efficiency_bonus": efficiency,
+            "wrong_action_penalty": -penalty,
+            "summary_bonus": summary_bonus,
+            "correct_actions_matched": matched,
+            "correct_actions_total": len(correct_seq),
+            "wrong_actions_count": wrong_count,
+            "summary_keywords_hit": hits,
+        },
+        "feedback": (
+            f"base={base} | efficiency={efficiency:.2f} "
+            f"({matched}/{len(correct_seq)} correct) | "
+            f"penalty=-{penalty:.2f} | summary={summary_bonus:.2f} | "
+            f"total={total:.2f}"
+        ),
+    }

inference.py ADDED Viewed

	@@ -0,0 +1,546 @@

+"""
+inference.py — Cloud Incident Response OpenEnv baseline inference script.
+The LLM reasons from evidence. Fallback is a dumb safety net that scores low.
+Override only blocks clearly invalid actions (wrong task submission, bad params).
+"""
+from __future__ import annotations
+import json
+import os
+import sys
+import requests
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass
+# ── Config ──────────────────────────────────────────────────────────────────
+API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.groq.com/openai/v1")
+MODEL_NAME   = os.environ.get("MODEL_NAME",   "llama-3.1-8b-instant")
+HF_TOKEN     = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY") or ""
+ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:7860")
+if not HF_TOKEN:
+    print("[WARN] No API key set — LLM calls will fail.", file=sys.stderr)
+_session = requests.Session()
+# Lazy-init OpenAI client to avoid import-time httpx errors
+_client = None
+def _get_client():
+    global _client
+    if _client is None:
+        from openai import OpenAI
+        _client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
+    return _client
+# ── Which submission action belongs to which task ───────────────────────────
+_TASK_SUBMIT = {
+    "alert_classification":  "submit_severity",
+    "root_cause_analysis":   "submit_root_cause",
+    "remediation_planning":  "submit_resolution",
+}
+_DIAG_TYPES = frozenset({
+    "query_logs", "check_metrics", "check_dependencies",
+    "check_recent_deploys", "check_service_status",
+})
+_SUBMIT_TYPES = frozenset({
+    "submit_severity", "submit_root_cause", "submit_resolution",
+})
+_REM_TYPES = frozenset({
+    "restart_service", "rollback_deploy", "scale_service",
+    "disable_feature_flag", "clear_cache", "execute_runbook_step",
+})
+_ALL_VALID = _DIAG_TYPES | _SUBMIT_TYPES | _REM_TYPES
+# ── System prompt — general SRE strategy, NO scenario answers ───────────────
+SYSTEM_PROMPT = """\
+You are an expert Site Reliability Engineer responding to a production incident.
+Reply with exactly ONE JSON action object. No markdown, no explanation, no extra text.
+VALID ACTIONS:
+{"action_type":"query_logs","parameters":{"service":"<name>"}}
+{"action_type":"check_metrics","parameters":{"service":"<name>"}}
+{"action_type":"check_dependencies","parameters":{"service":"<name>"}}
+{"action_type":"check_recent_deploys","parameters":{"service":"<name>"}}
+{"action_type":"check_service_status","parameters":{"service":"<name>"}}
+{"action_type":"restart_service","parameters":{"service":"<name>"}}
+{"action_type":"rollback_deploy","parameters":{"service":"<name>","target_version":"previous"}}
+{"action_type":"disable_feature_flag","parameters":{"flag":"<flag_name>"}}
+{"action_type":"execute_runbook_step","parameters":{"runbook_action":"<action>"}}
+{"action_type":"submit_severity","parameters":{"severity":"P1|P2|P3|P4","service":"<name>"}}
+{"action_type":"submit_root_cause","parameters":{"service":"<name>","failure_mode":"<description>"}}
+{"action_type":"submit_resolution","parameters":{"summary":"<3+ sentence summary>"}}
+RULES:
+- Service names MUST exactly match the KNOWN_SERVICES list in the observation.
+- P1 = complete outage OR revenue > $1,000/min.  P2 = major degradation.
+  P3 = minor issue.  P4 = informational.
+- Root cause = the upstream service that TRIGGERED the cascade. This is often
+  NOT listed in the alert's affected_services. Investigate services not in the
+  alert first.
+- submit_resolution summary must be 3+ sentences: (1) what failed and why,
+  (2) actions you took to fix it, (3) current recovery status.
+- Submit as soon as evidence is clear — do NOT waste steps querying more.
+TASK-SPECIFIC STRATEGY:
+alert_classification (max 3 steps):
+  Query 1-2 affected services for evidence, then submit_severity.
+root_cause_analysis (max 10 steps):
+  Investigate services NOT in the alert first (check logs + recent deploys).
+  Look for: OOM kills, BGP withdrawals, config changes, unbounded queries.
+  Submit submit_root_cause with the triggering service and failure mode.
+remediation_planning (max 15 steps):
+  1. Query logs to confirm root cause.
+  2. Execute fixes: disable bad jobs, restart crashed services, rollback configs,
+     run runbook steps.
+  3. Submit submit_resolution with a detailed 3-sentence summary.
+CRITICAL: Each task has ONE correct submission action:
+  alert_classification  -> submit_severity
+  root_cause_analysis   -> submit_root_cause
+  remediation_planning  -> submit_resolution
+Do NOT use the wrong submission type for the task."""
+# ── Helpers ────────────────────────────────────────────────��────────────────
+def _queried_svcs(queried_data: dict) -> set[str]:
+    return {
+        svc
+        for at, svcs in queried_data.items()
+        if at in _DIAG_TYPES and isinstance(svcs, dict)
+        for svc in svcs
+    }
+def _extract_signals(queried_data: dict) -> list[str]:
+    """Surface key patterns from queried data — shown to LLM."""
+    seen: set[str] = set()
+    signals: list[str] = []
+    def _add(msg: str) -> None:
+        if msg not in seen:
+            seen.add(msg)
+            signals.append(msg)
+    for action_type, services in queried_data.items():
+        if not isinstance(services, dict):
+            continue
+        for svc, data in services.items():
+            t = str(data).lower()
+            if "out of memory" in t or "oom" in t:
+                _add(f"OOM detected in {svc}")
+            if "bgp" in t and ("withdrawal" in t or "withdrawn" in t):
+                _add(f"BGP route issue in {svc}")
+            if "pool" in t and ("exhaust" in t or "too many clients" in t):
+                _add(f"Connection pool issue in {svc}")
+            if "cache" in t and ("purge" in t or "invalidat" in t):
+                _add(f"Cache purge in {svc}")
+            if "unbounded" in t or "no limit" in t:
+                _add(f"Unbounded query in {svc}")
+            if action_type == "check_recent_deploys" and any(
+                x in t for x in ("ago", "change", "update", "added")
+            ):
+                snippet = str(data)[:120].replace("\n", " ")
+                _add(f"Recent change in {svc}: {snippet}")
+    return signals
+# ── Message builders ────────────────────────────────────────────────────────
+def _first_obs_msg(obs: dict) -> str:
+    alert    = obs.get("alert", {})
+    known    = obs.get("known_services", [])
+    affected = alert.get("affected_services", [])
+    task_id  = obs.get("task_id", "")
+    non_aff  = [s for s in known if s not in affected]
+    lines = [
+        "=== NEW INCIDENT ===",
+        f"Task: {task_id}  |  Max steps: {obs.get('max_steps')}",
+        f"Scenario: {obs.get('scenario_id', '')}",
+        f"INCIDENT: {obs.get('incident_summary', '')}",
+    ]
+    if alert:
+        lines.append("ALERT DETAILS:")
+        if alert.get("title"):
+            lines.append(f"  Title: {alert['title']}")
+        if affected:
+            lines.append(f"  Directly affected services: {', '.join(affected)}")
+        for s in alert.get("symptoms", []):
+            lines.append(f"  - {s}")
+        for k in ("error_rate", "duration_minutes", "revenue_impact_per_min"):
+            if alert.get(k) is not None:
+                lines.append(f"  {k}: {alert[k]}")
+    lines.append(f"KNOWN_SERVICES (use these EXACT names): {json.dumps(known)}")
+    if non_aff and task_id in ("root_cause_analysis", "remediation_planning"):
+        lines.append(
+            f"  *** These services are NOT in the alert — investigate them "
+            f"for possible root cause: {json.dumps(non_aff)} ***"
+        )
+    lines.append(f"AVAILABLE ACTIONS: {obs.get('available_actions', [])}")
+    lines.append(f"REQUIRED SUBMISSION: {_TASK_SUBMIT.get(task_id, 'unknown')}")
+    lines.append("")
+    lines.append("Respond with your first action (JSON only, no markdown):")
+    return "\n".join(lines)
+def _step_msg(obs: dict, prev_queried: dict) -> str:
+    step      = obs.get("step_count", 0)
+    max_steps = obs.get("max_steps", 10)
+    left      = max_steps - step
+    queried   = obs.get("queried_data", {})
+    task_id   = obs.get("task_id", "")
+    lines = [
+        f"Step {step}/{max_steps} ({left} remaining) | "
+        f"reward={obs.get('cumulative_reward', 0.0):.3f} | "
+        f"feedback: {obs.get('feedback', '')}",
+    ]
+    # Show new data received
+    new_data = []
+    for action_type, services in queried.items():
+        prev = prev_queried.get(action_type, {})
+        if isinstance(services, dict):
+            for svc, data in services.items():
+                if svc not in prev:
+                    d = str(data)
+                    if len(d) > 500:
+                        d = d[:500] + "..."
+                    new_data.append(f"  [{action_type}][{svc}]: {d}")
+    if new_data:
+        lines.append("NEW DATA RECEIVED:")
+        lines.extend(new_data)
+    # Show extracted signals
+    signals = _extract_signals(queried)
+    if signals:
+        lines.append("KEY SIGNALS DETECTED:")
+        for sig in signals:
+            lines.append(f"  *** {sig} ***")
+    # Urgency reminders
+    if left <= 3:
+        lines.append(
+            f"*** {left} steps remaining — submit "
+            f"{_TASK_SUBMIT.get(task_id, 'your answer')} soon ***"
+        )
+    if left <= 1:
+        lines.append(
+            f"!!! LAST STEP — YOU MUST {_TASK_SUBMIT.get(task_id, 'SUBMIT')} NOW !!!"
+        )
+    lines.append("Next action (JSON only, no markdown):")
+    return "\n".join(lines)
+# ── Parse LLM output ───────────────────────────────────────────────────────
+def _parse(text: str) -> dict:
+    text = text.strip()
+    # Strip markdown code fences
+    if text.startswith("`"):
+        text = "\n".join(
+            ln for ln in text.splitlines() if not ln.startswith("`")
+        ).strip()
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        s = text.find("{")
+        e = text.rfind("}") + 1
+        if s != -1 and e > s:
+            return json.loads(text[s:e])
+        raise
+# ── Fallback — generic, no scenario knowledge ──────────────────────────────
+def _fallback_submit(task_id: str, obs: dict) -> dict:
+    """Minimal correct-type submission. Will score low but won't crash."""
+    alert = obs.get("alert", {})
+    known = obs.get("known_services", [])
+    if task_id == "alert_classification":
+        rev = alert.get("revenue_impact_per_min", 0) or 0
+        err = alert.get("error_rate", 0) or 0
+        sev = "P1" if (rev > 1000 or err > 0.9) else (
+              "P2" if (rev > 100 or err > 0.3) else "P3")
+        svc = (alert.get("affected_services") or known or ["unknown"])[0]
+        return {
+            "action_type": "submit_severity",
+            "parameters": {"severity": sev, "service": svc},
+        }
+    if task_id == "root_cause_analysis":
+        svc = known[0] if known else "unknown"
+        return {
+            "action_type": "submit_root_cause",
+            "parameters": {
+                "service": svc,
+                "failure_mode": "service failure causing downstream cascade",
+            },
+        }
+    # remediation_planning
+    return {
+        "action_type": "submit_resolution",
+        "parameters": {
+            "summary": (
+                "The incident was investigated through log and metric analysis "
+                "across affected services. Remediation actions were applied to "
+                "restore service health. Systems are being monitored for full "
+                "recovery confirmation."
+            ),
+        },
+    }
+def _smart_fallback(
+    task_id: str, obs: dict, step: int, max_steps: int
+) -> dict:
+    """Generic fallback — queries unvisited services, then submits."""
+    known   = obs.get("known_services", [])
+    queried = obs.get("queried_data", {})
+    left    = max_steps - step
+    q_svcs  = _queried_svcs(queried)
+    # Must submit on final step
+    if left <= 1:
+        return _fallback_submit(task_id, obs)
+    # Alert classification — submit after any query
+    if task_id == "alert_classification" and q_svcs:
+        return _fallback_submit(task_id, obs)
+    # Query next un-queried service
+    for svc in known:
+        if svc not in q_svcs:
+            return {
+                "action_type": "query_logs",
+                "parameters": {"service": svc},
+            }
+    # Try check_recent_deploys for unvisited services
+    if task_id in ("root_cause_analysis", "remediation_planning"):
+        deploy_queried = set(queried.get("check_recent_deploys", {}).keys())
+        for svc in known:
+            if svc not in deploy_queried:
+                return {
+                    "action_type": "check_recent_deploys",
+                    "parameters": {"service": svc},
+                }
+    # Everything queried — submit
+    return _fallback_submit(task_id, obs)
+# ── Override — ONLY blocks clearly invalid actions ──────────────────────────
+def _should_override(
+    task_id: str, action: dict, obs: dict, step: int, max_steps: int
+) -> bool:
+    at     = action.get("action_type", "")
+    params = action.get("parameters", {})
+    left   = max_steps - step
+    known  = obs.get("known_services", [])
+    # 1. Unknown action type
+    if at not in _ALL_VALID:
+        return True
+    # 2. Must submit on last step
+    if left <= 0 and at not in _SUBMIT_TYPES:
+        return True
+    # 3. WRONG submission type for the task
+    #    e.g. submit_severity during remediation_planning
+    correct_submit = _TASK_SUBMIT.get(task_id)
+    if at in _SUBMIT_TYPES and at != correct_submit:
+        return True
+    # 4. Service not in known_services (for service-targeted actions)
+    svc = (params.get("service") or "").strip()
+    if (svc and known
+            and at not in ("disable_feature_flag", "execute_runbook_step")
+            and svc not in known):
+        return True
+    # 5. Invalid severity value
+    if at == "submit_severity":
+        sev = (params.get("severity") or "").upper().strip()
+        if sev not in ("P1", "P2", "P3", "P4"):
+            return True
+    # 6. Empty required fields
+    if at == "submit_root_cause":
+        svc  = (params.get("service") or "").strip()
+        mode = (params.get("failure_mode") or "").strip()
+        if not svc or len(mode) < 5:
+            return True
+    if at == "submit_resolution":
+        summary = (params.get("summary") or "").strip()
+        if len(summary) < 30:
+            return True
+    # 7. Remediation action used in alert_classification task
+    if task_id == "alert_classification" and at in _REM_TYPES:
+        return True
+    return False
+# ── Episode runner ──────────────────────────────────────────────────────────
+def _run_episode(task_id: str, scenario_index: int) -> float:
+    r = _session.post(
+        f"{ENV_BASE_URL}/reset",
+        params={"task_id": task_id, "scenario_index": scenario_index},
+        timeout=30,
+    )
+    r.raise_for_status()
+    obs = r.json()
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user",   "content": _first_obs_msg(obs)},
+    ]
+    prev_queried: dict = {}
+    max_steps = obs.get("max_steps", 10)
+    for step_i in range(max_steps):
+        current_step = step_i + 1
+        # ── Call LLM ─────────────────────────────────────────────────────
+        try:
+            resp = _get_client().chat.completions.create(
+                model=MODEL_NAME,
+                messages=messages,
+                temperature=0.0,
+                max_tokens=300,
+                stream=False,
+            )
+            raw = resp.choices[0].message.content or ""
+        except Exception as e:
+            print(f"  [WARN] LLM call failed step {current_step}: {e}",
+                  file=sys.stderr)
+            raw = ""
+        messages.append({"role": "assistant", "content": raw or "{}"})
+        # ── Parse ────────────────────────────────────────────────────────
+        action = None
+        try:
+            if raw.strip():
+                action = _parse(raw)
+        except Exception:
+            pass
+        # ── Fallback / override ──────────────────────────────────────────
+        if action is None:
+            action = _smart_fallback(task_id, obs, current_step, max_steps)
+            print(f"  [FALLBACK] step {current_step}: "
+                  f"{action.get('action_type')}", file=sys.stderr)
+        elif _should_override(task_id, action, obs, current_step, max_steps):
+            old_at = action.get("action_type")
+            action = _smart_fallback(task_id, obs, current_step, max_steps)
+            print(f"  [OVERRIDE] step {current_step}: "
+                  f"{old_at} -> {action.get('action_type')}",
+                  file=sys.stderr)
+        # ── Step ─────────────────────────────────────────────────────────
+        sr = _session.post(
+            f"{ENV_BASE_URL}/step", json=action, timeout=30,
+        )
+        sr.raise_for_status()
+        result  = sr.json()
+        new_obs = result["observation"]
+        print(
+            f"  step {current_step:>2}: {action.get('action_type'):<28} "
+            f"reward={result['reward']['value']:+.3f}  "
+            f"done={result['done']}",
+            file=sys.stderr,
+        )
+        if result.get("done"):
+            break
+        step_msg = _step_msg(new_obs, prev_queried)
+        messages.append({"role": "user", "content": step_msg})
+        prev_queried = {
+            k: dict(v)
+            for k, v in new_obs.get("queried_data", {}).items()
+            if isinstance(v, dict)
+        }
+        obs = new_obs
+        # Keep conversation window manageable
+        if len(messages) > 20:
+            messages = messages[:2] + messages[-16:]
+    g = _session.get(f"{ENV_BASE_URL}/grader", timeout=30)
+    g.raise_for_status()
+    return g.json().get("total", 0.0)
+# ── Entry point ─────────────────────────────────────────────────────────────
+def main():
+    runs = [
+        ("alert_classification",  0),
+        ("alert_classification",  1),
+        ("root_cause_analysis",   0),
+        ("root_cause_analysis",   1),
+        ("remediation_planning",  0),
+        ("remediation_planning",  1),
+    ]
+    results: dict[str, list[float]] = {}
+    print(f"{'Task':<36} {'S':>2}  {'Score':>7}")
+    print("-" * 50)
+    for task_id, scenario_index in runs:
+        try:
+            score = _run_episode(task_id, scenario_index)
+        except Exception as e:
+            print(f"  [ERROR] {task_id} s{scenario_index}: {e}",
+                  file=sys.stderr)
+            score = 0.0
+        label = f"{task_id} [s{scenario_index}]"
+        print(f"{label:<36} {scenario_index:>2}  {score:>7.4f}")
+        results.setdefault(task_id, []).append(score)
+    print("-" * 50)
+    summary = {
+        t: round(sum(v) / len(v), 4) for t, v in results.items()
+    }
+    summary["overall"] = round(sum(summary.values()) / len(summary), 4)
+    print("\nScore Summary:")
+    for k, v in summary.items():
+        print(f"  {k:<36}: {v:.4f}")
+    print(json.dumps(summary))
+if __name__ == "__main__":
+    main()

openenv.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+name: cloud-incident-response
+version: "0.1.0"
+app_port: 7860
+description: >
+  OpenEnv environment simulating real-world cloud SRE on-call incident response.
+  Distinct from Kubernetes ops — focuses on cross-service cascading failures,
+  network partitions, OOM kills, and CDN storms across distributed systems.
+  An AI agent classifies alert severity, performs root cause analysis through
+  log/metric/dependency queries, and executes remediation sequences to resolve
+  production incidents end-to-end.
+author: Elliot89
+license: MIT
+tags:
+  - openenv
+  - sre
+  - cloud
+  - incident-response
+  - devops
+  - real-world
+  - agentic
+tasks:
+  - id: alert_classification
+    name: "Task 1: Alert Severity Classification"
+    difficulty: easy
+    max_steps: 3
+    score_range: [0.0, 1.0]
+    description: >
+      Classify incoming alert severity (P1-P4) by querying
+      logs and metrics across affected cloud services.
+  - id: root_cause_analysis
+    name: "Task 2: Root Cause Analysis"
+    difficulty: medium
+    max_steps: 10
+    score_range: [0.0, 1.0]
+    description: >
+      Trace a live incident through logs, metrics, dependencies,
+      and recent deploys to identify the exact root cause service
+      and failure mode across a distributed system.
+  - id: remediation_planning
+    name: "Task 3: Incident Remediation"
+    difficulty: hard
+    max_steps: 15
+    score_range: [0.0, 1.0]
+    description: >
+      Fully resolve a production incident end-to-end: diagnose
+      the root cause, execute the correct remediation sequence,
+      and submit a documented resolution summary.
+endpoints:
+  health:   "GET /health"
+  reset:    "POST /reset"
+  step:     "POST /step"
+  state:    "GET /state"
+  tasks:    "GET /tasks"
+  grader:   "GET /grader"
+  baseline: "POST /baseline"

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[project]
+name = "cloud-incident-response-openenv"
+version = "0.1.0"
+description = "OpenEnv environment for cloud SRE incident response"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "fastapi>=0.104.0",
+    "uvicorn[standard]>=0.24.0",
+    "pydantic>=2.0.0",
+    "requests>=2.31.0",
+    "openai>=1.58.0",
+    "httpx>=0.27.0,<0.29.0",
+    "python-dotenv>=1.0.0",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi>=0.104.0
+uvicorn[standard]>=0.24.0
+pydantic>=2.0.0
+requests>=2.31.0
+openai>=1.58.0
+httpx>=0.27.0,<0.29.0
+python-dotenv>=1.0.0

server/__init__.py ADDED Viewed

File without changes

server/app.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+server/app.py — FastAPI server for Cloud Incident Response OpenEnv.
+Endpoints:
+  GET  /          JSON health/status (triggers HF Space "Running" badge)
+  GET  /health    Lightweight health check
+  POST /reset     Start new episode
+  POST /step      Submit action
+  GET  /state     Current episode state
+  GET  /tasks     All tasks with action schemas
+  GET  /grader    Score current episode
+  POST /baseline  Run inference.py end-to-end, return score summary
+"""
+from __future__ import annotations
+import json
+import os
+import subprocess
+import sys
+# Ensure project root is on sys.path regardless of working directory
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
+from server.models import Action
+from server.environment import IncidentEnvironment
+from tasks import list_tasks, ALL_TASKS
+_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+# ── Global env instance ──────────────────────────────────────────────────────
+_env: IncidentEnvironment | None = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Initialise heavy objects after the server is already accepting requests."""
+    global _env
+    _env = IncidentEnvironment()
+    yield
+def _get_env() -> IncidentEnvironment:
+    if _env is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Environment initialising — retry in a moment",
+        )
+    return _env
+app = FastAPI(
+    title="Cloud Incident Response — OpenEnv",
+    version="0.1.0",
+    description=(
+        "OpenEnv environment for training AI agents on cloud SRE incident response. "
+        "Covers cascading failures, OOM kills, CDN storms, and network partitions."
+    ),
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ── Root — plain JSON so HF Space flips badge to Running ─────────────────────
+@app.get("/")
+def root():
+    return {
+        "status":      "running",
+        "name":        "cloud-incident-response",
+        "version":     "0.1.0",
+        "description": "OpenEnv environment for cloud SRE incident response",
+        "tasks":       ["alert_classification", "root_cause_analysis", "remediation_planning"],
+        "docs":        "/docs",
+        "health":      "/health",
+    }
+# ── Core endpoints ────────────────────────────────────────────────────────────
+@app.get("/health")
+def health():
+    return {"status": "ok", "version": "0.1.0"}
+@app.post("/reset")
+def reset(
+    task_id:        str = Query(default="alert_classification"),
+    scenario_index: int = Query(default=0),
+):
+    """Start a new episode. Returns the initial observation."""
+    env = _get_env()
+    try:
+        obs = env.reset(task_id=task_id, scenario_index=scenario_index)
+        return obs.model_dump()
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/step")
+def step(action: Action):
+    """Submit one action. Returns observation, reward, done, info."""
+    env = _get_env()
+    try:
+        obs, reward, done, info = env.step(action)
+        return {
+            "observation": obs.model_dump(),
+            "reward":      reward.model_dump(),
+            "done":        done,
+            "info":        info,
+        }
+    except RuntimeError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/state")
+def state():
+    """Return the full current episode state."""
+    env = _get_env()
+    try:
+        return env.state().model_dump()
+    except RuntimeError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/tasks")
+def tasks():
+    """Return all tasks with descriptions and action schemas."""
+    return {
+        "tasks": list_tasks(),
+        "total": len(ALL_TASKS),
+        "action_schema": {
+            "diagnostic": [
+                {"action_type": "query_logs",           "parameters": {"service": "string"}},
+                {"action_type": "check_metrics",        "parameters": {"service": "string"}},
+                {"action_type": "check_dependencies",   "parameters": {"service": "string"}},
+                {"action_type": "check_recent_deploys", "parameters": {"service": "string"}},
+                {"action_type": "check_service_status", "parameters": {"service": "string"}},
+            ],
+            "remediation": [
+                {"action_type": "restart_service",      "parameters": {"service": "string"}},
+                {"action_type": "rollback_deploy",      "parameters": {"service": "string", "target_version": "string"}},
+                {"action_type": "scale_service",        "parameters": {"service": "string", "replicas": "int"}},
+                {"action_type": "disable_feature_flag", "parameters": {"flag": "string"}},
+                {"action_type": "clear_cache",          "parameters": {"service": "string"}},
+                {"action_type": "execute_runbook_step", "parameters": {"runbook_action": "string", "target": "string"}},
+            ],
+            "submission": [
+                {"action_type": "submit_severity",   "parameters": {"severity": "P1|P2|P3|P4", "service": "string"}},
+                {"action_type": "submit_root_cause", "parameters": {"service": "string", "failure_mode": "string"}},
+                {"action_type": "submit_resolution", "parameters": {"summary": "string"}},
+            ],
+        },
+    }
+@app.get("/grader")
+def grader():
+    """Score the current episode. Returns total in [0.0, 1.0]."""
+    env = _get_env()
+    try:
+        s = env.state()
+        from graders import grade
+        result = grade(s.task_id, s.model_dump(), env._scenario)
+        return {
+            "total":       result["total"],
+            "breakdown":   result["breakdown"],
+            "feedback":    result["feedback"],
+            "task_id":     s.task_id,
+            "scenario_id": s.scenario_id,
+            "steps_used":  s.step_count,
+            "done":        s.done,
+        }
+    except RuntimeError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/baseline")
+def baseline():
+    """Run inference.py and return the JSON score summary."""
+    script = os.path.join(_ROOT, "inference.py")
+    if not os.path.exists(script):
+        raise HTTPException(
+            status_code=500,
+            detail="inference.py not found in project root",
+        )
+    try:
+        result = subprocess.run(
+            [sys.executable, script],
+            capture_output=True,
+            text=True,
+            timeout=1200,
+            cwd=_ROOT,
+            env={**os.environ, "ENV_BASE_URL": "http://localhost:7860"},
+        )
+    except subprocess.TimeoutExpired:
+        raise HTTPException(status_code=500, detail="inference.py timed out (>20 min)")
+    if result.returncode != 0:
+        raise HTTPException(status_code=500, detail=result.stderr[-2000:])
+    lines = result.stdout.strip().splitlines()
+    last  = lines[-1] if lines else ""
+    try:
+        return json.loads(last)
+    except Exception:
+        return {"raw_output": result.stdout[-3000:]}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("server.app:app", host="0.0.0.0", port=7860, reload=False)

server/environment.py ADDED Viewed

	@@ -0,0 +1,324 @@

+"""
+server/environment.py — Core OpenEnv environment for Cloud Incident Response.
+Implements the full OpenEnv interface:
+  reset(task_id, scenario_index) -> Observation
+  step(action)                   -> (Observation, Reward, done, info)
+  state()                        -> EpisodeState
+All state is in-memory. Thread-safe via a lock.
+"""
+from __future__ import annotations
+import uuid
+import threading
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from tasks import get_task, get_scenario
+from graders import grade, _svc_match
+from server.models import Action, ActionParameters, Observation, Reward, EpisodeState
+# ── Action type classification ────────────────────────────────────────────────
+_DIAGNOSTIC = frozenset({
+    "query_logs", "check_metrics", "check_dependencies",
+    "check_recent_deploys", "check_service_status",
+})
+_REMEDIATION = frozenset({
+    "restart_service", "rollback_deploy", "scale_service",
+    "disable_feature_flag", "clear_cache", "execute_runbook_step",
+})
+_SUBMIT = frozenset({
+    "submit_severity", "submit_root_cause", "submit_resolution",
+})
+# ── Reward constants ──────────────────────────────────────────────────────────
+R_QUERY_FIRST   = +0.05
+R_QUERY_REPEAT  = +0.01
+R_QUERY_UNKNOWN = -0.05
+R_REM_GOOD      = +0.10
+R_REM_WRONG     = -0.10
+R_PAST_HALF     = -0.02
+R_TIMEOUT       = -0.10
+R_BAD_ACTION    = -0.03
+class IncidentEnvironment:
+    """
+    OpenEnv environment for Cloud Incident Response.
+    One instance handles one episode at a time (thread-safe).
+    """
+    def __init__(self):
+        self._lock     = threading.Lock()
+        self._s:        dict = {}
+        self._scenario: dict = {}
+        self._task_def: dict = {}
+        self._ready          = False
+    # ── Public OpenEnv API ───────────────────────────────────────────────────
+    def reset(self, task_id: str, scenario_index: int = 0) -> Observation:
+        with self._lock:
+            task_def = get_task(task_id)
+            scenario = get_scenario(task_id, scenario_index)
+            self._task_def = task_def
+            self._scenario = scenario
+            self._s = {
+                "episode_id":        str(uuid.uuid4()),
+                "task_id":           task_id,
+                "scenario_id":       scenario["scenario_id"],
+                "step_count":        0,
+                "max_steps":         task_def["max_steps"],
+                "action_history":    [],
+                "queried_data":      {},
+                "queried_keys":      set(),
+                "submitted":         False,
+                "resolved":          False,
+                "done":              False,
+                "cumulative_reward": 0.0,
+                "feedback":          f"Episode started. {scenario['description']}",
+            }
+            self._ready = True
+            return self._build_obs()
+    def step(self, action: Action) -> tuple[Observation, Reward, bool, dict]:
+        with self._lock:
+            if not self._ready:
+                raise RuntimeError("Call reset() before step().")
+            s = self._s
+            if s["done"]:
+                return (
+                    self._build_obs(),
+                    Reward(value=0.0, reason="episode already done",
+                           cumulative=s["cumulative_reward"]),
+                    True,
+                    {},
+                )
+            s["step_count"] += 1
+            step_num = s["step_count"]
+            at       = action.action_type
+            params   = action.parameters
+            s["action_history"].append({
+                "action_type": at,
+                "parameters":  params.model_dump(exclude_none=True),
+                "step":        step_num,
+            })
+            r  = 0.0
+            fb: list[str] = []
+            # Efficiency penalty after halfway point
+            if step_num > s["max_steps"] // 2:
+                r += R_PAST_HALF
+                fb.append("efficiency penalty")
+            if at in _DIAGNOSTIC:
+                r, fb = self._handle_diagnostic(at, params, r, fb)
+            elif at in _REMEDIATION:
+                r, fb = self._handle_remediation(at, params, r, fb)
+            elif at in _SUBMIT:
+                r, fb, terminal = self._handle_submit(at, params, r, fb)
+                if terminal:
+                    s["done"] = True
+            else:
+                r += R_BAD_ACTION
+                fb.append(f"unknown action_type '{at}'")
+            # Timeout if max steps reached without submission
+            if step_num >= s["max_steps"] and not s["done"]:
+                r += R_TIMEOUT
+                fb.append("timeout — no submission made")
+                s["done"] = True
+            # Apply grader score on terminal step
+            if s["done"]:
+                result = grade(s["task_id"], s, self._scenario)
+                s["cumulative_reward"] = round(
+                    s["cumulative_reward"] + r + result["total"], 4
+                )
+                fb.append(f"grader={result['feedback']}")
+            else:
+                s["cumulative_reward"] = round(s["cumulative_reward"] + r, 4)
+            s["feedback"] = " | ".join(fb) if fb else "ok"
+            return (
+                self._build_obs(),
+                Reward(
+                    value=round(r, 4),
+                    reason=s["feedback"],
+                    cumulative=s["cumulative_reward"],
+                ),
+                s["done"],
+                {"step": step_num, "feedback": s["feedback"]},
+            )
+    def state(self) -> EpisodeState:
+        with self._lock:
+            if not self._ready:
+                raise RuntimeError("No active episode — call reset() first.")
+            s = self._s
+            return EpisodeState(
+                episode_id=s["episode_id"],
+                task_id=s["task_id"],
+                scenario_id=s["scenario_id"],
+                step_count=s["step_count"],
+                max_steps=s["max_steps"],
+                action_history=list(s["action_history"]),
+                queried_data=dict(s["queried_data"]),
+                submitted=s["submitted"],
+                resolved=s["resolved"],
+                done=s["done"],
+                cumulative_reward=s["cumulative_reward"],
+                feedback=s["feedback"],
+            )
+    # ── Action handlers ──────────────────────────────────────────────────────
+    def _handle_diagnostic(
+        self, at: str, params: ActionParameters, r: float, fb: list[str]
+    ) -> tuple[float, list[str]]:
+        s       = self._s
+        service = (params.service or "").lower().strip()
+        known   = {sv.lower() for sv in self._scenario.get("known_services", set())}
+        tool_data = self._scenario.get("tool_responses", {}).get(at, {})
+        key     = (at, service)
+        if service and service in known:
+            if key not in s["queried_keys"]:
+                r += R_QUERY_FIRST
+                fb.append(f"queried {service} (+{R_QUERY_FIRST})")
+                s["queried_keys"].add(key)
+            else:
+                r += R_QUERY_REPEAT
+                fb.append(f"re-queried {service} (+{R_QUERY_REPEAT})")
+            result = tool_data.get(service, f"No data available for '{service}'.")
+            s["queried_data"].setdefault(at, {})[service] = result
+        elif service:
+            r += R_QUERY_UNKNOWN
+            fb.append(f"unknown service '{service}' ({R_QUERY_UNKNOWN})")
+        else:
+            fb.append(f"{at}: no service specified")
+        return r, fb
+    def _handle_remediation(
+        self, at: str, params: ActionParameters, r: float, fb: list[str]
+    ) -> tuple[float, list[str]]:
+        s       = self._s
+        service = (params.service or "").lower().strip()
+        flag    = (params.flag or "").lower().strip()
+        runbook = (params.runbook_action or "").lower().strip()
+        target  = (params.target or "").lower().strip()
+        # Build candidate keys for wrong-action matching
+        keys: set[str] = {at}
+        if service: keys.add(f"{at}:{service}")
+        if flag:    keys.add(f"{at}:{flag}")
+        if runbook: keys.add(f"execute_runbook_step:{runbook}")
+        if target:  keys.add(f"execute_runbook_step:{target}")
+        wrong_map  = self._scenario.get("wrong_actions", {})
+        rem_data   = self._scenario.get("remediation_data", {})
+        # Check for wrong actions — also use fuzzy service matching for `at:svc` keys
+        is_wrong = any(k in wrong_map for k in keys)
+        if not is_wrong and service:
+            # Try _svc_match against wrong action keys of the form `at:svc`
+            for wk in wrong_map:
+                if ":" in wk:
+                    w_at, w_svc = wk.split(":", 1)
+                    if w_at == at and _svc_match(service, w_svc):
+                        is_wrong = True
+                        break
+        if is_wrong:
+            r += R_REM_WRONG
+            reason = next(
+                (wrong_map[k] for k in keys if k in wrong_map),
+                "wrong action for this incident"
+            )
+            fb.append(f"wrong action '{at}': {str(reason)[:80]}")
+        else:
+            r += R_REM_GOOD
+            fb.append(f"executed {at}" + (f" on '{service}'" if service else ""))
+            at_data = rem_data.get(at, {})
+            result  = (
+                at_data.get(service) or at_data.get(flag)
+                or at_data.get(runbook) or at_data.get(target)
+                or "action executed successfully"
+            )
+            s["queried_data"].setdefault(at, {})[
+                service or flag or runbook or target or at
+            ] = result
+        return r, fb
+    def _handle_submit(
+        self, at: str, params: ActionParameters, r: float, fb: list[str]
+    ) -> tuple[float, list[str], bool]:
+        s = self._s
+        s["submitted"] = True
+        if at == "submit_severity":
+            fb.append(f"submitted severity: {(params.severity or '').upper()}")
+        elif at == "submit_root_cause":
+            fb.append(
+                f"submitted root cause: "
+                f"service={params.service or ''}, "
+                f"failure_mode={params.failure_mode or ''}"
+            )
+        elif at == "submit_resolution":
+            summary   = params.summary or ""
+            inv_count = sum(
+                1 for a in s["action_history"]
+                if a.get("action_type") in _DIAGNOSTIC | _REMEDIATION
+            )
+            if summary.strip() and inv_count >= 1:
+                s["resolved"] = True
+                fb.append("resolution submitted — incident resolved")
+            else:
+                fb.append("resolution submitted — insufficient investigation")
+        return r, fb, True
+    # ── Build observation ────────────────────────────────────────────────────
+    def _build_obs(self) -> Observation:
+        s  = self._s
+        sc = self._scenario
+        td = self._task_def
+        # Return sorted list of known service names (exact strings agents must use)
+        known = sorted(sc.get("known_services", set()))
+        return Observation(
+            episode_id=s["episode_id"],
+            task_id=s["task_id"],
+            scenario_id=s["scenario_id"],
+            step_count=s["step_count"],
+            max_steps=s["max_steps"],
+            incident_summary=sc.get("incident_summary", sc.get("description", "")),
+            alert=sc.get("alert", {}),
+            available_actions=td.get("available_actions", []),
+            queried_data=dict(s["queried_data"]),
+            cumulative_reward=s["cumulative_reward"],
+            done=s["done"],
+            feedback=s["feedback"],
+            known_services=known,
+        )

server/models.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+server/models.py — Typed Pydantic models for the OpenEnv interface.
+OpenEnv requires three typed models: Action, Observation, Reward.
+All models use Pydantic v2.
+"""
+from __future__ import annotations
+from pydantic import BaseModel, Field
+class ActionParameters(BaseModel):
+    """Flexible parameter bag — different action types use different fields."""
+    service: str | None = None
+    severity: str | None = None
+    failure_mode: str | None = None
+    summary: str | None = None
+    target_version: str | None = None
+    replicas: int | None = None
+    flag: str | None = None
+    runbook_action: str | None = None
+    target: str | None = None
+    reasoning: str | None = None
+    model_config = {"extra": "allow"}
+class Action(BaseModel):
+    """An action submitted by the agent to the environment."""
+    action_type: str
+    parameters: ActionParameters = Field(default_factory=ActionParameters)
+    model_config = {"extra": "allow"}
+class Observation(BaseModel):
+    """Observation returned after reset() or step()."""
+    episode_id: str
+    task_id: str
+    scenario_id: str
+    step_count: int
+    max_steps: int
+    incident_summary: str
+    alert: dict
+    available_actions: list[str]
+    queried_data: dict
+    cumulative_reward: float
+    done: bool
+    feedback: str
+    # Explicit list of all valid service names for this scenario.
+    # Agents must use these exact strings in action parameters.
+    known_services: list[str] = Field(default_factory=list)
+class Reward(BaseModel):
+    """Reward signal returned after each step()."""
+    value: float
+    reason: str
+    cumulative: float
+class EpisodeState(BaseModel):
+    """Full episode state returned by GET /state."""
+    episode_id: str
+    task_id: str
+    scenario_id: str
+    step_count: int
+    max_steps: int
+    action_history: list[dict]
+    queried_data: dict
+    submitted: bool
+    resolved: bool
+    done: bool
+    cumulative_reward: float
+    feedback: str

tasks.py ADDED Viewed

	@@ -0,0 +1,768 @@

+"""
+tasks.py — Task and scenario definitions for Cloud Incident Response OpenEnv.
+Covers cross-service cascading failures in distributed cloud systems:
+  - DB connection pool exhaustion cascading through service mesh
+  - CDN cache invalidation storms causing origin overload
+  - OOM kills from runaway analytics queries
+  - BGP network partitions isolating availability zones
+Distinct from Kubernetes ops environments — focuses on application-layer
+incident response: log correlation, dependency tracing, and remediation
+across microservice architectures.
+Public API:
+    get_task(task_id)            -> task metadata dict
+    get_scenario(task_id, index) -> scenario dict
+    list_tasks()                 -> list of task dicts
+    ALL_TASKS                    -> dict[task_id -> metadata]
+"""
+from __future__ import annotations
+ALL_TASKS: dict = {
+    "alert_classification": {
+        "id": "alert_classification",
+        "name": "Task 1: Alert Severity Classification",
+        "difficulty": "easy",
+        "max_steps": 3,
+        "score_range": [0.0, 1.0],
+        "description": (
+            "An alert has fired. Query logs and metrics across affected services, "
+            "then classify the incident severity: P1 (CRITICAL — revenue/user impact, "
+            "immediate action), P2 (HIGH — degraded service), P3 (MEDIUM — minor issue), "
+            "P4 (LOW — informational). Submit severity with submit_severity."
+        ),
+        "available_actions": [
+            "query_logs",
+            "check_metrics",
+            "check_dependencies",
+            "check_recent_deploys",
+            "submit_severity",
+        ],
+        "submission_action": "submit_severity",
+        "scenarios": 2,
+    },
+    "root_cause_analysis": {
+        "id": "root_cause_analysis",
+        "name": "Task 2: Root Cause Analysis",
+        "difficulty": "medium",
+        "max_steps": 10,
+        "score_range": [0.0, 1.0],
+        "description": (
+            "A production incident is active. Use diagnostic tools to trace the failure "
+            "chain across services. Query logs, metrics, dependency graphs, and recent "
+            "deploys to identify which service is the root cause and what failure mode "
+            "triggered the cascade. Submit findings with submit_root_cause."
+        ),
+        "available_actions": [
+            "query_logs",
+            "check_metrics",
+            "check_dependencies",
+            "check_recent_deploys",
+            "check_service_status",
+            "submit_root_cause",
+        ],
+        "submission_action": "submit_root_cause",
+        "scenarios": 2,
+    },
+    "remediation_planning": {
+        "id": "remediation_planning",
+        "name": "Task 3: Incident Remediation",
+        "difficulty": "hard",
+        "max_steps": 15,
+        "score_range": [0.0, 1.0],
+        "description": (
+            "A critical production incident requires full end-to-end resolution. "
+            "Diagnose the root cause, execute the correct remediation sequence "
+            "(disable feature flags, restart services, rollback deploys, run runbook steps), "
+            "then submit a resolution summary. Scored on investigation quality, "
+            "remediation correctness, efficiency, and documentation."
+        ),
+        "available_actions": [
+            "query_logs",
+            "check_metrics",
+            "check_dependencies",
+            "check_recent_deploys",
+            "check_service_status",
+            "restart_service",
+            "rollback_deploy",
+            "scale_service",
+            "disable_feature_flag",
+            "clear_cache",
+            "execute_runbook_step",
+            "submit_resolution",
+        ],
+        "submission_action": "submit_resolution",
+        "scenarios": 2,
+    },
+}
+# ---------------------------------------------------------------------------
+# Scenario data — 3 tasks × 2 scenarios = 6 total episodes
+# ---------------------------------------------------------------------------
+SCENARIOS: dict = {
+    # ── TASK 1: ALERT CLASSIFICATION ────────────────────────────────────────
+    "alert_classification": [
+        # AC-001: Cascading DB connection pool exhaustion → P1
+        {
+            "scenario_id": "AC-001",
+            "description": (
+                "Cascading failure: postgres-db connection pool exhausted, "
+                "causing auth-service timeouts, blocking api-gateway requests. "
+                "Revenue impact is severe and growing."
+            ),
+            "incident_summary": (
+                "P1 ALERT — api-gateway 5xx rate 78%, auth-service timeout rate 94%, "
+                "postgres-db connection pool at 100% (500/500). "
+                "Checkout completely down. Revenue impact: $12,000/min."
+            ),
+            "alert": {
+                "id":               "ALT-20240315-001",
+                "title":            "CRITICAL: api-gateway error rate spike 78%",
+                "severity_fired":   "P1",
+                "affected_services": ["api-gateway", "auth-service", "postgres-db"],
+                "symptoms": [
+                    "api-gateway: HTTP 503 rate 78% (baseline: 0.1%)",
+                    "auth-service: connection timeout 94% of requests",
+                    "postgres-db: connection pool 500/500 — 100% utilized",
+                    "checkout flow: completely unavailable",
+                    "new user logins: 0% success rate",
+                ],
+                "error_rate":              0.78,
+                "duration_minutes":        4,
+                "revenue_impact_per_min":  12000,
+            },
+            "known_services": {"api-gateway", "auth-service", "postgres-db"},
+            "tool_responses": {
+                "query_logs": {
+                    "api-gateway": (
+                        "2024-03-15T10:04:12Z ERROR upstream connect error — "
+                        "reset reason: connection timeout auth-service:8080\n"
+                        "2024-03-15T10:04:13Z ERROR 503 Service Unavailable upstream: auth-service\n"
+                        "2024-03-15T10:04:14Z ERROR circuit breaker OPEN for auth-service"
+                    ),
+                    "auth-service": (
+                        "2024-03-15T10:04:10Z ERROR pq: sorry, too many clients already\n"
+                        "2024-03-15T10:04:11Z ERROR dial tcp postgres-db:5432: "
+                        "connect: connection refused — pool exhausted (500/500)\n"
+                        "2024-03-15T10:04:12Z ERROR all connection pool slots occupied"
+                    ),
+                    "postgres-db": (
+                        "2024-03-15T10:03:58Z LOG connection received: host=auth-service\n"
+                        "2024-03-15T10:04:00Z FATAL remaining connection slots reserved "
+                        "for non-replication superuser\n"
+                        "2024-03-15T10:04:01Z LOG max_connections=500 active=500 idle=0"
+                    ),
+                },
+                "check_metrics": {
+                    "api-gateway": (
+                        "HTTP 5xx rate: 78% | p99 latency: 30s (timeout) | "
+                        "RPS: 1,200 | circuit_breaker: OPEN"
+                    ),
+                    "auth-service": (
+                        "Error rate: 94% | DB connection wait: 28s | "
+                        "Active connections: 0 | Request queue: 847"
+                    ),
+                    "postgres-db": (
+                        "Connections: 500/500 (100%) | Query queue: 847 | "
+                        "CPU: 98% | Memory: 89% | Active queries: 500"
+                    ),
+                },
+                "check_dependencies": {
+                    "api-gateway": "Depends on: auth-service [CRITICAL], product-service [OK]",
+                    "auth-service": "Depends on: postgres-db [CRITICAL], redis-session [OK]",
+                    "postgres-db": "No upstream dependencies — root level service",
+                },
+                "check_recent_deploys": {
+                    "api-gateway":  "Last deploy: 3 days ago — no recent changes",
+                    "auth-service": (
+                        "Last deploy: 47 min ago — PR #2341: "
+                        "increased default connection pool size from 10 to 500"
+                    ),
+                    "postgres-db":  "Last deploy: 12 days ago — no recent changes",
+                },
+            },
+            "correct_severity":    "P1",
+            "adjacent_severities": ["P2"],
+        },
+        # AC-002: CDN cache invalidation storm → P2
+        {
+            "scenario_id": "AC-002",
+            "description": (
+                "CDN cache invalidation storm: a misconfigured purge cronjob wiped "
+                "all 2.1M cached keys, sending 40× normal traffic to origin. "
+                "Site degraded but not fully down — P2 severity."
+            ),
+            "incident_summary": (
+                "P2 ALERT — CDN cache hit rate dropped from 94% to 3%, "
+                "product-service origin traffic up 4000%, image-service CPU at 95%. "
+                "Pages loading slowly (p99: 18s). Checkout still working."
+            ),
+            "alert": {
+                "id":               "ALT-20240315-002",
+                "title":            "HIGH: CDN cache miss storm — origin overloaded",
+                "severity_fired":   "P2",
+                "affected_services": ["cdn-edge", "product-service", "image-service"],
+                "symptoms": [
+                    "CDN cache hit rate: 3% (normal: 94%)",
+                    "product-service: origin RPS 48,000 (normal: 1,200)",
+                    "image-service: CPU 95%, p99 latency 18s",
+                    "User experience: product pages slow, some images timing out",
+                    "Checkout: still functional (not affected)",
+                ],
+                "error_rate":             0.15,
+                "duration_minutes":        8,
+                "revenue_impact_per_min":  800,
+            },
+            "known_services": {"cdn-edge", "product-service", "image-service"},
+            "tool_responses": {
+                "query_logs": {
+                    "cdn-edge": (
+                        "2024-03-15T10:22:00Z INFO cache MISS ratio: 97% (5min window)\n"
+                        "2024-03-15T10:20:11Z WARN mass cache invalidation — "
+                        "2,100,000 keys purged by purge-job-prod\n"
+                        "2024-03-15T10:20:10Z INFO purge pattern: /* (ALL keys)"
+                    ),
+                    "product-service": (
+                        "2024-03-15T10:22:05Z WARN request queue depth: 12,400\n"
+                        "2024-03-15T10:22:06Z ERROR timeout fetching from image-service (18s)\n"
+                        "2024-03-15T10:22:07Z WARN worker pool 95% utilized"
+                    ),
+                    "image-service": (
+                        "2024-03-15T10:22:00Z WARN CPU throttling engaged (95%)\n"
+                        "2024-03-15T10:22:01Z ERROR worker pool exhausted — dropping requests\n"
+                        "2024-03-15T10:22:02Z ERROR OOM risk: memory at 91%"
+                    ),
+                },
+                "check_metrics": {
+                    "cdn-edge": (
+                        "Cache hit rate: 3% | Purge events (1h): 1 mass purge | "
+                        "Origin RPS: 48,000 | Bandwidth: 890 Gbps"
+                    ),
+                    "product-service": (
+                        "Origin RPS: 48,000 (normal: 1,200) | "
+                        "Queue depth: 12,400 | Worker utilization: 95%"
+                    ),
+                    "image-service": (
+                        "CPU: 95% | Memory: 91% | "
+                        "Worker pool: 0 free / 200 | p99 latency: 18s"
+                    ),
+                },
+                "check_dependencies": {
+                    "cdn-edge":       "Origin: product-service [OVERLOADED]",
+                    "product-service": "Depends on: image-service [DEGRADED], postgres-db [OK]",
+                    "image-service":   "Depends on: object-storage [OK] — no upstream issues",
+                },
+                "check_recent_deploys": {
+                    "cdn-edge": (
+                        "Cronjob purge-job-prod updated 2h ago — "
+                        "purge pattern changed from /images/* to /* (all keys)"
+                    ),
+                    "product-service": "Last deploy: 5 days ago — no recent changes",
+                    "image-service":   "Last deploy: 2 days ago — no recent changes",
+                },
+            },
+            "correct_severity":    "P2",
+            "adjacent_severities": ["P1", "P3"],
+        },
+    ],
+    # ── TASK 2: ROOT CAUSE ANALYSIS ─────────────────────────────────────────
+    "root_cause_analysis": [
+        # RCA-001: Analytics service OOM kills postgres-db
+        {
+            "scenario_id": "RCA-001",
+            "description": (
+                "postgres-db was OOM-killed by the Linux kernel after a runaway "
+                "analytics query with no LIMIT clause consumed all available memory. "
+                "All downstream services are now failing. analytics-service is the culprit."
+            ),
+            "incident_summary": (
+                "Multiple services down: api-gateway 503, auth-service failing, "
+                "order-service write failures. postgres-db restarting in a loop. "
+                "Root cause is upstream — trace the failure chain."
+            ),
+            "alert": {
+                "id":              "ALT-RCA-001",
+                "title":           "CRITICAL: postgres-db crash loop — all dependents down",
+                "severity_fired":  "P1",
+                "affected_services": [
+                    "api-gateway", "auth-service", "order-service", "postgres-db",
+                ],
+                "symptoms": [
+                    "postgres-db: 4 restarts in 12 minutes",
+                    "auth-service: connection refused — 100% failure",
+                    "order-service: all writes failing",
+                    "api-gateway: 503 on all authenticated routes",
+                    "analytics-service: last job failed 12 min ago",
+                ],
+                "error_rate":       0.95,
+                "duration_minutes": 14,
+            },
+            "known_services": {
+                "api-gateway", "auth-service", "order-service",
+                "postgres-db", "analytics-service", "redis-session",
+            },
+            "tool_responses": {
+                "query_logs": {
+                    "postgres-db": (
+                        "2024-03-16T02:11:00Z LOG database system shut down at 02:10:58\n"
+                        "2024-03-16T02:10:58Z FATAL Out of Memory: Kill process 1847 (postgres) "
+                        "score 982 or sacrifice child\n"
+                        "2024-03-16T02:10:30Z LOG process 1847 query running 12min: "
+                        "SELECT * FROM events JOIN user_sessions JOIN orders "
+                        "JOIN products — no LIMIT clause, est 847M rows"
+                    ),
+                    "analytics-service": (
+                        "2024-03-16T01:58:00Z INFO starting job: full_history_export\n"
+                        "2024-03-16T01:58:01Z WARN query has no LIMIT — estimated 847M rows\n"
+                        "2024-03-16T02:10:55Z ERROR job killed by OOM — full_history_export FAILED"
+                    ),
+                    "auth-service": (
+                        "2024-03-16T02:11:05Z ERROR connect ECONNREFUSED postgres-db:5432\n"
+                        "2024-03-16T02:11:06Z ERROR all retries exhausted — giving up"
+                    ),
+                    "api-gateway": (
+                        "2024-03-16T02:11:10Z ERROR upstream auth-service: 503 Service Unavailable"
+                    ),
+                    "order-service": (
+                        "2024-03-16T02:11:08Z ERROR pq: the database system is starting up"
+                    ),
+                    "redis-session": "No errors — operating normally at 99.2% hit rate",
+                },
+                "check_metrics": {
+                    "postgres-db": (
+                        "Memory: OOM killed (0% free at crash) | "
+                        "Restarts: 4 in 12min | Status: RESTARTING"
+                    ),
+                    "analytics-service": (
+                        "Memory at crash: 31.2GB / 32GB (97.5%) | "
+                        "Job runtime: 12min 55s | Status: ERROR"
+                    ),
+                    "auth-service":  "Connection success: 0% | DB: CRITICAL | Redis: OK",
+                    "api-gateway":   "503 rate: 95% | Auth dependency: DOWN",
+                    "order-service": "Write success: 0% | DB: RESTARTING",
+                    "redis-session": "Hit rate: 99.2% | Memory: 42% | Healthy",
+                },
+                "check_dependencies": {
+                    "postgres-db": (
+                        "Clients: auth-service, order-service, analytics-service, product-service"
+                    ),
+                    "analytics-service": "Depends on: postgres-db [CRASH LOOP]",
+                    "auth-service":      "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
+                    "api-gateway":       "Depends on: auth-service [DOWN]",
+                    "order-service":     "Depends on: postgres-db [CRASH LOOP]",
+                    "redis-session":     "No DB dependency — standalone cache",
+                },
+                "check_recent_deploys": {
+                    "analytics-service": (
+                        "Deploy 6h ago: added full_history_export scheduled job — "
+                        "runs daily at 02:00 UTC, no LIMIT on cross-table JOIN"
+                    ),
+                    "postgres-db":   "No deploys in 3 weeks",
+                    "auth-service":  "No recent deploys",
+                    "order-service": "No recent deploys",
+                    "redis-session": "No recent deploys",
+                },
+                "check_service_status": {
+                    "postgres-db":       "RESTARTING | Uptime: 47s | Crash reason: OOM",
+                    "analytics-service": "ERROR | Last job: full_history_export FAILED",
+                    "auth-service":      "DOWN | Waiting for postgres-db",
+                    "api-gateway":       "DEGRADED | 95% requests failing",
+                    "order-service":     "DOWN | Waiting for postgres-db",
+                    "redis-session":     "HEALTHY | All normal",
+                },
+            },
+            "correct_root_cause": {
+                "service":      "analytics-service",
+                "failure_mode": "unbounded query OOM killing postgres-db",
+            },
+            "wrong_actions": {
+                "restart_service:auth-service":  "auth-service is a victim — DB must be fixed first",
+                "restart_service:api-gateway":   "api-gateway is downstream — won't help",
+                "scale_service:postgres-db":     "Scaling won't prevent OOM if the bad query runs again",
+                "rollback_deploy:postgres-db":   "postgres-db has no recent deploys",
+            },
+        },
+        # RCA-002: BGP route withdrawal — AZ network partition
+        {
+            "scenario_id": "RCA-002",
+            "description": (
+                "A BGP route withdrawal isolated AZ-1 (where payment-service runs) "
+                "from AZ-2 and AZ-3, causing 61% of checkout requests to fail. "
+                "Services within AZ-1 are healthy — it is a pure network issue."
+            ),
+            "incident_summary": (
+                "Checkout failure rate 61% — AZ-2 and AZ-3 cannot reach payment-service "
+                "in AZ-1. AZ-1 users unaffected. fraud-detection-service also unreachable "
+                "cross-AZ. Network infrastructure change 18 min ago."
+            ),
+            "alert": {
+                "id":              "ALT-RCA-002",
+                "title":           "HIGH: checkout failure 61% — cross-AZ connectivity loss",
+                "severity_fired":  "P2",
+                "affected_services": [
+                    "order-service", "payment-service", "fraud-detection-service",
+                ],
+                "symptoms": [
+                    "checkout failure rate: 61% (AZ-2/AZ-3 only)",
+                    "payment-service: unreachable from AZ-2, AZ-3",
+                    "fraud-detection-service: timeout from AZ-2, AZ-3",
+                    "AZ-1 users: 0% failure rate",
+                    "Network: AZ-2/AZ-3 → AZ-1 routing broken",
+                ],
+                "error_rate":       0.61,
+                "duration_minutes": 9,
+            },
+            "known_services": {
+                "order-service", "payment-service", "fraud-detection-service",
+                "postgres-db", "redis-payment-cache", "network-infra",
+            },
+            "tool_responses": {
+                "query_logs": {
+                    "order-service": (
+                        "2024-03-17T14:32:10Z ERROR connection timeout payment-service:8080 "
+                        "(AZ-2 to AZ-1: no route to host)\n"
+                        "2024-03-17T14:32:11Z ERROR fraud-detection-service: i/o timeout (30s)"
+                    ),
+                    "payment-service": (
+                        "2024-03-17T14:31:58Z WARN health check from AZ-2 LB failing\n"
+                        "2024-03-17T14:31:59Z INFO AZ-1 local traffic: all normal"
+                    ),
+                    "fraud-detection-service": (
+                        "2024-03-17T14:32:00Z INFO AZ-1 requests: all normal\n"
+                        "2024-03-17T14:32:01Z WARN cross-AZ health probes: 100% timeout"
+                    ),
+                    "network-infra": (
+                        "2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.2.1 route withdrawal — "
+                        "AZ-2 lost route to AZ-1 CIDR 10.0.1.0/24\n"
+                        "2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.3.1 route withdrawal — "
+                        "AZ-3 lost route to AZ-1 CIDR 10.0.1.0/24\n"
+                        "2024-03-17T14:31:44Z INFO router config change applied — "
+                        "BGP advertisement policy updated"
+                    ),
+                    "postgres-db":        "Operating normally — no errors detected",
+                    "redis-payment-cache": "Operating normally — AZ-1 traffic only, all healthy",
+                },
+                "check_metrics": {
+                    "order-service": (
+                        "AZ-2 checkout failure: 99% | AZ-3 checkout failure: 98% | "
+                        "AZ-1 checkout failure: 0.2% (baseline)"
+                    ),
+                    "payment-service": (
+                        "AZ-1 traffic: normal (100% success) | "
+                        "AZ-2/AZ-3 inbound connections: 0 (blocked)"
+                    ),
+                    "fraud-detection-service": (
+                        "AZ-1 processing: normal | "
+                        "Cross-AZ health checks: 100% timeout"
+                    ),
+                    "network-infra": (
+                        "BGP session AZ-2: WITHDRAWN | BGP session AZ-3: WITHDRAWN | "
+                        "AZ-1 internal: all UP | Config change: 18min ago"
+                    ),
+                    "postgres-db":         "All metrics normal — no anomalies",
+                    "redis-payment-cache": "All metrics normal — AZ-1 only traffic",
+                },
+                "check_dependencies": {
+                    "order-service": (
+                        "Depends on: payment-service [PARTITIONED], "
+                        "fraud-detection-service [PARTITIONED]"
+                    ),
+                    "payment-service":         "Depends on: postgres-db [OK], redis-payment-cache [OK]",
+                    "fraud-detection-service": "Depends on: postgres-db [OK]",
+                    "network-infra":           "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN], AZ-1 [UP]",
+                },
+                "check_recent_deploys": {
+                    "network-infra": (
+                        "Router config change 18min ago — BGP route advertisement policy update: "
+                        "inadvertently withdrew AZ-1 routes from AZ-2/AZ-3 peers"
+                    ),
+                    "payment-service":         "No recent deploys",
+                    "order-service":           "No recent deploys",
+                    "fraud-detection-service": "No recent deploys",
+                },
+                "check_service_status": {
+                    "payment-service":         "HEALTHY within AZ-1 | Cross-AZ: UNREACHABLE",
+                    "order-service":           "DEGRADED | AZ-2/AZ-3 instances failing",
+                    "network-infra":           "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN | AZ-1: UP",
+                    "fraud-detection-service": "HEALTHY within AZ-1 | Cross-AZ: UNREACHABLE",
+                    "postgres-db":             "HEALTHY",
+                    "redis-payment-cache":     "HEALTHY",
+                },
+            },
+            "correct_root_cause": {
+                "service":      "network-infra",
+                "failure_mode": "BGP route withdrawal causing AZ network partition",
+            },
+            "wrong_actions": {
+                "restart_service:payment-service":  "payment-service is healthy — restarting won't fix routing",
+                "restart_service:order-service":    "order-service is a victim of the partition",
+                "scale_service:payment-service":    "Scaling won't fix a BGP routing issue",
+                "clear_cache:redis-payment-cache":  "Cache is healthy — not the cause",
+            },
+        },
+    ],
+    # ── TASK 3: REMEDIATION PLANNING ────────────────────────────────────────
+    "remediation_planning": [
+        # RP-001: Full OOM remediation — disable cron, restart cascade
+        {
+            "scenario_id": "RP-001",
+            "description": (
+                "Full remediation: analytics-service OOM-killed postgres-db with an "
+                "unbounded query. Must disable the offending job, restart postgres, "
+                "restore all downstream services, and document the resolution."
+            ),
+            "incident_summary": (
+                "CRITICAL — postgres-db in OOM crash loop. auth-service, order-service, "
+                "api-gateway all down. analytics-service caused it with unbounded query. "
+                "Required actions: disable job, restart postgres, restore services, document."
+            ),
+            "alert": {
+                "id":              "ALT-RP-001",
+                "title":           "CRITICAL: postgres-db OOM crash loop — full stack down",
+                "severity_fired":  "P1",
+                "affected_services": [
+                    "postgres-db", "analytics-service",
+                    "auth-service", "order-service", "api-gateway",
+                ],
+            },
+            "known_services": {
+                "postgres-db", "auth-service", "order-service",
+                "api-gateway", "analytics-service",
+            },
+            "tool_responses": {
+                "query_logs": {
+                    "postgres-db": (
+                        "FATAL: Out of Memory: Kill process (postgres) — "
+                        "analytics query running 12min with no LIMIT"
+                    ),
+                    "analytics-service": (
+                        "ERROR: full_history_export — unbounded JOIN, 847M rows, killed by OOM"
+                    ),
+                    "auth-service":  "ERROR: connect ECONNREFUSED postgres-db:5432",
+                    "order-service": "ERROR: pq: the database system is starting up",
+                    "api-gateway":   "ERROR: upstream auth-service 503",
+                },
+                "check_metrics": {
+                    "postgres-db":       "Memory: OOM | Restarts: 4 | Status: CRASH LOOP",
+                    "analytics-service": "Memory spike: 31GB/32GB | Status: ERROR",
+                    "auth-service":      "Connection success: 0% | Waiting for DB",
+                    "order-service":     "Write success: 0% | Waiting for DB",
+                    "api-gateway":       "503 rate: 95% | Auth: DOWN",
+                },
+                "check_dependencies": {
+                    "postgres-db":       "Clients: auth-service, order-service, analytics-service",
+                    "analytics-service": "Depends on: postgres-db [CRASH LOOP]",
+                    "auth-service":      "Depends on: postgres-db [CRASH LOOP]",
+                    "order-service":     "Depends on: postgres-db [CRASH LOOP]",
+                },
+                "check_recent_deploys": {
+                    "analytics-service": (
+                        "Deploy 6h ago: full_history_export job — "
+                        "unbounded cross-table JOIN query"
+                    ),
+                    "postgres-db": "No recent changes",
+                },
+                "check_service_status": {
+                    "postgres-db":       "CRASH LOOP | OOM kill | Uptime: 47s",
+                    "analytics-service": "ERROR | Last job failed",
+                    "auth-service":      "DOWN",
+                    "order-service":     "DOWN",
+                    "api-gateway":       "DEGRADED",
+                },
+            },
+            "remediation_data": {
+                "disable_feature_flag": {
+                    "full_history_export": (
+                        "Cron job full_history_export DISABLED — "
+                        "no more unbounded queries will run"
+                    ),
+                },
+                "restart_service": {
+                    "postgres-db": (
+                        "postgres-db restarted cleanly — "
+                        "accepting connections (12/500 active)"
+                    ),
+                    "analytics-service": (
+                        "analytics-service restarted — no active queries"
+                    ),
+                    "auth-service":  "auth-service restarted — reconnected to postgres-db OK",
+                    "order-service": "order-service restarted — writes resuming normally",
+                },
+                "execute_runbook_step": {
+                    "verify_db_health": (
+                        "postgres-db: connections 12/500, CPU 12%, Memory 34% — healthy"
+                    ),
+                    "check_service_recovery": (
+                        "auth-service OK | order-service OK | api-gateway OK"
+                    ),
+                },
+            },
+            "correct_remediation_sequence": [
+                "disable_feature_flag:full_history_export",
+                "restart_service:analytics-service",
+                "restart_service:postgres-db",
+                "restart_service:auth-service",
+                "restart_service:order-service",
+            ],
+            "wrong_actions": {
+                "rollback_deploy:postgres-db": (
+                    "postgres-db has no recent deploy to roll back"
+                ),
+                "scale_service:postgres-db": (
+                    "Scaling won't prevent the OOM query from running again"
+                ),
+                "restart_service:api-gateway": (
+                    "api-gateway is downstream — fix the DB first"
+                ),
+            },
+            "resolution_keywords": [
+                "analytics", "oom", "memory", "postgres", "query",
+                "full_history_export", "disabled", "restarted", "recovered",
+            ],
+        },
+        # RP-002: Full BGP remediation — restore routes, rollback config, verify
+        {
+            "scenario_id": "RP-002",
+            "description": (
+                "Full remediation: BGP route withdrawal partitioned AZ-2/AZ-3 from "
+                "AZ-1 where payment-service runs. Must restore BGP routes, roll back "
+                "the router config change, verify checkout recovery, and document."
+            ),
+            "incident_summary": (
+                "P2 — BGP partition isolating payment-service from 61% of users. "
+                "Router config change 18min ago is the cause. "
+                "Required: restore BGP routes, rollback network config, verify recovery."
+            ),
+            "alert": {
+                "id":              "ALT-RP-002",
+                "title":           "HIGH: checkout 61% failure — BGP AZ partition",
+                "severity_fired":  "P2",
+                "affected_services": ["network-infra", "order-service", "payment-service"],
+            },
+            "known_services": {
+                "network-infra", "order-service", "payment-service",
+                "fraud-detection-service", "postgres-db",
+            },
+            "tool_responses": {
+                "query_logs": {
+                    "network-infra": (
+                        "CRITICAL: BGP route withdrawal — "
+                        "AZ-2/AZ-3 lost route to AZ-1 10.0.1.0/24\n"
+                        "Router config change 18min ago: BGP policy updated"
+                    ),
+                    "order-service": (
+                        "ERROR: connection timeout payment-service — no route to host"
+                    ),
+                    "payment-service": (
+                        "INFO: AZ-1 traffic normal | "
+                        "WARN: cross-AZ health checks failing"
+                    ),
+                    "fraud-detection-service": (
+                        "WARN: cross-AZ health probes 100% timeout | AZ-1 traffic: normal"
+                    ),
+                    "postgres-db": "Operating normally",
+                },
+                "check_metrics": {
+                    "network-infra":  "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN | AZ-1: UP",
+                    "order-service":  "AZ-2 failure: 99% | AZ-1 failure: 0.2%",
+                    "payment-service": "AZ-1: normal | Cross-AZ inbound: 0",
+                    "fraud-detection-service": "AZ-1: normal | Cross-AZ: 0",
+                    "postgres-db":    "All normal",
+                },
+                "check_dependencies": {
+                    "order-service":  "Depends on: payment-service [PARTITIONED]",
+                    "payment-service": "Depends on: postgres-db [OK]",
+                    "network-infra":  "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN]",
+                },
+                "check_recent_deploys": {
+                    "network-infra": (
+                        "Config change 18min ago — BGP policy update "
+                        "accidentally withdrew AZ-1 routes"
+                    ),
+                    "payment-service": "No recent deploys",
+                    "order-service":   "No recent deploys",
+                },
+                "check_service_status": {
+                    "network-infra":   "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
+                    "payment-service": "HEALTHY (AZ-1) | Cross-AZ: UNREACHABLE",
+                    "order-service":   "DEGRADED",
+                },
+            },
+            "remediation_data": {
+                "rollback_deploy": {
+                    "network-infra": (
+                        "Router config rolled back — "
+                        "BGP advertisement policy restored to previous version"
+                    ),
+                },
+                "execute_runbook_step": {
+                    "restore_bgp_routes": (
+                        "BGP routes restored — AZ-2/AZ-3 can now reach AZ-1 10.0.1.0/24"
+                    ),
+                    "verify_checkout_recovery": (
+                        "Checkout failure rate: 0.3% — incident fully resolved"
+                    ),
+                },
+            },
+            "correct_remediation_sequence": [
+                "execute_runbook_step:restore_bgp_routes",
+                "rollback_deploy:network-infra",
+                "execute_runbook_step:verify_checkout_recovery",
+            ],
+            "wrong_actions": {
+                "restart_service:payment-service":  "payment-service is healthy — network is the issue",
+                "scale_service:payment-service":    "Scaling won't fix BGP routing",
+                "restart_service:order-service":    "order-service is a victim",
+                "clear_cache":                      "Cache is unrelated to network routing",
+            },
+            "resolution_keywords": [
+                "bgp", "network", "route", "rollback", "partition",
+                "restored", "az-1", "az-2", "az-3", "checkout", "withdrawal",
+            ],
+        },
+    ],
+}
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def get_task(task_id: str) -> dict:
+    if task_id not in ALL_TASKS:
+        raise ValueError(
+            f"Unknown task_id '{task_id}'. "
+            f"Valid: {list(ALL_TASKS.keys())}"
+        )
+    return ALL_TASKS[task_id]
+def get_scenario(task_id: str, index: int) -> dict:
+    if task_id not in SCENARIOS:
+        raise ValueError(f"No scenarios for task_id '{task_id}'.")
+    scenarios = SCENARIOS[task_id]
+    if index < 0 or index >= len(scenarios):
+        raise ValueError(
+            f"Scenario index {index} out of range for task '{task_id}' "
+            f"(valid: 0–{len(scenarios) - 1})"
+        )
+    return scenarios[index]
+def list_tasks() -> list:
+    return list(ALL_TASKS.values())

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff