cloud-incident / tasks.py
Elliot89's picture
feat: cloud incident response OpenEnv v0.1.0
37204eb
"""
tasks.py β€” Task and scenario definitions for Cloud Incident Response OpenEnv.
Covers cross-service cascading failures in distributed cloud systems:
- DB connection pool exhaustion cascading through service mesh
- CDN cache invalidation storms
- OOM kills from runaway analytics queries
- BGP network partitions isolating availability zones
Distinct from Kubernetes ops environments β€” focuses on application-layer
incident response: log correlation, dependency tracing, and remediation
across microservice architectures.
Public API:
get_task(task_id) -> task metadata dict
get_scenario(task_id, index) -> scenario dict
list_tasks() -> list of task dicts
ALL_TASKS -> dict[task_id -> metadata]
"""
from __future__ import annotations
ALL_TASKS: dict = {
"alert_classification": {
"id": "alert_classification",
"name": "Task 1: Alert Severity Classification",
"difficulty": "easy",
"max_steps": 3,
"score_range": [0.0, 1.0],
"description": (
"An alert has fired. Query logs and metrics across affected services, "
"then classify the incident severity: P1 (CRITICAL β€” revenue/user impact, "
"immediate action), P2 (HIGH β€” degraded service), P3 (MEDIUM β€” minor issue), "
"P4 (LOW β€” informational). Submit severity with submit_severity."
),
"available_actions": [
"query_logs",
"check_metrics",
"check_dependencies",
"check_recent_deploys",
"submit_severity",
],
"submission_action": "submit_severity",
"scenarios": 2,
},
"root_cause_analysis": {
"id": "root_cause_analysis",
"name": "Task 2: Root Cause Analysis",
"difficulty": "medium",
"max_steps": 10,
"score_range": [0.0, 1.0],
"description": (
"A production incident is active. Use diagnostic tools to trace the failure "
"chain across services. Query logs, metrics, dependency graphs, and recent "
"deploys to identify which service is the root cause and what failure mode "
"triggered the cascade. Submit findings with submit_root_cause."
),
"available_actions": [
"query_logs",
"check_metrics",
"check_dependencies",
"check_recent_deploys",
"check_service_status",
"submit_root_cause",
],
"submission_action": "submit_root_cause",
"scenarios": 2,
},
"remediation_planning": {
"id": "remediation_planning",
"name": "Task 3: Incident Remediation",
"difficulty": "hard",
"max_steps": 15,
"score_range": [0.0, 1.0],
"description": (
"A critical production incident requires full end-to-end resolution. "
"Diagnose the root cause, execute the correct remediation sequence "
"(disable feature flags, restart services, rollback deploys, run runbook steps), "
"then submit a resolution summary. Scored on investigation quality, "
"remediation correctness, efficiency, and documentation."
),
"available_actions": [
"query_logs",
"check_metrics",
"check_dependencies",
"check_recent_deploys",
"check_service_status",
"restart_service",
"rollback_deploy",
"scale_service",
"disable_feature_flag",
"clear_cache",
"execute_runbook_step",
"submit_resolution",
],
"submission_action": "submit_resolution",
"scenarios": 2,
},
}
# ---------------------------------------------------------------------------
# Scenario data β€” 3 tasks x 2 scenarios = 6 total episodes
# ---------------------------------------------------------------------------
SCENARIOS: dict = {
# ── TASK 1: ALERT CLASSIFICATION ────────────────────────────────────────
"alert_classification": [
# AC-001: Cascading DB connection pool exhaustion
{
"scenario_id": "AC-001",
"description": (
"Cascading failure: postgres-db connection pool exhausted, "
"causing auth-service timeouts, blocking api-gateway requests. "
"Revenue impact is severe and growing."
),
"incident_summary": (
"P1 ALERT β€” api-gateway 5xx rate 78%, auth-service timeout rate 94%, "
"postgres-db connection pool at 100% (500/500). "
"Checkout completely down. Revenue impact: $12,000/min."
),
"alert": {
"id": "ALT-20240315-001",
"title": "CRITICAL: api-gateway error rate spike 78%",
"severity_fired": "P1",
"affected_services": ["api-gateway", "auth-service", "postgres-db"],
"symptoms": [
"api-gateway: HTTP 503 rate 78% (baseline: 0.1%)",
"auth-service: connection timeout 94% of requests",
"postgres-db: connection pool 500/500 β€” 100% utilized",
"checkout flow: completely unavailable",
"new user logins: 0% success rate",
],
"error_rate": 0.78,
"duration_minutes": 4,
"revenue_impact_per_min": 12000,
},
"known_services": {"api-gateway", "auth-service", "postgres-db"},
"tool_responses": {
"query_logs": {
"api-gateway": (
"2024-03-15T10:04:12Z ERROR upstream connect error β€” "
"reset reason: connection timeout auth-service:8080\n"
"2024-03-15T10:04:13Z ERROR 503 Service Unavailable upstream: auth-service\n"
"2024-03-15T10:04:14Z ERROR circuit breaker OPEN for auth-service"
),
"auth-service": (
"2024-03-15T10:04:10Z ERROR pq: sorry, too many clients already\n"
"2024-03-15T10:04:11Z ERROR dial tcp postgres-db:5432: "
"connect: connection refused β€” pool exhausted (500/500)\n"
"2024-03-15T10:04:12Z ERROR all connection pool slots occupied"
),
"postgres-db": (
"2024-03-15T10:03:58Z LOG connection received: host=auth-service\n"
"2024-03-15T10:04:00Z FATAL remaining connection slots reserved "
"for non-replication superuser\n"
"2024-03-15T10:04:01Z LOG max_connections=500 active=500 idle=0"
),
},
"check_metrics": {
"api-gateway": (
"HTTP 5xx rate: 78% | p99 latency: 30s (timeout) | "
"RPS: 1,200 | circuit_breaker: OPEN"
),
"auth-service": (
"Error rate: 94% | DB connection wait: 28s | "
"Active connections: 0 | Request queue: 847"
),
"postgres-db": (
"Connections: 500/500 (100%) | Query queue: 847 | "
"CPU: 98% | Memory: 89% | Active queries: 500"
),
},
"check_dependencies": {
"api-gateway": "Depends on: auth-service [CRITICAL], product-service [OK]",
"auth-service": "Depends on: postgres-db [CRITICAL], redis-session [OK]",
"postgres-db": "No upstream dependencies β€” root level service",
},
"check_recent_deploys": {
"api-gateway": "Last deploy: 3 days ago β€” no recent changes",
"auth-service": (
"Last deploy: 47 min ago β€” PR #2341: "
"increased default connection pool size from 10 to 500"
),
"postgres-db": "Last deploy: 12 days ago β€” no recent changes",
},
},
"correct_severity": "P1",
"adjacent_severities": ["P2"],
"correct_root_cause": {
"service": "postgres-db",
"failure_mode": "connection pool exhaustion",
},
"correct_remediation": [
"restart_service:auth-service",
"execute_runbook_step:increase_max_connections",
"scale_service:postgres-db",
],
"wrong_actions": {
"rollback_deploy": "Rolling back auth-service pool size won't fix 500 stuck connections",
"restart_service:api-gateway": "api-gateway is a victim β€” fixing it won't help",
"clear_cache": "Cache is unrelated to DB connection pool exhaustion",
},
},
# AC-002: CDN cache invalidation storm
{
"scenario_id": "AC-002",
"description": (
"CDN cache invalidation storm: a misconfigured purge cronjob wiped "
"all 2.1M cached keys, sending 40x normal traffic to origin. "
"Site degraded but not fully down β€” P2 severity."
),
"incident_summary": (
"P2 ALERT β€” CDN cache hit rate dropped from 94% to 3%, "
"product-service origin traffic up 4000%, image-service CPU at 95%. "
"Pages loading slowly (p99: 18s). Checkout still working."
),
"alert": {
"id": "ALT-20240315-002",
"title": "HIGH: CDN cache miss storm β€” origin overloaded",
"severity_fired": "P2",
"affected_services": ["cdn-edge", "product-service", "image-service"],
"symptoms": [
"CDN cache hit rate: 3% (normal: 94%)",
"product-service: origin RPS 48,000 (normal: 1,200)",
"image-service: CPU 95%, p99 latency 18s",
"User experience: product pages slow, some images timing out",
"Checkout: still functional (not affected)",
],
"error_rate": 0.15,
"duration_minutes": 8,
"revenue_impact_per_min": 800,
},
"known_services": {"cdn-edge", "product-service", "image-service"},
"tool_responses": {
"query_logs": {
"cdn-edge": (
"2024-03-15T10:22:00Z INFO cache MISS ratio: 97% (5min window)\n"
"2024-03-15T10:20:11Z WARN mass cache invalidation β€” "
"2,100,000 keys purged by purge-job-prod\n"
"2024-03-15T10:20:10Z INFO purge pattern: /* (ALL keys)"
),
"product-service": (
"2024-03-15T10:22:05Z WARN request queue depth: 12,400\n"
"2024-03-15T10:22:06Z ERROR timeout fetching from image-service (18s)\n"
"2024-03-15T10:22:07Z WARN worker pool 95% utilized"
),
"image-service": (
"2024-03-15T10:22:00Z WARN CPU throttling engaged (95%)\n"
"2024-03-15T10:22:01Z ERROR worker pool exhausted β€” dropping requests\n"
"2024-03-15T10:22:02Z ERROR OOM risk: memory at 91%"
),
},
"check_metrics": {
"cdn-edge": (
"Cache hit rate: 3% | Purge events (1h): 1 mass purge | "
"Origin RPS: 48,000 | Bandwidth: 890 Gbps"
),
"product-service": (
"Origin RPS: 48,000 (normal: 1,200) | "
"Queue depth: 12,400 | Worker utilization: 95%"
),
"image-service": (
"CPU: 95% | Memory: 91% | "
"Worker pool: 0 free / 200 | p99 latency: 18s"
),
},
"check_dependencies": {
"cdn-edge": "Origin: product-service [OVERLOADED]",
"product-service": "Depends on: image-service [DEGRADED], postgres-db [OK]",
"image-service": "Depends on: object-storage [OK] β€” no upstream issues",
},
"check_recent_deploys": {
"cdn-edge": (
"Cronjob purge-job-prod updated 2h ago β€” "
"purge pattern changed from /images/* to /* (all keys)"
),
"product-service": "Last deploy: 5 days ago β€” no recent changes",
"image-service": "Last deploy: 2 days ago β€” no recent changes",
},
},
"correct_severity": "P2",
"adjacent_severities": ["P1", "P3"],
"correct_root_cause": {
"service": "cdn-edge",
"failure_mode": "misconfigured purge job invalidated all cache keys",
},
"correct_remediation": [
"disable_feature_flag:purge-job-prod",
"execute_runbook_step:warm_cdn_cache",
"scale_service:image-service",
],
"wrong_actions": {
"restart_service:image-service": (
"Restarting won't fix the CDN miss storm β€” source is the purge job"
),
"rollback_deploy:product-service": "product-service has no recent deploys",
"restart_service:cdn-edge": (
"Restarting CDN edge nodes will make cache miss rate worse temporarily"
),
},
},
],
# ── TASK 2: ROOT CAUSE ANALYSIS ─────────────────────────────────────────
"root_cause_analysis": [
# RCA-001: Analytics service OOM kills postgres-db
{
"scenario_id": "RCA-001",
"description": (
"postgres-db was OOM-killed by the Linux kernel after a runaway "
"analytics query with no LIMIT clause consumed all available memory. "
"All downstream services are now failing."
),
"incident_summary": (
"Multiple services down: api-gateway 503, auth-service failing, "
"order-service write failures. postgres-db restarting in a loop. "
"Root cause upstream β€” trace the failure chain."
),
"alert": {
"id": "ALT-RCA-001",
"title": "CRITICAL: postgres-db crash loop β€” all dependents down",
"severity_fired": "P1",
"affected_services": [
"api-gateway", "auth-service", "order-service", "postgres-db"
],
"symptoms": [
"postgres-db: 4 restarts in 12 minutes",
"auth-service: connection refused β€” 100% failure",
"order-service: all writes failing",
"api-gateway: 503 on all authenticated routes",
"analytics-service: last job failed 12 min ago",
],
"error_rate": 0.95,
"duration_minutes": 14,
},
"known_services": {
"api-gateway", "auth-service", "order-service",
"postgres-db", "analytics-service", "redis-session",
},
"tool_responses": {
"query_logs": {
"postgres-db": (
"2024-03-16T02:11:00Z LOG database system shut down at 02:10:58\n"
"2024-03-16T02:10:58Z FATAL Out of Memory: Kill process 1847 (postgres) "
"score 982 or sacrifice child\n"
"2024-03-16T02:10:30Z LOG process 1847 query running 12min: "
"SELECT * FROM events JOIN user_sessions JOIN orders "
"JOIN products β€” no LIMIT clause, est 847M rows"
),
"analytics-service": (
"2024-03-16T01:58:00Z INFO starting job: full_history_export\n"
"2024-03-16T01:58:01Z WARN query has no LIMIT β€” estimated 847M rows\n"
"2024-03-16T02:10:55Z ERROR job killed by OOM β€” full_history_export FAILED"
),
"auth-service": (
"2024-03-16T02:11:05Z ERROR connect ECONNREFUSED postgres-db:5432\n"
"2024-03-16T02:11:06Z ERROR all retries exhausted β€” giving up"
),
"api-gateway": (
"2024-03-16T02:11:10Z ERROR upstream auth-service: 503 Service Unavailable"
),
"order-service": (
"2024-03-16T02:11:08Z ERROR pq: the database system is starting up"
),
"redis-session": "No errors β€” operating normally at 99.2% hit rate",
},
"check_metrics": {
"postgres-db": (
"Memory: OOM killed (0% free at crash) | "
"Restarts: 4 in 12min | Status: RESTARTING"
),
"analytics-service": (
"Memory at crash: 31.2GB / 32GB (97.5%) | "
"Job runtime: 12min 55s | Status: ERROR"
),
"auth-service": "Connection success: 0% | DB: CRITICAL | Redis: OK",
"api-gateway": "503 rate: 95% | Auth dependency: DOWN",
"order-service": "Write success: 0% | DB: RESTARTING",
"redis-session": "Hit rate: 99.2% | Memory: 42% | Healthy",
},
"check_dependencies": {
"postgres-db": (
"Clients: auth-service, order-service, analytics-service, product-service"
),
"analytics-service": "Depends on: postgres-db [CRASH LOOP]",
"auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
"api-gateway": "Depends on: auth-service [DOWN]",
"order-service": "Depends on: postgres-db [CRASH LOOP]",
"redis-session": "No DB dependency β€” standalone cache",
},
"check_recent_deploys": {
"analytics-service": (
"Deploy 6h ago: added full_history_export scheduled job β€” "
"runs daily at 02:00 UTC, no LIMIT on cross-table JOIN"
),
"postgres-db": "No deploys in 3 weeks",
"auth-service": "No recent deploys",
"order-service": "No recent deploys",
"redis-session": "No recent deploys",
},
"check_service_status": {
"postgres-db": "RESTARTING | Uptime: 47s | Crash reason: OOM",
"analytics-service": "ERROR | Last job: full_history_export FAILED",
"auth-service": "DOWN | Waiting for postgres-db",
"api-gateway": "DEGRADED | 95% requests failing",
"order-service": "DOWN | Waiting for postgres-db",
"redis-session": "HEALTHY | All normal",
},
},
"correct_root_cause": {
"service": "analytics-service",
"failure_mode": "unbounded query OOM killing postgres-db",
},
"correct_remediation": [
"disable_feature_flag:full_history_export",
"restart_service:analytics-service",
"restart_service:postgres-db",
],
"wrong_actions": {
"restart_service:auth-service": "auth-service is a victim β€” DB must be fixed first",
"restart_service:api-gateway": "api-gateway is downstream β€” won't help",
"scale_service:postgres-db": "Scaling won't prevent OOM if the bad query runs again",
"rollback_deploy:postgres-db": "postgres-db has no recent deploys",
},
},
# RCA-002: BGP route withdrawal β€” AZ network partition
{
"scenario_id": "RCA-002",
"description": (
"A BGP route withdrawal isolated AZ-1 (where payment-service runs) "
"from AZ-2 and AZ-3, causing 61% of checkout requests to fail. "
"Services within AZ-1 are healthy β€” it's a pure network issue."
),
"incident_summary": (
"Checkout failure rate 61% β€” AZ-2 and AZ-3 cannot reach payment-service "
"in AZ-1. AZ-1 users unaffected. fraud-detection-service also unreachable "
"cross-AZ. Network infrastructure change 18 min ago."
),
"alert": {
"id": "ALT-RCA-002",
"title": "HIGH: checkout failure 61% β€” cross-AZ connectivity loss",
"severity_fired": "P2",
"affected_services": [
"order-service", "payment-service", "fraud-detection-service"
],
"symptoms": [
"checkout failure rate: 61% (AZ-2/AZ-3 only)",
"payment-service: unreachable from AZ-2, AZ-3",
"fraud-detection-service: timeout from AZ-2, AZ-3",
"AZ-1 users: 0% failure rate",
"Network: AZ-2/AZ-3 β†’ AZ-1 routing broken",
],
"error_rate": 0.61,
"duration_minutes": 9,
},
"known_services": {
"order-service", "payment-service", "fraud-detection-service",
"postgres-db", "redis-payment-cache", "network-infra",
},
"tool_responses": {
"query_logs": {
"order-service": (
"2024-03-17T14:32:10Z ERROR connection timeout payment-service:8080 "
"(AZ-2 to AZ-1: no route to host)\n"
"2024-03-17T14:32:11Z ERROR fraud-detection-service: i/o timeout (30s)"
),
"payment-service": (
"2024-03-17T14:31:58Z WARN health check from AZ-2 LB failing\n"
"2024-03-17T14:31:59Z INFO AZ-1 local traffic: all normal"
),
"fraud-detection-service": (
"2024-03-17T14:32:00Z INFO AZ-1 requests: all normal\n"
"2024-03-17T14:32:01Z WARN cross-AZ health probes: 100% timeout"
),
"network-infra": (
"2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.2.1 route withdrawal β€” "
"AZ-2 lost route to AZ-1 CIDR 10.0.1.0/24\n"
"2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.3.1 route withdrawal β€” "
"AZ-3 lost route to AZ-1 CIDR 10.0.1.0/24\n"
"2024-03-17T14:31:44Z INFO router config change applied β€” "
"BGP advertisement policy updated"
),
"postgres-db": "Operating normally β€” no errors detected",
"redis-payment-cache": "Operating normally β€” AZ-1 traffic only, all healthy",
},
"check_metrics": {
"order-service": (
"AZ-2 checkout failure: 99% | AZ-3 checkout failure: 98% | "
"AZ-1 checkout failure: 0.2% (baseline)"
),
"payment-service": (
"AZ-1 traffic: normal (100% success) | "
"AZ-2/AZ-3 inbound connections: 0 (blocked)"
),
"fraud-detection-service": (
"AZ-1 processing: normal | "
"Cross-AZ health checks: 100% timeout"
),
"network-infra": (
"BGP session AZ-2: WITHDRAWN | BGP session AZ-3: WITHDRAWN | "
"AZ-1 internal: all UP | Config change: 18min ago"
),
"postgres-db": "All metrics normal β€” no anomalies",
"redis-payment-cache": "All metrics normal β€” AZ-1 only traffic",
},
"check_dependencies": {
"order-service": (
"Depends on: payment-service [PARTITIONED], "
"fraud-detection-service [PARTITIONED]"
),
"payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
"fraud-detection-service": "Depends on: postgres-db [OK]",
"network-infra": "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN], AZ-1 [UP]",
},
"check_recent_deploys": {
"network-infra": (
"Router config change 18min ago β€” BGP route advertisement policy update: "
"inadvertently withdrew AZ-1 routes from AZ-2/AZ-3 peers"
),
"payment-service": "No recent deploys",
"order-service": "No recent deploys",
"fraud-detection-service": "No recent deploys",
},
"check_service_status": {
"payment-service": "HEALTHY within AZ-1 | Cross-AZ: UNREACHABLE",
"order-service": "DEGRADED | AZ-2/AZ-3 instances failing",
"network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN | AZ-1: UP",
"fraud-detection-service": "HEALTHY within AZ-1 | Cross-AZ: UNREACHABLE",
"postgres-db": "HEALTHY",
"redis-payment-cache": "HEALTHY",
},
},
"correct_root_cause": {
"service": "network-infra",
"failure_mode": "BGP route withdrawal causing AZ network partition",
},
"correct_remediation": [
"execute_runbook_step:restore_bgp_routes",
"rollback_deploy:network-infra",
],
"wrong_actions": {
"restart_service:payment-service": (
"payment-service is healthy β€” restarting won't fix routing"
),
"restart_service:order-service": "order-service is a victim of the partition",
"scale_service:payment-service": "Scaling won't fix a BGP routing issue",
"clear_cache:redis-payment-cache": "Cache is healthy β€” not the cause",
},
},
],
# ── TASK 3: REMEDIATION PLANNING ────────────────────────────────────────
"remediation_planning": [
# RP-001: Full OOM remediation
{
"scenario_id": "RP-001",
"description": (
"Full remediation: analytics-service OOM-killed postgres-db with an "
"unbounded query. Must disable the offending job, restart postgres, "
"restore all downstream services, and document the resolution."
),
"incident_summary": (
"CRITICAL β€” postgres-db in OOM crash loop. auth-service, order-service, "
"api-gateway all down. analytics-service caused it with unbounded query. "
"Required actions: disable job, restart postgres, restore services, document."
),
"alert": {
"id": "ALT-RP-001",
"title": "CRITICAL: postgres-db OOM crash loop β€” full stack down",
"severity_fired": "P1",
"affected_services": [
"postgres-db", "analytics-service",
"auth-service", "order-service", "api-gateway"
],
},
"known_services": {
"postgres-db", "auth-service", "order-service",
"api-gateway", "analytics-service",
},
"tool_responses": {
"query_logs": {
"postgres-db": (
"FATAL: Out of Memory: Kill process (postgres) β€” "
"analytics query running 12min with no LIMIT"
),
"analytics-service": (
"ERROR: full_history_export β€” unbounded JOIN, 847M rows, killed by OOM"
),
"auth-service": "ERROR: connect ECONNREFUSED postgres-db:5432",
"order-service": "ERROR: pq: the database system is starting up",
"api-gateway": "ERROR: upstream auth-service 503",
},
"check_metrics": {
"postgres-db": "Memory: OOM | Restarts: 4 | Status: CRASH LOOP",
"analytics-service": "Memory spike: 31GB/32GB | Status: ERROR",
"auth-service": "Connection success: 0% | Waiting for DB",
"order-service": "Write success: 0% | Waiting for DB",
"api-gateway": "503 rate: 95% | Auth: DOWN",
},
"check_dependencies": {
"postgres-db": "Clients: auth-service, order-service, analytics-service",
"analytics-service": "Depends on: postgres-db [CRASH LOOP]",
"auth-service": "Depends on: postgres-db [CRASH LOOP]",
"order-service": "Depends on: postgres-db [CRASH LOOP]",
},
"check_recent_deploys": {
"analytics-service": (
"Deploy 6h ago: full_history_export job β€” "
"unbounded cross-table JOIN query"
),
"postgres-db": "No recent changes",
},
"check_service_status": {
"postgres-db": "CRASH LOOP | OOM kill | Uptime: 47s",
"analytics-service": "ERROR | Last job failed",
"auth-service": "DOWN",
"order-service": "DOWN",
"api-gateway": "DEGRADED",
},
},
"remediation_data": {
"disable_feature_flag": {
"full_history_export": (
"Cron job full_history_export DISABLED β€” "
"no more unbounded queries will run"
),
},
"restart_service": {
"postgres-db": (
"postgres-db restarted cleanly β€” "
"accepting connections (12/500 active)"
),
"analytics-service": (
"analytics-service restarted β€” no active queries"
),
"auth-service": (
"auth-service restarted β€” reconnected to postgres-db OK"
),
"order-service": (
"order-service restarted β€” writes resuming normally"
),
},
"execute_runbook_step": {
"verify_db_health": (
"postgres-db: connections 12/500, CPU 12%, Memory 34% β€” healthy"
),
"check_service_recovery": (
"auth-service OK | order-service OK | api-gateway OK"
),
},
},
"correct_remediation_sequence": [
"disable_feature_flag:full_history_export",
"restart_service:analytics-service",
"restart_service:postgres-db",
"restart_service:auth-service",
"restart_service:order-service",
],
"wrong_actions": {
"rollback_deploy:postgres-db": (
"postgres-db has no recent deploy to roll back"
),
"scale_service:postgres-db": (
"Scaling won't prevent the OOM query from running again"
),
"restart_service:api-gateway": (
"api-gateway is downstream β€” fix the DB first"
),
},
"resolution_keywords": [
"analytics", "oom", "memory", "postgres", "query",
"full_history_export", "disabled", "restarted", "recovered",
],
},
# RP-002: Full BGP remediation
{
"scenario_id": "RP-002",
"description": (
"Full remediation: BGP route withdrawal partitioned AZ-2/AZ-3 from "
"AZ-1 where payment-service runs. Must restore BGP routes, roll back "
"the router config change, verify checkout recovery, and document."
),
"incident_summary": (
"P2 β€” BGP partition isolating payment-service from 61% of users. "
"Router config change 18min ago is the cause. "
"Required: restore BGP routes, rollback network config, verify recovery."
),
"alert": {
"id": "ALT-RP-002",
"title": "HIGH: checkout 61% failure β€” BGP AZ partition",
"severity_fired": "P2",
"affected_services": ["network-infra", "order-service", "payment-service"],
},
"known_services": {
"network-infra", "order-service", "payment-service",
"fraud-detection-service", "postgres-db",
},
"tool_responses": {
"query_logs": {
"network-infra": (
"CRITICAL: BGP route withdrawal β€” "
"AZ-2/AZ-3 lost route to AZ-1 10.0.1.0/24\n"
"Router config change 18min ago: BGP policy updated"
),
"order-service": (
"ERROR: connection timeout payment-service β€” no route to host"
),
"payment-service": (
"INFO: AZ-1 traffic normal | "
"WARN: cross-AZ health checks failing"
),
},
"check_metrics": {
"network-infra": (
"BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN | AZ-1: UP"
),
"order-service": "AZ-2 failure: 99% | AZ-1 failure: 0.2%",
"payment-service": "AZ-1: normal | Cross-AZ inbound: 0",
},
"check_dependencies": {
"order-service": "Depends on: payment-service [PARTITIONED]",
"payment-service": "Depends on: postgres-db [OK]",
"network-infra": "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN]",
},
"check_recent_deploys": {
"network-infra": (
"Config change 18min ago β€” BGP policy update "
"accidentally withdrew AZ-1 routes"
),
"payment-service": "No recent deploys",
"order-service": "No recent deploys",
},
"check_service_status": {
"network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
"payment-service": "HEALTHY (AZ-1) | Cross-AZ: UNREACHABLE",
"order-service": "DEGRADED",
},
},
"remediation_data": {
"rollback_deploy": {
"network-infra": (
"Router config rolled back β€” "
"BGP advertisement policy restored to previous version"
),
},
"execute_runbook_step": {
"restore_bgp_routes": (
"BGP routes restored β€” AZ-2/AZ-3 can now reach AZ-1 10.0.1.0/24"
),
"verify_checkout_recovery": (
"Checkout failure rate: 0.3% β€” incident fully resolved"
),
},
},
"correct_remediation_sequence": [
"execute_runbook_step:restore_bgp_routes",
"rollback_deploy:network-infra",
"execute_runbook_step:verify_checkout_recovery",
],
"wrong_actions": {
"restart_service:payment-service": (
"payment-service is healthy β€” network is the issue"
),
"scale_service:payment-service": "Scaling won't fix BGP routing",
"restart_service:order-service": "order-service is a victim",
"clear_cache": "Cache is unrelated to network routing",
},
"resolution_keywords": [
"bgp", "network", "route", "rollback", "partition",
"restored", "az-1", "az-2", "az-3", "checkout", "withdrawal",
],
},
],
}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def get_task(task_id: str) -> dict:
if task_id not in ALL_TASKS:
raise ValueError(
f"Unknown task_id '{task_id}'. "
f"Valid task IDs: {list(ALL_TASKS.keys())}"
)
return ALL_TASKS[task_id]
def get_scenario(task_id: str, index: int) -> dict:
if task_id not in SCENARIOS:
raise ValueError(f"No scenarios for task_id '{task_id}'.")
scenarios = SCENARIOS[task_id]
if index < 0 or index >= len(scenarios):
raise ValueError(
f"Scenario index {index} out of range for task '{task_id}' "
f"(valid: 0–{len(scenarios) - 1})"
)
return scenarios[index]
def list_tasks() -> list:
return list(ALL_TASKS.values())