""" tasks.py — Task and scenario definitions for Cloud Incident Response OpenEnv. Covers cross-service cascading failures in distributed cloud systems: - DB connection pool exhaustion cascading through service mesh - CDN cache invalidation storms - OOM kills from runaway analytics queries - BGP network partitions isolating availability zones Distinct from Kubernetes ops environments — focuses on application-layer incident response: log correlation, dependency tracing, and remediation across microservice architectures. Public API: get_task(task_id) -> task metadata dict get_scenario(task_id, index) -> scenario dict list_tasks() -> list of task dicts ALL_TASKS -> dict[task_id -> metadata] """ from __future__ import annotations ALL_TASKS: dict = { "alert_classification": { "id": "alert_classification", "name": "Task 1: Alert Severity Classification", "difficulty": "easy", "max_steps": 3, "score_range": [0.0, 1.0], "description": ( "An alert has fired. Query logs and metrics across affected services, " "then classify the incident severity: P1 (CRITICAL — revenue/user impact, " "immediate action), P2 (HIGH — degraded service), P3 (MEDIUM — minor issue), " "P4 (LOW — informational). Submit severity with submit_severity." ), "available_actions": [ "query_logs", "check_metrics", "check_dependencies", "check_recent_deploys", "submit_severity", ], "submission_action": "submit_severity", "scenarios": 2, }, "root_cause_analysis": { "id": "root_cause_analysis", "name": "Task 2: Root Cause Analysis", "difficulty": "medium", "max_steps": 10, "score_range": [0.0, 1.0], "description": ( "A production incident is active. Use diagnostic tools to trace the failure " "chain across services. Query logs, metrics, dependency graphs, and recent " "deploys to identify which service is the root cause and what failure mode " "triggered the cascade. Submit findings with submit_root_cause." ), "available_actions": [ "query_logs", "check_metrics", "check_dependencies", "check_recent_deploys", "check_service_status", "submit_root_cause", ], "submission_action": "submit_root_cause", "scenarios": 2, }, "remediation_planning": { "id": "remediation_planning", "name": "Task 3: Incident Remediation", "difficulty": "hard", "max_steps": 15, "score_range": [0.0, 1.0], "description": ( "A critical production incident requires full end-to-end resolution. " "Diagnose the root cause, execute the correct remediation sequence " "(disable feature flags, restart services, rollback deploys, run runbook steps), " "then submit a resolution summary. Scored on investigation quality, " "remediation correctness, efficiency, and documentation." ), "available_actions": [ "query_logs", "check_metrics", "check_dependencies", "check_recent_deploys", "check_service_status", "restart_service", "rollback_deploy", "scale_service", "disable_feature_flag", "clear_cache", "execute_runbook_step", "submit_resolution", ], "submission_action": "submit_resolution", "scenarios": 2, }, } # --------------------------------------------------------------------------- # Scenario data — 3 tasks x 2 scenarios = 6 total episodes # --------------------------------------------------------------------------- SCENARIOS: dict = { # ── TASK 1: ALERT CLASSIFICATION ──────────────────────────────────────── "alert_classification": [ # AC-001: Cascading DB connection pool exhaustion { "scenario_id": "AC-001", "description": ( "Cascading failure: postgres-db connection pool exhausted, " "causing auth-service timeouts, blocking api-gateway requests. " "Revenue impact is severe and growing." ), "incident_summary": ( "P1 ALERT — api-gateway 5xx rate 78%, auth-service timeout rate 94%, " "postgres-db connection pool at 100% (500/500). " "Checkout completely down. Revenue impact: $12,000/min." ), "alert": { "id": "ALT-20240315-001", "title": "CRITICAL: api-gateway error rate spike 78%", "severity_fired": "P1", "affected_services": ["api-gateway", "auth-service", "postgres-db"], "symptoms": [ "api-gateway: HTTP 503 rate 78% (baseline: 0.1%)", "auth-service: connection timeout 94% of requests", "postgres-db: connection pool 500/500 — 100% utilized", "checkout flow: completely unavailable", "new user logins: 0% success rate", ], "error_rate": 0.78, "duration_minutes": 4, "revenue_impact_per_min": 12000, }, "known_services": {"api-gateway", "auth-service", "postgres-db"}, "tool_responses": { "query_logs": { "api-gateway": ( "2024-03-15T10:04:12Z ERROR upstream connect error — " "reset reason: connection timeout auth-service:8080\n" "2024-03-15T10:04:13Z ERROR 503 Service Unavailable upstream: auth-service\n" "2024-03-15T10:04:14Z ERROR circuit breaker OPEN for auth-service" ), "auth-service": ( "2024-03-15T10:04:10Z ERROR pq: sorry, too many clients already\n" "2024-03-15T10:04:11Z ERROR dial tcp postgres-db:5432: " "connect: connection refused — pool exhausted (500/500)\n" "2024-03-15T10:04:12Z ERROR all connection pool slots occupied" ), "postgres-db": ( "2024-03-15T10:03:58Z LOG connection received: host=auth-service\n" "2024-03-15T10:04:00Z FATAL remaining connection slots reserved " "for non-replication superuser\n" "2024-03-15T10:04:01Z LOG max_connections=500 active=500 idle=0" ), }, "check_metrics": { "api-gateway": ( "HTTP 5xx rate: 78% | p99 latency: 30s (timeout) | " "RPS: 1,200 | circuit_breaker: OPEN" ), "auth-service": ( "Error rate: 94% | DB connection wait: 28s | " "Active connections: 0 | Request queue: 847" ), "postgres-db": ( "Connections: 500/500 (100%) | Query queue: 847 | " "CPU: 98% | Memory: 89% | Active queries: 500" ), }, "check_dependencies": { "api-gateway": "Depends on: auth-service [CRITICAL], product-service [OK]", "auth-service": "Depends on: postgres-db [CRITICAL], redis-session [OK]", "postgres-db": "No upstream dependencies — root level service", }, "check_recent_deploys": { "api-gateway": "Last deploy: 3 days ago — no recent changes", "auth-service": ( "Last deploy: 47 min ago — PR #2341: " "increased default connection pool size from 10 to 500" ), "postgres-db": "Last deploy: 12 days ago — no recent changes", }, }, "correct_severity": "P1", "adjacent_severities": ["P2"], "correct_root_cause": { "service": "postgres-db", "failure_mode": "connection pool exhaustion", }, "correct_remediation": [ "restart_service:auth-service", "execute_runbook_step:increase_max_connections", "scale_service:postgres-db", ], "wrong_actions": { "rollback_deploy": "Rolling back auth-service pool size won't fix 500 stuck connections", "restart_service:api-gateway": "api-gateway is a victim — fixing it won't help", "clear_cache": "Cache is unrelated to DB connection pool exhaustion", }, }, # AC-002: CDN cache invalidation storm { "scenario_id": "AC-002", "description": ( "CDN cache invalidation storm: a misconfigured purge cronjob wiped " "all 2.1M cached keys, sending 40x normal traffic to origin. " "Site degraded but not fully down — P2 severity." ), "incident_summary": ( "P2 ALERT — CDN cache hit rate dropped from 94% to 3%, " "product-service origin traffic up 4000%, image-service CPU at 95%. " "Pages loading slowly (p99: 18s). Checkout still working." ), "alert": { "id": "ALT-20240315-002", "title": "HIGH: CDN cache miss storm — origin overloaded", "severity_fired": "P2", "affected_services": ["cdn-edge", "product-service", "image-service"], "symptoms": [ "CDN cache hit rate: 3% (normal: 94%)", "product-service: origin RPS 48,000 (normal: 1,200)", "image-service: CPU 95%, p99 latency 18s", "User experience: product pages slow, some images timing out", "Checkout: still functional (not affected)", ], "error_rate": 0.15, "duration_minutes": 8, "revenue_impact_per_min": 800, }, "known_services": {"cdn-edge", "product-service", "image-service"}, "tool_responses": { "query_logs": { "cdn-edge": ( "2024-03-15T10:22:00Z INFO cache MISS ratio: 97% (5min window)\n" "2024-03-15T10:20:11Z WARN mass cache invalidation — " "2,100,000 keys purged by purge-job-prod\n" "2024-03-15T10:20:10Z INFO purge pattern: /* (ALL keys)" ), "product-service": ( "2024-03-15T10:22:05Z WARN request queue depth: 12,400\n" "2024-03-15T10:22:06Z ERROR timeout fetching from image-service (18s)\n" "2024-03-15T10:22:07Z WARN worker pool 95% utilized" ), "image-service": ( "2024-03-15T10:22:00Z WARN CPU throttling engaged (95%)\n" "2024-03-15T10:22:01Z ERROR worker pool exhausted — dropping requests\n" "2024-03-15T10:22:02Z ERROR OOM risk: memory at 91%" ), }, "check_metrics": { "cdn-edge": ( "Cache hit rate: 3% | Purge events (1h): 1 mass purge | " "Origin RPS: 48,000 | Bandwidth: 890 Gbps" ), "product-service": ( "Origin RPS: 48,000 (normal: 1,200) | " "Queue depth: 12,400 | Worker utilization: 95%" ), "image-service": ( "CPU: 95% | Memory: 91% | " "Worker pool: 0 free / 200 | p99 latency: 18s" ), }, "check_dependencies": { "cdn-edge": "Origin: product-service [OVERLOADED]", "product-service": "Depends on: image-service [DEGRADED], postgres-db [OK]", "image-service": "Depends on: object-storage [OK] — no upstream issues", }, "check_recent_deploys": { "cdn-edge": ( "Cronjob purge-job-prod updated 2h ago — " "purge pattern changed from /images/* to /* (all keys)" ), "product-service": "Last deploy: 5 days ago — no recent changes", "image-service": "Last deploy: 2 days ago — no recent changes", }, }, "correct_severity": "P2", "adjacent_severities": ["P1", "P3"], "correct_root_cause": { "service": "cdn-edge", "failure_mode": "misconfigured purge job invalidated all cache keys", }, "correct_remediation": [ "disable_feature_flag:purge-job-prod", "execute_runbook_step:warm_cdn_cache", "scale_service:image-service", ], "wrong_actions": { "restart_service:image-service": ( "Restarting won't fix the CDN miss storm — source is the purge job" ), "rollback_deploy:product-service": "product-service has no recent deploys", "restart_service:cdn-edge": ( "Restarting CDN edge nodes will make cache miss rate worse temporarily" ), }, }, ], # ── TASK 2: ROOT CAUSE ANALYSIS ───────────────────────────────────────── "root_cause_analysis": [ # RCA-001: Analytics service OOM kills postgres-db { "scenario_id": "RCA-001", "description": ( "postgres-db was OOM-killed by the Linux kernel after a runaway " "analytics query with no LIMIT clause consumed all available memory. " "All downstream services are now failing." ), "incident_summary": ( "Multiple services down: api-gateway 503, auth-service failing, " "order-service write failures. postgres-db restarting in a loop. " "Root cause upstream — trace the failure chain." ), "alert": { "id": "ALT-RCA-001", "title": "CRITICAL: postgres-db crash loop — all dependents down", "severity_fired": "P1", "affected_services": [ "api-gateway", "auth-service", "order-service", "postgres-db" ], "symptoms": [ "postgres-db: 4 restarts in 12 minutes", "auth-service: connection refused — 100% failure", "order-service: all writes failing", "api-gateway: 503 on all authenticated routes", "analytics-service: last job failed 12 min ago", ], "error_rate": 0.95, "duration_minutes": 14, }, "known_services": { "api-gateway", "auth-service", "order-service", "postgres-db", "analytics-service", "redis-session", }, "tool_responses": { "query_logs": { "postgres-db": ( "2024-03-16T02:11:00Z LOG database system shut down at 02:10:58\n" "2024-03-16T02:10:58Z FATAL Out of Memory: Kill process 1847 (postgres) " "score 982 or sacrifice child\n" "2024-03-16T02:10:30Z LOG process 1847 query running 12min: " "SELECT * FROM events JOIN user_sessions JOIN orders " "JOIN products — no LIMIT clause, est 847M rows" ), "analytics-service": ( "2024-03-16T01:58:00Z INFO starting job: full_history_export\n" "2024-03-16T01:58:01Z WARN query has no LIMIT — estimated 847M rows\n" "2024-03-16T02:10:55Z ERROR job killed by OOM — full_history_export FAILED" ), "auth-service": ( "2024-03-16T02:11:05Z ERROR connect ECONNREFUSED postgres-db:5432\n" "2024-03-16T02:11:06Z ERROR all retries exhausted — giving up" ), "api-gateway": ( "2024-03-16T02:11:10Z ERROR upstream auth-service: 503 Service Unavailable" ), "order-service": ( "2024-03-16T02:11:08Z ERROR pq: the database system is starting up" ), "redis-session": "No errors — operating normally at 99.2% hit rate", }, "check_metrics": { "postgres-db": ( "Memory: OOM killed (0% free at crash) | " "Restarts: 4 in 12min | Status: RESTARTING" ), "analytics-service": ( "Memory at crash: 31.2GB / 32GB (97.5%) | " "Job runtime: 12min 55s | Status: ERROR" ), "auth-service": "Connection success: 0% | DB: CRITICAL | Redis: OK", "api-gateway": "503 rate: 95% | Auth dependency: DOWN", "order-service": "Write success: 0% | DB: RESTARTING", "redis-session": "Hit rate: 99.2% | Memory: 42% | Healthy", }, "check_dependencies": { "postgres-db": ( "Clients: auth-service, order-service, analytics-service, product-service" ), "analytics-service": "Depends on: postgres-db [CRASH LOOP]", "auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]", "api-gateway": "Depends on: auth-service [DOWN]", "order-service": "Depends on: postgres-db [CRASH LOOP]", "redis-session": "No DB dependency — standalone cache", }, "check_recent_deploys": { "analytics-service": ( "Deploy 6h ago: added full_history_export scheduled job — " "runs daily at 02:00 UTC, no LIMIT on cross-table JOIN" ), "postgres-db": "No deploys in 3 weeks", "auth-service": "No recent deploys", "order-service": "No recent deploys", "redis-session": "No recent deploys", }, "check_service_status": { "postgres-db": "RESTARTING | Uptime: 47s | Crash reason: OOM", "analytics-service": "ERROR | Last job: full_history_export FAILED", "auth-service": "DOWN | Waiting for postgres-db", "api-gateway": "DEGRADED | 95% requests failing", "order-service": "DOWN | Waiting for postgres-db", "redis-session": "HEALTHY | All normal", }, }, "correct_root_cause": { "service": "analytics-service", "failure_mode": "unbounded query OOM killing postgres-db", }, "correct_remediation": [ "disable_feature_flag:full_history_export", "restart_service:analytics-service", "restart_service:postgres-db", ], "wrong_actions": { "restart_service:auth-service": "auth-service is a victim — DB must be fixed first", "restart_service:api-gateway": "api-gateway is downstream — won't help", "scale_service:postgres-db": "Scaling won't prevent OOM if the bad query runs again", "rollback_deploy:postgres-db": "postgres-db has no recent deploys", }, }, # RCA-002: BGP route withdrawal — AZ network partition { "scenario_id": "RCA-002", "description": ( "A BGP route withdrawal isolated AZ-1 (where payment-service runs) " "from AZ-2 and AZ-3, causing 61% of checkout requests to fail. " "Services within AZ-1 are healthy — it's a pure network issue." ), "incident_summary": ( "Checkout failure rate 61% — AZ-2 and AZ-3 cannot reach payment-service " "in AZ-1. AZ-1 users unaffected. fraud-detection-service also unreachable " "cross-AZ. Network infrastructure change 18 min ago." ), "alert": { "id": "ALT-RCA-002", "title": "HIGH: checkout failure 61% — cross-AZ connectivity loss", "severity_fired": "P2", "affected_services": [ "order-service", "payment-service", "fraud-detection-service" ], "symptoms": [ "checkout failure rate: 61% (AZ-2/AZ-3 only)", "payment-service: unreachable from AZ-2, AZ-3", "fraud-detection-service: timeout from AZ-2, AZ-3", "AZ-1 users: 0% failure rate", "Network: AZ-2/AZ-3 → AZ-1 routing broken", ], "error_rate": 0.61, "duration_minutes": 9, }, "known_services": { "order-service", "payment-service", "fraud-detection-service", "postgres-db", "redis-payment-cache", "network-infra", }, "tool_responses": { "query_logs": { "order-service": ( "2024-03-17T14:32:10Z ERROR connection timeout payment-service:8080 " "(AZ-2 to AZ-1: no route to host)\n" "2024-03-17T14:32:11Z ERROR fraud-detection-service: i/o timeout (30s)" ), "payment-service": ( "2024-03-17T14:31:58Z WARN health check from AZ-2 LB failing\n" "2024-03-17T14:31:59Z INFO AZ-1 local traffic: all normal" ), "fraud-detection-service": ( "2024-03-17T14:32:00Z INFO AZ-1 requests: all normal\n" "2024-03-17T14:32:01Z WARN cross-AZ health probes: 100% timeout" ), "network-infra": ( "2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.2.1 route withdrawal — " "AZ-2 lost route to AZ-1 CIDR 10.0.1.0/24\n" "2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.3.1 route withdrawal — " "AZ-3 lost route to AZ-1 CIDR 10.0.1.0/24\n" "2024-03-17T14:31:44Z INFO router config change applied — " "BGP advertisement policy updated" ), "postgres-db": "Operating normally — no errors detected", "redis-payment-cache": "Operating normally — AZ-1 traffic only, all healthy", }, "check_metrics": { "order-service": ( "AZ-2 checkout failure: 99% | AZ-3 checkout failure: 98% | " "AZ-1 checkout failure: 0.2% (baseline)" ), "payment-service": ( "AZ-1 traffic: normal (100% success) | " "AZ-2/AZ-3 inbound connections: 0 (blocked)" ), "fraud-detection-service": ( "AZ-1 processing: normal | " "Cross-AZ health checks: 100% timeout" ), "network-infra": ( "BGP session AZ-2: WITHDRAWN | BGP session AZ-3: WITHDRAWN | " "AZ-1 internal: all UP | Config change: 18min ago" ), "postgres-db": "All metrics normal — no anomalies", "redis-payment-cache": "All metrics normal — AZ-1 only traffic", }, "check_dependencies": { "order-service": ( "Depends on: payment-service [PARTITIONED], " "fraud-detection-service [PARTITIONED]" ), "payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]", "fraud-detection-service": "Depends on: postgres-db [OK]", "network-infra": "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN], AZ-1 [UP]", }, "check_recent_deploys": { "network-infra": ( "Router config change 18min ago — BGP route advertisement policy update: " "inadvertently withdrew AZ-1 routes from AZ-2/AZ-3 peers" ), "payment-service": "No recent deploys", "order-service": "No recent deploys", "fraud-detection-service": "No recent deploys", }, "check_service_status": { "payment-service": "HEALTHY within AZ-1 | Cross-AZ: UNREACHABLE", "order-service": "DEGRADED | AZ-2/AZ-3 instances failing", "network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN | AZ-1: UP", "fraud-detection-service": "HEALTHY within AZ-1 | Cross-AZ: UNREACHABLE", "postgres-db": "HEALTHY", "redis-payment-cache": "HEALTHY", }, }, "correct_root_cause": { "service": "network-infra", "failure_mode": "BGP route withdrawal causing AZ network partition", }, "correct_remediation": [ "execute_runbook_step:restore_bgp_routes", "rollback_deploy:network-infra", ], "wrong_actions": { "restart_service:payment-service": ( "payment-service is healthy — restarting won't fix routing" ), "restart_service:order-service": "order-service is a victim of the partition", "scale_service:payment-service": "Scaling won't fix a BGP routing issue", "clear_cache:redis-payment-cache": "Cache is healthy — not the cause", }, }, ], # ── TASK 3: REMEDIATION PLANNING ──────────────────────────────────────── "remediation_planning": [ # RP-001: Full OOM remediation { "scenario_id": "RP-001", "description": ( "Full remediation: analytics-service OOM-killed postgres-db with an " "unbounded query. Must disable the offending job, restart postgres, " "restore all downstream services, and document the resolution." ), "incident_summary": ( "CRITICAL — postgres-db in OOM crash loop. auth-service, order-service, " "api-gateway all down. analytics-service caused it with unbounded query. " "Required actions: disable job, restart postgres, restore services, document." ), "alert": { "id": "ALT-RP-001", "title": "CRITICAL: postgres-db OOM crash loop — full stack down", "severity_fired": "P1", "affected_services": [ "postgres-db", "analytics-service", "auth-service", "order-service", "api-gateway" ], }, "known_services": { "postgres-db", "auth-service", "order-service", "api-gateway", "analytics-service", }, "tool_responses": { "query_logs": { "postgres-db": ( "FATAL: Out of Memory: Kill process (postgres) — " "analytics query running 12min with no LIMIT" ), "analytics-service": ( "ERROR: full_history_export — unbounded JOIN, 847M rows, killed by OOM" ), "auth-service": "ERROR: connect ECONNREFUSED postgres-db:5432", "order-service": "ERROR: pq: the database system is starting up", "api-gateway": "ERROR: upstream auth-service 503", }, "check_metrics": { "postgres-db": "Memory: OOM | Restarts: 4 | Status: CRASH LOOP", "analytics-service": "Memory spike: 31GB/32GB | Status: ERROR", "auth-service": "Connection success: 0% | Waiting for DB", "order-service": "Write success: 0% | Waiting for DB", "api-gateway": "503 rate: 95% | Auth: DOWN", }, "check_dependencies": { "postgres-db": "Clients: auth-service, order-service, analytics-service", "analytics-service": "Depends on: postgres-db [CRASH LOOP]", "auth-service": "Depends on: postgres-db [CRASH LOOP]", "order-service": "Depends on: postgres-db [CRASH LOOP]", }, "check_recent_deploys": { "analytics-service": ( "Deploy 6h ago: full_history_export job — " "unbounded cross-table JOIN query" ), "postgres-db": "No recent changes", }, "check_service_status": { "postgres-db": "CRASH LOOP | OOM kill | Uptime: 47s", "analytics-service": "ERROR | Last job failed", "auth-service": "DOWN", "order-service": "DOWN", "api-gateway": "DEGRADED", }, }, "remediation_data": { "disable_feature_flag": { "full_history_export": ( "Cron job full_history_export DISABLED — " "no more unbounded queries will run" ), }, "restart_service": { "postgres-db": ( "postgres-db restarted cleanly — " "accepting connections (12/500 active)" ), "analytics-service": ( "analytics-service restarted — no active queries" ), "auth-service": ( "auth-service restarted — reconnected to postgres-db OK" ), "order-service": ( "order-service restarted — writes resuming normally" ), }, "execute_runbook_step": { "verify_db_health": ( "postgres-db: connections 12/500, CPU 12%, Memory 34% — healthy" ), "check_service_recovery": ( "auth-service OK | order-service OK | api-gateway OK" ), }, }, "correct_remediation_sequence": [ "disable_feature_flag:full_history_export", "restart_service:analytics-service", "restart_service:postgres-db", "restart_service:auth-service", "restart_service:order-service", ], "wrong_actions": { "rollback_deploy:postgres-db": ( "postgres-db has no recent deploy to roll back" ), "scale_service:postgres-db": ( "Scaling won't prevent the OOM query from running again" ), "restart_service:api-gateway": ( "api-gateway is downstream — fix the DB first" ), }, "resolution_keywords": [ "analytics", "oom", "memory", "postgres", "query", "full_history_export", "disabled", "restarted", "recovered", ], }, # RP-002: Full BGP remediation { "scenario_id": "RP-002", "description": ( "Full remediation: BGP route withdrawal partitioned AZ-2/AZ-3 from " "AZ-1 where payment-service runs. Must restore BGP routes, roll back " "the router config change, verify checkout recovery, and document." ), "incident_summary": ( "P2 — BGP partition isolating payment-service from 61% of users. " "Router config change 18min ago is the cause. " "Required: restore BGP routes, rollback network config, verify recovery." ), "alert": { "id": "ALT-RP-002", "title": "HIGH: checkout 61% failure — BGP AZ partition", "severity_fired": "P2", "affected_services": ["network-infra", "order-service", "payment-service"], }, "known_services": { "network-infra", "order-service", "payment-service", "fraud-detection-service", "postgres-db", }, "tool_responses": { "query_logs": { "network-infra": ( "CRITICAL: BGP route withdrawal — " "AZ-2/AZ-3 lost route to AZ-1 10.0.1.0/24\n" "Router config change 18min ago: BGP policy updated" ), "order-service": ( "ERROR: connection timeout payment-service — no route to host" ), "payment-service": ( "INFO: AZ-1 traffic normal | " "WARN: cross-AZ health checks failing" ), }, "check_metrics": { "network-infra": ( "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN | AZ-1: UP" ), "order-service": "AZ-2 failure: 99% | AZ-1 failure: 0.2%", "payment-service": "AZ-1: normal | Cross-AZ inbound: 0", }, "check_dependencies": { "order-service": "Depends on: payment-service [PARTITIONED]", "payment-service": "Depends on: postgres-db [OK]", "network-infra": "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN]", }, "check_recent_deploys": { "network-infra": ( "Config change 18min ago — BGP policy update " "accidentally withdrew AZ-1 routes" ), "payment-service": "No recent deploys", "order-service": "No recent deploys", }, "check_service_status": { "network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN", "payment-service": "HEALTHY (AZ-1) | Cross-AZ: UNREACHABLE", "order-service": "DEGRADED", }, }, "remediation_data": { "rollback_deploy": { "network-infra": ( "Router config rolled back — " "BGP advertisement policy restored to previous version" ), }, "execute_runbook_step": { "restore_bgp_routes": ( "BGP routes restored — AZ-2/AZ-3 can now reach AZ-1 10.0.1.0/24" ), "verify_checkout_recovery": ( "Checkout failure rate: 0.3% — incident fully resolved" ), }, }, "correct_remediation_sequence": [ "execute_runbook_step:restore_bgp_routes", "rollback_deploy:network-infra", "execute_runbook_step:verify_checkout_recovery", ], "wrong_actions": { "restart_service:payment-service": ( "payment-service is healthy — network is the issue" ), "scale_service:payment-service": "Scaling won't fix BGP routing", "restart_service:order-service": "order-service is a victim", "clear_cache": "Cache is unrelated to network routing", }, "resolution_keywords": [ "bgp", "network", "route", "rollback", "partition", "restored", "az-1", "az-2", "az-3", "checkout", "withdrawal", ], }, ], } # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def get_task(task_id: str) -> dict: if task_id not in ALL_TASKS: raise ValueError( f"Unknown task_id '{task_id}'. " f"Valid task IDs: {list(ALL_TASKS.keys())}" ) return ALL_TASKS[task_id] def get_scenario(task_id: str, index: int) -> dict: if task_id not in SCENARIOS: raise ValueError(f"No scenarios for task_id '{task_id}'.") scenarios = SCENARIOS[task_id] if index < 0 or index >= len(scenarios): raise ValueError( f"Scenario index {index} out of range for task '{task_id}' " f"(valid: 0–{len(scenarios) - 1})" ) return scenarios[index] def list_tasks() -> list: return list(ALL_TASKS.values())