Elliot89's picture
Prepare project for push: update files
8151d99
"""
tasks.py β€” Task and scenario definitions for Cloud Incident Response OpenEnv.
Difficulty calibration targets:
EASY β†’ 8B: 0.75-1.0, 70B: 0.85-1.0
MEDIUM β†’ 8B: 0.30-0.50, 70B: 0.45-0.65
HARD β†’ 8B: 0.15-0.35, 70B: 0.30-0.50
Design principles for genuine difficulty:
EASY: Alert metrics are clear. Only trick is P2-vs-P3 ambiguity.
MEDIUM: Root cause buried. 8-10 known services. Multiple red herrings.
incident_summary does NOT hint at root cause. Must investigate 4+ services.
HARD: Same diagnosis challenge + 5-7 step remediation sequence +
10+ known services (many wrong choices) + quality summary required.
Public API:
get_task(task_id) -> task metadata dict
get_scenario(task_id, index) -> scenario dict
list_tasks() -> list of task dicts
ALL_TASKS -> dict[task_id -> metadata]
"""
from __future__ import annotations
ALL_TASKS: dict = {
"alert_classification": {
"id": "alert_classification",
"name": "Task 1: Alert Severity Classification",
"difficulty": "easy",
"max_steps": 3,
"score_range": [0.0, 1.0],
"description": (
"An alert has fired. Query logs and metrics across affected services, "
"then classify the incident severity: P1 (CRITICAL β€” complete outage or "
"revenue >$1,000/min), P2 (HIGH β€” major degradation affecting most users), "
"P3 (MEDIUM β€” partial/minor issue with graceful fallback), "
"P4 (LOW β€” informational). Submit with submit_severity."
),
"available_actions": [
"query_logs",
"check_metrics",
"check_dependencies",
"check_recent_deploys",
"submit_severity",
],
"submission_action": "submit_severity",
"scenarios": 3,
},
"root_cause_analysis": {
"id": "root_cause_analysis",
"name": "Task 2: Root Cause Analysis",
"difficulty": "medium",
"max_steps": 10,
"score_range": [0.0, 1.0],
"description": (
"A production incident is active with multiple services showing errors. "
"Use diagnostic tools to trace the failure chain. The root cause may be "
"any service in the system β€” not necessarily one showing errors. "
"Query logs, metrics, dependencies, and recent deploys across ALL "
"available services to find the true trigger. Submit with submit_root_cause."
),
"available_actions": [
"query_logs",
"check_metrics",
"check_dependencies",
"check_recent_deploys",
"check_service_status",
"submit_root_cause",
],
"submission_action": "submit_root_cause",
"scenarios": 3,
},
"remediation_planning": {
"id": "remediation_planning",
"name": "Task 3: Incident Remediation",
"difficulty": "hard",
"max_steps": 15,
"score_range": [0.0, 1.0],
"description": (
"A critical production incident requires full end-to-end resolution. "
"Diagnose the root cause among many services, execute the correct "
"remediation sequence (order matters β€” wrong actions are penalized), "
"then submit a detailed resolution summary. Scored on diagnosis quality, "
"remediation correctness, action efficiency, and documentation."
),
"available_actions": [
"query_logs",
"check_metrics",
"check_dependencies",
"check_recent_deploys",
"check_service_status",
"restart_service",
"rollback_deploy",
"scale_service",
"disable_feature_flag",
"clear_cache",
"execute_runbook_step",
"submit_resolution",
],
"submission_action": "submit_resolution",
"scenarios": 3,
},
}
# ---------------------------------------------------------------------------
# Scenario data β€” 3 tasks Γ— 3 scenarios = 9 total episodes
# ---------------------------------------------------------------------------
SCENARIOS: dict = {
# ══════════════════════════════════════════════════════════════════════
# TASK 1: ALERT CLASSIFICATION (EASY)
# Target: 8B→0.75-1.0, 70B→0.85-1.0
#
# AC-001: Clear P1 (78% errors, $12k/min) β€” should be trivial
# AC-002: Ambiguous P2 (degraded but working, $800/min)
# AC-003: Trap P3 (45% errors but zero revenue impact, graceful fallback)
# ══════════════════════════════════════════════════════════════════════
"alert_classification": [
# AC-001: Clear P1
{
"scenario_id": "AC-001",
"description": (
"Cascading failure across multiple services. "
"Assess severity based on user and revenue impact."
),
"incident_summary": (
"Alert fired: api-gateway reporting elevated error rates. "
"Multiple downstream services affected. "
"Assess the severity of this incident."
),
"alert": {
"id": "ALT-20240315-001",
"title": "api-gateway error rate elevated",
"severity_fired": "UNCLASSIFIED",
"affected_services": ["api-gateway", "auth-service", "postgres-db"],
"symptoms": [
"api-gateway: HTTP 503 rate 78% (baseline: 0.1%)",
"auth-service: connection timeout 94% of requests",
"postgres-db: connection pool 500/500 utilized",
"checkout flow: unavailable",
"user logins: failing",
],
"error_rate": 0.78,
"duration_minutes": 4,
"revenue_impact_per_min": 12000,
},
"known_services": {"api-gateway", "auth-service", "postgres-db"},
"tool_responses": {
"query_logs": {
"api-gateway": (
"2024-03-15T10:04:12Z ERROR upstream timeout auth-service:8080\n"
"2024-03-15T10:04:13Z ERROR 503 Service Unavailable\n"
"2024-03-15T10:04:14Z ERROR circuit breaker OPEN"
),
"auth-service": (
"2024-03-15T10:04:10Z ERROR too many clients already\n"
"2024-03-15T10:04:11Z ERROR connection pool exhausted (500/500)"
),
"postgres-db": (
"2024-03-15T10:04:00Z FATAL remaining slots reserved for superuser\n"
"2024-03-15T10:04:01Z LOG max_connections=500 active=500"
),
},
"check_metrics": {
"api-gateway": "5xx rate: 78% | p99: 30s | circuit_breaker: OPEN",
"auth-service": "Error rate: 94% | DB wait: 28s | Queue: 847",
"postgres-db": "Connections: 500/500 (100%) | CPU: 98% | Memory: 89%",
},
"check_dependencies": {
"api-gateway": "Depends on: auth-service [CRITICAL]",
"auth-service": "Depends on: postgres-db [CRITICAL]",
"postgres-db": "No upstream dependencies",
},
"check_recent_deploys": {
"api-gateway": "No recent changes",
"auth-service": "Deploy 47 min ago β€” connection pool size change",
"postgres-db": "No recent changes",
},
},
"correct_severity": "P1",
"adjacent_severities": ["P2"],
},
# AC-002: Ambiguous P2 β€” degraded but not down
{
"scenario_id": "AC-002",
"description": (
"Service degradation affecting page load times. "
"Core transaction flows still operational. "
"Assess severity carefully."
),
"incident_summary": (
"Alert fired: CDN cache performance degraded. "
"Origin servers under increased load. "
"Assess the severity of this incident."
),
"alert": {
"id": "ALT-20240315-002",
"title": "CDN cache performance anomaly detected",
"severity_fired": "UNCLASSIFIED",
"affected_services": ["cdn-edge", "product-service", "image-service"],
"symptoms": [
"CDN cache hit rate: 3% (normal: 94%)",
"product-service: elevated origin traffic",
"image-service: CPU 95%, p99 latency 18s",
"Product pages: loading slowly",
"Checkout: still functional",
],
"error_rate": 0.15,
"duration_minutes": 8,
"revenue_impact_per_min": 800,
},
"known_services": {"cdn-edge", "product-service", "image-service"},
"tool_responses": {
"query_logs": {
"cdn-edge": (
"2024-03-15T10:22:00Z INFO cache MISS ratio: 97%\n"
"2024-03-15T10:20:11Z WARN mass cache invalidation β€” 2.1M keys purged\n"
"2024-03-15T10:20:10Z INFO purge pattern: /* (ALL keys)"
),
"product-service": (
"2024-03-15T10:22:05Z WARN request queue depth: 12,400\n"
"2024-03-15T10:22:06Z ERROR timeout from image-service\n"
"2024-03-15T10:22:07Z WARN worker pool 95%"
),
"image-service": (
"2024-03-15T10:22:00Z WARN CPU throttling 95%\n"
"2024-03-15T10:22:01Z ERROR worker pool exhausted\n"
"2024-03-15T10:22:02Z WARN memory at 91%"
),
},
"check_metrics": {
"cdn-edge": "Cache hit: 3% | Origin RPS: 48,000 | Bandwidth: 890 Gbps",
"product-service": "Origin RPS: 48k (norm: 1.2k) | Queue: 12,400",
"image-service": "CPU: 95% | Memory: 91% | p99: 18s",
},
"check_dependencies": {
"cdn-edge": "Origin: product-service [OVERLOADED]",
"product-service": "Depends on: image-service [DEGRADED]",
"image-service": "Depends on: object-storage [OK]",
},
"check_recent_deploys": {
"cdn-edge": "Cronjob updated 2h ago β€” purge pattern changed",
"product-service": "No recent changes",
"image-service": "No recent changes",
},
},
"correct_severity": "P2",
"adjacent_severities": ["P1", "P3"],
},
# AC-003: P3 trap β€” high error rate but zero impact
{
"scenario_id": "AC-003",
"description": (
"Internal service reporting elevated errors. "
"Determine actual user and business impact. "
"Not all high error rates are critical."
),
"incident_summary": (
"Alert fired: recommendation-service error rate elevated to 45%. "
"Assess the severity based on actual user and business impact."
),
"alert": {
"id": "ALT-20240315-003",
"title": "recommendation-service error rate 45%",
"severity_fired": "UNCLASSIFIED",
"affected_services": ["recommendation-service", "product-service"],
"symptoms": [
"recommendation-service: error rate 45% (baseline: 2%)",
"product-service: using fallback recommendation logic",
"User experience: default recommendations shown",
"Checkout: fully functional",
"Revenue: no measurable change",
],
"error_rate": 0.45,
"duration_minutes": 22,
"revenue_impact_per_min": 0,
},
"known_services": {"recommendation-service", "product-service", "redis-reco-cache"},
"tool_responses": {
"query_logs": {
"recommendation-service": (
"2024-03-15T09:48:00Z ERROR model inference timeout (>5s)\n"
"2024-03-15T09:48:01Z WARN ML model server overloaded\n"
"2024-03-15T09:48:02Z INFO fallback: returning default recommendations"
),
"product-service": (
"2024-03-15T09:48:05Z INFO recommendation-service returned defaults\n"
"2024-03-15T09:48:06Z INFO serving page with default recs β€” no user impact"
),
"redis-reco-cache": "Operating normally β€” cache hit rate 88%",
},
"check_metrics": {
"recommendation-service": (
"Error rate: 45% | Fallback rate: 45% | "
"Model server: OVERLOADED | User impact: NONE (graceful)"
),
"product-service": (
"Error rate: 0.1% (normal) | Checkout: 100% | Revenue: unchanged"
),
"redis-reco-cache": "Hit rate: 88% | Memory: 34% | HEALTHY",
},
"check_dependencies": {
"recommendation-service": "Depends on: ML model server [SLOW]",
"product-service": "Depends on: recommendation-service [DEGRADED β€” has fallback]",
"redis-reco-cache": "No dependencies",
},
"check_recent_deploys": {
"recommendation-service": "Model update 3h ago β€” new model v2.4",
"product-service": "No recent changes",
"redis-reco-cache": "No recent changes",
},
},
"correct_severity": "P3",
"adjacent_severities": ["P2", "P4"],
},
],
# ══════════════════════════════════════════════════════════════════════
# TASK 2: ROOT CAUSE ANALYSIS (MEDIUM)
# Target: 8B→0.30-0.50, 70B→0.45-0.65
#
# KEY DESIGN RULES:
# 1. Root cause service NEVER in affected_services
# 2. incident_summary describes SYMPTOMS only, no hints
# 3. 8-10 known_services (many to investigate)
# 4. Red herring deploys on non-root-cause services
# 5. Root cause only findable via check_recent_deploys + query_logs
# on the specific service β€” not from looking at victims
# ══════════════════════════════════════════════════════════════════════
"root_cause_analysis": [
# RCA-001: analytics-service OOM kills postgres-db
# 8 known services. Root cause: analytics-service.
# Red herrings: auth-service deploy (cosmetic), redis healthy
{
"scenario_id": "RCA-001",
"description": (
"Multiple services reporting failures. Database appears to be "
"the epicenter but the true trigger may be elsewhere."
),
"incident_summary": (
"Multiple services are failing. postgres-db is in a crash loop. "
"auth-service, order-service, and api-gateway are all reporting errors. "
"Investigate all available services to find what triggered this cascade."
),
"alert": {
"id": "ALT-RCA-001",
"title": "Multiple service failures β€” database crash loop",
"severity_fired": "P1",
"affected_services": [
"api-gateway", "auth-service", "order-service", "postgres-db",
],
"symptoms": [
"postgres-db: crash loop β€” 4 restarts in 12 minutes",
"auth-service: 100% connection failures",
"order-service: all writes failing",
"api-gateway: 503 on authenticated routes",
],
"error_rate": 0.95,
"duration_minutes": 14,
},
"known_services": {
"api-gateway", "auth-service", "order-service",
"postgres-db", "analytics-service", "redis-session",
"product-service", "notification-service",
},
"tool_responses": {
# In RCA-001, replace the query_logs section:
"query_logs": {
"postgres-db": (
"2024-03-16T02:11:00Z LOG database system shut down\n"
"2024-03-16T02:10:58Z FATAL terminated by kernel OOM killer\n"
"2024-03-16T02:10:30Z LOG long-running analytics export query "
"consuming 31.8GB/32GB β€” sequential scan on events table "
"with cross-join, running 12 minutes, no LIMIT clause. "
"Investigate analytics-service scheduled jobs"
),
"analytics-service": (
"2024-03-16T01:58:00Z INFO starting scheduled job: full_history_export\n"
"2024-03-16T01:58:01Z DEBUG executing: SELECT * FROM events "
"JOIN user_sessions ON ... JOIN orders ON ... β€” no LIMIT\n"
"2024-03-16T01:58:02Z WARN query plan estimates 847M row scan\n"
"2024-03-16T02:10:55Z ERROR job terminated β€” connection to database lost"
),
"auth-service": (
"2024-03-16T02:11:05Z ERROR connect ECONNREFUSED postgres-db:5432\n"
"2024-03-16T02:11:06Z ERROR all retries exhausted"
),
"api-gateway": (
"2024-03-16T02:11:10Z ERROR upstream auth-service: 503"
),
"order-service": (
"2024-03-16T02:11:08Z ERROR pq: database system is starting up"
),
"redis-session": "No errors β€” operating normally",
"product-service": (
"2024-03-16T02:11:12Z WARN DB queries failing β€” serving cached data"
),
"notification-service": (
"2024-03-16T02:11:15Z ERROR cannot send β€” user lookup failed"
),
},
"check_metrics": {
"postgres-db": (
"Memory: peaked at 31.8GB/32GB before kill | "
"Restarts: 4 in 12min | Status: RESTARTING | "
"Heaviest client: 10.0.5.47"
),
"analytics-service": (
"Last job: FAILED | Memory during job: 28GB | "
"IP: 10.0.5.47 | CPU: idle (job terminated)"
),
"auth-service": "Connections: 0% success | Queued requests: 1,200",
"api-gateway": "503 rate: 95% | Auth: DOWN",
"order-service": "Write success: 0% | DB: RESTARTING",
"redis-session": "Hit rate: 99.2% | Memory: 42% | HEALTHY",
"product-service": "Serving cached data | DB queries: 100% failing",
"notification-service": "Queue backlog: 8,400 | DB: DOWN",
},
"check_dependencies": {
"postgres-db": (
"Clients: auth-service, order-service, analytics-service, "
"product-service, notification-service"
),
"analytics-service": "Depends on: postgres-db [CRASH LOOP]",
"auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
"api-gateway": "Depends on: auth-service [DOWN], product-service [DEGRADED]",
"order-service": "Depends on: postgres-db [CRASH LOOP]",
"redis-session": "Standalone cache β€” no DB dependency",
"product-service": "Depends on: postgres-db [CRASH LOOP β€” using cache]",
"notification-service": "Depends on: postgres-db [CRASH LOOP]",
},
"check_recent_deploys": {
"analytics-service": (
"Deploy 6h ago: added scheduled data export job β€” "
"runs daily at 02:00 UTC. Change includes cross-table "
"JOIN query without LIMIT clause"
),
"postgres-db": "No deploys in 3 weeks",
"auth-service": (
"Deploy 2h ago: updated structured logging format. "
"No functional changes, no query changes, no connection changes."
),
"order-service": "No recent deploys",
"redis-session": "No recent deploys",
"api-gateway": "No recent deploys",
"product-service": (
"Deploy 3 days ago: added product image lazy loading. "
"No DB changes."
),
"notification-service": "No recent deploys",
},
"check_service_status": {
"postgres-db": "RESTARTING | Uptime: 47s | Last crash: OOM",
"analytics-service": "ERROR | Last job: FAILED 12min ago",
"auth-service": "DOWN | Blocked on postgres-db",
"api-gateway": "DEGRADED | 95% errors",
"order-service": "DOWN | Blocked on postgres-db",
"redis-session": "HEALTHY | 99.2% hit rate",
"product-service": "DEGRADED | Cache fallback active",
"notification-service": "DEGRADED | Queue backlog 8,400",
},
},
"correct_root_cause": {
"service": "analytics-service",
"failure_mode": "unbounded query OOM killing postgres-db",
},
"wrong_actions": {
"restart_service:auth-service": "victim β€” DB must be fixed first",
"restart_service:api-gateway": "downstream β€” won't help",
"restart_service:order-service": "victim β€” won't help",
"scale_service:postgres-db": "won't prevent OOM from bad query",
"rollback_deploy:postgres-db": "no recent deploys",
"rollback_deploy:auth-service": "auth deploy was cosmetic only",
"rollback_deploy:product-service": "product deploy unrelated",
"restart_service:redis-session": "redis is healthy",
"restart_service:notification-service": "victim β€” won't help",
},
},
# RCA-002: network-infra BGP withdrawal
# 8 known services. Root cause: network-infra.
# Red herrings: payment-service looks down, postgres-db exists
{
"scenario_id": "RCA-002",
"description": (
"Checkout failures concentrated in specific availability zones. "
"Some services appear unreachable while others work fine."
),
"incident_summary": (
"Checkout failure rate has spiked to 61%. payment-service and "
"fraud-detection-service are unreachable from some parts of the "
"infrastructure but appear healthy from others. Multiple services "
"to investigate. Find the root cause."
),
"alert": {
"id": "ALT-RCA-002",
"title": "Checkout failures β€” partial service unreachability",
"severity_fired": "P2",
"affected_services": [
"order-service", "payment-service", "fraud-detection-service",
],
"symptoms": [
"checkout failure rate: 61%",
"payment-service: intermittently unreachable",
"fraud-detection-service: intermittently unreachable",
"failures appear zone-specific",
],
"error_rate": 0.61,
"duration_minutes": 9,
},
"known_services": {
"order-service", "payment-service", "fraud-detection-service",
"postgres-db", "redis-payment-cache", "network-infra",
"cdn-edge", "api-gateway",
},
"tool_responses": {
# In RCA-002, replace query_logs:
"query_logs": {
"order-service": (
"2024-03-17T14:32:10Z ERROR connection timeout "
"payment-service:8080 β€” no route to host\n"
"2024-03-17T14:32:11Z ERROR fraud-detection-service: i/o timeout\n"
"2024-03-17T14:32:12Z WARN failures only from AZ-2/AZ-3, "
"AZ-1 traffic normal β€” possible network-infra issue"
),
"payment-service": (
"2024-03-17T14:31:58Z WARN health check from external LB failing\n"
"2024-03-17T14:31:59Z INFO local AZ-1 traffic: all normal\n"
"2024-03-17T14:32:00Z INFO processing requests normally (local only)"
),
"fraud-detection-service": (
"2024-03-17T14:32:00Z INFO local requests: processing normally\n"
"2024-03-17T14:32:01Z WARN external health probes: 100% timeout"
),
"network-infra": (
"2024-03-17T14:31:45Z CRITICAL BGP session 10.0.2.1 DOWN β€” "
"routes to 10.0.1.0/24 withdrawn from peer\n"
"2024-03-17T14:31:45Z CRITICAL BGP session 10.0.3.1 DOWN β€” "
"routes to 10.0.1.0/24 withdrawn from peer\n"
"2024-03-17T14:31:44Z INFO configuration change applied β€” "
"export filter policy updated"
),
"postgres-db": "Operating normally β€” no errors",
"redis-payment-cache": "Operating normally β€” all healthy",
"cdn-edge": "Operating normally β€” cache serving fine",
"api-gateway": (
"2024-03-17T14:32:15Z ERROR some backend routes timing out\n"
"2024-03-17T14:32:16Z INFO AZ-1 backends: responding normally"
),
},
"check_metrics": {
"order-service": (
"Failure rate varies by source AZ: "
"AZ-1: 0.2% | AZ-2: 99% | AZ-3: 98%"
),
"payment-service": (
"Internal processing: 100% success | "
"Inbound from AZ-2: 0 connections | Inbound from AZ-3: 0 connections | "
"Inbound from AZ-1: normal"
),
"fraud-detection-service": (
"Internal: normal | External probes: 100% timeout"
),
"network-infra": (
"BGP sessions: AZ-1 internal UP | "
"AZ-2β†’AZ-1: WITHDRAWN | AZ-3β†’AZ-1: WITHDRAWN | "
"Last change: 18min ago"
),
"postgres-db": "All metrics normal",
"redis-payment-cache": "All metrics normal",
"cdn-edge": "Cache hit: 91% | Normal operation",
"api-gateway": "Mixed β€” AZ-1 OK, AZ-2/AZ-3 partial failures",
},
"check_dependencies": {
"order-service": (
"Depends on: payment-service [PARTIAL], "
"fraud-detection-service [PARTIAL]"
),
"payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
"fraud-detection-service": "Depends on: postgres-db [OK]",
"network-infra": (
"BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN], AZ-1 [UP]"
),
"postgres-db": "All connections healthy",
"redis-payment-cache": "All connections healthy",
"cdn-edge": "No issues",
"api-gateway": "Depends on: multiple backends [MIXED]",
},
"check_recent_deploys": {
"network-infra": (
"Router configuration change 18min ago β€” modified BGP "
"export filter policy. Change accidentally removed AZ-1 "
"prefix 10.0.1.0/24 from advertisements to AZ-2 and AZ-3 peers."
),
"payment-service": "No recent deploys",
"order-service": "No recent deploys",
"fraud-detection-service": "No recent deploys",
"postgres-db": (
"Minor config change 5 days ago β€” increased shared_buffers. "
"No issues since."
),
"redis-payment-cache": "No recent deploys",
"cdn-edge": "No recent deploys",
"api-gateway": (
"Deploy 1 day ago β€” added request tracing headers. "
"No routing changes."
),
},
"check_service_status": {
"payment-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
"order-service": "DEGRADED | Partial failures",
"network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
"fraud-detection-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
"postgres-db": "HEALTHY",
"redis-payment-cache": "HEALTHY",
"cdn-edge": "HEALTHY",
"api-gateway": "DEGRADED | Mixed backend status",
},
},
"correct_root_cause": {
"service": "network-infra",
"failure_mode": "BGP route withdrawal causing AZ network partition",
},
"wrong_actions": {
"restart_service:payment-service": "healthy β€” network issue",
"restart_service:order-service": "victim",
"scale_service:payment-service": "won't fix routing",
"clear_cache:redis-payment-cache": "cache is healthy",
"restart_service:api-gateway": "victim of routing issue",
"rollback_deploy:api-gateway": "deploy was unrelated tracing headers",
"rollback_deploy:postgres-db": "config change was 5 days ago, unrelated",
"restart_service:cdn-edge": "CDN is healthy",
},
},
# RCA-003: config-service credential rotation bug
# 8 known services. Root cause: config-service.
# Red herrings: user-service had a recent deploy, postgres-db stressed
{
"scenario_id": "RCA-003",
"description": (
"Multiple services experiencing database authentication failures. "
"The database itself may not be the problem."
),
"incident_summary": (
"Several services are reporting database authentication failures. "
"postgres-db connection pool is saturated. user-service and "
"notification-service are down. api-gateway error rate elevated. "
"Investigate all services to find what triggered this."
),
"alert": {
"id": "ALT-RCA-003",
"title": "Multiple services β€” database authentication failures",
"severity_fired": "P2",
"affected_services": [
"api-gateway", "user-service", "notification-service", "postgres-db",
],
"symptoms": [
"user-service: FATAL password authentication failed",
"notification-service: FATAL password authentication failed",
"api-gateway: 503 rate 62%",
"postgres-db: connection pool 490/500",
],
"error_rate": 0.62,
"duration_minutes": 7,
},
"known_services": {
"api-gateway", "user-service", "notification-service",
"postgres-db", "config-service", "redis-session",
"order-service", "product-service",
},
"tool_responses": {
# In RCA-003, replace query_logs:
"query_logs": {
"user-service": (
"2024-03-18T08:14:00Z FATAL password authentication failed "
"for user 'app_user'\n"
"2024-03-18T08:14:01Z ERROR DB credentials rejected β€” "
"credentials were last pushed by config-service secrets "
"rotation at 08:12:00Z\n"
"2024-03-18T08:14:02Z WARN credential hash mismatch β€” "
"check config-service rotation job for issues"
),
"notification-service": (
"2024-03-18T08:14:05Z FATAL password authentication failed "
"for user 'app_user'\n"
"2024-03-18T08:14:06Z WARN credentials from config-service "
"rotation at 08:12:00Z appear invalid"
),
"api-gateway": (
"2024-03-18T08:14:10Z ERROR upstream user-service: 503\n"
"2024-03-18T08:14:11Z ERROR upstream notification-service: 503"
),
"postgres-db": (
"2024-03-18T08:14:00Z LOG auth failure from 10.0.3.x\n"
"2024-03-18T08:14:00Z LOG auth failure from 10.0.4.x\n"
"2024-03-18T08:14:01Z LOG 490/500 slots used by failed auth retries"
),
"config-service": (
"2024-03-18T08:12:00Z INFO secrets rotation job executed\n"
"2024-03-18T08:12:01Z WARN rotation referenced PREVIOUS "
"credential set instead of generating new β€” template bug "
"in version v3.2.1\n"
"2024-03-18T08:12:02Z INFO pushed credentials to: "
"user-service, notification-service, order-service"
),
"redis-session": "Operating normally",
"order-service": (
"2024-03-18T08:14:20Z WARN received credential push from "
"config-service but have not restarted β€” still using old valid creds"
),
"product-service": "Operating normally β€” using original credentials",
},
"check_metrics": {
"user-service": "DB auth: 100% failure | HTTP 503: 100%",
"notification-service": "DB auth: 100% failure | HTTP 503: 100%",
"api-gateway": "503 rate: 62% | Some upstreams DOWN",
"postgres-db": (
"Connections: 490/500 | Auth failures/s: 80 | "
"Valid connections: 10 | DB itself: HEALTHY"
),
"config-service": (
"Status: HEALTHY | Last push: 7min ago | "
"Type: secrets_rotation | Result: COMPLETED"
),
"redis-session": "All normal",
"order-service": "Using old credentials β€” still working",
"product-service": "All normal β€” unaffected",
},
"check_dependencies": {
"user-service": (
"Depends on: postgres-db [AUTH FAIL], "
"config-service [credential source]"
),
"notification-service": (
"Depends on: postgres-db [AUTH FAIL], "
"config-service [credential source]"
),
"api-gateway": "Depends on: user-service [DOWN], notification-service [DOWN]",
"postgres-db": "No upstream dependencies β€” DB is healthy",
"config-service": (
"Provides: credentials to user-service, "
"notification-service, order-service"
),
"redis-session": "Standalone",
"order-service": (
"Depends on: postgres-db [OK β€” old creds], "
"config-service [pending push]"
),
"product-service": "Depends on: postgres-db [OK β€” original creds]",
},
"check_recent_deploys": {
"config-service": (
"Deploy 2h ago: version v3.2.1 β€” updated secrets rotation "
"job template. Bug: rotation references previous credential "
"set instead of generating new credentials."
),
"user-service": (
"Deploy 4h ago: added new profile API endpoint. "
"No database or credential changes."
),
"notification-service": "No recent deploys",
"postgres-db": "No recent deploys",
"api-gateway": "No recent deploys",
"redis-session": "No recent deploys",
"order-service": (
"Deploy 1 day ago: updated order confirmation email template. "
"No DB changes."
),
"product-service": "No recent deploys",
},
"check_service_status": {
"user-service": "DOWN | DB auth failures",
"notification-service": "DOWN | DB auth failures",
"api-gateway": "DEGRADED | 62% error rate",
"postgres-db": "STRESSED but HEALTHY | 490/500 connections (failed auths)",
"config-service": "HEALTHY | Last rotation: 7min ago (completed)",
"redis-session": "HEALTHY",
"order-service": "HEALTHY | Old credentials still valid",
"product-service": "HEALTHY",
},
},
"correct_root_cause": {
"service": "config-service",
"failure_mode": "secrets rotation pushed stale credentials to downstream services",
},
"wrong_actions": {
"restart_service:user-service": "will retry with same bad credentials",
"restart_service:notification-service": "same bad credentials",
"restart_service:postgres-db": "DB is healthy β€” client creds are bad",
"scale_service:postgres-db": "connections are failed auths",
"rollback_deploy:user-service": "user-service deploy was unrelated",
"rollback_deploy:order-service": "order-service deploy was unrelated",
"restart_service:api-gateway": "downstream β€” fix upstream first",
},
},
],
# ══════════════════════════════════════════════════════════════════════
# TASK 3: REMEDIATION PLANNING (HARD)
# Target: 8B→0.15-0.35, 70B→0.30-0.50
#
# KEY DESIGN RULES:
# 1. Same diagnostic challenge as medium
# 2. 5-7 step remediation sequence required
# 3. 8-10 known services = many wrong choices
# 4. Wrong actions carry -0.05 penalty each (up to -0.15)
# 5. Summary must hit 3+ keywords for bonus
# 6. incident_summary does NOT reveal root cause
# ══════════════════════════════════════════════════════════════════════
"remediation_planning": [
# RP-001: OOM remediation β€” 6-step sequence, 8 services
{
"scenario_id": "RP-001",
"description": (
"Full incident remediation required. Multiple services down. "
"Diagnose the root cause, execute fixes in the correct order, "
"and document your resolution."
),
"incident_summary": (
"CRITICAL β€” postgres-db is crash-looping. auth-service, order-service, "
"and api-gateway are all down. notification-service queue backing up. "
"Diagnose the root cause, fix it, restore all services, and document."
),
"alert": {
"id": "ALT-RP-001",
"title": "CRITICAL: database crash loop β€” multiple services down",
"severity_fired": "P1",
"affected_services": [
"postgres-db", "auth-service", "order-service", "api-gateway",
],
},
"known_services": {
"postgres-db", "auth-service", "order-service",
"api-gateway", "analytics-service", "redis-session",
"product-service", "notification-service",
},
"tool_responses": {
"query_logs": {
# RP-001 query_logs β†’ postgres-db β€” REPLACE WITH:
"postgres-db": (
"FATAL: terminated by kernel OOM killer β€” "
"query from client 10.0.5.47 running 12min consuming "
"31.8GB of 32GB available memory"
),
"analytics-service": (
"INFO: starting job full_history_export\n"
"WARN: query plan: 847M rows, cross-table JOIN, no LIMIT\n"
"ERROR: job terminated β€” database connection lost"
),
"auth-service": "ERROR: connect ECONNREFUSED postgres-db:5432",
"order-service": "ERROR: pq: database system is starting up",
"api-gateway": "ERROR: upstream auth-service 503",
"redis-session": "Operating normally",
"product-service": "WARN: DB failing β€” serving cached data",
"notification-service": "ERROR: user lookup failed β€” queuing",
},
"check_metrics": {
"postgres-db": "OOM killed | Restarts: 4 | Heaviest client: 10.0.5.47",
"analytics-service": "Job FAILED | Memory peak: 31GB/32GB | IP: 10.0.5.47",
"auth-service": "0% DB success | Queue: 1,200",
"order-service": "0% write success",
"api-gateway": "503 rate: 95%",
"redis-session": "HEALTHY | 99.2% hit rate",
"product-service": "Cache fallback active",
"notification-service": "Queue: 8,400 messages backed up",
},
"check_dependencies": {
"postgres-db": (
"Clients: auth-service, order-service, analytics-service, "
"product-service, notification-service"
),
"analytics-service": "Depends on: postgres-db [CRASH LOOP]",
"auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
"api-gateway": "Depends on: auth-service [DOWN]",
"order-service": "Depends on: postgres-db [CRASH LOOP]",
"redis-session": "Standalone",
"product-service": "Depends on: postgres-db [CRASH LOOP β€” cache fallback]",
"notification-service": "Depends on: postgres-db [CRASH LOOP]",
},
"check_recent_deploys": {
"analytics-service": (
"Deploy 6h ago: added scheduled export job β€” "
"cross-table JOIN without LIMIT clause"
),
"postgres-db": "No deploys in 3 weeks",
"auth-service": "Deploy 2h ago: logging format only β€” no functional changes",
"order-service": "No recent deploys",
"product-service": "Deploy 3 days ago: image lazy loading β€” no DB changes",
"notification-service": "No recent deploys",
},
"check_service_status": {
"postgres-db": "CRASH LOOP | OOM | Uptime: 47s",
"analytics-service": "ERROR | Job FAILED",
"auth-service": "DOWN",
"order-service": "DOWN",
"api-gateway": "DEGRADED | 95% errors",
"redis-session": "HEALTHY",
"product-service": "DEGRADED | Cache fallback",
"notification-service": "DEGRADED | Queue backlog",
},
},
"remediation_data": {
"disable_feature_flag": {
"full_history_export": (
"Cron job full_history_export DISABLED β€” "
"unbounded query will not execute again"
),
},
"restart_service": {
"postgres-db": "postgres-db restarted β€” accepting connections (12/500)",
"analytics-service": "analytics-service restarted β€” idle",
"auth-service": "auth-service restarted β€” connected to postgres-db OK",
"order-service": "order-service restarted β€” writes resuming",
"api-gateway": "api-gateway restarted β€” routing recovered",
"product-service": "product-service β€” switched from cache to live DB",
"notification-service": "notification-service β€” draining queue",
},
"execute_runbook_step": {
"verify_db_health": "postgres-db: 12/500 connections, CPU 12%, Memory 34% β€” healthy",
"check_service_recovery": (
"auth OK | order OK | api-gateway OK | product OK | notification DRAINING"
),
},
},
"correct_remediation_sequence": [
"disable_feature_flag:full_history_export",
"restart_service:analytics-service",
"restart_service:postgres-db",
"restart_service:auth-service",
"restart_service:order-service",
"execute_runbook_step:verify_db_health",
],
"wrong_actions": {
"rollback_deploy:postgres-db": "no recent deploy",
"scale_service:postgres-db": "won't prevent OOM",
"restart_service:api-gateway": "downstream β€” fix DB stack first",
"rollback_deploy:auth-service": "cosmetic deploy only",
"clear_cache:redis-session": "healthy β€” not related",
"restart_service:redis-session": "healthy β€” not related",
"rollback_deploy:product-service": "unrelated deploy",
"restart_service:notification-service": "will recover once DB is up",
},
"resolution_keywords": [
"analytics", "oom", "memory", "postgres", "query",
"full_history_export", "disabled", "restarted",
"recovered", "unbounded", "crash", "kill",
],
},
# RP-002: BGP remediation β€” 4-step sequence, 8 services
{
"scenario_id": "RP-002",
"description": (
"Full incident remediation required. Checkout failures affecting "
"most users. Diagnose, fix, verify, and document."
),
"incident_summary": (
"Checkout failure rate 61%. payment-service unreachable from most "
"of the infrastructure. Some services report no issues. "
"Diagnose the root cause, execute remediation, verify recovery, "
"and document the resolution."
),
"alert": {
"id": "ALT-RP-002",
"title": "Checkout failures β€” partial service unreachability",
"severity_fired": "P2",
"affected_services": ["order-service", "payment-service"],
},
"known_services": {
"network-infra", "order-service", "payment-service",
"fraud-detection-service", "postgres-db",
"redis-payment-cache", "cdn-edge", "api-gateway",
},
"tool_responses": {
"query_logs": {
"network-infra": (
"CRITICAL: BGP peer 10.0.2.1 route withdrawal β€” "
"routes to 10.0.1.0/24 removed\n"
"CRITICAL: BGP peer 10.0.3.1 route withdrawal β€” "
"routes to 10.0.1.0/24 removed\n"
"INFO: configuration change applied β€” export filter updated"
),
"order-service": "ERROR: timeout payment-service β€” no route to host",
"payment-service": "INFO: local traffic normal | WARN: external health failing",
"fraud-detection-service": "WARN: cross-AZ probes timeout | Local: OK",
"postgres-db": "Operating normally",
"redis-payment-cache": "Operating normally",
"cdn-edge": "Operating normally",
"api-gateway": "ERROR: some backend routes timing out",
},
"check_metrics": {
"network-infra": (
"BGP AZ-2β†’AZ-1: WITHDRAWN | AZ-3β†’AZ-1: WITHDRAWN | "
"AZ-1 internal: UP | Last change: 18min ago"
),
"order-service": "AZ-1: 0.2% fail | AZ-2: 99% fail | AZ-3: 98% fail",
"payment-service": "Internal: 100% success | External: 0 inbound from AZ-2/3",
"fraud-detection-service": "Local: normal | External: timeout",
"postgres-db": "All normal",
"redis-payment-cache": "All normal",
"cdn-edge": "Cache: 91% hit | Normal",
"api-gateway": "Mixed β€” AZ-1 OK, AZ-2/3 partial failures",
},
"check_dependencies": {
"order-service": "Depends on: payment-service [PARTIAL], fraud-detection [PARTIAL]",
"payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
"network-infra": "BGP: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN]",
"fraud-detection-service": "Depends on: postgres-db [OK]",
"postgres-db": "All healthy",
"redis-payment-cache": "All healthy",
"cdn-edge": "No issues",
"api-gateway": "Mixed backends",
},
"check_recent_deploys": {
"network-infra": (
"Config change 18min ago β€” BGP export filter modified, "
"accidentally removed AZ-1 prefix from AZ-2/AZ-3 ads"
),
"payment-service": "No recent deploys",
"order-service": "No recent deploys",
"fraud-detection-service": "No recent deploys",
"postgres-db": "Minor change 5 days ago β€” increased shared_buffers",
"redis-payment-cache": "No recent deploys",
"cdn-edge": "No recent deploys",
"api-gateway": "Deploy 1 day ago β€” tracing headers, no routing changes",
},
"check_service_status": {
"network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
"payment-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
"order-service": "DEGRADED",
"fraud-detection-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
"postgres-db": "HEALTHY",
"redis-payment-cache": "HEALTHY",
"cdn-edge": "HEALTHY",
"api-gateway": "DEGRADED",
},
},
"remediation_data": {
"rollback_deploy": {
"network-infra": "Router config rolled back β€” BGP policy restored",
},
"execute_runbook_step": {
"restore_bgp_routes": "BGP routes restored β€” AZ-2/3 can reach AZ-1",
"verify_checkout_recovery": "Checkout failure: 0.3% β€” resolved",
"verify_cross_az_connectivity": "AZ-2β†’AZ-1: OK | AZ-3β†’AZ-1: OK",
},
},
"correct_remediation_sequence": [
"execute_runbook_step:restore_bgp_routes",
"rollback_deploy:network-infra",
"execute_runbook_step:verify_cross_az_connectivity",
"execute_runbook_step:verify_checkout_recovery",
],
"wrong_actions": {
"restart_service:payment-service": "healthy β€” network issue",
"scale_service:payment-service": "won't fix routing",
"restart_service:order-service": "victim",
"clear_cache:redis-payment-cache": "unrelated",
"restart_service:cdn-edge": "healthy",
"restart_service:fraud-detection-service": "healthy locally",
"restart_service:api-gateway": "victim of routing",
"rollback_deploy:api-gateway": "deploy was unrelated",
"rollback_deploy:postgres-db": "change was 5 days ago",
},
"resolution_keywords": [
"bgp", "network", "route", "rollback", "partition",
"restored", "az-1", "az-2", "az-3", "checkout",
"withdrawal", "config", "advertisement", "export",
],
},
# RP-003: Credential rotation remediation β€” 7-step sequence, 8 services
{
"scenario_id": "RP-003",
"description": (
"Full incident remediation required. Multiple services failing "
"database authentication. Diagnose, fix, verify, and document."
),
"incident_summary": (
"Multiple services reporting database authentication failures. "
"postgres-db connection pool near capacity with failed auth attempts. "
"user-service and notification-service are down. api-gateway degraded. "
"Diagnose the root cause, execute remediation, and document."
),
"alert": {
"id": "ALT-RP-003",
"title": "Multiple services β€” DB authentication failures",
"severity_fired": "P2",
"affected_services": [
"user-service", "notification-service", "api-gateway",
],
},
"known_services": {
"api-gateway", "user-service", "notification-service",
"postgres-db", "config-service", "redis-session",
"order-service", "product-service",
},
"tool_responses": {
"query_logs": {
"user-service": (
"FATAL: password authentication failed for user 'app_user'\n"
"ERROR: DB credentials rejected\n"
"WARN: credentials last refreshed at 08:12:00Z"
),
"notification-service": (
"FATAL: password authentication failed\n"
"WARN: credentials last refreshed at 08:12:00Z β€” "
"authentication rejected by postgres-db"
),
"api-gateway": (
"ERROR: upstream user-service 503\n"
"ERROR: upstream notification-service 503"
),
"postgres-db": (
"LOG: auth failure from 10.0.3.x (user-service)\n"
"LOG: auth failure from 10.0.4.x (notification-service)\n"
"LOG: 490/500 slots used by failed auth retries"
),
"config-service": (
"INFO: secrets rotation executed at 08:12:00Z\n"
"WARN: rotation used PREVIOUS credential set β€” "
"template bug in v3.2.1\n"
"INFO: pushed to: user-service, notification-service, order-service"
),
"redis-session": "Operating normally",
"order-service": (
"WARN: received credential push at 08:12:00Z β€” "
"not applied yet, still using old valid credentials"
),
"product-service": "Operating normally β€” using original credentials",
},
"check_metrics": {
"user-service": "DB auth: 100% failure | HTTP 503: 100%",
"notification-service": "DB auth: 100% failure | HTTP 503: 100%",
"api-gateway": "503 rate: 62%",
"postgres-db": "Connections: 490/500 | Auth failures/s: 80 | DB: HEALTHY",
"config-service": "HEALTHY | Last push: 7min ago | Type: secrets_rotation",
"redis-session": "All normal",
"order-service": "HEALTHY | Using old (valid) credentials",
"product-service": "HEALTHY | Unaffected",
},
"check_dependencies": {
"user-service": "Depends on: postgres-db [AUTH FAIL], config-service [creds]",
"notification-service": "Depends on: postgres-db [AUTH FAIL], config-service [creds]",
"api-gateway": "Depends on: user-service [DOWN], notification-service [DOWN]",
"postgres-db": "No upstream β€” DB itself is healthy",
"config-service": "Provides credentials to: user-svc, notification-svc, order-svc",
"redis-session": "Standalone",
"order-service": "Depends on: postgres-db [OK β€” old creds]",
"product-service": "Depends on: postgres-db [OK β€” original creds]",
},
"check_recent_deploys": {
"config-service": (
"Deploy 2h ago: v3.2.1 β€” updated secrets rotation template. "
"Bug: references previous credential set instead of generating new."
),
"user-service": "Deploy 4h ago: profile endpoint β€” no DB changes",
"notification-service": "No recent deploys",
"postgres-db": "No recent deploys",
"api-gateway": "No recent deploys",
"redis-session": "No recent deploys",
"order-service": "Deploy 1 day ago: email template β€” no DB changes",
"product-service": "No recent deploys",
},
"check_service_status": {
"user-service": "DOWN | DB auth failures",
"notification-service": "DOWN | DB auth failures",
"api-gateway": "DEGRADED | 62%",
"postgres-db": "STRESSED | 490/500 connections (failed auths)",
"config-service": "HEALTHY | Rotation completed",
"redis-session": "HEALTHY",
"order-service": "HEALTHY | Old creds valid",
"product-service": "HEALTHY",
},
},
"remediation_data": {
"rollback_deploy": {
"config-service": "config-service rolled back to v3.2.0 β€” bug removed",
},
"execute_runbook_step": {
"trigger_credential_rotation": (
"Correct credentials generated and pushed to "
"user-service, notification-service, order-service"
),
"verify_db_connectivity": (
"user-service: DB OK | notification-service: DB OK | "
"order-service: DB OK | postgres-db: 45/500 connections"
),
"verify_api_recovery": "api-gateway 503 rate: 0.1% β€” recovered",
},
"restart_service": {
"user-service": "user-service restarted β€” DB auth OK with correct creds",
"notification-service": "notification-service restarted β€” DB auth OK",
"order-service": "order-service restarted β€” using correct credentials",
},
},
"correct_remediation_sequence": [
"rollback_deploy:config-service",
"execute_runbook_step:trigger_credential_rotation",
"restart_service:user-service",
"restart_service:notification-service",
"restart_service:order-service",
"execute_runbook_step:verify_db_connectivity",
"execute_runbook_step:verify_api_recovery",
],
"wrong_actions": {
"restart_service:postgres-db": "DB is healthy β€” problem is credentials",
"scale_service:postgres-db": "connections are failed auths",
"restart_service:api-gateway": "downstream β€” fix auth first",
"rollback_deploy:user-service": "deploy was unrelated",
"rollback_deploy:order-service": "deploy was unrelated",
"clear_cache:redis-session": "healthy",
"restart_service:product-service": "healthy",
"restart_service:redis-session": "healthy",
},
"resolution_keywords": [
"config", "credential", "rotation", "stale", "password",
"authentication", "rollback", "config-service", "v3.2.1",
"restarted", "recovered", "push", "secrets", "template",
],
},
],
}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def get_task(task_id: str) -> dict:
if task_id not in ALL_TASKS:
raise ValueError(
f"Unknown task_id '{task_id}'. Valid: {list(ALL_TASKS.keys())}"
)
return ALL_TASKS[task_id]
def get_scenario(task_id: str, index: int) -> dict:
if task_id not in SCENARIOS:
raise ValueError(f"No scenarios for task_id '{task_id}'.")
scenarios = SCENARIOS[task_id]
if index < 0 or index >= len(scenarios):
raise ValueError(
f"Scenario index {index} out of range for task '{task_id}' "
f"(valid: 0–{len(scenarios) - 1})"
)
return scenarios[index]
def list_tasks() -> list:
return list(ALL_TASKS.values())