"""Deterministic incident scenarios for the IRT environment. Each scenario is a self-contained data definition: - Initial alerts visible to the agent - Hidden logs and metrics per service (revealed on INVESTIGATE) - Ground truth for grading (severity, root cause, valid remediations) Scenarios are keyed by task_id for 1-to-1 task↔scenario mapping. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any, Dict, List from src.models import ( Alert, AlertSeverity, IncidentSeverity, LogEntry, ServiceMetrics, ) @dataclass(frozen=True) class Scenario: scenario_id: str task_id: str incident_id: str description: str # Initial state initial_alerts: List[Alert] available_services: List[str] # Hidden – revealed on INVESTIGATE service_logs: Dict[str, List[LogEntry]] service_metrics: Dict[str, ServiceMetrics] # Ground truth correct_severity: IncidentSeverity correct_root_cause_service: str correct_root_cause_keywords: List[str] # any of these in diagnosis → credit valid_remediation_actions: List[Dict[str, Any]] expected_escalation_teams: List[str] # Params max_steps: int degradation_per_step: float = 0.0 # additional penalty per idle step relevant_services: List[str] = field(default_factory=list) # Blast radius: maps service → metric key → (rate_per_step, cap) that # worsens dynamically as the agent delays. Applied before metrics are # revealed so the agent observes a live, worsening picture. # Format: {"service": {"metric_key": (delta_per_step, max_value)}} blast_radius: Dict[str, Dict[str, tuple]] = field(default_factory=dict) def apply_blast_radius(scenario: Scenario, step: int) -> Dict[str, ServiceMetrics]: """Return a copy of service_metrics with blast-radius degradation applied. Each entry in scenario.blast_radius defines a (delta_per_step, cap) tuple per metric key. The returned dict can be used as revealed metrics so each INVESTIGATE at a higher step number sees a more degraded system. """ if not scenario.blast_radius: return dict(scenario.service_metrics) result: Dict[str, ServiceMetrics] = {} for svc, base_metrics in scenario.service_metrics.items(): blast = scenario.blast_radius.get(svc) if blast is None: result[svc] = base_metrics continue # Build an updated custom dict d = base_metrics.model_dump() custom: Dict[str, float] = dict(d.get("custom") or {}) # Core fields we also allow to degrade degradable_core = { "error_rate", "latency_p50_ms", "latency_p99_ms", "cpu_percent", "memory_percent", "request_rate", } for metric_key, (delta, cap) in blast.items(): if metric_key in degradable_core: old_val = d.get(metric_key, 0.0) new_val = min(cap, old_val + delta * step) if delta > 0 else max(cap, old_val + delta * step) d[metric_key] = round(new_val, 3) else: # Custom metric field old_val = custom.get(metric_key, 0.0) new_val = min(cap, old_val + delta * step) if delta > 0 else max(cap, old_val + delta * step) custom[metric_key] = round(new_val, 3) d["custom"] = custom result[svc] = ServiceMetrics(**d) return result # ---- helpers ---------------------------------------------------------------- def _alert(aid: str, svc: str, sev: AlertSeverity, msg: str, ts: str, **meta: Any) -> Alert: return Alert(alert_id=aid, service=svc, severity=sev, message=msg, timestamp=ts, metadata=meta) def _log(ts: str, svc: str, lvl: str, msg: str, tid: str | None = None) -> LogEntry: return LogEntry(timestamp=ts, service=svc, level=lvl, message=msg, trace_id=tid) def _metrics(svc: str, cpu: float, mem: float, rr: float, er: float, p50: float, p99: float, **custom: float) -> ServiceMetrics: return ServiceMetrics(service=svc, cpu_percent=cpu, memory_percent=mem, request_rate=rr, error_rate=er, latency_p50_ms=p50, latency_p99_ms=p99, custom=custom) # ========================================================================== # SCENARIO 1 – Easy: Database Connection Pool Exhaustion # ========================================================================== _SCENARIO_EASY = Scenario( scenario_id="db-conn-pool-001", task_id="severity_classification", incident_id="INC-20260327-001", description=( "The user-service API is experiencing elevated latency and errors. " "Alerts indicate the PostgreSQL primary database connection pool is " "nearly saturated. Classify the incident severity." ), initial_alerts=[ _alert("ALT-001", "user-service", AlertSeverity.WARNING, "p99 latency exceeded 3000 ms threshold (current: 4200 ms)", "2026-03-27T02:15:00Z"), _alert("ALT-002", "postgres-primary", AlertSeverity.CRITICAL, "Connection pool utilization at 98% (max 200 connections)", "2026-03-27T02:14:30Z"), _alert("ALT-003", "user-service", AlertSeverity.WARNING, "Error rate at 12% over the last 5 minutes", "2026-03-27T02:15:30Z"), ], available_services=["user-service", "postgres-primary", "redis-cache", "api-gateway"], service_logs={ "user-service": [ _log("2026-03-27T02:10:00Z", "user-service", "INFO", "Deployment v2.3.1 completed successfully"), _log("2026-03-27T02:12:00Z", "user-service", "WARN", "DB query took 2800 ms for /api/users/profile"), _log("2026-03-27T02:13:00Z", "user-service", "ERROR", "Connection acquisition timeout after 5000 ms", "trace-a1b2"), _log("2026-03-27T02:13:30Z", "user-service", "ERROR", "Connection acquisition timeout after 5000 ms", "trace-c3d4"), _log("2026-03-27T02:14:00Z", "user-service", "ERROR", "Failed to acquire connection from pool: pool exhausted", "trace-e5f6"), _log("2026-03-27T02:14:30Z", "user-service", "WARN", "Retry #3 for DB connection – backing off 500 ms"), _log("2026-03-27T02:15:00Z", "user-service", "ERROR", "HTTP 503 returned for GET /api/users/profile", "trace-g7h8"), ], "postgres-primary": [ _log("2026-03-27T02:08:00Z", "postgres-primary", "INFO", "Active connections: 120/200"), _log("2026-03-27T02:10:30Z", "postgres-primary", "WARN", "Active connections: 175/200"), _log("2026-03-27T02:12:00Z", "postgres-primary", "WARN", "Active connections: 190/200 – approaching limit"), _log("2026-03-27T02:13:00Z", "postgres-primary", "ERROR", "Active connections: 196/200 – new connection rejected"), _log("2026-03-27T02:14:00Z", "postgres-primary", "ERROR", "Connection count 198/200. Longest idle: 1800 s. Possible connection leak detected."), _log("2026-03-27T02:15:00Z", "postgres-primary", "ERROR", "Active connections: 200/200 – pool fully saturated"), ], "redis-cache": [ _log("2026-03-27T02:15:00Z", "redis-cache", "INFO", "Memory usage: 45%. Operations normal."), _log("2026-03-27T02:15:00Z", "redis-cache", "INFO", "Hit rate: 94%. No evictions."), ], "api-gateway": [ _log("2026-03-27T02:14:00Z", "api-gateway", "WARN", "Upstream user-service returning 503 for 8% of requests"), _log("2026-03-27T02:15:00Z", "api-gateway", "INFO", "All other upstream services healthy"), ], }, service_metrics={ "user-service": _metrics("user-service", 65.0, 58.0, 450.0, 0.12, 320.0, 4200.0), "postgres-primary": _metrics("postgres-primary", 78.0, 72.0, 450.0, 0.05, 45.0, 890.0, connection_pool_pct=98.0), "redis-cache": _metrics("redis-cache", 15.0, 45.0, 1200.0, 0.001, 1.2, 3.5), "api-gateway": _metrics("api-gateway", 22.0, 30.0, 2200.0, 0.08, 85.0, 4500.0), }, correct_severity=IncidentSeverity.P2, correct_root_cause_service="postgres-primary", correct_root_cause_keywords=["connection pool", "connection leak", "pool exhaustion", "pool saturated", "connection exhaustion"], valid_remediation_actions=[ {"action": "restart", "service": "user-service"}, {"action": "config_change", "service": "postgres-primary", "detail": "increase pool size"}, ], expected_escalation_teams=["database-team"], max_steps=10, degradation_per_step=0.005, relevant_services=["user-service", "postgres-primary"], # Blast radius: connection pool fully saturates, user-service error rate climbs blast_radius={ "postgres-primary": { "connection_pool_pct": (0.5, 100.0), # +0.5%/step → caps at 100% }, "user-service": { "error_rate": (0.02, 0.60), # +2pp/step → caps at 60% "latency_p99_ms": (200.0, 10000.0), # +200ms/step → caps at 10s }, }, ) # ========================================================================== # SCENARIO 2 – Medium: Payment Processing Failure # ========================================================================== _SCENARIO_MEDIUM = Scenario( scenario_id="payment-failure-001", task_id="root_cause_analysis", incident_id="INC-20260327-002", description=( "Payment success rate has dropped sharply. Multiple services show " "degradation. Investigate the services, identify the root cause, " "classify severity, and apply the correct remediation." ), initial_alerts=[ _alert("ALT-010", "payment-gateway", AlertSeverity.CRITICAL, "Payment success rate dropped to 45% (threshold: 95%)", "2026-03-27T09:30:00Z"), _alert("ALT-011", "payment-processor", AlertSeverity.WARNING, "Timeout errors increased 10x in last 10 minutes", "2026-03-27T09:30:30Z"), _alert("ALT-012", "redis-session", AlertSeverity.WARNING, "Key eviction rate spike: 1500 evictions/min (normal: <10)", "2026-03-27T09:29:00Z"), _alert("ALT-013", "order-service", AlertSeverity.WARNING, "Error rate elevated to 8%", "2026-03-27T09:31:00Z"), ], available_services=["payment-gateway", "payment-processor", "redis-session", "order-service", "user-service", "postgres-primary"], service_logs={ "payment-gateway": [ _log("2026-03-27T09:25:00Z", "payment-gateway", "INFO", "Processing 320 payments/min"), _log("2026-03-27T09:28:00Z", "payment-gateway", "WARN", "Payment token validation failed: token not found in session store", "trace-pay-01"), _log("2026-03-27T09:28:30Z", "payment-gateway", "ERROR", "Payment failed: session token expired or missing for txn TXN-8842", "trace-pay-02"), _log("2026-03-27T09:29:00Z", "payment-gateway", "ERROR", "Batch failure: 55% of payment attempts failing with SESSION_TOKEN_MISSING"), _log("2026-03-27T09:30:00Z", "payment-gateway", "ERROR", "Success rate critical: 45%. All failures correlate with session token lookup errors."), ], "payment-processor": [ _log("2026-03-27T09:28:00Z", "payment-processor", "WARN", "Upstream payment-gateway sending incomplete requests"), _log("2026-03-27T09:29:00Z", "payment-processor", "ERROR", "Timeout waiting for payment-gateway response: 12 s", "trace-pp-01"), _log("2026-03-27T09:30:00Z", "payment-processor", "WARN", "Retry queue depth: 450 (normal: <20)"), ], "redis-session": [ _log("2026-03-27T09:20:00Z", "redis-session", "INFO", "Memory usage: 95%. Approaching maxmemory limit (4 GB)."), _log("2026-03-27T09:22:00Z", "redis-session", "WARN", "maxmemory reached. Eviction policy: allkeys-lru. Beginning evictions."), _log("2026-03-27T09:25:00Z", "redis-session", "WARN", "Evicted 800 keys in last 3 minutes. Active sessions being evicted."), _log("2026-03-27T09:28:00Z", "redis-session", "ERROR", "Eviction rate critical: 1500 keys/min. Payment session tokens are being evicted before use."), _log("2026-03-27T09:30:00Z", "redis-session", "ERROR", "Memory at 100%. Continuous eviction. Session TTL effectively reduced from 30 min to ~45 s."), ], "order-service": [ _log("2026-03-27T09:30:00Z", "order-service", "WARN", "Downstream payment-gateway returning errors for order confirmations"), _log("2026-03-27T09:31:00Z", "order-service", "ERROR", "8% of orders failing at payment step – propagated from payment-gateway"), ], "user-service": [ _log("2026-03-27T09:30:00Z", "user-service", "INFO", "All endpoints healthy. Latency normal."), ], "postgres-primary": [ _log("2026-03-27T09:30:00Z", "postgres-primary", "INFO", "Active connections: 85/200. Operations normal."), ], }, service_metrics={ "payment-gateway": _metrics("payment-gateway", 45.0, 52.0, 320.0, 0.55, 250.0, 12000.0, payment_success_rate=0.45), "payment-processor": _metrics("payment-processor", 35.0, 40.0, 150.0, 0.30, 180.0, 8000.0), "redis-session": _metrics("redis-session", 30.0, 99.5, 5000.0, 0.02, 0.8, 2.5, memory_used_gb=3.98, evictions_per_min=1500.0), "order-service": _metrics("order-service", 28.0, 35.0, 200.0, 0.08, 120.0, 950.0), "user-service": _metrics("user-service", 20.0, 32.0, 400.0, 0.002, 45.0, 120.0), "postgres-primary": _metrics("postgres-primary", 40.0, 55.0, 300.0, 0.001, 12.0, 45.0), }, correct_severity=IncidentSeverity.P1, correct_root_cause_service="redis-session", correct_root_cause_keywords=[ "redis", "memory", "eviction", "session token", "maxmemory", "session eviction", "cache eviction", "redis memory", ], valid_remediation_actions=[ {"action": "scale", "service": "redis-session"}, {"action": "config_change", "service": "redis-session", "detail": "increase maxmemory"}, {"action": "restart", "service": "redis-session"}, ], expected_escalation_teams=["payments-team", "platform-team"], max_steps=15, degradation_per_step=0.01, relevant_services=["payment-gateway", "redis-session", "payment-processor"], # Blast radius: Redis keeps evicting, payment success rate collapses blast_radius={ "redis-session": { "evictions_per_min": (150.0, 5000.0), # +150 evictions/min/step "memory_used_gb": (0.005, 4.0), # creeps toward hard limit }, "payment-gateway": { "payment_success_rate": (-0.04, 0.05), # drops 4pp/step → 5% floor "error_rate": (0.03, 0.90), }, "order-service": { "error_rate": (0.02, 0.50), }, }, ) # ========================================================================== # SCENARIO 3 – Hard: Cascading Multi-Service Outage # ========================================================================== _SCENARIO_HARD = Scenario( scenario_id="cascading-outage-001", task_id="full_incident_management", incident_id="INC-20260327-003", description=( "Multiple services are degraded simultaneously. The API gateway is " "returning 503s, the auth service has extreme latency, and downstream " "services are failing. This is a cascading outage. You must triage, " "investigate, identify the root cause, remediate, escalate, and " "communicate status updates." ), initial_alerts=[ _alert("ALT-100", "api-gateway", AlertSeverity.CRITICAL, "503 error rate at 35% across all endpoints", "2026-03-27T14:00:00Z"), _alert("ALT-101", "auth-service", AlertSeverity.CRITICAL, "p99 latency > 5000 ms (threshold: 200 ms)", "2026-03-27T14:00:30Z"), _alert("ALT-102", "order-service", AlertSeverity.WARNING, "Message queue depth growing: 15000 (normal: <500)", "2026-03-27T14:01:00Z"), _alert("ALT-103", "notification-service", AlertSeverity.WARNING, "Connection timeout to auth-service: 100% failure rate", "2026-03-27T14:01:30Z"), _alert("ALT-104", "cdn-static", AlertSeverity.INFO, "Cache miss rate elevated to 15% (normal: 2%)", "2026-03-27T14:02:00Z"), _alert("ALT-105", "user-service", AlertSeverity.WARNING, "Intermittent HTTP 401 responses (token validation failing)", "2026-03-27T14:01:00Z"), _alert("ALT-106", "deployment-tracker", AlertSeverity.CRITICAL, "auth-service v3.1.0 deployed at 13:47 — memory climb started immediately. Escalate to auth-team and platform-team.", "2026-03-27T14:02:00Z"), ], available_services=[ "api-gateway", "auth-service", "user-service", "order-service", "notification-service", "cdn-static", "postgres-primary", "redis-auth-cache", ], service_logs={ "api-gateway": [ _log("2026-03-27T13:58:00Z", "api-gateway", "INFO", "All upstreams healthy. Traffic: 5500 req/s."), _log("2026-03-27T14:00:00Z", "api-gateway", "ERROR", "Upstream auth-service: 503 for 35% of auth checks"), _log("2026-03-27T14:00:30Z", "api-gateway", "ERROR", "Circuit breaker OPEN for auth-service after 50 consecutive failures"), _log("2026-03-27T14:01:00Z", "api-gateway", "ERROR", "Cascading: requests requiring auth are failing. Public endpoints OK."), ], "auth-service": [ _log("2026-03-27T13:45:00Z", "auth-service", "INFO", "Deployment v3.1.0 started (canary 10%)"), _log("2026-03-27T13:47:00Z", "auth-service", "INFO", "Deployment v3.1.0 promoted to 100%"), _log("2026-03-27T13:50:00Z", "auth-service", "WARN", "Memory usage climbing: 72% (was 45% before deploy)"), _log("2026-03-27T13:55:00Z", "auth-service", "WARN", "Memory usage: 88%. GC pauses increasing: avg 350 ms"), _log("2026-03-27T13:58:00Z", "auth-service", "ERROR", "Memory usage: 95%. GC pause: 2100 ms. Requests timing out."), _log("2026-03-27T14:00:00Z", "auth-service", "ERROR", "OOMKill risk. Memory: 97%. Token validation taking 4800 ms avg."), _log("2026-03-27T14:00:30Z", "auth-service", "ERROR", "v3.1.0 changelog: 'Refactored token cache to in-memory store' – possible unbounded cache growth"), _log("2026-03-27T14:01:00Z", "auth-service", "ERROR", "Pod restarts: 3 in last 5 min due to OOMKill. Service effectively down."), ], "user-service": [ _log("2026-03-27T14:00:00Z", "user-service", "WARN", "Auth token validation calls timing out"), _log("2026-03-27T14:01:00Z", "user-service", "ERROR", "Returning 401 for 40% of requests – cannot validate tokens with auth-service"), ], "order-service": [ _log("2026-03-27T14:00:00Z", "order-service", "WARN", "Order processing slowing – auth dependency failing"), _log("2026-03-27T14:01:00Z", "order-service", "ERROR", "Queue depth: 15000. Orders stuck awaiting auth validation."), _log("2026-03-27T14:02:00Z", "order-service", "ERROR", "Queue depth: 25000. Risk of message broker disk overflow."), ], "notification-service": [ _log("2026-03-27T14:01:00Z", "notification-service", "ERROR", "Cannot reach auth-service. All notification deliveries paused."), _log("2026-03-27T14:02:00Z", "notification-service", "WARN", "Buffered 8000 pending notifications."), ], "cdn-static": [ _log("2026-03-27T14:00:00Z", "cdn-static", "INFO", "Cache miss rate elevated. Likely due to increased full page reloads from client-side auth failures."), _log("2026-03-27T14:02:00Z", "cdn-static", "INFO", "No CDN-side issues detected. Origin healthy."), ], "postgres-primary": [ _log("2026-03-27T14:00:00Z", "postgres-primary", "INFO", "Connections: 90/200. Query performance normal."), ], "redis-auth-cache": [ _log("2026-03-27T14:00:00Z", "redis-auth-cache", "INFO", "Memory: 30%. Operations normal."), _log("2026-03-27T14:00:30Z", "redis-auth-cache", "WARN", "Cache hit rate dropped from 92% to 15%. auth-service v3.1.0 appears to bypass cache."), ], }, service_metrics={ "api-gateway": _metrics("api-gateway", 55.0, 40.0, 5500.0, 0.35, 150.0, 8500.0), "auth-service": _metrics("auth-service", 95.0, 97.0, 800.0, 0.65, 2500.0, 5200.0, gc_pause_ms=2100.0, pod_restarts=3.0), "user-service": _metrics("user-service", 30.0, 35.0, 400.0, 0.40, 80.0, 4800.0), "order-service": _metrics("order-service", 40.0, 45.0, 200.0, 0.25, 300.0, 3500.0, queue_depth=15000.0), "notification-service": _metrics("notification-service", 10.0, 20.0, 0.0, 1.0, 0.0, 0.0), "cdn-static": _metrics("cdn-static", 12.0, 18.0, 8000.0, 0.001, 8.0, 25.0, cache_miss_rate=0.15), "postgres-primary": _metrics("postgres-primary", 38.0, 52.0, 250.0, 0.001, 10.0, 40.0), "redis-auth-cache": _metrics("redis-auth-cache", 12.0, 30.0, 2000.0, 0.005, 0.5, 1.8, cache_hit_rate=0.15), }, correct_severity=IncidentSeverity.P1, correct_root_cause_service="auth-service", correct_root_cause_keywords=[ "memory leak", "v3.1.0", "deployment", "oom", "unbounded cache", "in-memory", "bad deployment", "auth-service deployment", "token cache", "gc pause", "out of memory", ], valid_remediation_actions=[ {"action": "rollback", "service": "auth-service"}, {"action": "restart", "service": "auth-service"}, {"action": "scale", "service": "order-service"}, {"action": "restart", "service": "order-service"}, ], expected_escalation_teams=["platform-team", "auth-team"], max_steps=20, degradation_per_step=0.015, relevant_services=["auth-service", "api-gateway", "redis-auth-cache", "order-service"], # Blast radius: auth-service OOMKills more often, order queue grows unbounded blast_radius={ "auth-service": { "memory_percent": (0.5, 100.0), # +0.5%/step → OOM at 100% "error_rate": (0.02, 0.95), # cascades toward full outage "latency_p99_ms": (100.0, 15000.0), "pod_restarts": (0.3, 15.0), # accumulating restarts }, "order-service": { "queue_depth": (1500.0, 100000.0), # queue grows 1500/step "error_rate": (0.02, 0.80), }, "api-gateway": { "error_rate": (0.015, 0.70), # more requests fail over time }, "user-service": { "error_rate": (0.02, 0.80), }, }, ) # ========================================================================== # SCENARIO 1-B – Easy variant: Disk space exhaustion on log volume # ========================================================================== _SCENARIO_EASY_B = Scenario( scenario_id="disk-full-001", task_id="severity_classification", incident_id="INC-20260327-101", description=( "The search-service and its underlying Elasticsearch cluster are " "experiencing errors. Alerts indicate disk usage is critically high. " "Classify the incident severity." ), initial_alerts=[ _alert("ALT-201", "elasticsearch", AlertSeverity.CRITICAL, "Disk usage at 95% on data node es-node-01", "2026-03-27T06:10:00Z"), _alert("ALT-202", "search-service", AlertSeverity.WARNING, "Bulk indexing failures: 400% increase", "2026-03-27T06:10:30Z"), _alert("ALT-203", "elasticsearch", AlertSeverity.WARNING, "write.low_watermark crossed – shard allocation blocked", "2026-03-27T06:09:00Z"), ], available_services=["search-service", "elasticsearch", "kibana", "log-aggregator"], service_logs={ "search-service": [ _log("2026-03-27T06:08:00Z", "search-service", "WARN", "Indexing queue backing up: 12000 documents pending"), _log("2026-03-27T06:09:00Z", "search-service", "ERROR", "BulkIndexException: ClusterBlockException[blocked: FORBIDDEN/12/index]"), _log("2026-03-27T06:10:00Z", "search-service", "ERROR", "Search degraded – last index refresh 8 min ago. Serving stale results."), ], "elasticsearch": [ _log("2026-03-27T06:05:00Z", "elasticsearch", "WARN", "Disk usage: 90% on es-node-01. Threshold: 85%."), _log("2026-03-27T06:07:00Z", "elasticsearch", "WARN", "Disk: 93%. flood_stage watermark approaching."), _log("2026-03-27T06:09:00Z", "elasticsearch", "ERROR", "Disk: 95%. flood_stage reached. All indices set to read-only."), _log("2026-03-27T06:10:00Z", "elasticsearch", "ERROR", "Shard allocation disabled. Cluster status: YELLOW. Write ops blocked."), ], "kibana": [ _log("2026-03-27T06:10:00Z", "kibana", "INFO", "Dashboard loading normally. Read-only ops unaffected."), ], "log-aggregator": [ _log("2026-03-27T06:09:00Z", "log-aggregator", "WARN", "Log shipping to elasticsearch failing. Retrying. Buffer: 50000 lines."), ], }, service_metrics={ "search-service": _metrics("search-service", 42.0, 50.0, 200.0, 0.35, 180.0, 2200.0), "elasticsearch": _metrics("elasticsearch", 60.0, 80.0, 50.0, 0.40, 200.0, 5000.0, disk_pct=95.0), "kibana": _metrics("kibana", 15.0, 25.0, 30.0, 0.0, 90.0, 350.0), "log-aggregator": _metrics("log-aggregator", 25.0, 35.0, 300.0, 0.15, 50.0, 400.0), }, correct_severity=IncidentSeverity.P2, correct_root_cause_service="elasticsearch", correct_root_cause_keywords=["disk", "disk full", "disk space", "flood_stage", "watermark", "read-only", "disk usage"], valid_remediation_actions=[ {"action": "config_change", "service": "elasticsearch", "detail": "clear read-only flag"}, {"action": "scale", "service": "elasticsearch"}, ], expected_escalation_teams=["infrastructure-team"], max_steps=10, degradation_per_step=0.005, relevant_services=["search-service", "elasticsearch"], ) # ========================================================================== # SCENARIO 2-B – Medium variant: Slow memory leak in background worker # ========================================================================== _SCENARIO_MEDIUM_B = Scenario( scenario_id="worker-memleak-001", task_id="root_cause_analysis", incident_id="INC-20260327-102", description=( "The report-generation service is timing out and users cannot export " "data. Multiple related services show elevated errors. Find the true " "root cause, classify severity, diagnose, and remediate." ), initial_alerts=[ _alert("ALT-210", "report-service", AlertSeverity.CRITICAL, "Request timeout rate 60% for /api/export", "2026-03-27T11:20:00Z"), _alert("ALT-211", "worker-pool", AlertSeverity.WARNING, "Worker memory usage: 94% (4 of 5 workers OOMKilling)", "2026-03-27T11:19:00Z"), _alert("ALT-212", "s3-upload", AlertSeverity.WARNING, "Upload failures – 503s from report-service", "2026-03-27T11:20:30Z"), _alert("ALT-213", "postgres-reports", AlertSeverity.INFO, "Long-running queries detected: 5 queries > 10 s", "2026-03-27T11:18:00Z"), _alert("ALT-214", "health-monitor", AlertSeverity.INFO, "Core services healthy: payment, auth, user-api all nominal. Issue isolated to report-export subsystem.", "2026-03-27T11:20:00Z"), ], available_services=["report-service", "worker-pool", "s3-upload", "postgres-reports", "redis-cache", "api-gateway"], service_logs={ "report-service": [ _log("2026-03-27T11:15:00Z", "report-service", "INFO", "Report job queued: RPT-9981, format: xlsx, rows: 1M"), _log("2026-03-27T11:16:00Z", "report-service", "WARN", "Worker RPT-9981 memory: 2.1 GB (limit 2 GB). Nearing OOM."), _log("2026-03-27T11:18:00Z", "report-service", "ERROR", "Worker OOMKilled during xlsx serialization. Job failed."), _log("2026-03-27T11:19:00Z", "report-service", "ERROR", "3 concurrent OOMKills. Export endpoint returning 503."), ], "worker-pool": [ _log("2026-03-27T11:10:00Z", "worker-pool", "INFO", "Workers: 5 active, 0 idle. Load: nominal."), _log("2026-03-27T11:14:00Z", "worker-pool", "WARN", "Worker memory climbing. Suspected unbounded row accumulation in xlsx writer."), _log("2026-03-27T11:17:00Z", "worker-pool", "ERROR", "Worker #3 OOMKilled. Memory at 100%."), _log("2026-03-27T11:19:00Z", "worker-pool", "ERROR", "4/5 workers OOMKilled. Effective worker capacity: 1. Queue depth: 45."), _log("2026-03-27T11:19:30Z", "worker-pool", "ERROR", "Root cause: xlsx writer buffers all rows in memory before flushing. No streaming."), ], "s3-upload": [ _log("2026-03-27T11:20:00Z", "s3-upload", "WARN", "Upstream report-service returning 503. S3 uploads queued."), ], "postgres-reports": [ _log("2026-03-27T11:17:00Z", "postgres-reports", "INFO", "Large sequential scan for 1M row export. Query time: 12 s. This is normal for large exports."), ], "redis-cache": [_log("2026-03-27T11:20:00Z", "redis-cache", "INFO", "Operations normal.")], "api-gateway": [_log("2026-03-27T11:20:00Z", "api-gateway", "WARN", "report-service upstream: 60% 503 errors.")], }, service_metrics={ "report-service": _metrics("report-service", 55.0, 75.0, 10.0, 0.60, 8000.0, 30000.0), "worker-pool": _metrics("worker-pool", 90.0, 94.0, 5.0, 0.80, 15000.0, 60000.0, oom_kills=4.0), "s3-upload": _metrics("s3-upload", 10.0, 15.0, 2.0, 0.60, 500.0, 3000.0), "postgres-reports": _metrics("postgres-reports", 55.0, 60.0, 15.0, 0.0, 200.0, 12000.0), "redis-cache": _metrics("redis-cache", 12.0, 30.0, 500.0, 0.0, 1.0, 3.0), "api-gateway": _metrics("api-gateway", 20.0, 28.0, 800.0, 0.08, 80.0, 2000.0), }, correct_severity=IncidentSeverity.P2, correct_root_cause_service="worker-pool", correct_root_cause_keywords=["memory", "oom", "out of memory", "xlsx", "buffering", "unbounded", "memory leak", "worker memory", "worker", "oomkill", "streaming", "row accumulation"], # Note: P2 not P1 — only the report-export subsystem is affected, core services healthy. valid_remediation_actions=[ {"action": "restart", "service": "worker-pool"}, {"action": "scale", "service": "worker-pool"}, {"action": "config_change", "service": "worker-pool", "detail": "enable streaming"}, ], expected_escalation_teams=["backend-team", "platform-team"], max_steps=15, degradation_per_step=0.008, relevant_services=["report-service", "worker-pool"], ) # ========================================================================== # SCENARIO 3-B – Hard variant: Kubernetes node pressure / pod eviction cascade # ========================================================================== _SCENARIO_HARD_B = Scenario( scenario_id="k8s-node-pressure-001", task_id="full_incident_management", incident_id="INC-20260327-004", description=( "Multiple pods are being evicted across the cluster. The checkout " "service is returning 502s, node-exporter reports memory pressure on " "three nodes, and the HPA has been scaling aggressively. This is a " "node-level resource exhaustion event triggered by an HPA/resource-limit " "misconfiguration. Full incident management required." ), initial_alerts=[ _alert("ALT-200", "checkout-service", AlertSeverity.CRITICAL, "502 error rate 28% across checkout endpoints", "2026-03-27T16:00:00Z"), _alert("ALT-201", "k8s-node-01", AlertSeverity.CRITICAL, "MemoryPressure=True — 3/8 pods evicted in last 5 min", "2026-03-27T16:00:30Z"), _alert("ALT-202", "k8s-node-02", AlertSeverity.WARNING, "MemoryPressure=True — node at 92% memory", "2026-03-27T16:01:00Z"), _alert("ALT-203", "hpa-controller", AlertSeverity.WARNING, "HPA for recommendation-service scaled to maxReplicas=20 (was 4)", "2026-03-27T15:55:00Z"), _alert("ALT-204", "cart-service", AlertSeverity.WARNING, "Downstream checkout-service returning 502s for 35% of cart completions", "2026-03-27T16:01:30Z"), _alert("ALT-205", "cdn-static", AlertSeverity.INFO, "Slight latency increase: p99 68ms (normal: 20ms)", "2026-03-27T16:02:00Z"), ], available_services=[ "checkout-service", "k8s-node-01", "k8s-node-02", "recommendation-service", "cart-service", "hpa-controller", "cdn-static", "postgres-checkout", ], service_logs={ "checkout-service": [ _log("2026-03-27T15:58:00Z", "checkout-service", "INFO", "Processing normally. 180 req/s."), _log("2026-03-27T15:59:30Z", "checkout-service", "WARN", "3 pods restarting. Connections dropped."), _log("2026-03-27T16:00:00Z", "checkout-service", "ERROR", "502 Bad Gateway — upstream recommendation-service pods unavailable"), _log("2026-03-27T16:01:00Z", "checkout-service", "ERROR", "Circuit breaker half-open. 28% of requests failing."), ], "k8s-node-01": [ _log("2026-03-27T15:50:00Z", "k8s-node-01", "INFO", "Memory: 78%."), _log("2026-03-27T15:53:00Z", "k8s-node-01", "WARN", "Memory: 88%. kubelet setting eviction threshold."), _log("2026-03-27T15:56:00Z", "k8s-node-01", "ERROR", "Memory: 95%. OOM eviction beginning. Evicting low-priority pods."), _log("2026-03-27T15:58:00Z", "k8s-node-01", "ERROR", "Evicted: recommendation-service-7d8f (2 GB). Memory: 91%."), _log("2026-03-27T16:00:00Z", "k8s-node-01", "ERROR", "Memory back to 95%. HPA-spawned recommendation-service pods consuming all available memory."), ], "k8s-node-02": [ _log("2026-03-27T15:58:00Z", "k8s-node-02", "WARN", "Memory: 90%. recommendation-service HPA placed 6 new pods here."), _log("2026-03-27T16:00:30Z", "k8s-node-02", "ERROR", "Memory: 92%. Approaching eviction threshold."), ], "recommendation-service": [ _log("2026-03-27T15:45:00Z", "recommendation-service", "INFO", "Memory usage tracking: v2.4.0 deployed. ML model loaded."), _log("2026-03-27T15:50:00Z", "recommendation-service", "WARN", "Each pod consuming 2.1 GB (limit: 2.0 GB) — requests.memory too low."), _log("2026-03-27T15:53:00Z", "recommendation-service", "WARN", "HPA triggered: latency spike caused scale-out. 8→12 pods"), _log("2026-03-27T15:57:00Z", "recommendation-service", "ERROR", "HPA at maxReplicas=20. 20 pods × 2.1 GB = 42 GB on nodes with 32 GB capacity."), _log("2026-03-27T16:00:00Z", "recommendation-service", "ERROR", "Pod eviction loop: evicted pods restart, consume memory, trigger eviction again."), ], "hpa-controller": [ _log("2026-03-27T15:52:00Z", "hpa-controller", "INFO", "recommendation-service: scaling 4→8 due to latency"), _log("2026-03-27T15:55:00Z", "hpa-controller", "WARN", "recommendation-service: scaling 8→20 (maxReplicas). Memory requests underspecified."), _log("2026-03-27T16:00:00Z", "hpa-controller", "ERROR", "Eviction loop detected. Scaling is worsening node pressure."), ], "cart-service": [ _log("2026-03-27T16:01:00Z", "cart-service", "WARN", "Checkout dependency failing. 35% cart completions blocked."), ], "cdn-static": [ _log("2026-03-27T16:02:00Z", "cdn-static", "INFO", "Slight latency increase correlates with client retries. No CDN-side issue."), ], "postgres-checkout": [ _log("2026-03-27T16:00:00Z", "postgres-checkout", "INFO", "All queries normal. Connections: 45/200."), ], }, service_metrics={ "checkout-service": _metrics("checkout-service", 55.0, 60.0, 180.0, 0.28, 200.0, 5500.0), "k8s-node-01": _metrics("k8s-node-01", 70.0, 95.0, 0.0, 0.0, 0.0, 0.0, evicted_pods=3.0), "k8s-node-02": _metrics("k8s-node-02", 65.0, 92.0, 0.0, 0.0, 0.0, 0.0), "recommendation-service": _metrics("recommendation-service", 85.0, 105.0, 80.0, 0.60, 800.0, 12000.0, memory_per_pod_gb=2.1, pod_count=20.0), "cart-service": _metrics("cart-service", 30.0, 35.0, 250.0, 0.15, 90.0, 2200.0), "hpa-controller": _metrics("hpa-controller", 10.0, 15.0, 0.0, 0.0, 0.0, 0.0, current_replicas=20.0), "cdn-static": _metrics("cdn-static", 10.0, 12.0, 9000.0, 0.001, 12.0, 68.0), "postgres-checkout": _metrics("postgres-checkout", 35.0, 48.0, 200.0, 0.001, 12.0, 38.0), }, correct_severity=IncidentSeverity.P1, correct_root_cause_service="recommendation-service", correct_root_cause_keywords=[ "memory request", "resource limit", "hpa", "eviction loop", "pod eviction", "memory limit", "recommendation-service memory", "node pressure", "oom eviction", "hpa scale", "memory requests underspecified", ], valid_remediation_actions=[ {"action": "config_change", "service": "recommendation-service"}, {"action": "scale", "service": "recommendation-service"}, {"action": "restart", "service": "recommendation-service"}, {"action": "config_change", "service": "hpa-controller"}, ], expected_escalation_teams=["platform-team", "sre-team"], max_steps=20, degradation_per_step=0.015, relevant_services=["recommendation-service", "k8s-node-01", "hpa-controller", "checkout-service"], blast_radius={ "recommendation-service": { "error_rate": (0.03, 0.95), "pod_count": (0.5, 20.0), }, "k8s-node-01": { "memory_percent": (0.4, 100.0), "evicted_pods": (0.4, 20.0), }, "k8s-node-02": { "memory_percent": (0.5, 100.0), }, "checkout-service": { "error_rate": (0.025, 0.85), }, }, ) # ========================================================================== # SCENARIO 3-C – Hard variant: Database failover split-brain # ========================================================================== _SCENARIO_HARD_C = Scenario( scenario_id="db-failover-race-001", task_id="full_incident_management", incident_id="INC-20260327-005", description=( "The primary PostgreSQL instance failed over to the replica 18 minutes " "ago but several services still route writes to the old primary (now " "read-only) because pgbouncer's connection string was never updated. " "A split-brain scenario is actively corrupting order state. Full " "incident commander workflow required: triage, diagnose, remediate, " "escalate, communicate." ), initial_alerts=[ _alert("ALT-300", "order-service", AlertSeverity.CRITICAL, "Write failures: 65% of order commits failing with ReadOnlyError", "2026-03-27T18:10:00Z"), _alert("ALT-301", "postgres-primary-old", AlertSeverity.CRITICAL, "Instance is READ-ONLY (promoted replica took writes 18 min ago)", "2026-03-27T18:10:30Z"), _alert("ALT-302", "postgres-replica-new", AlertSeverity.WARNING, "Becoming primary: only 30% of expected write traffic received", "2026-03-27T18:11:00Z"), _alert("ALT-303", "payment-service", AlertSeverity.WARNING, "Double-charge risk: orders appearing in both DB instances for 8% of txns", "2026-03-27T18:11:30Z"), _alert("ALT-304", "inventory-service", AlertSeverity.WARNING, "Stock deduction failing silently: items over-sold", "2026-03-27T18:12:00Z"), _alert("ALT-305", "monitoring-dashboard", AlertSeverity.INFO, "DB failover event recorded at 2026-03-27T17:52:00Z", "2026-03-27T18:12:30Z"), _alert("ALT-306", "pgbouncer", AlertSeverity.CRITICAL, "pgbouncer still routing ALL writes to postgres-primary-old (read-only). Connection string not updated after failover.", "2026-03-27T18:13:00Z"), ], available_services=[ "order-service", "postgres-primary-old", "postgres-replica-new", "payment-service", "inventory-service", "config-service", "monitoring-dashboard", "pgbouncer", ], service_logs={ "order-service": [ _log("2026-03-27T17:52:00Z", "order-service", "WARN", "DB failover detected. Using cached connection string."), _log("2026-03-27T17:55:00Z", "order-service", "ERROR", "INSERT failed: ERROR: cannot execute INSERT in a read-only transaction"), _log("2026-03-27T18:00:00Z", "order-service", "ERROR", "65% of order writes failing. Service still pointing to old primary."), _log("2026-03-27T18:10:00Z", "order-service", "ERROR", "Connection pool: all connections to postgres-primary-old. Failover not propagated."), ], "postgres-primary-old": [ _log("2026-03-27T17:52:00Z", "postgres-primary-old", "WARN", "Promotion event: replica assumed primary role. This instance now read-only."), _log("2026-03-27T18:05:00Z", "postgres-primary-old", "ERROR", "Receiving 1800 write attempts/min from services — all rejected (read-only)."), _log("2026-03-27T18:10:00Z", "postgres-primary-old", "ERROR", "Active connections: 198/200. Service retry loops filling pool."), ], "postgres-replica-new": [ _log("2026-03-27T17:52:00Z", "postgres-replica-new", "INFO", "Promoted to primary. Accepting writes."), _log("2026-03-27T18:05:00Z", "postgres-replica-new", "WARN", "Only 30% of expected write traffic received. Split-brain suspected."), _log("2026-03-27T18:10:00Z", "postgres-replica-new", "WARN", "Diverging from old primary: 1240 transactions only in new primary."), ], "payment-service": [ _log("2026-03-27T18:05:00Z", "payment-service", "ERROR", "Idempotency check failing: order state inconsistent between DB instances"), _log("2026-03-27T18:10:00Z", "payment-service", "ERROR", "8% txn double-charge risk. Halting charge processing for affected orders."), ], "inventory-service": [ _log("2026-03-27T18:05:00Z", "inventory-service", "ERROR", "Stock deduction writes going to old primary (read-only) — silently lost."), _log("2026-03-27T18:10:00Z", "inventory-service", "ERROR", "Oversold items: 340 SKUs with negative virtual stock. Revenue impact growing."), ], "config-service": [ _log("2026-03-27T17:52:00Z", "config-service", "INFO", "DB failover event received. Updated DB_PRIMARY_HOST in config store."), _log("2026-03-27T17:52:30Z", "config-service", "WARN", "Config propagation: order-service and payment-service did NOT acknowledge new config."), _log("2026-03-27T17:55:00Z", "config-service", "ERROR", "Config ack missing for 4/8 services. Manual pgbouncer reload required."), ], "pgbouncer": [ _log("2026-03-27T17:52:00Z", "pgbouncer", "WARN", "Failover detected. pgbouncer config NOT auto-updated (static connection string)."), _log("2026-03-27T18:10:00Z", "pgbouncer", "ERROR", "Routing 100% of writes to postgres-primary-old (read-only). Update target_db required immediately."), ], "monitoring-dashboard": [ _log("2026-03-27T17:52:00Z", "monitoring-dashboard", "INFO", "Auto-failover triggered at 17:52:00Z by health check failure on primary."), _log("2026-03-27T18:12:00Z", "monitoring-dashboard", "INFO", "Split-brain duration: 18 min. Financial impact estimate: $42,000 in at-risk transactions."), ], }, service_metrics={ "order-service": _metrics("order-service", 55.0, 60.0, 800.0, 0.65, 300.0, 8000.0, write_failure_rate=0.65), "postgres-primary-old": _metrics("postgres-primary-old", 80.0, 70.0, 1800.0, 1.0, 5.0, 50.0, is_read_only=1.0, connection_pct=99.0), "postgres-replica-new": _metrics("postgres-replica-new", 30.0, 45.0, 600.0, 0.0, 8.0, 30.0, write_pct_expected=0.30), "payment-service": _metrics("payment-service", 40.0, 45.0, 200.0, 0.25, 180.0, 3500.0, double_charge_risk_pct=0.08), "inventory-service": _metrics("inventory-service", 35.0, 40.0, 300.0, 0.30, 120.0, 2500.0, oversold_skus=340.0), "config-service": _metrics("config-service", 15.0, 20.0, 50.0, 0.10, 30.0, 200.0), "monitoring-dashboard": _metrics("monitoring-dashboard", 10.0, 15.0, 100.0, 0.0, 50.0, 150.0), "pgbouncer": _metrics("pgbouncer", 25.0, 30.0, 2000.0, 0.65, 2.0, 8.0, routing_to_old_primary=1.0), }, correct_severity=IncidentSeverity.P1, correct_root_cause_service="pgbouncer", correct_root_cause_keywords=[ "pgbouncer", "connection string", "split-brain", "failover", "read-only", "config not propagated", "stale connection", "db routing", "pgbouncer config", "connection pool routing", "failover not propagated", ], valid_remediation_actions=[ {"action": "config_change", "service": "pgbouncer"}, {"action": "restart", "service": "order-service"}, {"action": "config_change", "service": "order-service"}, {"action": "restart", "service": "payment-service"}, ], expected_escalation_teams=["database-team", "platform-team"], max_steps=20, degradation_per_step=0.02, relevant_services=["pgbouncer", "postgres-primary-old", "postgres-replica-new", "order-service"], blast_radius={ "order-service": { "write_failure_rate": (0.02, 1.0), "error_rate": (0.02, 0.95), }, "inventory-service": { "oversold_skus": (25.0, 5000.0), "error_rate": (0.02, 0.80), }, "payment-service": { "double_charge_risk_pct": (0.005, 0.30), "error_rate": (0.02, 0.60), }, "postgres-primary-old": { "connection_pct": (0.2, 100.0), }, }, ) # ========================================================================== # SCENARIO 1-C – Easy variant: DNS resolution failure # ========================================================================== _SCENARIO_EASY_C = Scenario( scenario_id="dns-fail-001", task_id="severity_classification", incident_id="INC-20260327-201", description=( "Multiple microservices are reporting connection timeouts to downstream " "dependencies. Alerts indicate DNS resolution failures across the " "internal service mesh. Classify the incident severity." ), initial_alerts=[ _alert("ALT-301", "api-gateway", AlertSeverity.CRITICAL, "Upstream connection timeout rate 40% to backend services", "2026-03-27T14:00:00Z"), _alert("ALT-302", "coredns", AlertSeverity.CRITICAL, "DNS query failure rate 65% — SERVFAIL responses", "2026-03-27T13:58:00Z"), _alert("ALT-303", "notification-service", AlertSeverity.WARNING, "Failed to resolve smtp-relay.internal: NXDOMAIN", "2026-03-27T14:01:00Z"), ], available_services=["api-gateway", "coredns", "notification-service", "istio-proxy"], service_logs={ "api-gateway": [ _log("2026-03-27T13:58:00Z", "api-gateway", "ERROR", "upstream connect error: dns_resolution_failure for user-service.default.svc.cluster.local"), _log("2026-03-27T13:59:00Z", "api-gateway", "ERROR", "circuit breaker tripped: 5/10 upstream failures in 30s. Returning 503."), _log("2026-03-27T14:00:00Z", "api-gateway", "WARN", "Retry budget exhausted for payment-service. DNS not resolving."), ], "coredns": [ _log("2026-03-27T13:55:00Z", "coredns", "WARN", "Cache miss rate increasing: 80%. Upstream forwarder slow."), _log("2026-03-27T13:57:00Z", "coredns", "ERROR", "OOMKilled: coredns-7d8f9b pod restarted. Memory limit 128Mi exceeded."), _log("2026-03-27T13:58:00Z", "coredns", "ERROR", "SERVFAIL for *.default.svc.cluster.local — upstream timeout after 5s"), _log("2026-03-27T14:00:00Z", "coredns", "ERROR", "Pod restart count: 4 in last 10 minutes. CrashLoopBackOff."), ], "notification-service": [ _log("2026-03-27T14:00:00Z", "notification-service", "WARN", "Email delivery failing: cannot resolve smtp-relay.internal"), ], "istio-proxy": [ _log("2026-03-27T14:00:00Z", "istio-proxy", "INFO", "Sidecar healthy. mTLS handshake OK. Issue is upstream DNS, not mesh."), ], }, service_metrics={ "api-gateway": _metrics("api-gateway", 25.0, 40.0, 1200.0, 0.40, 800.0, 5000.0), "coredns": _metrics("coredns", 95.0, 98.0, 5000.0, 0.65, 50.0, 5000.0, restart_count=4.0, cache_miss_pct=80.0), "notification-service": _metrics("notification-service", 10.0, 20.0, 50.0, 0.80, 200.0, 3000.0), "istio-proxy": _metrics("istio-proxy", 5.0, 10.0, 1200.0, 0.01, 2.0, 10.0), }, correct_severity=IncidentSeverity.P1, correct_root_cause_service="coredns", correct_root_cause_keywords=["dns", "coredns", "OOM", "memory", "resolution", "SERVFAIL", "CrashLoop"], valid_remediation_actions=[ {"action": "restart", "service": "coredns"}, {"action": "scale", "service": "coredns"}, {"action": "config_change", "service": "coredns", "detail": "increase memory limit"}, ], expected_escalation_teams=["platform-team"], max_steps=10, degradation_per_step=0.008, relevant_services=["api-gateway", "coredns"], blast_radius={ "coredns": { "error_rate": (0.03, 0.95), "restart_count": (1.0, 15.0), }, "api-gateway": { "error_rate": (0.03, 0.80), "latency_p99_ms": (500.0, 15000.0), }, }, ) # ========================================================================== # SCENARIO 2-C – Medium variant: TLS certificate expiry # ========================================================================== _SCENARIO_MEDIUM_C = Scenario( scenario_id="tls-expiry-001", task_id="root_cause_analysis", incident_id="INC-20260327-301", description=( "The checkout-service is returning 502 errors for all HTTPS calls to " "the payment provider API. Internal health checks pass but external " "payment calls fail. Diagnose the root cause and remediate." ), initial_alerts=[ _alert("ALT-401", "checkout-service", AlertSeverity.CRITICAL, "Payment API calls failing: 502 rate 95%", "2026-03-27T09:00:00Z"), _alert("ALT-402", "cert-manager", AlertSeverity.WARNING, "Certificate renewal failed for payments.example.com — ACME challenge timeout", "2026-03-27T08:00:00Z"), _alert("ALT-403", "nginx-ingress", AlertSeverity.WARNING, "TLS handshake failures: 200/min on payments upstream", "2026-03-27T09:01:00Z"), ], available_services=["checkout-service", "cert-manager", "nginx-ingress", "payment-provider-stub"], service_logs={ "checkout-service": [ _log("2026-03-27T08:55:00Z", "checkout-service", "ERROR", "PaymentGatewayError: SSL certificate has expired (payments.example.com)"), _log("2026-03-27T08:58:00Z", "checkout-service", "ERROR", "javax.net.ssl.SSLHandshakeException: PKIX path validation failed: certificate expired at 2026-03-27T00:00:00Z"), _log("2026-03-27T09:00:00Z", "checkout-service", "ERROR", "Circuit breaker OPEN for payment-provider. 48/50 calls failed in 60s."), ], "cert-manager": [ _log("2026-03-27T02:00:00Z", "cert-manager", "INFO", "Certificate renewal triggered for payments.example.com (expires in 24h)"), _log("2026-03-27T02:01:00Z", "cert-manager", "ERROR", "ACME HTTP-01 challenge failed: upstream DNS not resolving challenge token"), _log("2026-03-27T02:05:00Z", "cert-manager", "ERROR", "Retry 3/3 failed. Certificate NOT renewed. Expiry: 2026-03-27T00:00:00Z"), _log("2026-03-27T08:00:00Z", "cert-manager", "CRITICAL", "Certificate EXPIRED: payments.example.com. Last valid: 2026-03-26T23:59:59Z"), ], "nginx-ingress": [ _log("2026-03-27T09:00:00Z", "nginx-ingress", "ERROR", "SSL_do_handshake() failed: certificate verify failed (expired)"), _log("2026-03-27T09:01:00Z", "nginx-ingress", "WARN", "Upstream payments backend: 200 TLS errors/min. Peer certificate expired."), ], "payment-provider-stub": [ _log("2026-03-27T09:00:00Z", "payment-provider-stub", "INFO", "Healthy. Accepting connections on port 443 with valid certificate."), ], }, service_metrics={ "checkout-service": _metrics("checkout-service", 15.0, 30.0, 300.0, 0.95, 50.0, 200.0, payment_success_pct=5.0, revenue_loss_per_min=8500.0), "cert-manager": _metrics("cert-manager", 5.0, 10.0, 1.0, 0.0, 10.0, 50.0, certs_expired=1.0, renewal_failures=3.0), "nginx-ingress": _metrics("nginx-ingress", 10.0, 20.0, 500.0, 0.40, 5.0, 30.0, tls_handshake_failures_per_min=200.0), "payment-provider-stub": _metrics("payment-provider-stub", 5.0, 15.0, 50.0, 0.0, 20.0, 80.0), }, correct_severity=IncidentSeverity.P1, correct_root_cause_service="cert-manager", correct_root_cause_keywords=["certificate", "TLS", "SSL", "expired", "cert-manager", "renewal", "ACME", "expiry"], valid_remediation_actions=[ {"action": "restart", "service": "cert-manager"}, {"action": "config_change", "service": "cert-manager", "detail": "force renewal"}, {"action": "config_change", "service": "nginx-ingress", "detail": "update certificate"}, ], expected_escalation_teams=["security-team", "platform-team"], max_steps=15, degradation_per_step=0.010, relevant_services=["checkout-service", "cert-manager", "nginx-ingress"], blast_radius={ "checkout-service": { "error_rate": (0.005, 1.0), "payment_success_pct": (-0.5, 0.0), "revenue_loss_per_min": (500.0, 50000.0), }, "nginx-ingress": { "tls_handshake_failures_per_min": (20.0, 1000.0), }, }, ) # ---- registry --------------------------------------------------------------- # Multiple variants per task — environment randomly selects one per reset() SCENARIO_VARIANTS: Dict[str, List[Scenario]] = { "severity_classification": [_SCENARIO_EASY, _SCENARIO_EASY_B, _SCENARIO_EASY_C], "root_cause_analysis": [_SCENARIO_MEDIUM, _SCENARIO_MEDIUM_B, _SCENARIO_MEDIUM_C], "full_incident_management": [_SCENARIO_HARD, _SCENARIO_HARD_B, _SCENARIO_HARD_C], } # Always maps task_id → primary (deterministic) scenario for testing/baseline SCENARIOS: Dict[str, Scenario] = { "severity_classification": _SCENARIO_EASY, "root_cause_analysis": _SCENARIO_MEDIUM, "full_incident_management": _SCENARIO_HARD, } def get_scenario(task_id: str, variant_seed: int = 0) -> Scenario: """Return a scenario for the given task_id. Args: task_id: One of the three registered task IDs. variant_seed: Index into SCENARIO_VARIANTS[task_id]. Wraps around. Pass 0 for the primary/deterministic scenario. """ if task_id not in SCENARIO_VARIANTS: raise ValueError(f"Unknown task_id '{task_id}'. Valid: {list(SCENARIO_VARIANTS.keys())}") variants = SCENARIO_VARIANTS[task_id] return variants[variant_seed % len(variants)]