openenv / src /scenarios.py
sentinel-space-publisher
space: publish latest Sentinel app snapshot
c452421
"""Deterministic incident scenarios for the IRT environment.
Each scenario is a self-contained data definition:
- Initial alerts visible to the agent
- Hidden logs and metrics per service (revealed on INVESTIGATE)
- Ground truth for grading (severity, root cause, valid remediations)
Scenarios are keyed by task_id for 1-to-1 task↔scenario mapping.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, List
from src.models import (
Alert,
AlertSeverity,
IncidentSeverity,
LogEntry,
ServiceMetrics,
)
@dataclass(frozen=True)
class Scenario:
scenario_id: str
task_id: str
incident_id: str
description: str
# Initial state
initial_alerts: List[Alert]
available_services: List[str]
# Hidden – revealed on INVESTIGATE
service_logs: Dict[str, List[LogEntry]]
service_metrics: Dict[str, ServiceMetrics]
# Ground truth
correct_severity: IncidentSeverity
correct_root_cause_service: str
correct_root_cause_keywords: List[str] # any of these in diagnosis → credit
valid_remediation_actions: List[Dict[str, Any]]
expected_escalation_teams: List[str]
# Params
max_steps: int
degradation_per_step: float = 0.0 # additional penalty per idle step
relevant_services: List[str] = field(default_factory=list)
# Blast radius: maps service → metric key → (rate_per_step, cap) that
# worsens dynamically as the agent delays. Applied before metrics are
# revealed so the agent observes a live, worsening picture.
# Format: {"service": {"metric_key": (delta_per_step, max_value)}}
blast_radius: Dict[str, Dict[str, tuple]] = field(default_factory=dict)
def apply_blast_radius(scenario: Scenario, step: int) -> Dict[str, ServiceMetrics]:
"""Return a copy of service_metrics with blast-radius degradation applied.
Each entry in scenario.blast_radius defines a (delta_per_step, cap) tuple
per metric key. The returned dict can be used as revealed metrics so each
INVESTIGATE at a higher step number sees a more degraded system.
"""
if not scenario.blast_radius:
return dict(scenario.service_metrics)
result: Dict[str, ServiceMetrics] = {}
for svc, base_metrics in scenario.service_metrics.items():
blast = scenario.blast_radius.get(svc)
if blast is None:
result[svc] = base_metrics
continue
# Build an updated custom dict
d = base_metrics.model_dump()
custom: Dict[str, float] = dict(d.get("custom") or {})
# Core fields we also allow to degrade
degradable_core = {
"error_rate", "latency_p50_ms", "latency_p99_ms",
"cpu_percent", "memory_percent", "request_rate",
}
for metric_key, (delta, cap) in blast.items():
if metric_key in degradable_core:
old_val = d.get(metric_key, 0.0)
new_val = min(cap, old_val + delta * step) if delta > 0 else max(cap, old_val + delta * step)
d[metric_key] = round(new_val, 3)
else:
# Custom metric field
old_val = custom.get(metric_key, 0.0)
new_val = min(cap, old_val + delta * step) if delta > 0 else max(cap, old_val + delta * step)
custom[metric_key] = round(new_val, 3)
d["custom"] = custom
result[svc] = ServiceMetrics(**d)
return result
# ---- helpers ----------------------------------------------------------------
def _alert(aid: str, svc: str, sev: AlertSeverity, msg: str, ts: str, **meta: Any) -> Alert:
return Alert(alert_id=aid, service=svc, severity=sev, message=msg, timestamp=ts, metadata=meta)
def _log(ts: str, svc: str, lvl: str, msg: str, tid: str | None = None) -> LogEntry:
return LogEntry(timestamp=ts, service=svc, level=lvl, message=msg, trace_id=tid)
def _metrics(svc: str, cpu: float, mem: float, rr: float, er: float, p50: float, p99: float, **custom: float) -> ServiceMetrics:
return ServiceMetrics(service=svc, cpu_percent=cpu, memory_percent=mem, request_rate=rr, error_rate=er, latency_p50_ms=p50, latency_p99_ms=p99, custom=custom)
# ==========================================================================
# SCENARIO 1 – Easy: Database Connection Pool Exhaustion
# ==========================================================================
_SCENARIO_EASY = Scenario(
scenario_id="db-conn-pool-001",
task_id="severity_classification",
incident_id="INC-20260327-001",
description=(
"The user-service API is experiencing elevated latency and errors. "
"Alerts indicate the PostgreSQL primary database connection pool is "
"nearly saturated. Classify the incident severity."
),
initial_alerts=[
_alert("ALT-001", "user-service", AlertSeverity.WARNING,
"p99 latency exceeded 3000 ms threshold (current: 4200 ms)", "2026-03-27T02:15:00Z"),
_alert("ALT-002", "postgres-primary", AlertSeverity.CRITICAL,
"Connection pool utilization at 98% (max 200 connections)", "2026-03-27T02:14:30Z"),
_alert("ALT-003", "user-service", AlertSeverity.WARNING,
"Error rate at 12% over the last 5 minutes", "2026-03-27T02:15:30Z"),
],
available_services=["user-service", "postgres-primary", "redis-cache", "api-gateway"],
service_logs={
"user-service": [
_log("2026-03-27T02:10:00Z", "user-service", "INFO", "Deployment v2.3.1 completed successfully"),
_log("2026-03-27T02:12:00Z", "user-service", "WARN", "DB query took 2800 ms for /api/users/profile"),
_log("2026-03-27T02:13:00Z", "user-service", "ERROR", "Connection acquisition timeout after 5000 ms", "trace-a1b2"),
_log("2026-03-27T02:13:30Z", "user-service", "ERROR", "Connection acquisition timeout after 5000 ms", "trace-c3d4"),
_log("2026-03-27T02:14:00Z", "user-service", "ERROR", "Failed to acquire connection from pool: pool exhausted", "trace-e5f6"),
_log("2026-03-27T02:14:30Z", "user-service", "WARN", "Retry #3 for DB connection – backing off 500 ms"),
_log("2026-03-27T02:15:00Z", "user-service", "ERROR", "HTTP 503 returned for GET /api/users/profile", "trace-g7h8"),
],
"postgres-primary": [
_log("2026-03-27T02:08:00Z", "postgres-primary", "INFO", "Active connections: 120/200"),
_log("2026-03-27T02:10:30Z", "postgres-primary", "WARN", "Active connections: 175/200"),
_log("2026-03-27T02:12:00Z", "postgres-primary", "WARN", "Active connections: 190/200 – approaching limit"),
_log("2026-03-27T02:13:00Z", "postgres-primary", "ERROR", "Active connections: 196/200 – new connection rejected"),
_log("2026-03-27T02:14:00Z", "postgres-primary", "ERROR", "Connection count 198/200. Longest idle: 1800 s. Possible connection leak detected."),
_log("2026-03-27T02:15:00Z", "postgres-primary", "ERROR", "Active connections: 200/200 – pool fully saturated"),
],
"redis-cache": [
_log("2026-03-27T02:15:00Z", "redis-cache", "INFO", "Memory usage: 45%. Operations normal."),
_log("2026-03-27T02:15:00Z", "redis-cache", "INFO", "Hit rate: 94%. No evictions."),
],
"api-gateway": [
_log("2026-03-27T02:14:00Z", "api-gateway", "WARN", "Upstream user-service returning 503 for 8% of requests"),
_log("2026-03-27T02:15:00Z", "api-gateway", "INFO", "All other upstream services healthy"),
],
},
service_metrics={
"user-service": _metrics("user-service", 65.0, 58.0, 450.0, 0.12, 320.0, 4200.0),
"postgres-primary": _metrics("postgres-primary", 78.0, 72.0, 450.0, 0.05, 45.0, 890.0, connection_pool_pct=98.0),
"redis-cache": _metrics("redis-cache", 15.0, 45.0, 1200.0, 0.001, 1.2, 3.5),
"api-gateway": _metrics("api-gateway", 22.0, 30.0, 2200.0, 0.08, 85.0, 4500.0),
},
correct_severity=IncidentSeverity.P2,
correct_root_cause_service="postgres-primary",
correct_root_cause_keywords=["connection pool", "connection leak", "pool exhaustion", "pool saturated", "connection exhaustion"],
valid_remediation_actions=[
{"action": "restart", "service": "user-service"},
{"action": "config_change", "service": "postgres-primary", "detail": "increase pool size"},
],
expected_escalation_teams=["database-team"],
max_steps=10,
degradation_per_step=0.005,
relevant_services=["user-service", "postgres-primary"],
# Blast radius: connection pool fully saturates, user-service error rate climbs
blast_radius={
"postgres-primary": {
"connection_pool_pct": (0.5, 100.0), # +0.5%/step → caps at 100%
},
"user-service": {
"error_rate": (0.02, 0.60), # +2pp/step → caps at 60%
"latency_p99_ms": (200.0, 10000.0), # +200ms/step → caps at 10s
},
},
)
# ==========================================================================
# SCENARIO 2 – Medium: Payment Processing Failure
# ==========================================================================
_SCENARIO_MEDIUM = Scenario(
scenario_id="payment-failure-001",
task_id="root_cause_analysis",
incident_id="INC-20260327-002",
description=(
"Payment success rate has dropped sharply. Multiple services show "
"degradation. Investigate the services, identify the root cause, "
"classify severity, and apply the correct remediation."
),
initial_alerts=[
_alert("ALT-010", "payment-gateway", AlertSeverity.CRITICAL,
"Payment success rate dropped to 45% (threshold: 95%)", "2026-03-27T09:30:00Z"),
_alert("ALT-011", "payment-processor", AlertSeverity.WARNING,
"Timeout errors increased 10x in last 10 minutes", "2026-03-27T09:30:30Z"),
_alert("ALT-012", "redis-session", AlertSeverity.WARNING,
"Key eviction rate spike: 1500 evictions/min (normal: <10)", "2026-03-27T09:29:00Z"),
_alert("ALT-013", "order-service", AlertSeverity.WARNING,
"Error rate elevated to 8%", "2026-03-27T09:31:00Z"),
],
available_services=["payment-gateway", "payment-processor", "redis-session", "order-service", "user-service", "postgres-primary"],
service_logs={
"payment-gateway": [
_log("2026-03-27T09:25:00Z", "payment-gateway", "INFO", "Processing 320 payments/min"),
_log("2026-03-27T09:28:00Z", "payment-gateway", "WARN", "Payment token validation failed: token not found in session store", "trace-pay-01"),
_log("2026-03-27T09:28:30Z", "payment-gateway", "ERROR", "Payment failed: session token expired or missing for txn TXN-8842", "trace-pay-02"),
_log("2026-03-27T09:29:00Z", "payment-gateway", "ERROR", "Batch failure: 55% of payment attempts failing with SESSION_TOKEN_MISSING"),
_log("2026-03-27T09:30:00Z", "payment-gateway", "ERROR", "Success rate critical: 45%. All failures correlate with session token lookup errors."),
],
"payment-processor": [
_log("2026-03-27T09:28:00Z", "payment-processor", "WARN", "Upstream payment-gateway sending incomplete requests"),
_log("2026-03-27T09:29:00Z", "payment-processor", "ERROR", "Timeout waiting for payment-gateway response: 12 s", "trace-pp-01"),
_log("2026-03-27T09:30:00Z", "payment-processor", "WARN", "Retry queue depth: 450 (normal: <20)"),
],
"redis-session": [
_log("2026-03-27T09:20:00Z", "redis-session", "INFO", "Memory usage: 95%. Approaching maxmemory limit (4 GB)."),
_log("2026-03-27T09:22:00Z", "redis-session", "WARN", "maxmemory reached. Eviction policy: allkeys-lru. Beginning evictions."),
_log("2026-03-27T09:25:00Z", "redis-session", "WARN", "Evicted 800 keys in last 3 minutes. Active sessions being evicted."),
_log("2026-03-27T09:28:00Z", "redis-session", "ERROR", "Eviction rate critical: 1500 keys/min. Payment session tokens are being evicted before use."),
_log("2026-03-27T09:30:00Z", "redis-session", "ERROR", "Memory at 100%. Continuous eviction. Session TTL effectively reduced from 30 min to ~45 s."),
],
"order-service": [
_log("2026-03-27T09:30:00Z", "order-service", "WARN", "Downstream payment-gateway returning errors for order confirmations"),
_log("2026-03-27T09:31:00Z", "order-service", "ERROR", "8% of orders failing at payment step – propagated from payment-gateway"),
],
"user-service": [
_log("2026-03-27T09:30:00Z", "user-service", "INFO", "All endpoints healthy. Latency normal."),
],
"postgres-primary": [
_log("2026-03-27T09:30:00Z", "postgres-primary", "INFO", "Active connections: 85/200. Operations normal."),
],
},
service_metrics={
"payment-gateway": _metrics("payment-gateway", 45.0, 52.0, 320.0, 0.55, 250.0, 12000.0, payment_success_rate=0.45),
"payment-processor": _metrics("payment-processor", 35.0, 40.0, 150.0, 0.30, 180.0, 8000.0),
"redis-session": _metrics("redis-session", 30.0, 99.5, 5000.0, 0.02, 0.8, 2.5, memory_used_gb=3.98, evictions_per_min=1500.0),
"order-service": _metrics("order-service", 28.0, 35.0, 200.0, 0.08, 120.0, 950.0),
"user-service": _metrics("user-service", 20.0, 32.0, 400.0, 0.002, 45.0, 120.0),
"postgres-primary": _metrics("postgres-primary", 40.0, 55.0, 300.0, 0.001, 12.0, 45.0),
},
correct_severity=IncidentSeverity.P1,
correct_root_cause_service="redis-session",
correct_root_cause_keywords=[
"redis", "memory", "eviction", "session token", "maxmemory",
"session eviction", "cache eviction", "redis memory",
],
valid_remediation_actions=[
{"action": "scale", "service": "redis-session"},
{"action": "config_change", "service": "redis-session", "detail": "increase maxmemory"},
{"action": "restart", "service": "redis-session"},
],
expected_escalation_teams=["payments-team", "platform-team"],
max_steps=15,
degradation_per_step=0.01,
relevant_services=["payment-gateway", "redis-session", "payment-processor"],
# Blast radius: Redis keeps evicting, payment success rate collapses
blast_radius={
"redis-session": {
"evictions_per_min": (150.0, 5000.0), # +150 evictions/min/step
"memory_used_gb": (0.005, 4.0), # creeps toward hard limit
},
"payment-gateway": {
"payment_success_rate": (-0.04, 0.05), # drops 4pp/step → 5% floor
"error_rate": (0.03, 0.90),
},
"order-service": {
"error_rate": (0.02, 0.50),
},
},
)
# ==========================================================================
# SCENARIO 3 – Hard: Cascading Multi-Service Outage
# ==========================================================================
_SCENARIO_HARD = Scenario(
scenario_id="cascading-outage-001",
task_id="full_incident_management",
incident_id="INC-20260327-003",
description=(
"Multiple services are degraded simultaneously. The API gateway is "
"returning 503s, the auth service has extreme latency, and downstream "
"services are failing. This is a cascading outage. You must triage, "
"investigate, identify the root cause, remediate, escalate, and "
"communicate status updates."
),
initial_alerts=[
_alert("ALT-100", "api-gateway", AlertSeverity.CRITICAL,
"503 error rate at 35% across all endpoints", "2026-03-27T14:00:00Z"),
_alert("ALT-101", "auth-service", AlertSeverity.CRITICAL,
"p99 latency > 5000 ms (threshold: 200 ms)", "2026-03-27T14:00:30Z"),
_alert("ALT-102", "order-service", AlertSeverity.WARNING,
"Message queue depth growing: 15000 (normal: <500)", "2026-03-27T14:01:00Z"),
_alert("ALT-103", "notification-service", AlertSeverity.WARNING,
"Connection timeout to auth-service: 100% failure rate", "2026-03-27T14:01:30Z"),
_alert("ALT-104", "cdn-static", AlertSeverity.INFO,
"Cache miss rate elevated to 15% (normal: 2%)", "2026-03-27T14:02:00Z"),
_alert("ALT-105", "user-service", AlertSeverity.WARNING,
"Intermittent HTTP 401 responses (token validation failing)", "2026-03-27T14:01:00Z"),
_alert("ALT-106", "deployment-tracker", AlertSeverity.CRITICAL,
"auth-service v3.1.0 deployed at 13:47 — memory climb started immediately. Escalate to auth-team and platform-team.", "2026-03-27T14:02:00Z"),
],
available_services=[
"api-gateway", "auth-service", "user-service",
"order-service", "notification-service", "cdn-static",
"postgres-primary", "redis-auth-cache",
],
service_logs={
"api-gateway": [
_log("2026-03-27T13:58:00Z", "api-gateway", "INFO", "All upstreams healthy. Traffic: 5500 req/s."),
_log("2026-03-27T14:00:00Z", "api-gateway", "ERROR", "Upstream auth-service: 503 for 35% of auth checks"),
_log("2026-03-27T14:00:30Z", "api-gateway", "ERROR", "Circuit breaker OPEN for auth-service after 50 consecutive failures"),
_log("2026-03-27T14:01:00Z", "api-gateway", "ERROR", "Cascading: requests requiring auth are failing. Public endpoints OK."),
],
"auth-service": [
_log("2026-03-27T13:45:00Z", "auth-service", "INFO", "Deployment v3.1.0 started (canary 10%)"),
_log("2026-03-27T13:47:00Z", "auth-service", "INFO", "Deployment v3.1.0 promoted to 100%"),
_log("2026-03-27T13:50:00Z", "auth-service", "WARN", "Memory usage climbing: 72% (was 45% before deploy)"),
_log("2026-03-27T13:55:00Z", "auth-service", "WARN", "Memory usage: 88%. GC pauses increasing: avg 350 ms"),
_log("2026-03-27T13:58:00Z", "auth-service", "ERROR", "Memory usage: 95%. GC pause: 2100 ms. Requests timing out."),
_log("2026-03-27T14:00:00Z", "auth-service", "ERROR", "OOMKill risk. Memory: 97%. Token validation taking 4800 ms avg."),
_log("2026-03-27T14:00:30Z", "auth-service", "ERROR", "v3.1.0 changelog: 'Refactored token cache to in-memory store' – possible unbounded cache growth"),
_log("2026-03-27T14:01:00Z", "auth-service", "ERROR", "Pod restarts: 3 in last 5 min due to OOMKill. Service effectively down."),
],
"user-service": [
_log("2026-03-27T14:00:00Z", "user-service", "WARN", "Auth token validation calls timing out"),
_log("2026-03-27T14:01:00Z", "user-service", "ERROR", "Returning 401 for 40% of requests – cannot validate tokens with auth-service"),
],
"order-service": [
_log("2026-03-27T14:00:00Z", "order-service", "WARN", "Order processing slowing – auth dependency failing"),
_log("2026-03-27T14:01:00Z", "order-service", "ERROR", "Queue depth: 15000. Orders stuck awaiting auth validation."),
_log("2026-03-27T14:02:00Z", "order-service", "ERROR", "Queue depth: 25000. Risk of message broker disk overflow."),
],
"notification-service": [
_log("2026-03-27T14:01:00Z", "notification-service", "ERROR", "Cannot reach auth-service. All notification deliveries paused."),
_log("2026-03-27T14:02:00Z", "notification-service", "WARN", "Buffered 8000 pending notifications."),
],
"cdn-static": [
_log("2026-03-27T14:00:00Z", "cdn-static", "INFO", "Cache miss rate elevated. Likely due to increased full page reloads from client-side auth failures."),
_log("2026-03-27T14:02:00Z", "cdn-static", "INFO", "No CDN-side issues detected. Origin healthy."),
],
"postgres-primary": [
_log("2026-03-27T14:00:00Z", "postgres-primary", "INFO", "Connections: 90/200. Query performance normal."),
],
"redis-auth-cache": [
_log("2026-03-27T14:00:00Z", "redis-auth-cache", "INFO", "Memory: 30%. Operations normal."),
_log("2026-03-27T14:00:30Z", "redis-auth-cache", "WARN", "Cache hit rate dropped from 92% to 15%. auth-service v3.1.0 appears to bypass cache."),
],
},
service_metrics={
"api-gateway": _metrics("api-gateway", 55.0, 40.0, 5500.0, 0.35, 150.0, 8500.0),
"auth-service": _metrics("auth-service", 95.0, 97.0, 800.0, 0.65, 2500.0, 5200.0, gc_pause_ms=2100.0, pod_restarts=3.0),
"user-service": _metrics("user-service", 30.0, 35.0, 400.0, 0.40, 80.0, 4800.0),
"order-service": _metrics("order-service", 40.0, 45.0, 200.0, 0.25, 300.0, 3500.0, queue_depth=15000.0),
"notification-service": _metrics("notification-service", 10.0, 20.0, 0.0, 1.0, 0.0, 0.0),
"cdn-static": _metrics("cdn-static", 12.0, 18.0, 8000.0, 0.001, 8.0, 25.0, cache_miss_rate=0.15),
"postgres-primary": _metrics("postgres-primary", 38.0, 52.0, 250.0, 0.001, 10.0, 40.0),
"redis-auth-cache": _metrics("redis-auth-cache", 12.0, 30.0, 2000.0, 0.005, 0.5, 1.8, cache_hit_rate=0.15),
},
correct_severity=IncidentSeverity.P1,
correct_root_cause_service="auth-service",
correct_root_cause_keywords=[
"memory leak", "v3.1.0", "deployment", "oom", "unbounded cache",
"in-memory", "bad deployment", "auth-service deployment",
"token cache", "gc pause", "out of memory",
],
valid_remediation_actions=[
{"action": "rollback", "service": "auth-service"},
{"action": "restart", "service": "auth-service"},
{"action": "scale", "service": "order-service"},
{"action": "restart", "service": "order-service"},
],
expected_escalation_teams=["platform-team", "auth-team"],
max_steps=20,
degradation_per_step=0.015,
relevant_services=["auth-service", "api-gateway", "redis-auth-cache", "order-service"],
# Blast radius: auth-service OOMKills more often, order queue grows unbounded
blast_radius={
"auth-service": {
"memory_percent": (0.5, 100.0), # +0.5%/step → OOM at 100%
"error_rate": (0.02, 0.95), # cascades toward full outage
"latency_p99_ms": (100.0, 15000.0),
"pod_restarts": (0.3, 15.0), # accumulating restarts
},
"order-service": {
"queue_depth": (1500.0, 100000.0), # queue grows 1500/step
"error_rate": (0.02, 0.80),
},
"api-gateway": {
"error_rate": (0.015, 0.70), # more requests fail over time
},
"user-service": {
"error_rate": (0.02, 0.80),
},
},
)
# ==========================================================================
# SCENARIO 1-B – Easy variant: Disk space exhaustion on log volume
# ==========================================================================
_SCENARIO_EASY_B = Scenario(
scenario_id="disk-full-001",
task_id="severity_classification",
incident_id="INC-20260327-101",
description=(
"The search-service and its underlying Elasticsearch cluster are "
"experiencing errors. Alerts indicate disk usage is critically high. "
"Classify the incident severity."
),
initial_alerts=[
_alert("ALT-201", "elasticsearch", AlertSeverity.CRITICAL,
"Disk usage at 95% on data node es-node-01", "2026-03-27T06:10:00Z"),
_alert("ALT-202", "search-service", AlertSeverity.WARNING,
"Bulk indexing failures: 400% increase", "2026-03-27T06:10:30Z"),
_alert("ALT-203", "elasticsearch", AlertSeverity.WARNING,
"write.low_watermark crossed – shard allocation blocked", "2026-03-27T06:09:00Z"),
],
available_services=["search-service", "elasticsearch", "kibana", "log-aggregator"],
service_logs={
"search-service": [
_log("2026-03-27T06:08:00Z", "search-service", "WARN", "Indexing queue backing up: 12000 documents pending"),
_log("2026-03-27T06:09:00Z", "search-service", "ERROR", "BulkIndexException: ClusterBlockException[blocked: FORBIDDEN/12/index]"),
_log("2026-03-27T06:10:00Z", "search-service", "ERROR", "Search degraded – last index refresh 8 min ago. Serving stale results."),
],
"elasticsearch": [
_log("2026-03-27T06:05:00Z", "elasticsearch", "WARN", "Disk usage: 90% on es-node-01. Threshold: 85%."),
_log("2026-03-27T06:07:00Z", "elasticsearch", "WARN", "Disk: 93%. flood_stage watermark approaching."),
_log("2026-03-27T06:09:00Z", "elasticsearch", "ERROR", "Disk: 95%. flood_stage reached. All indices set to read-only."),
_log("2026-03-27T06:10:00Z", "elasticsearch", "ERROR", "Shard allocation disabled. Cluster status: YELLOW. Write ops blocked."),
],
"kibana": [
_log("2026-03-27T06:10:00Z", "kibana", "INFO", "Dashboard loading normally. Read-only ops unaffected."),
],
"log-aggregator": [
_log("2026-03-27T06:09:00Z", "log-aggregator", "WARN", "Log shipping to elasticsearch failing. Retrying. Buffer: 50000 lines."),
],
},
service_metrics={
"search-service": _metrics("search-service", 42.0, 50.0, 200.0, 0.35, 180.0, 2200.0),
"elasticsearch": _metrics("elasticsearch", 60.0, 80.0, 50.0, 0.40, 200.0, 5000.0, disk_pct=95.0),
"kibana": _metrics("kibana", 15.0, 25.0, 30.0, 0.0, 90.0, 350.0),
"log-aggregator": _metrics("log-aggregator", 25.0, 35.0, 300.0, 0.15, 50.0, 400.0),
},
correct_severity=IncidentSeverity.P2,
correct_root_cause_service="elasticsearch",
correct_root_cause_keywords=["disk", "disk full", "disk space", "flood_stage", "watermark", "read-only", "disk usage"],
valid_remediation_actions=[
{"action": "config_change", "service": "elasticsearch", "detail": "clear read-only flag"},
{"action": "scale", "service": "elasticsearch"},
],
expected_escalation_teams=["infrastructure-team"],
max_steps=10,
degradation_per_step=0.005,
relevant_services=["search-service", "elasticsearch"],
)
# ==========================================================================
# SCENARIO 2-B – Medium variant: Slow memory leak in background worker
# ==========================================================================
_SCENARIO_MEDIUM_B = Scenario(
scenario_id="worker-memleak-001",
task_id="root_cause_analysis",
incident_id="INC-20260327-102",
description=(
"The report-generation service is timing out and users cannot export "
"data. Multiple related services show elevated errors. Find the true "
"root cause, classify severity, diagnose, and remediate."
),
initial_alerts=[
_alert("ALT-210", "report-service", AlertSeverity.CRITICAL,
"Request timeout rate 60% for /api/export", "2026-03-27T11:20:00Z"),
_alert("ALT-211", "worker-pool", AlertSeverity.WARNING,
"Worker memory usage: 94% (4 of 5 workers OOMKilling)", "2026-03-27T11:19:00Z"),
_alert("ALT-212", "s3-upload", AlertSeverity.WARNING,
"Upload failures – 503s from report-service", "2026-03-27T11:20:30Z"),
_alert("ALT-213", "postgres-reports", AlertSeverity.INFO,
"Long-running queries detected: 5 queries > 10 s", "2026-03-27T11:18:00Z"),
_alert("ALT-214", "health-monitor", AlertSeverity.INFO,
"Core services healthy: payment, auth, user-api all nominal. Issue isolated to report-export subsystem.", "2026-03-27T11:20:00Z"),
],
available_services=["report-service", "worker-pool", "s3-upload", "postgres-reports", "redis-cache", "api-gateway"],
service_logs={
"report-service": [
_log("2026-03-27T11:15:00Z", "report-service", "INFO", "Report job queued: RPT-9981, format: xlsx, rows: 1M"),
_log("2026-03-27T11:16:00Z", "report-service", "WARN", "Worker RPT-9981 memory: 2.1 GB (limit 2 GB). Nearing OOM."),
_log("2026-03-27T11:18:00Z", "report-service", "ERROR", "Worker OOMKilled during xlsx serialization. Job failed."),
_log("2026-03-27T11:19:00Z", "report-service", "ERROR", "3 concurrent OOMKills. Export endpoint returning 503."),
],
"worker-pool": [
_log("2026-03-27T11:10:00Z", "worker-pool", "INFO", "Workers: 5 active, 0 idle. Load: nominal."),
_log("2026-03-27T11:14:00Z", "worker-pool", "WARN", "Worker memory climbing. Suspected unbounded row accumulation in xlsx writer."),
_log("2026-03-27T11:17:00Z", "worker-pool", "ERROR", "Worker #3 OOMKilled. Memory at 100%."),
_log("2026-03-27T11:19:00Z", "worker-pool", "ERROR", "4/5 workers OOMKilled. Effective worker capacity: 1. Queue depth: 45."),
_log("2026-03-27T11:19:30Z", "worker-pool", "ERROR", "Root cause: xlsx writer buffers all rows in memory before flushing. No streaming."),
],
"s3-upload": [
_log("2026-03-27T11:20:00Z", "s3-upload", "WARN", "Upstream report-service returning 503. S3 uploads queued."),
],
"postgres-reports": [
_log("2026-03-27T11:17:00Z", "postgres-reports", "INFO", "Large sequential scan for 1M row export. Query time: 12 s. This is normal for large exports."),
],
"redis-cache": [_log("2026-03-27T11:20:00Z", "redis-cache", "INFO", "Operations normal.")],
"api-gateway": [_log("2026-03-27T11:20:00Z", "api-gateway", "WARN", "report-service upstream: 60% 503 errors.")],
},
service_metrics={
"report-service": _metrics("report-service", 55.0, 75.0, 10.0, 0.60, 8000.0, 30000.0),
"worker-pool": _metrics("worker-pool", 90.0, 94.0, 5.0, 0.80, 15000.0, 60000.0, oom_kills=4.0),
"s3-upload": _metrics("s3-upload", 10.0, 15.0, 2.0, 0.60, 500.0, 3000.0),
"postgres-reports": _metrics("postgres-reports", 55.0, 60.0, 15.0, 0.0, 200.0, 12000.0),
"redis-cache": _metrics("redis-cache", 12.0, 30.0, 500.0, 0.0, 1.0, 3.0),
"api-gateway": _metrics("api-gateway", 20.0, 28.0, 800.0, 0.08, 80.0, 2000.0),
},
correct_severity=IncidentSeverity.P2,
correct_root_cause_service="worker-pool",
correct_root_cause_keywords=["memory", "oom", "out of memory", "xlsx", "buffering", "unbounded", "memory leak", "worker memory", "worker", "oomkill", "streaming", "row accumulation"],
# Note: P2 not P1 — only the report-export subsystem is affected, core services healthy.
valid_remediation_actions=[
{"action": "restart", "service": "worker-pool"},
{"action": "scale", "service": "worker-pool"},
{"action": "config_change", "service": "worker-pool", "detail": "enable streaming"},
],
expected_escalation_teams=["backend-team", "platform-team"],
max_steps=15,
degradation_per_step=0.008,
relevant_services=["report-service", "worker-pool"],
)
# ==========================================================================
# SCENARIO 3-B – Hard variant: Kubernetes node pressure / pod eviction cascade
# ==========================================================================
_SCENARIO_HARD_B = Scenario(
scenario_id="k8s-node-pressure-001",
task_id="full_incident_management",
incident_id="INC-20260327-004",
description=(
"Multiple pods are being evicted across the cluster. The checkout "
"service is returning 502s, node-exporter reports memory pressure on "
"three nodes, and the HPA has been scaling aggressively. This is a "
"node-level resource exhaustion event triggered by an HPA/resource-limit "
"misconfiguration. Full incident management required."
),
initial_alerts=[
_alert("ALT-200", "checkout-service", AlertSeverity.CRITICAL,
"502 error rate 28% across checkout endpoints", "2026-03-27T16:00:00Z"),
_alert("ALT-201", "k8s-node-01", AlertSeverity.CRITICAL,
"MemoryPressure=True — 3/8 pods evicted in last 5 min", "2026-03-27T16:00:30Z"),
_alert("ALT-202", "k8s-node-02", AlertSeverity.WARNING,
"MemoryPressure=True — node at 92% memory", "2026-03-27T16:01:00Z"),
_alert("ALT-203", "hpa-controller", AlertSeverity.WARNING,
"HPA for recommendation-service scaled to maxReplicas=20 (was 4)", "2026-03-27T15:55:00Z"),
_alert("ALT-204", "cart-service", AlertSeverity.WARNING,
"Downstream checkout-service returning 502s for 35% of cart completions", "2026-03-27T16:01:30Z"),
_alert("ALT-205", "cdn-static", AlertSeverity.INFO,
"Slight latency increase: p99 68ms (normal: 20ms)", "2026-03-27T16:02:00Z"),
],
available_services=[
"checkout-service", "k8s-node-01", "k8s-node-02",
"recommendation-service", "cart-service", "hpa-controller",
"cdn-static", "postgres-checkout",
],
service_logs={
"checkout-service": [
_log("2026-03-27T15:58:00Z", "checkout-service", "INFO", "Processing normally. 180 req/s."),
_log("2026-03-27T15:59:30Z", "checkout-service", "WARN", "3 pods restarting. Connections dropped."),
_log("2026-03-27T16:00:00Z", "checkout-service", "ERROR", "502 Bad Gateway — upstream recommendation-service pods unavailable"),
_log("2026-03-27T16:01:00Z", "checkout-service", "ERROR", "Circuit breaker half-open. 28% of requests failing."),
],
"k8s-node-01": [
_log("2026-03-27T15:50:00Z", "k8s-node-01", "INFO", "Memory: 78%."),
_log("2026-03-27T15:53:00Z", "k8s-node-01", "WARN", "Memory: 88%. kubelet setting eviction threshold."),
_log("2026-03-27T15:56:00Z", "k8s-node-01", "ERROR", "Memory: 95%. OOM eviction beginning. Evicting low-priority pods."),
_log("2026-03-27T15:58:00Z", "k8s-node-01", "ERROR", "Evicted: recommendation-service-7d8f (2 GB). Memory: 91%."),
_log("2026-03-27T16:00:00Z", "k8s-node-01", "ERROR", "Memory back to 95%. HPA-spawned recommendation-service pods consuming all available memory."),
],
"k8s-node-02": [
_log("2026-03-27T15:58:00Z", "k8s-node-02", "WARN", "Memory: 90%. recommendation-service HPA placed 6 new pods here."),
_log("2026-03-27T16:00:30Z", "k8s-node-02", "ERROR", "Memory: 92%. Approaching eviction threshold."),
],
"recommendation-service": [
_log("2026-03-27T15:45:00Z", "recommendation-service", "INFO", "Memory usage tracking: v2.4.0 deployed. ML model loaded."),
_log("2026-03-27T15:50:00Z", "recommendation-service", "WARN", "Each pod consuming 2.1 GB (limit: 2.0 GB) — requests.memory too low."),
_log("2026-03-27T15:53:00Z", "recommendation-service", "WARN", "HPA triggered: latency spike caused scale-out. 8→12 pods"),
_log("2026-03-27T15:57:00Z", "recommendation-service", "ERROR", "HPA at maxReplicas=20. 20 pods × 2.1 GB = 42 GB on nodes with 32 GB capacity."),
_log("2026-03-27T16:00:00Z", "recommendation-service", "ERROR", "Pod eviction loop: evicted pods restart, consume memory, trigger eviction again."),
],
"hpa-controller": [
_log("2026-03-27T15:52:00Z", "hpa-controller", "INFO", "recommendation-service: scaling 4→8 due to latency"),
_log("2026-03-27T15:55:00Z", "hpa-controller", "WARN", "recommendation-service: scaling 8→20 (maxReplicas). Memory requests underspecified."),
_log("2026-03-27T16:00:00Z", "hpa-controller", "ERROR", "Eviction loop detected. Scaling is worsening node pressure."),
],
"cart-service": [
_log("2026-03-27T16:01:00Z", "cart-service", "WARN", "Checkout dependency failing. 35% cart completions blocked."),
],
"cdn-static": [
_log("2026-03-27T16:02:00Z", "cdn-static", "INFO", "Slight latency increase correlates with client retries. No CDN-side issue."),
],
"postgres-checkout": [
_log("2026-03-27T16:00:00Z", "postgres-checkout", "INFO", "All queries normal. Connections: 45/200."),
],
},
service_metrics={
"checkout-service": _metrics("checkout-service", 55.0, 60.0, 180.0, 0.28, 200.0, 5500.0),
"k8s-node-01": _metrics("k8s-node-01", 70.0, 95.0, 0.0, 0.0, 0.0, 0.0, evicted_pods=3.0),
"k8s-node-02": _metrics("k8s-node-02", 65.0, 92.0, 0.0, 0.0, 0.0, 0.0),
"recommendation-service": _metrics("recommendation-service", 85.0, 105.0, 80.0, 0.60, 800.0, 12000.0, memory_per_pod_gb=2.1, pod_count=20.0),
"cart-service": _metrics("cart-service", 30.0, 35.0, 250.0, 0.15, 90.0, 2200.0),
"hpa-controller": _metrics("hpa-controller", 10.0, 15.0, 0.0, 0.0, 0.0, 0.0, current_replicas=20.0),
"cdn-static": _metrics("cdn-static", 10.0, 12.0, 9000.0, 0.001, 12.0, 68.0),
"postgres-checkout": _metrics("postgres-checkout", 35.0, 48.0, 200.0, 0.001, 12.0, 38.0),
},
correct_severity=IncidentSeverity.P1,
correct_root_cause_service="recommendation-service",
correct_root_cause_keywords=[
"memory request", "resource limit", "hpa", "eviction loop", "pod eviction",
"memory limit", "recommendation-service memory", "node pressure",
"oom eviction", "hpa scale", "memory requests underspecified",
],
valid_remediation_actions=[
{"action": "config_change", "service": "recommendation-service"},
{"action": "scale", "service": "recommendation-service"},
{"action": "restart", "service": "recommendation-service"},
{"action": "config_change", "service": "hpa-controller"},
],
expected_escalation_teams=["platform-team", "sre-team"],
max_steps=20,
degradation_per_step=0.015,
relevant_services=["recommendation-service", "k8s-node-01", "hpa-controller", "checkout-service"],
blast_radius={
"recommendation-service": {
"error_rate": (0.03, 0.95),
"pod_count": (0.5, 20.0),
},
"k8s-node-01": {
"memory_percent": (0.4, 100.0),
"evicted_pods": (0.4, 20.0),
},
"k8s-node-02": {
"memory_percent": (0.5, 100.0),
},
"checkout-service": {
"error_rate": (0.025, 0.85),
},
},
)
# ==========================================================================
# SCENARIO 3-C – Hard variant: Database failover split-brain
# ==========================================================================
_SCENARIO_HARD_C = Scenario(
scenario_id="db-failover-race-001",
task_id="full_incident_management",
incident_id="INC-20260327-005",
description=(
"The primary PostgreSQL instance failed over to the replica 18 minutes "
"ago but several services still route writes to the old primary (now "
"read-only) because pgbouncer's connection string was never updated. "
"A split-brain scenario is actively corrupting order state. Full "
"incident commander workflow required: triage, diagnose, remediate, "
"escalate, communicate."
),
initial_alerts=[
_alert("ALT-300", "order-service", AlertSeverity.CRITICAL,
"Write failures: 65% of order commits failing with ReadOnlyError", "2026-03-27T18:10:00Z"),
_alert("ALT-301", "postgres-primary-old", AlertSeverity.CRITICAL,
"Instance is READ-ONLY (promoted replica took writes 18 min ago)", "2026-03-27T18:10:30Z"),
_alert("ALT-302", "postgres-replica-new", AlertSeverity.WARNING,
"Becoming primary: only 30% of expected write traffic received", "2026-03-27T18:11:00Z"),
_alert("ALT-303", "payment-service", AlertSeverity.WARNING,
"Double-charge risk: orders appearing in both DB instances for 8% of txns", "2026-03-27T18:11:30Z"),
_alert("ALT-304", "inventory-service", AlertSeverity.WARNING,
"Stock deduction failing silently: items over-sold", "2026-03-27T18:12:00Z"),
_alert("ALT-305", "monitoring-dashboard", AlertSeverity.INFO,
"DB failover event recorded at 2026-03-27T17:52:00Z", "2026-03-27T18:12:30Z"),
_alert("ALT-306", "pgbouncer", AlertSeverity.CRITICAL,
"pgbouncer still routing ALL writes to postgres-primary-old (read-only). Connection string not updated after failover.", "2026-03-27T18:13:00Z"),
],
available_services=[
"order-service", "postgres-primary-old", "postgres-replica-new",
"payment-service", "inventory-service", "config-service",
"monitoring-dashboard", "pgbouncer",
],
service_logs={
"order-service": [
_log("2026-03-27T17:52:00Z", "order-service", "WARN", "DB failover detected. Using cached connection string."),
_log("2026-03-27T17:55:00Z", "order-service", "ERROR", "INSERT failed: ERROR: cannot execute INSERT in a read-only transaction"),
_log("2026-03-27T18:00:00Z", "order-service", "ERROR", "65% of order writes failing. Service still pointing to old primary."),
_log("2026-03-27T18:10:00Z", "order-service", "ERROR", "Connection pool: all connections to postgres-primary-old. Failover not propagated."),
],
"postgres-primary-old": [
_log("2026-03-27T17:52:00Z", "postgres-primary-old", "WARN", "Promotion event: replica assumed primary role. This instance now read-only."),
_log("2026-03-27T18:05:00Z", "postgres-primary-old", "ERROR", "Receiving 1800 write attempts/min from services — all rejected (read-only)."),
_log("2026-03-27T18:10:00Z", "postgres-primary-old", "ERROR", "Active connections: 198/200. Service retry loops filling pool."),
],
"postgres-replica-new": [
_log("2026-03-27T17:52:00Z", "postgres-replica-new", "INFO", "Promoted to primary. Accepting writes."),
_log("2026-03-27T18:05:00Z", "postgres-replica-new", "WARN", "Only 30% of expected write traffic received. Split-brain suspected."),
_log("2026-03-27T18:10:00Z", "postgres-replica-new", "WARN", "Diverging from old primary: 1240 transactions only in new primary."),
],
"payment-service": [
_log("2026-03-27T18:05:00Z", "payment-service", "ERROR", "Idempotency check failing: order state inconsistent between DB instances"),
_log("2026-03-27T18:10:00Z", "payment-service", "ERROR", "8% txn double-charge risk. Halting charge processing for affected orders."),
],
"inventory-service": [
_log("2026-03-27T18:05:00Z", "inventory-service", "ERROR", "Stock deduction writes going to old primary (read-only) — silently lost."),
_log("2026-03-27T18:10:00Z", "inventory-service", "ERROR", "Oversold items: 340 SKUs with negative virtual stock. Revenue impact growing."),
],
"config-service": [
_log("2026-03-27T17:52:00Z", "config-service", "INFO", "DB failover event received. Updated DB_PRIMARY_HOST in config store."),
_log("2026-03-27T17:52:30Z", "config-service", "WARN", "Config propagation: order-service and payment-service did NOT acknowledge new config."),
_log("2026-03-27T17:55:00Z", "config-service", "ERROR", "Config ack missing for 4/8 services. Manual pgbouncer reload required."),
],
"pgbouncer": [
_log("2026-03-27T17:52:00Z", "pgbouncer", "WARN", "Failover detected. pgbouncer config NOT auto-updated (static connection string)."),
_log("2026-03-27T18:10:00Z", "pgbouncer", "ERROR", "Routing 100% of writes to postgres-primary-old (read-only). Update target_db required immediately."),
],
"monitoring-dashboard": [
_log("2026-03-27T17:52:00Z", "monitoring-dashboard", "INFO", "Auto-failover triggered at 17:52:00Z by health check failure on primary."),
_log("2026-03-27T18:12:00Z", "monitoring-dashboard", "INFO", "Split-brain duration: 18 min. Financial impact estimate: $42,000 in at-risk transactions."),
],
},
service_metrics={
"order-service": _metrics("order-service", 55.0, 60.0, 800.0, 0.65, 300.0, 8000.0, write_failure_rate=0.65),
"postgres-primary-old": _metrics("postgres-primary-old", 80.0, 70.0, 1800.0, 1.0, 5.0, 50.0, is_read_only=1.0, connection_pct=99.0),
"postgres-replica-new": _metrics("postgres-replica-new", 30.0, 45.0, 600.0, 0.0, 8.0, 30.0, write_pct_expected=0.30),
"payment-service": _metrics("payment-service", 40.0, 45.0, 200.0, 0.25, 180.0, 3500.0, double_charge_risk_pct=0.08),
"inventory-service": _metrics("inventory-service", 35.0, 40.0, 300.0, 0.30, 120.0, 2500.0, oversold_skus=340.0),
"config-service": _metrics("config-service", 15.0, 20.0, 50.0, 0.10, 30.0, 200.0),
"monitoring-dashboard": _metrics("monitoring-dashboard", 10.0, 15.0, 100.0, 0.0, 50.0, 150.0),
"pgbouncer": _metrics("pgbouncer", 25.0, 30.0, 2000.0, 0.65, 2.0, 8.0, routing_to_old_primary=1.0),
},
correct_severity=IncidentSeverity.P1,
correct_root_cause_service="pgbouncer",
correct_root_cause_keywords=[
"pgbouncer", "connection string", "split-brain", "failover", "read-only",
"config not propagated", "stale connection", "db routing", "pgbouncer config",
"connection pool routing", "failover not propagated",
],
valid_remediation_actions=[
{"action": "config_change", "service": "pgbouncer"},
{"action": "restart", "service": "order-service"},
{"action": "config_change", "service": "order-service"},
{"action": "restart", "service": "payment-service"},
],
expected_escalation_teams=["database-team", "platform-team"],
max_steps=20,
degradation_per_step=0.02,
relevant_services=["pgbouncer", "postgres-primary-old", "postgres-replica-new", "order-service"],
blast_radius={
"order-service": {
"write_failure_rate": (0.02, 1.0),
"error_rate": (0.02, 0.95),
},
"inventory-service": {
"oversold_skus": (25.0, 5000.0),
"error_rate": (0.02, 0.80),
},
"payment-service": {
"double_charge_risk_pct": (0.005, 0.30),
"error_rate": (0.02, 0.60),
},
"postgres-primary-old": {
"connection_pct": (0.2, 100.0),
},
},
)
# ==========================================================================
# SCENARIO 1-C – Easy variant: DNS resolution failure
# ==========================================================================
_SCENARIO_EASY_C = Scenario(
scenario_id="dns-fail-001",
task_id="severity_classification",
incident_id="INC-20260327-201",
description=(
"Multiple microservices are reporting connection timeouts to downstream "
"dependencies. Alerts indicate DNS resolution failures across the "
"internal service mesh. Classify the incident severity."
),
initial_alerts=[
_alert("ALT-301", "api-gateway", AlertSeverity.CRITICAL,
"Upstream connection timeout rate 40% to backend services", "2026-03-27T14:00:00Z"),
_alert("ALT-302", "coredns", AlertSeverity.CRITICAL,
"DNS query failure rate 65% — SERVFAIL responses", "2026-03-27T13:58:00Z"),
_alert("ALT-303", "notification-service", AlertSeverity.WARNING,
"Failed to resolve smtp-relay.internal: NXDOMAIN", "2026-03-27T14:01:00Z"),
],
available_services=["api-gateway", "coredns", "notification-service", "istio-proxy"],
service_logs={
"api-gateway": [
_log("2026-03-27T13:58:00Z", "api-gateway", "ERROR", "upstream connect error: dns_resolution_failure for user-service.default.svc.cluster.local"),
_log("2026-03-27T13:59:00Z", "api-gateway", "ERROR", "circuit breaker tripped: 5/10 upstream failures in 30s. Returning 503."),
_log("2026-03-27T14:00:00Z", "api-gateway", "WARN", "Retry budget exhausted for payment-service. DNS not resolving."),
],
"coredns": [
_log("2026-03-27T13:55:00Z", "coredns", "WARN", "Cache miss rate increasing: 80%. Upstream forwarder slow."),
_log("2026-03-27T13:57:00Z", "coredns", "ERROR", "OOMKilled: coredns-7d8f9b pod restarted. Memory limit 128Mi exceeded."),
_log("2026-03-27T13:58:00Z", "coredns", "ERROR", "SERVFAIL for *.default.svc.cluster.local — upstream timeout after 5s"),
_log("2026-03-27T14:00:00Z", "coredns", "ERROR", "Pod restart count: 4 in last 10 minutes. CrashLoopBackOff."),
],
"notification-service": [
_log("2026-03-27T14:00:00Z", "notification-service", "WARN", "Email delivery failing: cannot resolve smtp-relay.internal"),
],
"istio-proxy": [
_log("2026-03-27T14:00:00Z", "istio-proxy", "INFO", "Sidecar healthy. mTLS handshake OK. Issue is upstream DNS, not mesh."),
],
},
service_metrics={
"api-gateway": _metrics("api-gateway", 25.0, 40.0, 1200.0, 0.40, 800.0, 5000.0),
"coredns": _metrics("coredns", 95.0, 98.0, 5000.0, 0.65, 50.0, 5000.0, restart_count=4.0, cache_miss_pct=80.0),
"notification-service": _metrics("notification-service", 10.0, 20.0, 50.0, 0.80, 200.0, 3000.0),
"istio-proxy": _metrics("istio-proxy", 5.0, 10.0, 1200.0, 0.01, 2.0, 10.0),
},
correct_severity=IncidentSeverity.P1,
correct_root_cause_service="coredns",
correct_root_cause_keywords=["dns", "coredns", "OOM", "memory", "resolution", "SERVFAIL", "CrashLoop"],
valid_remediation_actions=[
{"action": "restart", "service": "coredns"},
{"action": "scale", "service": "coredns"},
{"action": "config_change", "service": "coredns", "detail": "increase memory limit"},
],
expected_escalation_teams=["platform-team"],
max_steps=10,
degradation_per_step=0.008,
relevant_services=["api-gateway", "coredns"],
blast_radius={
"coredns": {
"error_rate": (0.03, 0.95),
"restart_count": (1.0, 15.0),
},
"api-gateway": {
"error_rate": (0.03, 0.80),
"latency_p99_ms": (500.0, 15000.0),
},
},
)
# ==========================================================================
# SCENARIO 2-C – Medium variant: TLS certificate expiry
# ==========================================================================
_SCENARIO_MEDIUM_C = Scenario(
scenario_id="tls-expiry-001",
task_id="root_cause_analysis",
incident_id="INC-20260327-301",
description=(
"The checkout-service is returning 502 errors for all HTTPS calls to "
"the payment provider API. Internal health checks pass but external "
"payment calls fail. Diagnose the root cause and remediate."
),
initial_alerts=[
_alert("ALT-401", "checkout-service", AlertSeverity.CRITICAL,
"Payment API calls failing: 502 rate 95%", "2026-03-27T09:00:00Z"),
_alert("ALT-402", "cert-manager", AlertSeverity.WARNING,
"Certificate renewal failed for payments.example.com — ACME challenge timeout", "2026-03-27T08:00:00Z"),
_alert("ALT-403", "nginx-ingress", AlertSeverity.WARNING,
"TLS handshake failures: 200/min on payments upstream", "2026-03-27T09:01:00Z"),
],
available_services=["checkout-service", "cert-manager", "nginx-ingress", "payment-provider-stub"],
service_logs={
"checkout-service": [
_log("2026-03-27T08:55:00Z", "checkout-service", "ERROR", "PaymentGatewayError: SSL certificate has expired (payments.example.com)"),
_log("2026-03-27T08:58:00Z", "checkout-service", "ERROR", "javax.net.ssl.SSLHandshakeException: PKIX path validation failed: certificate expired at 2026-03-27T00:00:00Z"),
_log("2026-03-27T09:00:00Z", "checkout-service", "ERROR", "Circuit breaker OPEN for payment-provider. 48/50 calls failed in 60s."),
],
"cert-manager": [
_log("2026-03-27T02:00:00Z", "cert-manager", "INFO", "Certificate renewal triggered for payments.example.com (expires in 24h)"),
_log("2026-03-27T02:01:00Z", "cert-manager", "ERROR", "ACME HTTP-01 challenge failed: upstream DNS not resolving challenge token"),
_log("2026-03-27T02:05:00Z", "cert-manager", "ERROR", "Retry 3/3 failed. Certificate NOT renewed. Expiry: 2026-03-27T00:00:00Z"),
_log("2026-03-27T08:00:00Z", "cert-manager", "CRITICAL", "Certificate EXPIRED: payments.example.com. Last valid: 2026-03-26T23:59:59Z"),
],
"nginx-ingress": [
_log("2026-03-27T09:00:00Z", "nginx-ingress", "ERROR", "SSL_do_handshake() failed: certificate verify failed (expired)"),
_log("2026-03-27T09:01:00Z", "nginx-ingress", "WARN", "Upstream payments backend: 200 TLS errors/min. Peer certificate expired."),
],
"payment-provider-stub": [
_log("2026-03-27T09:00:00Z", "payment-provider-stub", "INFO", "Healthy. Accepting connections on port 443 with valid certificate."),
],
},
service_metrics={
"checkout-service": _metrics("checkout-service", 15.0, 30.0, 300.0, 0.95, 50.0, 200.0, payment_success_pct=5.0, revenue_loss_per_min=8500.0),
"cert-manager": _metrics("cert-manager", 5.0, 10.0, 1.0, 0.0, 10.0, 50.0, certs_expired=1.0, renewal_failures=3.0),
"nginx-ingress": _metrics("nginx-ingress", 10.0, 20.0, 500.0, 0.40, 5.0, 30.0, tls_handshake_failures_per_min=200.0),
"payment-provider-stub": _metrics("payment-provider-stub", 5.0, 15.0, 50.0, 0.0, 20.0, 80.0),
},
correct_severity=IncidentSeverity.P1,
correct_root_cause_service="cert-manager",
correct_root_cause_keywords=["certificate", "TLS", "SSL", "expired", "cert-manager", "renewal", "ACME", "expiry"],
valid_remediation_actions=[
{"action": "restart", "service": "cert-manager"},
{"action": "config_change", "service": "cert-manager", "detail": "force renewal"},
{"action": "config_change", "service": "nginx-ingress", "detail": "update certificate"},
],
expected_escalation_teams=["security-team", "platform-team"],
max_steps=15,
degradation_per_step=0.010,
relevant_services=["checkout-service", "cert-manager", "nginx-ingress"],
blast_radius={
"checkout-service": {
"error_rate": (0.005, 1.0),
"payment_success_pct": (-0.5, 0.0),
"revenue_loss_per_min": (500.0, 50000.0),
},
"nginx-ingress": {
"tls_handshake_failures_per_min": (20.0, 1000.0),
},
},
)
# ---- registry ---------------------------------------------------------------
# Multiple variants per task — environment randomly selects one per reset()
SCENARIO_VARIANTS: Dict[str, List[Scenario]] = {
"severity_classification": [_SCENARIO_EASY, _SCENARIO_EASY_B, _SCENARIO_EASY_C],
"root_cause_analysis": [_SCENARIO_MEDIUM, _SCENARIO_MEDIUM_B, _SCENARIO_MEDIUM_C],
"full_incident_management": [_SCENARIO_HARD, _SCENARIO_HARD_B, _SCENARIO_HARD_C],
}
# Always maps task_id → primary (deterministic) scenario for testing/baseline
SCENARIOS: Dict[str, Scenario] = {
"severity_classification": _SCENARIO_EASY,
"root_cause_analysis": _SCENARIO_MEDIUM,
"full_incident_management": _SCENARIO_HARD,
}
def get_scenario(task_id: str, variant_seed: int = 0) -> Scenario:
"""Return a scenario for the given task_id.
Args:
task_id: One of the three registered task IDs.
variant_seed: Index into SCENARIO_VARIANTS[task_id]. Wraps around.
Pass 0 for the primary/deterministic scenario.
"""
if task_id not in SCENARIO_VARIANTS:
raise ValueError(f"Unknown task_id '{task_id}'. Valid: {list(SCENARIO_VARIANTS.keys())}")
variants = SCENARIO_VARIANTS[task_id]
return variants[variant_seed % len(variants)]