Spaces:

srikrishna2005
/

openenv

Running

File size: 56,554 Bytes

c452421

"""Deterministic incident scenarios for the IRT environment.

Each scenario is a self-contained data definition:
  - Initial alerts visible to the agent
  - Hidden logs and metrics per service (revealed on INVESTIGATE)
  - Ground truth for grading (severity, root cause, valid remediations)

Scenarios are keyed by task_id for 1-to-1 task↔scenario mapping.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, Dict, List

from src.models import (
    Alert,
    AlertSeverity,
    IncidentSeverity,
    LogEntry,
    ServiceMetrics,
)


@dataclass(frozen=True)
class Scenario:
    scenario_id: str
    task_id: str
    incident_id: str
    description: str
    # Initial state
    initial_alerts: List[Alert]
    available_services: List[str]
    # Hidden – revealed on INVESTIGATE
    service_logs: Dict[str, List[LogEntry]]
    service_metrics: Dict[str, ServiceMetrics]
    # Ground truth
    correct_severity: IncidentSeverity
    correct_root_cause_service: str
    correct_root_cause_keywords: List[str]  # any of these in diagnosis → credit
    valid_remediation_actions: List[Dict[str, Any]]
    expected_escalation_teams: List[str]
    # Params
    max_steps: int
    degradation_per_step: float = 0.0  # additional penalty per idle step
    relevant_services: List[str] = field(default_factory=list)
    # Blast radius: maps service → metric key → (rate_per_step, cap) that
    # worsens dynamically as the agent delays.  Applied before metrics are
    # revealed so the agent observes a live, worsening picture.
    # Format: {"service": {"metric_key": (delta_per_step, max_value)}}
    blast_radius: Dict[str, Dict[str, tuple]] = field(default_factory=dict)


def apply_blast_radius(scenario: Scenario, step: int) -> Dict[str, ServiceMetrics]:
    """Return a copy of service_metrics with blast-radius degradation applied.

    Each entry in scenario.blast_radius defines a (delta_per_step, cap) tuple
    per metric key.  The returned dict can be used as revealed metrics so each
    INVESTIGATE at a higher step number sees a more degraded system.
    """
    if not scenario.blast_radius:
        return dict(scenario.service_metrics)

    result: Dict[str, ServiceMetrics] = {}
    for svc, base_metrics in scenario.service_metrics.items():
        blast = scenario.blast_radius.get(svc)
        if blast is None:
            result[svc] = base_metrics
            continue
        # Build an updated custom dict
        d = base_metrics.model_dump()
        custom: Dict[str, float] = dict(d.get("custom") or {})
        # Core fields we also allow to degrade
        degradable_core = {
            "error_rate", "latency_p50_ms", "latency_p99_ms",
            "cpu_percent", "memory_percent", "request_rate",
        }
        for metric_key, (delta, cap) in blast.items():
            if metric_key in degradable_core:
                old_val = d.get(metric_key, 0.0)
                new_val = min(cap, old_val + delta * step) if delta > 0 else max(cap, old_val + delta * step)
                d[metric_key] = round(new_val, 3)
            else:
                # Custom metric field
                old_val = custom.get(metric_key, 0.0)
                new_val = min(cap, old_val + delta * step) if delta > 0 else max(cap, old_val + delta * step)
                custom[metric_key] = round(new_val, 3)
        d["custom"] = custom
        result[svc] = ServiceMetrics(**d)
    return result


# ---- helpers ----------------------------------------------------------------

def _alert(aid: str, svc: str, sev: AlertSeverity, msg: str, ts: str, **meta: Any) -> Alert:
    return Alert(alert_id=aid, service=svc, severity=sev, message=msg, timestamp=ts, metadata=meta)


def _log(ts: str, svc: str, lvl: str, msg: str, tid: str | None = None) -> LogEntry:
    return LogEntry(timestamp=ts, service=svc, level=lvl, message=msg, trace_id=tid)


def _metrics(svc: str, cpu: float, mem: float, rr: float, er: float, p50: float, p99: float, **custom: float) -> ServiceMetrics:
    return ServiceMetrics(service=svc, cpu_percent=cpu, memory_percent=mem, request_rate=rr, error_rate=er, latency_p50_ms=p50, latency_p99_ms=p99, custom=custom)


# ==========================================================================
# SCENARIO 1 – Easy: Database Connection Pool Exhaustion
# ==========================================================================

_SCENARIO_EASY = Scenario(
    scenario_id="db-conn-pool-001",
    task_id="severity_classification",
    incident_id="INC-20260327-001",
    description=(
        "The user-service API is experiencing elevated latency and errors. "
        "Alerts indicate the PostgreSQL primary database connection pool is "
        "nearly saturated. Classify the incident severity."
    ),
    initial_alerts=[
        _alert("ALT-001", "user-service", AlertSeverity.WARNING,
               "p99 latency exceeded 3000 ms threshold (current: 4200 ms)", "2026-03-27T02:15:00Z"),
        _alert("ALT-002", "postgres-primary", AlertSeverity.CRITICAL,
               "Connection pool utilization at 98% (max 200 connections)", "2026-03-27T02:14:30Z"),
        _alert("ALT-003", "user-service", AlertSeverity.WARNING,
               "Error rate at 12% over the last 5 minutes", "2026-03-27T02:15:30Z"),
    ],
    available_services=["user-service", "postgres-primary", "redis-cache", "api-gateway"],
    service_logs={
        "user-service": [
            _log("2026-03-27T02:10:00Z", "user-service", "INFO", "Deployment v2.3.1 completed successfully"),
            _log("2026-03-27T02:12:00Z", "user-service", "WARN", "DB query took 2800 ms for /api/users/profile"),
            _log("2026-03-27T02:13:00Z", "user-service", "ERROR", "Connection acquisition timeout after 5000 ms", "trace-a1b2"),
            _log("2026-03-27T02:13:30Z", "user-service", "ERROR", "Connection acquisition timeout after 5000 ms", "trace-c3d4"),
            _log("2026-03-27T02:14:00Z", "user-service", "ERROR", "Failed to acquire connection from pool: pool exhausted", "trace-e5f6"),
            _log("2026-03-27T02:14:30Z", "user-service", "WARN", "Retry #3 for DB connection – backing off 500 ms"),
            _log("2026-03-27T02:15:00Z", "user-service", "ERROR", "HTTP 503 returned for GET /api/users/profile", "trace-g7h8"),
        ],
        "postgres-primary": [
            _log("2026-03-27T02:08:00Z", "postgres-primary", "INFO", "Active connections: 120/200"),
            _log("2026-03-27T02:10:30Z", "postgres-primary", "WARN", "Active connections: 175/200"),
            _log("2026-03-27T02:12:00Z", "postgres-primary", "WARN", "Active connections: 190/200 – approaching limit"),
            _log("2026-03-27T02:13:00Z", "postgres-primary", "ERROR", "Active connections: 196/200 – new connection rejected"),
            _log("2026-03-27T02:14:00Z", "postgres-primary", "ERROR", "Connection count 198/200. Longest idle: 1800 s. Possible connection leak detected."),
            _log("2026-03-27T02:15:00Z", "postgres-primary", "ERROR", "Active connections: 200/200 – pool fully saturated"),
        ],
        "redis-cache": [
            _log("2026-03-27T02:15:00Z", "redis-cache", "INFO", "Memory usage: 45%. Operations normal."),
            _log("2026-03-27T02:15:00Z", "redis-cache", "INFO", "Hit rate: 94%. No evictions."),
        ],
        "api-gateway": [
            _log("2026-03-27T02:14:00Z", "api-gateway", "WARN", "Upstream user-service returning 503 for 8% of requests"),
            _log("2026-03-27T02:15:00Z", "api-gateway", "INFO", "All other upstream services healthy"),
        ],
    },
    service_metrics={
        "user-service": _metrics("user-service", 65.0, 58.0, 450.0, 0.12, 320.0, 4200.0),
        "postgres-primary": _metrics("postgres-primary", 78.0, 72.0, 450.0, 0.05, 45.0, 890.0, connection_pool_pct=98.0),
        "redis-cache": _metrics("redis-cache", 15.0, 45.0, 1200.0, 0.001, 1.2, 3.5),
        "api-gateway": _metrics("api-gateway", 22.0, 30.0, 2200.0, 0.08, 85.0, 4500.0),
    },
    correct_severity=IncidentSeverity.P2,
    correct_root_cause_service="postgres-primary",
    correct_root_cause_keywords=["connection pool", "connection leak", "pool exhaustion", "pool saturated", "connection exhaustion"],
    valid_remediation_actions=[
        {"action": "restart", "service": "user-service"},
        {"action": "config_change", "service": "postgres-primary", "detail": "increase pool size"},
    ],
    expected_escalation_teams=["database-team"],
    max_steps=10,
    degradation_per_step=0.005,
    relevant_services=["user-service", "postgres-primary"],
    # Blast radius: connection pool fully saturates, user-service error rate climbs
    blast_radius={
        "postgres-primary": {
            "connection_pool_pct": (0.5, 100.0),   # +0.5%/step → caps at 100%
        },
        "user-service": {
            "error_rate": (0.02, 0.60),             # +2pp/step → caps at 60%
            "latency_p99_ms": (200.0, 10000.0),     # +200ms/step → caps at 10s
        },
    },
)


# ==========================================================================
# SCENARIO 2 – Medium: Payment Processing Failure
# ==========================================================================

_SCENARIO_MEDIUM = Scenario(
    scenario_id="payment-failure-001",
    task_id="root_cause_analysis",
    incident_id="INC-20260327-002",
    description=(
        "Payment success rate has dropped sharply. Multiple services show "
        "degradation. Investigate the services, identify the root cause, "
        "classify severity, and apply the correct remediation."
    ),
    initial_alerts=[
        _alert("ALT-010", "payment-gateway", AlertSeverity.CRITICAL,
               "Payment success rate dropped to 45% (threshold: 95%)", "2026-03-27T09:30:00Z"),
        _alert("ALT-011", "payment-processor", AlertSeverity.WARNING,
               "Timeout errors increased 10x in last 10 minutes", "2026-03-27T09:30:30Z"),
        _alert("ALT-012", "redis-session", AlertSeverity.WARNING,
               "Key eviction rate spike: 1500 evictions/min (normal: <10)", "2026-03-27T09:29:00Z"),
        _alert("ALT-013", "order-service", AlertSeverity.WARNING,
               "Error rate elevated to 8%", "2026-03-27T09:31:00Z"),
    ],
    available_services=["payment-gateway", "payment-processor", "redis-session", "order-service", "user-service", "postgres-primary"],
    service_logs={
        "payment-gateway": [
            _log("2026-03-27T09:25:00Z", "payment-gateway", "INFO", "Processing 320 payments/min"),
            _log("2026-03-27T09:28:00Z", "payment-gateway", "WARN", "Payment token validation failed: token not found in session store", "trace-pay-01"),
            _log("2026-03-27T09:28:30Z", "payment-gateway", "ERROR", "Payment failed: session token expired or missing for txn TXN-8842", "trace-pay-02"),
            _log("2026-03-27T09:29:00Z", "payment-gateway", "ERROR", "Batch failure: 55% of payment attempts failing with SESSION_TOKEN_MISSING"),
            _log("2026-03-27T09:30:00Z", "payment-gateway", "ERROR", "Success rate critical: 45%. All failures correlate with session token lookup errors."),
        ],
        "payment-processor": [
            _log("2026-03-27T09:28:00Z", "payment-processor", "WARN", "Upstream payment-gateway sending incomplete requests"),
            _log("2026-03-27T09:29:00Z", "payment-processor", "ERROR", "Timeout waiting for payment-gateway response: 12 s", "trace-pp-01"),
            _log("2026-03-27T09:30:00Z", "payment-processor", "WARN", "Retry queue depth: 450 (normal: <20)"),
        ],
        "redis-session": [
            _log("2026-03-27T09:20:00Z", "redis-session", "INFO", "Memory usage: 95%. Approaching maxmemory limit (4 GB)."),
            _log("2026-03-27T09:22:00Z", "redis-session", "WARN", "maxmemory reached. Eviction policy: allkeys-lru. Beginning evictions."),
            _log("2026-03-27T09:25:00Z", "redis-session", "WARN", "Evicted 800 keys in last 3 minutes. Active sessions being evicted."),
            _log("2026-03-27T09:28:00Z", "redis-session", "ERROR", "Eviction rate critical: 1500 keys/min. Payment session tokens are being evicted before use."),
            _log("2026-03-27T09:30:00Z", "redis-session", "ERROR", "Memory at 100%. Continuous eviction. Session TTL effectively reduced from 30 min to ~45 s."),
        ],
        "order-service": [
            _log("2026-03-27T09:30:00Z", "order-service", "WARN", "Downstream payment-gateway returning errors for order confirmations"),
            _log("2026-03-27T09:31:00Z", "order-service", "ERROR", "8% of orders failing at payment step – propagated from payment-gateway"),
        ],
        "user-service": [
            _log("2026-03-27T09:30:00Z", "user-service", "INFO", "All endpoints healthy. Latency normal."),
        ],
        "postgres-primary": [
            _log("2026-03-27T09:30:00Z", "postgres-primary", "INFO", "Active connections: 85/200. Operations normal."),
        ],
    },
    service_metrics={
        "payment-gateway": _metrics("payment-gateway", 45.0, 52.0, 320.0, 0.55, 250.0, 12000.0, payment_success_rate=0.45),
        "payment-processor": _metrics("payment-processor", 35.0, 40.0, 150.0, 0.30, 180.0, 8000.0),
        "redis-session": _metrics("redis-session", 30.0, 99.5, 5000.0, 0.02, 0.8, 2.5, memory_used_gb=3.98, evictions_per_min=1500.0),
        "order-service": _metrics("order-service", 28.0, 35.0, 200.0, 0.08, 120.0, 950.0),
        "user-service": _metrics("user-service", 20.0, 32.0, 400.0, 0.002, 45.0, 120.0),
        "postgres-primary": _metrics("postgres-primary", 40.0, 55.0, 300.0, 0.001, 12.0, 45.0),
    },
    correct_severity=IncidentSeverity.P1,
    correct_root_cause_service="redis-session",
    correct_root_cause_keywords=[
        "redis", "memory", "eviction", "session token", "maxmemory",
        "session eviction", "cache eviction", "redis memory",
    ],
    valid_remediation_actions=[
        {"action": "scale", "service": "redis-session"},
        {"action": "config_change", "service": "redis-session", "detail": "increase maxmemory"},
        {"action": "restart", "service": "redis-session"},
    ],
    expected_escalation_teams=["payments-team", "platform-team"],
    max_steps=15,
    degradation_per_step=0.01,
    relevant_services=["payment-gateway", "redis-session", "payment-processor"],
    # Blast radius: Redis keeps evicting, payment success rate collapses
    blast_radius={
        "redis-session": {
            "evictions_per_min": (150.0, 5000.0),  # +150 evictions/min/step
            "memory_used_gb": (0.005, 4.0),         # creeps toward hard limit
        },
        "payment-gateway": {
            "payment_success_rate": (-0.04, 0.05),  # drops 4pp/step → 5% floor
            "error_rate": (0.03, 0.90),
        },
        "order-service": {
            "error_rate": (0.02, 0.50),
        },
    },
)


# ==========================================================================
# SCENARIO 3 – Hard: Cascading Multi-Service Outage
# ==========================================================================

_SCENARIO_HARD = Scenario(
    scenario_id="cascading-outage-001",
    task_id="full_incident_management",
    incident_id="INC-20260327-003",
    description=(
        "Multiple services are degraded simultaneously. The API gateway is "
        "returning 503s, the auth service has extreme latency, and downstream "
        "services are failing. This is a cascading outage. You must triage, "
        "investigate, identify the root cause, remediate, escalate, and "
        "communicate status updates."
    ),
    initial_alerts=[
        _alert("ALT-100", "api-gateway", AlertSeverity.CRITICAL,
               "503 error rate at 35% across all endpoints", "2026-03-27T14:00:00Z"),
        _alert("ALT-101", "auth-service", AlertSeverity.CRITICAL,
               "p99 latency > 5000 ms (threshold: 200 ms)", "2026-03-27T14:00:30Z"),
        _alert("ALT-102", "order-service", AlertSeverity.WARNING,
               "Message queue depth growing: 15000 (normal: <500)", "2026-03-27T14:01:00Z"),
        _alert("ALT-103", "notification-service", AlertSeverity.WARNING,
               "Connection timeout to auth-service: 100% failure rate", "2026-03-27T14:01:30Z"),
        _alert("ALT-104", "cdn-static", AlertSeverity.INFO,
               "Cache miss rate elevated to 15% (normal: 2%)", "2026-03-27T14:02:00Z"),
        _alert("ALT-105", "user-service", AlertSeverity.WARNING,
               "Intermittent HTTP 401 responses (token validation failing)", "2026-03-27T14:01:00Z"),
        _alert("ALT-106", "deployment-tracker", AlertSeverity.CRITICAL,
               "auth-service v3.1.0 deployed at 13:47 — memory climb started immediately. Escalate to auth-team and platform-team.", "2026-03-27T14:02:00Z"),
    ],
    available_services=[
        "api-gateway", "auth-service", "user-service",
        "order-service", "notification-service", "cdn-static",
        "postgres-primary", "redis-auth-cache",
    ],
    service_logs={
        "api-gateway": [
            _log("2026-03-27T13:58:00Z", "api-gateway", "INFO", "All upstreams healthy. Traffic: 5500 req/s."),
            _log("2026-03-27T14:00:00Z", "api-gateway", "ERROR", "Upstream auth-service: 503 for 35% of auth checks"),
            _log("2026-03-27T14:00:30Z", "api-gateway", "ERROR", "Circuit breaker OPEN for auth-service after 50 consecutive failures"),
            _log("2026-03-27T14:01:00Z", "api-gateway", "ERROR", "Cascading: requests requiring auth are failing. Public endpoints OK."),
        ],
        "auth-service": [
            _log("2026-03-27T13:45:00Z", "auth-service", "INFO", "Deployment v3.1.0 started (canary 10%)"),
            _log("2026-03-27T13:47:00Z", "auth-service", "INFO", "Deployment v3.1.0 promoted to 100%"),
            _log("2026-03-27T13:50:00Z", "auth-service", "WARN", "Memory usage climbing: 72% (was 45% before deploy)"),
            _log("2026-03-27T13:55:00Z", "auth-service", "WARN", "Memory usage: 88%. GC pauses increasing: avg 350 ms"),
            _log("2026-03-27T13:58:00Z", "auth-service", "ERROR", "Memory usage: 95%. GC pause: 2100 ms. Requests timing out."),
            _log("2026-03-27T14:00:00Z", "auth-service", "ERROR", "OOMKill risk. Memory: 97%. Token validation taking 4800 ms avg."),
            _log("2026-03-27T14:00:30Z", "auth-service", "ERROR", "v3.1.0 changelog: 'Refactored token cache to in-memory store' – possible unbounded cache growth"),
            _log("2026-03-27T14:01:00Z", "auth-service", "ERROR", "Pod restarts: 3 in last 5 min due to OOMKill. Service effectively down."),
        ],
        "user-service": [
            _log("2026-03-27T14:00:00Z", "user-service", "WARN", "Auth token validation calls timing out"),
            _log("2026-03-27T14:01:00Z", "user-service", "ERROR", "Returning 401 for 40% of requests – cannot validate tokens with auth-service"),
        ],
        "order-service": [
            _log("2026-03-27T14:00:00Z", "order-service", "WARN", "Order processing slowing – auth dependency failing"),
            _log("2026-03-27T14:01:00Z", "order-service", "ERROR", "Queue depth: 15000. Orders stuck awaiting auth validation."),
            _log("2026-03-27T14:02:00Z", "order-service", "ERROR", "Queue depth: 25000. Risk of message broker disk overflow."),
        ],
        "notification-service": [
            _log("2026-03-27T14:01:00Z", "notification-service", "ERROR", "Cannot reach auth-service. All notification deliveries paused."),
            _log("2026-03-27T14:02:00Z", "notification-service", "WARN", "Buffered 8000 pending notifications."),
        ],
        "cdn-static": [
            _log("2026-03-27T14:00:00Z", "cdn-static", "INFO", "Cache miss rate elevated. Likely due to increased full page reloads from client-side auth failures."),
            _log("2026-03-27T14:02:00Z", "cdn-static", "INFO", "No CDN-side issues detected. Origin healthy."),
        ],
        "postgres-primary": [
            _log("2026-03-27T14:00:00Z", "postgres-primary", "INFO", "Connections: 90/200. Query performance normal."),
        ],
        "redis-auth-cache": [
            _log("2026-03-27T14:00:00Z", "redis-auth-cache", "INFO", "Memory: 30%. Operations normal."),
            _log("2026-03-27T14:00:30Z", "redis-auth-cache", "WARN", "Cache hit rate dropped from 92% to 15%. auth-service v3.1.0 appears to bypass cache."),
        ],
    },
    service_metrics={
        "api-gateway": _metrics("api-gateway", 55.0, 40.0, 5500.0, 0.35, 150.0, 8500.0),
        "auth-service": _metrics("auth-service", 95.0, 97.0, 800.0, 0.65, 2500.0, 5200.0, gc_pause_ms=2100.0, pod_restarts=3.0),
        "user-service": _metrics("user-service", 30.0, 35.0, 400.0, 0.40, 80.0, 4800.0),
        "order-service": _metrics("order-service", 40.0, 45.0, 200.0, 0.25, 300.0, 3500.0, queue_depth=15000.0),
        "notification-service": _metrics("notification-service", 10.0, 20.0, 0.0, 1.0, 0.0, 0.0),
        "cdn-static": _metrics("cdn-static", 12.0, 18.0, 8000.0, 0.001, 8.0, 25.0, cache_miss_rate=0.15),
        "postgres-primary": _metrics("postgres-primary", 38.0, 52.0, 250.0, 0.001, 10.0, 40.0),
        "redis-auth-cache": _metrics("redis-auth-cache", 12.0, 30.0, 2000.0, 0.005, 0.5, 1.8, cache_hit_rate=0.15),
    },
    correct_severity=IncidentSeverity.P1,
    correct_root_cause_service="auth-service",
    correct_root_cause_keywords=[
        "memory leak", "v3.1.0", "deployment", "oom", "unbounded cache",
        "in-memory", "bad deployment", "auth-service deployment",
        "token cache", "gc pause", "out of memory",
    ],
    valid_remediation_actions=[
        {"action": "rollback", "service": "auth-service"},
        {"action": "restart", "service": "auth-service"},
        {"action": "scale", "service": "order-service"},
        {"action": "restart", "service": "order-service"},
    ],
    expected_escalation_teams=["platform-team", "auth-team"],
    max_steps=20,
    degradation_per_step=0.015,
    relevant_services=["auth-service", "api-gateway", "redis-auth-cache", "order-service"],
    # Blast radius: auth-service OOMKills more often, order queue grows unbounded
    blast_radius={
        "auth-service": {
            "memory_percent": (0.5, 100.0),        # +0.5%/step → OOM at 100%
            "error_rate": (0.02, 0.95),             # cascades toward full outage
            "latency_p99_ms": (100.0, 15000.0),
            "pod_restarts": (0.3, 15.0),            # accumulating restarts
        },
        "order-service": {
            "queue_depth": (1500.0, 100000.0),      # queue grows 1500/step
            "error_rate": (0.02, 0.80),
        },
        "api-gateway": {
            "error_rate": (0.015, 0.70),            # more requests fail over time
        },
        "user-service": {
            "error_rate": (0.02, 0.80),
        },
    },
)


# ==========================================================================
# SCENARIO 1-B – Easy variant: Disk space exhaustion on log volume
# ==========================================================================

_SCENARIO_EASY_B = Scenario(
    scenario_id="disk-full-001",
    task_id="severity_classification",
    incident_id="INC-20260327-101",
    description=(
        "The search-service and its underlying Elasticsearch cluster are "
        "experiencing errors. Alerts indicate disk usage is critically high. "
        "Classify the incident severity."
    ),
    initial_alerts=[
        _alert("ALT-201", "elasticsearch", AlertSeverity.CRITICAL,
               "Disk usage at 95% on data node es-node-01", "2026-03-27T06:10:00Z"),
        _alert("ALT-202", "search-service", AlertSeverity.WARNING,
               "Bulk indexing failures: 400% increase", "2026-03-27T06:10:30Z"),
        _alert("ALT-203", "elasticsearch", AlertSeverity.WARNING,
               "write.low_watermark crossed – shard allocation blocked", "2026-03-27T06:09:00Z"),
    ],
    available_services=["search-service", "elasticsearch", "kibana", "log-aggregator"],
    service_logs={
        "search-service": [
            _log("2026-03-27T06:08:00Z", "search-service", "WARN", "Indexing queue backing up: 12000 documents pending"),
            _log("2026-03-27T06:09:00Z", "search-service", "ERROR", "BulkIndexException: ClusterBlockException[blocked: FORBIDDEN/12/index]"),
            _log("2026-03-27T06:10:00Z", "search-service", "ERROR", "Search degraded – last index refresh 8 min ago. Serving stale results."),
        ],
        "elasticsearch": [
            _log("2026-03-27T06:05:00Z", "elasticsearch", "WARN", "Disk usage: 90% on es-node-01. Threshold: 85%."),
            _log("2026-03-27T06:07:00Z", "elasticsearch", "WARN", "Disk: 93%. flood_stage watermark approaching."),
            _log("2026-03-27T06:09:00Z", "elasticsearch", "ERROR", "Disk: 95%. flood_stage reached. All indices set to read-only."),
            _log("2026-03-27T06:10:00Z", "elasticsearch", "ERROR", "Shard allocation disabled. Cluster status: YELLOW. Write ops blocked."),
        ],
        "kibana": [
            _log("2026-03-27T06:10:00Z", "kibana", "INFO", "Dashboard loading normally. Read-only ops unaffected."),
        ],
        "log-aggregator": [
            _log("2026-03-27T06:09:00Z", "log-aggregator", "WARN", "Log shipping to elasticsearch failing. Retrying. Buffer: 50000 lines."),
        ],
    },
    service_metrics={
        "search-service": _metrics("search-service", 42.0, 50.0, 200.0, 0.35, 180.0, 2200.0),
        "elasticsearch": _metrics("elasticsearch", 60.0, 80.0, 50.0, 0.40, 200.0, 5000.0, disk_pct=95.0),
        "kibana": _metrics("kibana", 15.0, 25.0, 30.0, 0.0, 90.0, 350.0),
        "log-aggregator": _metrics("log-aggregator", 25.0, 35.0, 300.0, 0.15, 50.0, 400.0),
    },
    correct_severity=IncidentSeverity.P2,
    correct_root_cause_service="elasticsearch",
    correct_root_cause_keywords=["disk", "disk full", "disk space", "flood_stage", "watermark", "read-only", "disk usage"],
    valid_remediation_actions=[
        {"action": "config_change", "service": "elasticsearch", "detail": "clear read-only flag"},
        {"action": "scale", "service": "elasticsearch"},
    ],
    expected_escalation_teams=["infrastructure-team"],
    max_steps=10,
    degradation_per_step=0.005,
    relevant_services=["search-service", "elasticsearch"],
)


# ==========================================================================
# SCENARIO 2-B – Medium variant: Slow memory leak in background worker
# ==========================================================================

_SCENARIO_MEDIUM_B = Scenario(
    scenario_id="worker-memleak-001",
    task_id="root_cause_analysis",
    incident_id="INC-20260327-102",
    description=(
        "The report-generation service is timing out and users cannot export "
        "data. Multiple related services show elevated errors. Find the true "
        "root cause, classify severity, diagnose, and remediate."
    ),
    initial_alerts=[
        _alert("ALT-210", "report-service", AlertSeverity.CRITICAL,
               "Request timeout rate 60% for /api/export", "2026-03-27T11:20:00Z"),
        _alert("ALT-211", "worker-pool", AlertSeverity.WARNING,
               "Worker memory usage: 94% (4 of 5 workers OOMKilling)", "2026-03-27T11:19:00Z"),
        _alert("ALT-212", "s3-upload", AlertSeverity.WARNING,
               "Upload failures – 503s from report-service", "2026-03-27T11:20:30Z"),
        _alert("ALT-213", "postgres-reports", AlertSeverity.INFO,
               "Long-running queries detected: 5 queries > 10 s", "2026-03-27T11:18:00Z"),
        _alert("ALT-214", "health-monitor", AlertSeverity.INFO,
               "Core services healthy: payment, auth, user-api all nominal. Issue isolated to report-export subsystem.", "2026-03-27T11:20:00Z"),
    ],
    available_services=["report-service", "worker-pool", "s3-upload", "postgres-reports", "redis-cache", "api-gateway"],
    service_logs={
        "report-service": [
            _log("2026-03-27T11:15:00Z", "report-service", "INFO", "Report job queued: RPT-9981, format: xlsx, rows: 1M"),
            _log("2026-03-27T11:16:00Z", "report-service", "WARN", "Worker RPT-9981 memory: 2.1 GB (limit 2 GB). Nearing OOM."),
            _log("2026-03-27T11:18:00Z", "report-service", "ERROR", "Worker OOMKilled during xlsx serialization. Job failed."),
            _log("2026-03-27T11:19:00Z", "report-service", "ERROR", "3 concurrent OOMKills. Export endpoint returning 503."),
        ],
        "worker-pool": [
            _log("2026-03-27T11:10:00Z", "worker-pool", "INFO", "Workers: 5 active, 0 idle. Load: nominal."),
            _log("2026-03-27T11:14:00Z", "worker-pool", "WARN", "Worker memory climbing. Suspected unbounded row accumulation in xlsx writer."),
            _log("2026-03-27T11:17:00Z", "worker-pool", "ERROR", "Worker #3 OOMKilled. Memory at 100%."),
            _log("2026-03-27T11:19:00Z", "worker-pool", "ERROR", "4/5 workers OOMKilled. Effective worker capacity: 1. Queue depth: 45."),
            _log("2026-03-27T11:19:30Z", "worker-pool", "ERROR", "Root cause: xlsx writer buffers all rows in memory before flushing. No streaming."),
        ],
        "s3-upload": [
            _log("2026-03-27T11:20:00Z", "s3-upload", "WARN", "Upstream report-service returning 503. S3 uploads queued."),
        ],
        "postgres-reports": [
            _log("2026-03-27T11:17:00Z", "postgres-reports", "INFO", "Large sequential scan for 1M row export. Query time: 12 s. This is normal for large exports."),
        ],
        "redis-cache": [_log("2026-03-27T11:20:00Z", "redis-cache", "INFO", "Operations normal.")],
        "api-gateway": [_log("2026-03-27T11:20:00Z", "api-gateway", "WARN", "report-service upstream: 60% 503 errors.")],
    },
    service_metrics={
        "report-service": _metrics("report-service", 55.0, 75.0, 10.0, 0.60, 8000.0, 30000.0),
        "worker-pool": _metrics("worker-pool", 90.0, 94.0, 5.0, 0.80, 15000.0, 60000.0, oom_kills=4.0),
        "s3-upload": _metrics("s3-upload", 10.0, 15.0, 2.0, 0.60, 500.0, 3000.0),
        "postgres-reports": _metrics("postgres-reports", 55.0, 60.0, 15.0, 0.0, 200.0, 12000.0),
        "redis-cache": _metrics("redis-cache", 12.0, 30.0, 500.0, 0.0, 1.0, 3.0),
        "api-gateway": _metrics("api-gateway", 20.0, 28.0, 800.0, 0.08, 80.0, 2000.0),
    },
    correct_severity=IncidentSeverity.P2,
    correct_root_cause_service="worker-pool",
    correct_root_cause_keywords=["memory", "oom", "out of memory", "xlsx", "buffering", "unbounded", "memory leak", "worker memory", "worker", "oomkill", "streaming", "row accumulation"],
    # Note: P2 not P1 — only the report-export subsystem is affected, core services healthy.
    valid_remediation_actions=[
        {"action": "restart", "service": "worker-pool"},
        {"action": "scale", "service": "worker-pool"},
        {"action": "config_change", "service": "worker-pool", "detail": "enable streaming"},
    ],
    expected_escalation_teams=["backend-team", "platform-team"],
    max_steps=15,
    degradation_per_step=0.008,
    relevant_services=["report-service", "worker-pool"],
)


# ==========================================================================
# SCENARIO 3-B – Hard variant: Kubernetes node pressure / pod eviction cascade
# ==========================================================================

_SCENARIO_HARD_B = Scenario(
    scenario_id="k8s-node-pressure-001",
    task_id="full_incident_management",
    incident_id="INC-20260327-004",
    description=(
        "Multiple pods are being evicted across the cluster. The checkout "
        "service is returning 502s, node-exporter reports memory pressure on "
        "three nodes, and the HPA has been scaling aggressively. This is a "
        "node-level resource exhaustion event triggered by an HPA/resource-limit "
        "misconfiguration. Full incident management required."
    ),
    initial_alerts=[
        _alert("ALT-200", "checkout-service", AlertSeverity.CRITICAL,
               "502 error rate 28% across checkout endpoints", "2026-03-27T16:00:00Z"),
        _alert("ALT-201", "k8s-node-01", AlertSeverity.CRITICAL,
               "MemoryPressure=True — 3/8 pods evicted in last 5 min", "2026-03-27T16:00:30Z"),
        _alert("ALT-202", "k8s-node-02", AlertSeverity.WARNING,
               "MemoryPressure=True — node at 92% memory", "2026-03-27T16:01:00Z"),
        _alert("ALT-203", "hpa-controller", AlertSeverity.WARNING,
               "HPA for recommendation-service scaled to maxReplicas=20 (was 4)", "2026-03-27T15:55:00Z"),
        _alert("ALT-204", "cart-service", AlertSeverity.WARNING,
               "Downstream checkout-service returning 502s for 35% of cart completions", "2026-03-27T16:01:30Z"),
        _alert("ALT-205", "cdn-static", AlertSeverity.INFO,
               "Slight latency increase: p99 68ms (normal: 20ms)", "2026-03-27T16:02:00Z"),
    ],
    available_services=[
        "checkout-service", "k8s-node-01", "k8s-node-02",
        "recommendation-service", "cart-service", "hpa-controller",
        "cdn-static", "postgres-checkout",
    ],
    service_logs={
        "checkout-service": [
            _log("2026-03-27T15:58:00Z", "checkout-service", "INFO", "Processing normally. 180 req/s."),
            _log("2026-03-27T15:59:30Z", "checkout-service", "WARN", "3 pods restarting. Connections dropped."),
            _log("2026-03-27T16:00:00Z", "checkout-service", "ERROR", "502 Bad Gateway — upstream recommendation-service pods unavailable"),
            _log("2026-03-27T16:01:00Z", "checkout-service", "ERROR", "Circuit breaker half-open. 28% of requests failing."),
        ],
        "k8s-node-01": [
            _log("2026-03-27T15:50:00Z", "k8s-node-01", "INFO", "Memory: 78%."),
            _log("2026-03-27T15:53:00Z", "k8s-node-01", "WARN", "Memory: 88%. kubelet setting eviction threshold."),
            _log("2026-03-27T15:56:00Z", "k8s-node-01", "ERROR", "Memory: 95%. OOM eviction beginning. Evicting low-priority pods."),
            _log("2026-03-27T15:58:00Z", "k8s-node-01", "ERROR", "Evicted: recommendation-service-7d8f (2 GB). Memory: 91%."),
            _log("2026-03-27T16:00:00Z", "k8s-node-01", "ERROR", "Memory back to 95%. HPA-spawned recommendation-service pods consuming all available memory."),
        ],
        "k8s-node-02": [
            _log("2026-03-27T15:58:00Z", "k8s-node-02", "WARN", "Memory: 90%. recommendation-service HPA placed 6 new pods here."),
            _log("2026-03-27T16:00:30Z", "k8s-node-02", "ERROR", "Memory: 92%. Approaching eviction threshold."),
        ],
        "recommendation-service": [
            _log("2026-03-27T15:45:00Z", "recommendation-service", "INFO", "Memory usage tracking: v2.4.0 deployed. ML model loaded."),
            _log("2026-03-27T15:50:00Z", "recommendation-service", "WARN", "Each pod consuming 2.1 GB (limit: 2.0 GB) — requests.memory too low."),
            _log("2026-03-27T15:53:00Z", "recommendation-service", "WARN", "HPA triggered: latency spike caused scale-out. 8→12 pods"),
            _log("2026-03-27T15:57:00Z", "recommendation-service", "ERROR", "HPA at maxReplicas=20. 20 pods × 2.1 GB = 42 GB on nodes with 32 GB capacity."),
            _log("2026-03-27T16:00:00Z", "recommendation-service", "ERROR", "Pod eviction loop: evicted pods restart, consume memory, trigger eviction again."),
        ],
        "hpa-controller": [
            _log("2026-03-27T15:52:00Z", "hpa-controller", "INFO", "recommendation-service: scaling 4→8 due to latency"),
            _log("2026-03-27T15:55:00Z", "hpa-controller", "WARN", "recommendation-service: scaling 8→20 (maxReplicas). Memory requests underspecified."),
            _log("2026-03-27T16:00:00Z", "hpa-controller", "ERROR", "Eviction loop detected. Scaling is worsening node pressure."),
        ],
        "cart-service": [
            _log("2026-03-27T16:01:00Z", "cart-service", "WARN", "Checkout dependency failing. 35% cart completions blocked."),
        ],
        "cdn-static": [
            _log("2026-03-27T16:02:00Z", "cdn-static", "INFO", "Slight latency increase correlates with client retries. No CDN-side issue."),
        ],
        "postgres-checkout": [
            _log("2026-03-27T16:00:00Z", "postgres-checkout", "INFO", "All queries normal. Connections: 45/200."),
        ],
    },
    service_metrics={
        "checkout-service": _metrics("checkout-service", 55.0, 60.0, 180.0, 0.28, 200.0, 5500.0),
        "k8s-node-01":       _metrics("k8s-node-01",       70.0, 95.0,   0.0, 0.0,   0.0,    0.0, evicted_pods=3.0),
        "k8s-node-02":       _metrics("k8s-node-02",       65.0, 92.0,   0.0, 0.0,   0.0,    0.0),
        "recommendation-service": _metrics("recommendation-service", 85.0, 105.0, 80.0, 0.60, 800.0, 12000.0, memory_per_pod_gb=2.1, pod_count=20.0),
        "cart-service":      _metrics("cart-service",      30.0, 35.0, 250.0, 0.15,  90.0, 2200.0),
        "hpa-controller":    _metrics("hpa-controller",    10.0, 15.0,   0.0, 0.0,   0.0,    0.0, current_replicas=20.0),
        "cdn-static":        _metrics("cdn-static",        10.0, 12.0, 9000.0, 0.001, 12.0,  68.0),
        "postgres-checkout": _metrics("postgres-checkout", 35.0, 48.0, 200.0, 0.001, 12.0,   38.0),
    },
    correct_severity=IncidentSeverity.P1,
    correct_root_cause_service="recommendation-service",
    correct_root_cause_keywords=[
        "memory request", "resource limit", "hpa", "eviction loop", "pod eviction",
        "memory limit", "recommendation-service memory", "node pressure",
        "oom eviction", "hpa scale", "memory requests underspecified",
    ],
    valid_remediation_actions=[
        {"action": "config_change", "service": "recommendation-service"},
        {"action": "scale",         "service": "recommendation-service"},
        {"action": "restart",       "service": "recommendation-service"},
        {"action": "config_change", "service": "hpa-controller"},
    ],
    expected_escalation_teams=["platform-team", "sre-team"],
    max_steps=20,
    degradation_per_step=0.015,
    relevant_services=["recommendation-service", "k8s-node-01", "hpa-controller", "checkout-service"],
    blast_radius={
        "recommendation-service": {
            "error_rate": (0.03, 0.95),
            "pod_count":  (0.5,  20.0),
        },
        "k8s-node-01": {
            "memory_percent": (0.4, 100.0),
            "evicted_pods":   (0.4,  20.0),
        },
        "k8s-node-02": {
            "memory_percent": (0.5, 100.0),
        },
        "checkout-service": {
            "error_rate": (0.025, 0.85),
        },
    },
)


# ==========================================================================
# SCENARIO 3-C – Hard variant: Database failover split-brain
# ==========================================================================

_SCENARIO_HARD_C = Scenario(
    scenario_id="db-failover-race-001",
    task_id="full_incident_management",
    incident_id="INC-20260327-005",
    description=(
        "The primary PostgreSQL instance failed over to the replica 18 minutes "
        "ago but several services still route writes to the old primary (now "
        "read-only) because pgbouncer's connection string was never updated. "
        "A split-brain scenario is actively corrupting order state. Full "
        "incident commander workflow required: triage, diagnose, remediate, "
        "escalate, communicate."
    ),
    initial_alerts=[
        _alert("ALT-300", "order-service", AlertSeverity.CRITICAL,
               "Write failures: 65% of order commits failing with ReadOnlyError", "2026-03-27T18:10:00Z"),
        _alert("ALT-301", "postgres-primary-old", AlertSeverity.CRITICAL,
               "Instance is READ-ONLY (promoted replica took writes 18 min ago)", "2026-03-27T18:10:30Z"),
        _alert("ALT-302", "postgres-replica-new", AlertSeverity.WARNING,
               "Becoming primary: only 30% of expected write traffic received", "2026-03-27T18:11:00Z"),
        _alert("ALT-303", "payment-service", AlertSeverity.WARNING,
               "Double-charge risk: orders appearing in both DB instances for 8% of txns", "2026-03-27T18:11:30Z"),
        _alert("ALT-304", "inventory-service", AlertSeverity.WARNING,
               "Stock deduction failing silently: items over-sold", "2026-03-27T18:12:00Z"),
        _alert("ALT-305", "monitoring-dashboard", AlertSeverity.INFO,
               "DB failover event recorded at 2026-03-27T17:52:00Z", "2026-03-27T18:12:30Z"),
        _alert("ALT-306", "pgbouncer", AlertSeverity.CRITICAL,
               "pgbouncer still routing ALL writes to postgres-primary-old (read-only). Connection string not updated after failover.", "2026-03-27T18:13:00Z"),
    ],
    available_services=[
        "order-service", "postgres-primary-old", "postgres-replica-new",
        "payment-service", "inventory-service", "config-service",
        "monitoring-dashboard", "pgbouncer",
    ],
    service_logs={
        "order-service": [
            _log("2026-03-27T17:52:00Z", "order-service", "WARN", "DB failover detected. Using cached connection string."),
            _log("2026-03-27T17:55:00Z", "order-service", "ERROR", "INSERT failed: ERROR: cannot execute INSERT in a read-only transaction"),
            _log("2026-03-27T18:00:00Z", "order-service", "ERROR", "65% of order writes failing. Service still pointing to old primary."),
            _log("2026-03-27T18:10:00Z", "order-service", "ERROR", "Connection pool: all connections to postgres-primary-old. Failover not propagated."),
        ],
        "postgres-primary-old": [
            _log("2026-03-27T17:52:00Z", "postgres-primary-old", "WARN", "Promotion event: replica assumed primary role. This instance now read-only."),
            _log("2026-03-27T18:05:00Z", "postgres-primary-old", "ERROR", "Receiving 1800 write attempts/min from services — all rejected (read-only)."),
            _log("2026-03-27T18:10:00Z", "postgres-primary-old", "ERROR", "Active connections: 198/200. Service retry loops filling pool."),
        ],
        "postgres-replica-new": [
            _log("2026-03-27T17:52:00Z", "postgres-replica-new", "INFO", "Promoted to primary. Accepting writes."),
            _log("2026-03-27T18:05:00Z", "postgres-replica-new", "WARN", "Only 30% of expected write traffic received. Split-brain suspected."),
            _log("2026-03-27T18:10:00Z", "postgres-replica-new", "WARN", "Diverging from old primary: 1240 transactions only in new primary."),
        ],
        "payment-service": [
            _log("2026-03-27T18:05:00Z", "payment-service", "ERROR", "Idempotency check failing: order state inconsistent between DB instances"),
            _log("2026-03-27T18:10:00Z", "payment-service", "ERROR", "8% txn double-charge risk. Halting charge processing for affected orders."),
        ],
        "inventory-service": [
            _log("2026-03-27T18:05:00Z", "inventory-service", "ERROR", "Stock deduction writes going to old primary (read-only) — silently lost."),
            _log("2026-03-27T18:10:00Z", "inventory-service", "ERROR", "Oversold items: 340 SKUs with negative virtual stock. Revenue impact growing."),
        ],
        "config-service": [
            _log("2026-03-27T17:52:00Z", "config-service", "INFO", "DB failover event received. Updated DB_PRIMARY_HOST in config store."),
            _log("2026-03-27T17:52:30Z", "config-service", "WARN", "Config propagation: order-service and payment-service did NOT acknowledge new config."),
            _log("2026-03-27T17:55:00Z", "config-service", "ERROR", "Config ack missing for 4/8 services. Manual pgbouncer reload required."),
        ],
        "pgbouncer": [
            _log("2026-03-27T17:52:00Z", "pgbouncer", "WARN", "Failover detected. pgbouncer config NOT auto-updated (static connection string)."),
            _log("2026-03-27T18:10:00Z", "pgbouncer", "ERROR", "Routing 100% of writes to postgres-primary-old (read-only). Update target_db required immediately."),
        ],
        "monitoring-dashboard": [
            _log("2026-03-27T17:52:00Z", "monitoring-dashboard", "INFO", "Auto-failover triggered at 17:52:00Z by health check failure on primary."),
            _log("2026-03-27T18:12:00Z", "monitoring-dashboard", "INFO", "Split-brain duration: 18 min. Financial impact estimate: $42,000 in at-risk transactions."),
        ],
    },
    service_metrics={
        "order-service":         _metrics("order-service",         55.0, 60.0,  800.0, 0.65, 300.0,  8000.0, write_failure_rate=0.65),
        "postgres-primary-old":  _metrics("postgres-primary-old",  80.0, 70.0, 1800.0, 1.0,    5.0,    50.0, is_read_only=1.0, connection_pct=99.0),
        "postgres-replica-new":  _metrics("postgres-replica-new",  30.0, 45.0,  600.0, 0.0,    8.0,    30.0, write_pct_expected=0.30),
        "payment-service":       _metrics("payment-service",       40.0, 45.0,  200.0, 0.25, 180.0,  3500.0, double_charge_risk_pct=0.08),
        "inventory-service":     _metrics("inventory-service",     35.0, 40.0,  300.0, 0.30, 120.0,  2500.0, oversold_skus=340.0),
        "config-service":        _metrics("config-service",        15.0, 20.0,   50.0, 0.10,  30.0,   200.0),
        "monitoring-dashboard":  _metrics("monitoring-dashboard",  10.0, 15.0,  100.0, 0.0,   50.0,   150.0),
        "pgbouncer":             _metrics("pgbouncer",             25.0, 30.0, 2000.0, 0.65,   2.0,     8.0, routing_to_old_primary=1.0),
    },
    correct_severity=IncidentSeverity.P1,
    correct_root_cause_service="pgbouncer",
    correct_root_cause_keywords=[
        "pgbouncer", "connection string", "split-brain", "failover", "read-only",
        "config not propagated", "stale connection", "db routing", "pgbouncer config",
        "connection pool routing", "failover not propagated",
    ],
    valid_remediation_actions=[
        {"action": "config_change", "service": "pgbouncer"},
        {"action": "restart",       "service": "order-service"},
        {"action": "config_change", "service": "order-service"},
        {"action": "restart",       "service": "payment-service"},
    ],
    expected_escalation_teams=["database-team", "platform-team"],
    max_steps=20,
    degradation_per_step=0.02,
    relevant_services=["pgbouncer", "postgres-primary-old", "postgres-replica-new", "order-service"],
    blast_radius={
        "order-service": {
            "write_failure_rate": (0.02, 1.0),
            "error_rate":         (0.02, 0.95),
        },
        "inventory-service": {
            "oversold_skus":      (25.0, 5000.0),
            "error_rate":         (0.02, 0.80),
        },
        "payment-service": {
            "double_charge_risk_pct": (0.005, 0.30),
            "error_rate":             (0.02,  0.60),
        },
        "postgres-primary-old": {
            "connection_pct": (0.2, 100.0),
        },
    },
)


# ==========================================================================
# SCENARIO 1-C – Easy variant: DNS resolution failure
# ==========================================================================

_SCENARIO_EASY_C = Scenario(
    scenario_id="dns-fail-001",
    task_id="severity_classification",
    incident_id="INC-20260327-201",
    description=(
        "Multiple microservices are reporting connection timeouts to downstream "
        "dependencies. Alerts indicate DNS resolution failures across the "
        "internal service mesh. Classify the incident severity."
    ),
    initial_alerts=[
        _alert("ALT-301", "api-gateway", AlertSeverity.CRITICAL,
               "Upstream connection timeout rate 40% to backend services", "2026-03-27T14:00:00Z"),
        _alert("ALT-302", "coredns", AlertSeverity.CRITICAL,
               "DNS query failure rate 65% — SERVFAIL responses", "2026-03-27T13:58:00Z"),
        _alert("ALT-303", "notification-service", AlertSeverity.WARNING,
               "Failed to resolve smtp-relay.internal: NXDOMAIN", "2026-03-27T14:01:00Z"),
    ],
    available_services=["api-gateway", "coredns", "notification-service", "istio-proxy"],
    service_logs={
        "api-gateway": [
            _log("2026-03-27T13:58:00Z", "api-gateway", "ERROR", "upstream connect error: dns_resolution_failure for user-service.default.svc.cluster.local"),
            _log("2026-03-27T13:59:00Z", "api-gateway", "ERROR", "circuit breaker tripped: 5/10 upstream failures in 30s. Returning 503."),
            _log("2026-03-27T14:00:00Z", "api-gateway", "WARN", "Retry budget exhausted for payment-service. DNS not resolving."),
        ],
        "coredns": [
            _log("2026-03-27T13:55:00Z", "coredns", "WARN", "Cache miss rate increasing: 80%. Upstream forwarder slow."),
            _log("2026-03-27T13:57:00Z", "coredns", "ERROR", "OOMKilled: coredns-7d8f9b pod restarted. Memory limit 128Mi exceeded."),
            _log("2026-03-27T13:58:00Z", "coredns", "ERROR", "SERVFAIL for *.default.svc.cluster.local — upstream timeout after 5s"),
            _log("2026-03-27T14:00:00Z", "coredns", "ERROR", "Pod restart count: 4 in last 10 minutes. CrashLoopBackOff."),
        ],
        "notification-service": [
            _log("2026-03-27T14:00:00Z", "notification-service", "WARN", "Email delivery failing: cannot resolve smtp-relay.internal"),
        ],
        "istio-proxy": [
            _log("2026-03-27T14:00:00Z", "istio-proxy", "INFO", "Sidecar healthy. mTLS handshake OK. Issue is upstream DNS, not mesh."),
        ],
    },
    service_metrics={
        "api-gateway": _metrics("api-gateway", 25.0, 40.0, 1200.0, 0.40, 800.0, 5000.0),
        "coredns": _metrics("coredns", 95.0, 98.0, 5000.0, 0.65, 50.0, 5000.0, restart_count=4.0, cache_miss_pct=80.0),
        "notification-service": _metrics("notification-service", 10.0, 20.0, 50.0, 0.80, 200.0, 3000.0),
        "istio-proxy": _metrics("istio-proxy", 5.0, 10.0, 1200.0, 0.01, 2.0, 10.0),
    },
    correct_severity=IncidentSeverity.P1,
    correct_root_cause_service="coredns",
    correct_root_cause_keywords=["dns", "coredns", "OOM", "memory", "resolution", "SERVFAIL", "CrashLoop"],
    valid_remediation_actions=[
        {"action": "restart", "service": "coredns"},
        {"action": "scale", "service": "coredns"},
        {"action": "config_change", "service": "coredns", "detail": "increase memory limit"},
    ],
    expected_escalation_teams=["platform-team"],
    max_steps=10,
    degradation_per_step=0.008,
    relevant_services=["api-gateway", "coredns"],
    blast_radius={
        "coredns": {
            "error_rate": (0.03, 0.95),
            "restart_count": (1.0, 15.0),
        },
        "api-gateway": {
            "error_rate": (0.03, 0.80),
            "latency_p99_ms": (500.0, 15000.0),
        },
    },
)


# ==========================================================================
# SCENARIO 2-C – Medium variant: TLS certificate expiry
# ==========================================================================

_SCENARIO_MEDIUM_C = Scenario(
    scenario_id="tls-expiry-001",
    task_id="root_cause_analysis",
    incident_id="INC-20260327-301",
    description=(
        "The checkout-service is returning 502 errors for all HTTPS calls to "
        "the payment provider API. Internal health checks pass but external "
        "payment calls fail. Diagnose the root cause and remediate."
    ),
    initial_alerts=[
        _alert("ALT-401", "checkout-service", AlertSeverity.CRITICAL,
               "Payment API calls failing: 502 rate 95%", "2026-03-27T09:00:00Z"),
        _alert("ALT-402", "cert-manager", AlertSeverity.WARNING,
               "Certificate renewal failed for payments.example.com — ACME challenge timeout", "2026-03-27T08:00:00Z"),
        _alert("ALT-403", "nginx-ingress", AlertSeverity.WARNING,
               "TLS handshake failures: 200/min on payments upstream", "2026-03-27T09:01:00Z"),
    ],
    available_services=["checkout-service", "cert-manager", "nginx-ingress", "payment-provider-stub"],
    service_logs={
        "checkout-service": [
            _log("2026-03-27T08:55:00Z", "checkout-service", "ERROR", "PaymentGatewayError: SSL certificate has expired (payments.example.com)"),
            _log("2026-03-27T08:58:00Z", "checkout-service", "ERROR", "javax.net.ssl.SSLHandshakeException: PKIX path validation failed: certificate expired at 2026-03-27T00:00:00Z"),
            _log("2026-03-27T09:00:00Z", "checkout-service", "ERROR", "Circuit breaker OPEN for payment-provider. 48/50 calls failed in 60s."),
        ],
        "cert-manager": [
            _log("2026-03-27T02:00:00Z", "cert-manager", "INFO", "Certificate renewal triggered for payments.example.com (expires in 24h)"),
            _log("2026-03-27T02:01:00Z", "cert-manager", "ERROR", "ACME HTTP-01 challenge failed: upstream DNS not resolving challenge token"),
            _log("2026-03-27T02:05:00Z", "cert-manager", "ERROR", "Retry 3/3 failed. Certificate NOT renewed. Expiry: 2026-03-27T00:00:00Z"),
            _log("2026-03-27T08:00:00Z", "cert-manager", "CRITICAL", "Certificate EXPIRED: payments.example.com. Last valid: 2026-03-26T23:59:59Z"),
        ],
        "nginx-ingress": [
            _log("2026-03-27T09:00:00Z", "nginx-ingress", "ERROR", "SSL_do_handshake() failed: certificate verify failed (expired)"),
            _log("2026-03-27T09:01:00Z", "nginx-ingress", "WARN", "Upstream payments backend: 200 TLS errors/min. Peer certificate expired."),
        ],
        "payment-provider-stub": [
            _log("2026-03-27T09:00:00Z", "payment-provider-stub", "INFO", "Healthy. Accepting connections on port 443 with valid certificate."),
        ],
    },
    service_metrics={
        "checkout-service": _metrics("checkout-service", 15.0, 30.0, 300.0, 0.95, 50.0, 200.0, payment_success_pct=5.0, revenue_loss_per_min=8500.0),
        "cert-manager": _metrics("cert-manager", 5.0, 10.0, 1.0, 0.0, 10.0, 50.0, certs_expired=1.0, renewal_failures=3.0),
        "nginx-ingress": _metrics("nginx-ingress", 10.0, 20.0, 500.0, 0.40, 5.0, 30.0, tls_handshake_failures_per_min=200.0),
        "payment-provider-stub": _metrics("payment-provider-stub", 5.0, 15.0, 50.0, 0.0, 20.0, 80.0),
    },
    correct_severity=IncidentSeverity.P1,
    correct_root_cause_service="cert-manager",
    correct_root_cause_keywords=["certificate", "TLS", "SSL", "expired", "cert-manager", "renewal", "ACME", "expiry"],
    valid_remediation_actions=[
        {"action": "restart", "service": "cert-manager"},
        {"action": "config_change", "service": "cert-manager", "detail": "force renewal"},
        {"action": "config_change", "service": "nginx-ingress", "detail": "update certificate"},
    ],
    expected_escalation_teams=["security-team", "platform-team"],
    max_steps=15,
    degradation_per_step=0.010,
    relevant_services=["checkout-service", "cert-manager", "nginx-ingress"],
    blast_radius={
        "checkout-service": {
            "error_rate": (0.005, 1.0),
            "payment_success_pct": (-0.5, 0.0),
            "revenue_loss_per_min": (500.0, 50000.0),
        },
        "nginx-ingress": {
            "tls_handshake_failures_per_min": (20.0, 1000.0),
        },
    },
)


# ---- registry ---------------------------------------------------------------

# Multiple variants per task — environment randomly selects one per reset()
SCENARIO_VARIANTS: Dict[str, List[Scenario]] = {
    "severity_classification": [_SCENARIO_EASY, _SCENARIO_EASY_B, _SCENARIO_EASY_C],
    "root_cause_analysis": [_SCENARIO_MEDIUM, _SCENARIO_MEDIUM_B, _SCENARIO_MEDIUM_C],
    "full_incident_management": [_SCENARIO_HARD, _SCENARIO_HARD_B, _SCENARIO_HARD_C],
}

# Always maps task_id → primary (deterministic) scenario for testing/baseline
SCENARIOS: Dict[str, Scenario] = {
    "severity_classification": _SCENARIO_EASY,
    "root_cause_analysis": _SCENARIO_MEDIUM,
    "full_incident_management": _SCENARIO_HARD,
}


def get_scenario(task_id: str, variant_seed: int = 0) -> Scenario:
    """Return a scenario for the given task_id.

    Args:
        task_id:      One of the three registered task IDs.
        variant_seed: Index into SCENARIO_VARIANTS[task_id]. Wraps around.
                      Pass 0 for the primary/deterministic scenario.
    """
    if task_id not in SCENARIO_VARIANTS:
        raise ValueError(f"Unknown task_id '{task_id}'. Valid: {list(SCENARIO_VARIANTS.keys())}")
    variants = SCENARIO_VARIANTS[task_id]
    return variants[variant_seed % len(variants)]