"""
tasks.py — Task and scenario definitions for Cloud Incident Response OpenEnv.

Difficulty calibration targets:
  EASY   → 8B: 0.75-1.0,  70B: 0.85-1.0
  MEDIUM → 8B: 0.30-0.50,  70B: 0.45-0.65
  HARD   → 8B: 0.15-0.35,  70B: 0.30-0.50

Design principles for genuine difficulty:
  EASY: Alert metrics are clear. Only trick is P2-vs-P3 ambiguity.
  MEDIUM: Root cause buried. 8-10 known services. Multiple red herrings.
    incident_summary does NOT hint at root cause. Must investigate 4+ services.
  HARD: Same diagnosis challenge + 5-7 step remediation sequence +
    10+ known services (many wrong choices) + quality summary required.

Public API:
    get_task(task_id)            -> task metadata dict
    get_scenario(task_id, index) -> scenario dict
    list_tasks()                 -> list of task dicts
    ALL_TASKS                    -> dict[task_id -> metadata]
"""

from __future__ import annotations

ALL_TASKS: dict = {
    "alert_classification": {
        "id": "alert_classification",
        "name": "Task 1: Alert Severity Classification",
        "difficulty": "easy",
        "max_steps": 3,
        "score_range": [0.0, 1.0],
        "description": (
            "An alert has fired. Query logs and metrics across affected services, "
            "then classify the incident severity: P1 (CRITICAL — complete outage or "
            "revenue >$1,000/min), P2 (HIGH — major degradation affecting most users), "
            "P3 (MEDIUM — partial/minor issue with graceful fallback), "
            "P4 (LOW — informational). Submit with submit_severity."
        ),
        "available_actions": [
            "query_logs",
            "check_metrics",
            "check_dependencies",
            "check_recent_deploys",
            "submit_severity",
        ],
        "submission_action": "submit_severity",
        "scenarios": 3,
    },
    "root_cause_analysis": {
        "id": "root_cause_analysis",
        "name": "Task 2: Root Cause Analysis",
        "difficulty": "medium",
        "max_steps": 10,
        "score_range": [0.0, 1.0],
        "description": (
            "A production incident is active with multiple services showing errors. "
            "Use diagnostic tools to trace the failure chain. The root cause may be "
            "any service in the system — not necessarily one showing errors. "
            "Query logs, metrics, dependencies, and recent deploys across ALL "
            "available services to find the true trigger. Submit with submit_root_cause."
        ),
        "available_actions": [
            "query_logs",
            "check_metrics",
            "check_dependencies",
            "check_recent_deploys",
            "check_service_status",
            "submit_root_cause",
        ],
        "submission_action": "submit_root_cause",
        "scenarios": 3,
    },
    "remediation_planning": {
        "id": "remediation_planning",
        "name": "Task 3: Incident Remediation",
        "difficulty": "hard",
        "max_steps": 15,
        "score_range": [0.0, 1.0],
        "description": (
            "A critical production incident requires full end-to-end resolution. "
            "Diagnose the root cause among many services, execute the correct "
            "remediation sequence (order matters — wrong actions are penalized), "
            "then submit a detailed resolution summary. Scored on diagnosis quality, "
            "remediation correctness, action efficiency, and documentation."
        ),
        "available_actions": [
            "query_logs",
            "check_metrics",
            "check_dependencies",
            "check_recent_deploys",
            "check_service_status",
            "restart_service",
            "rollback_deploy",
            "scale_service",
            "disable_feature_flag",
            "clear_cache",
            "execute_runbook_step",
            "submit_resolution",
        ],
        "submission_action": "submit_resolution",
        "scenarios": 3,
    },
}

# ---------------------------------------------------------------------------
# Scenario data — 3 tasks × 3 scenarios = 9 total episodes
# ---------------------------------------------------------------------------

SCENARIOS: dict = {

    # ══════════════════════════════════════════════════════════════════════
    # TASK 1: ALERT CLASSIFICATION (EASY)
    # Target: 8B→0.75-1.0, 70B→0.85-1.0
    #
    # AC-001: Clear P1 (78% errors, $12k/min) — should be trivial
    # AC-002: Ambiguous P2 (degraded but working, $800/min)
    # AC-003: Trap P3 (45% errors but zero revenue impact, graceful fallback)
    # ══════════════════════════════════════════════════════════════════════

    "alert_classification": [
        # AC-001: Clear P1
        {
            "scenario_id": "AC-001",
            "description": (
                "Cascading failure across multiple services. "
                "Assess severity based on user and revenue impact."
            ),
            "incident_summary": (
                "Alert fired: api-gateway reporting elevated error rates. "
                "Multiple downstream services affected. "
                "Assess the severity of this incident."
            ),
            "alert": {
                "id": "ALT-20240315-001",
                "title": "api-gateway error rate elevated",
                "severity_fired": "UNCLASSIFIED",
                "affected_services": ["api-gateway", "auth-service", "postgres-db"],
                "symptoms": [
                    "api-gateway: HTTP 503 rate 78% (baseline: 0.1%)",
                    "auth-service: connection timeout 94% of requests",
                    "postgres-db: connection pool 500/500 utilized",
                    "checkout flow: unavailable",
                    "user logins: failing",
                ],
                "error_rate": 0.78,
                "duration_minutes": 4,
                "revenue_impact_per_min": 12000,
            },
            "known_services": {"api-gateway", "auth-service", "postgres-db"},
            "tool_responses": {
                "query_logs": {
                    "api-gateway": (
                        "2024-03-15T10:04:12Z ERROR upstream timeout auth-service:8080\n"
                        "2024-03-15T10:04:13Z ERROR 503 Service Unavailable\n"
                        "2024-03-15T10:04:14Z ERROR circuit breaker OPEN"
                    ),
                    "auth-service": (
                        "2024-03-15T10:04:10Z ERROR too many clients already\n"
                        "2024-03-15T10:04:11Z ERROR connection pool exhausted (500/500)"
                    ),
                    "postgres-db": (
                        "2024-03-15T10:04:00Z FATAL remaining slots reserved for superuser\n"
                        "2024-03-15T10:04:01Z LOG max_connections=500 active=500"
                    ),
                },
                "check_metrics": {
                    "api-gateway": "5xx rate: 78% | p99: 30s | circuit_breaker: OPEN",
                    "auth-service": "Error rate: 94% | DB wait: 28s | Queue: 847",
                    "postgres-db": "Connections: 500/500 (100%) | CPU: 98% | Memory: 89%",
                },
                "check_dependencies": {
                    "api-gateway": "Depends on: auth-service [CRITICAL]",
                    "auth-service": "Depends on: postgres-db [CRITICAL]",
                    "postgres-db": "No upstream dependencies",
                },
                "check_recent_deploys": {
                    "api-gateway": "No recent changes",
                    "auth-service": "Deploy 47 min ago — connection pool size change",
                    "postgres-db": "No recent changes",
                },
            },
            "correct_severity": "P1",
            "adjacent_severities": ["P2"],
        },

        # AC-002: Ambiguous P2 — degraded but not down
        {
            "scenario_id": "AC-002",
            "description": (
                "Service degradation affecting page load times. "
                "Core transaction flows still operational. "
                "Assess severity carefully."
            ),
            "incident_summary": (
                "Alert fired: CDN cache performance degraded. "
                "Origin servers under increased load. "
                "Assess the severity of this incident."
            ),
            "alert": {
                "id": "ALT-20240315-002",
                "title": "CDN cache performance anomaly detected",
                "severity_fired": "UNCLASSIFIED",
                "affected_services": ["cdn-edge", "product-service", "image-service"],
                "symptoms": [
                    "CDN cache hit rate: 3% (normal: 94%)",
                    "product-service: elevated origin traffic",
                    "image-service: CPU 95%, p99 latency 18s",
                    "Product pages: loading slowly",
                    "Checkout: still functional",
                ],
                "error_rate": 0.15,
                "duration_minutes": 8,
                "revenue_impact_per_min": 800,
            },
            "known_services": {"cdn-edge", "product-service", "image-service"},
            "tool_responses": {
                "query_logs": {
                    "cdn-edge": (
                        "2024-03-15T10:22:00Z INFO cache MISS ratio: 97%\n"
                        "2024-03-15T10:20:11Z WARN mass cache invalidation — 2.1M keys purged\n"
                        "2024-03-15T10:20:10Z INFO purge pattern: /* (ALL keys)"
                    ),
                    "product-service": (
                        "2024-03-15T10:22:05Z WARN request queue depth: 12,400\n"
                        "2024-03-15T10:22:06Z ERROR timeout from image-service\n"
                        "2024-03-15T10:22:07Z WARN worker pool 95%"
                    ),
                    "image-service": (
                        "2024-03-15T10:22:00Z WARN CPU throttling 95%\n"
                        "2024-03-15T10:22:01Z ERROR worker pool exhausted\n"
                        "2024-03-15T10:22:02Z WARN memory at 91%"
                    ),
                },
                "check_metrics": {
                    "cdn-edge": "Cache hit: 3% | Origin RPS: 48,000 | Bandwidth: 890 Gbps",
                    "product-service": "Origin RPS: 48k (norm: 1.2k) | Queue: 12,400",
                    "image-service": "CPU: 95% | Memory: 91% | p99: 18s",
                },
                "check_dependencies": {
                    "cdn-edge": "Origin: product-service [OVERLOADED]",
                    "product-service": "Depends on: image-service [DEGRADED]",
                    "image-service": "Depends on: object-storage [OK]",
                },
                "check_recent_deploys": {
                    "cdn-edge": "Cronjob updated 2h ago — purge pattern changed",
                    "product-service": "No recent changes",
                    "image-service": "No recent changes",
                },
            },
            "correct_severity": "P2",
            "adjacent_severities": ["P1", "P3"],
        },

        # AC-003: P3 trap — high error rate but zero impact
        {
            "scenario_id": "AC-003",
            "description": (
                "Internal service reporting elevated errors. "
                "Determine actual user and business impact. "
                "Not all high error rates are critical."
            ),
            "incident_summary": (
                "Alert fired: recommendation-service error rate elevated to 45%. "
                "Assess the severity based on actual user and business impact."
            ),
            "alert": {
                "id": "ALT-20240315-003",
                "title": "recommendation-service error rate 45%",
                "severity_fired": "UNCLASSIFIED",
                "affected_services": ["recommendation-service", "product-service"],
                "symptoms": [
                    "recommendation-service: error rate 45% (baseline: 2%)",
                    "product-service: using fallback recommendation logic",
                    "User experience: default recommendations shown",
                    "Checkout: fully functional",
                    "Revenue: no measurable change",
                ],
                "error_rate": 0.45,
                "duration_minutes": 22,
                "revenue_impact_per_min": 0,
            },
            "known_services": {"recommendation-service", "product-service", "redis-reco-cache"},
            "tool_responses": {
                "query_logs": {
                    "recommendation-service": (
                        "2024-03-15T09:48:00Z ERROR model inference timeout (>5s)\n"
                        "2024-03-15T09:48:01Z WARN ML model server overloaded\n"
                        "2024-03-15T09:48:02Z INFO fallback: returning default recommendations"
                    ),
                    "product-service": (
                        "2024-03-15T09:48:05Z INFO recommendation-service returned defaults\n"
                        "2024-03-15T09:48:06Z INFO serving page with default recs — no user impact"
                    ),
                    "redis-reco-cache": "Operating normally — cache hit rate 88%",
                },
                "check_metrics": {
                    "recommendation-service": (
                        "Error rate: 45% | Fallback rate: 45% | "
                        "Model server: OVERLOADED | User impact: NONE (graceful)"
                    ),
                    "product-service": (
                        "Error rate: 0.1% (normal) | Checkout: 100% | Revenue: unchanged"
                    ),
                    "redis-reco-cache": "Hit rate: 88% | Memory: 34% | HEALTHY",
                },
                "check_dependencies": {
                    "recommendation-service": "Depends on: ML model server [SLOW]",
                    "product-service": "Depends on: recommendation-service [DEGRADED — has fallback]",
                    "redis-reco-cache": "No dependencies",
                },
                "check_recent_deploys": {
                    "recommendation-service": "Model update 3h ago — new model v2.4",
                    "product-service": "No recent changes",
                    "redis-reco-cache": "No recent changes",
                },
            },
            "correct_severity": "P3",
            "adjacent_severities": ["P2", "P4"],
        },
    ],

    # ══════════════════════════════════════════════════════════════════════
    # TASK 2: ROOT CAUSE ANALYSIS (MEDIUM)
    # Target: 8B→0.30-0.50, 70B→0.45-0.65
    #
    # KEY DESIGN RULES:
    # 1. Root cause service NEVER in affected_services
    # 2. incident_summary describes SYMPTOMS only, no hints
    # 3. 8-10 known_services (many to investigate)
    # 4. Red herring deploys on non-root-cause services
    # 5. Root cause only findable via check_recent_deploys + query_logs
    #    on the specific service — not from looking at victims
    # ══════════════════════════════════════════════════════════════════════

    "root_cause_analysis": [

        # RCA-001: analytics-service OOM kills postgres-db
        # 8 known services. Root cause: analytics-service.
        # Red herrings: auth-service deploy (cosmetic), redis healthy
        {
            "scenario_id": "RCA-001",
            "description": (
                "Multiple services reporting failures. Database appears to be "
                "the epicenter but the true trigger may be elsewhere."
            ),
            "incident_summary": (
                "Multiple services are failing. postgres-db is in a crash loop. "
                "auth-service, order-service, and api-gateway are all reporting errors. "
                "Investigate all available services to find what triggered this cascade."
            ),
            "alert": {
                "id": "ALT-RCA-001",
                "title": "Multiple service failures — database crash loop",
                "severity_fired": "P1",
                "affected_services": [
                    "api-gateway", "auth-service", "order-service", "postgres-db",
                ],
                "symptoms": [
                    "postgres-db: crash loop — 4 restarts in 12 minutes",
                    "auth-service: 100% connection failures",
                    "order-service: all writes failing",
                    "api-gateway: 503 on authenticated routes",
                ],
                "error_rate": 0.95,
                "duration_minutes": 14,
            },
            "known_services": {
                "api-gateway", "auth-service", "order-service",
                "postgres-db", "analytics-service", "redis-session",
                "product-service", "notification-service",
            },
            "tool_responses": {
                        # In RCA-001, replace the query_logs section:
        "query_logs": {
            "postgres-db": (
    "2024-03-16T02:11:00Z LOG database system shut down\n"
    "2024-03-16T02:10:58Z FATAL terminated by kernel OOM killer\n"
    "2024-03-16T02:10:30Z LOG long-running analytics export query "
    "consuming 31.8GB/32GB — sequential scan on events table "
    "with cross-join, running 12 minutes, no LIMIT clause. "
    "Investigate analytics-service scheduled jobs"
),
            "analytics-service": (
                "2024-03-16T01:58:00Z INFO starting scheduled job: full_history_export\n"
                "2024-03-16T01:58:01Z DEBUG executing: SELECT * FROM events "
                "JOIN user_sessions ON ... JOIN orders ON ... — no LIMIT\n"
                "2024-03-16T01:58:02Z WARN query plan estimates 847M row scan\n"
                "2024-03-16T02:10:55Z ERROR job terminated — connection to database lost"
            ),
            "auth-service": (
                "2024-03-16T02:11:05Z ERROR connect ECONNREFUSED postgres-db:5432\n"
                "2024-03-16T02:11:06Z ERROR all retries exhausted"
            ),
            "api-gateway": (
                "2024-03-16T02:11:10Z ERROR upstream auth-service: 503"
            ),
            "order-service": (
                "2024-03-16T02:11:08Z ERROR pq: database system is starting up"
            ),
            "redis-session": "No errors — operating normally",
            "product-service": (
                "2024-03-16T02:11:12Z WARN DB queries failing — serving cached data"
            ),
            "notification-service": (
                "2024-03-16T02:11:15Z ERROR cannot send — user lookup failed"
            ),
        },
                "check_metrics": {
                    "postgres-db": (
                        "Memory: peaked at 31.8GB/32GB before kill | "
                        "Restarts: 4 in 12min | Status: RESTARTING | "
                        "Heaviest client: 10.0.5.47"
                    ),
                    "analytics-service": (
                        "Last job: FAILED | Memory during job: 28GB | "
                        "IP: 10.0.5.47 | CPU: idle (job terminated)"
                    ),
                    "auth-service": "Connections: 0% success | Queued requests: 1,200",
                    "api-gateway": "503 rate: 95% | Auth: DOWN",
                    "order-service": "Write success: 0% | DB: RESTARTING",
                    "redis-session": "Hit rate: 99.2% | Memory: 42% | HEALTHY",
                    "product-service": "Serving cached data | DB queries: 100% failing",
                    "notification-service": "Queue backlog: 8,400 | DB: DOWN",
                },
                "check_dependencies": {
                    "postgres-db": (
                        "Clients: auth-service, order-service, analytics-service, "
                        "product-service, notification-service"
                    ),
                    "analytics-service": "Depends on: postgres-db [CRASH LOOP]",
                    "auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
                    "api-gateway": "Depends on: auth-service [DOWN], product-service [DEGRADED]",
                    "order-service": "Depends on: postgres-db [CRASH LOOP]",
                    "redis-session": "Standalone cache — no DB dependency",
                    "product-service": "Depends on: postgres-db [CRASH LOOP — using cache]",
                    "notification-service": "Depends on: postgres-db [CRASH LOOP]",
                },
                "check_recent_deploys": {
                    "analytics-service": (
                        "Deploy 6h ago: added scheduled data export job — "
                        "runs daily at 02:00 UTC. Change includes cross-table "
                        "JOIN query without LIMIT clause"
                    ),
                    "postgres-db": "No deploys in 3 weeks",
                    "auth-service": (
                        "Deploy 2h ago: updated structured logging format. "
                        "No functional changes, no query changes, no connection changes."
                    ),
                    "order-service": "No recent deploys",
                    "redis-session": "No recent deploys",
                    "api-gateway": "No recent deploys",
                    "product-service": (
                        "Deploy 3 days ago: added product image lazy loading. "
                        "No DB changes."
                    ),
                    "notification-service": "No recent deploys",
                },
                "check_service_status": {
                    "postgres-db": "RESTARTING | Uptime: 47s | Last crash: OOM",
                    "analytics-service": "ERROR | Last job: FAILED 12min ago",
                    "auth-service": "DOWN | Blocked on postgres-db",
                    "api-gateway": "DEGRADED | 95% errors",
                    "order-service": "DOWN | Blocked on postgres-db",
                    "redis-session": "HEALTHY | 99.2% hit rate",
                    "product-service": "DEGRADED | Cache fallback active",
                    "notification-service": "DEGRADED | Queue backlog 8,400",
                },
            },
            "correct_root_cause": {
                "service": "analytics-service",
                "failure_mode": "unbounded query OOM killing postgres-db",
            },
            "wrong_actions": {
                "restart_service:auth-service": "victim — DB must be fixed first",
                "restart_service:api-gateway": "downstream — won't help",
                "restart_service:order-service": "victim — won't help",
                "scale_service:postgres-db": "won't prevent OOM from bad query",
                "rollback_deploy:postgres-db": "no recent deploys",
                "rollback_deploy:auth-service": "auth deploy was cosmetic only",
                "rollback_deploy:product-service": "product deploy unrelated",
                "restart_service:redis-session": "redis is healthy",
                "restart_service:notification-service": "victim — won't help",
            },
        },

        # RCA-002: network-infra BGP withdrawal
        # 8 known services. Root cause: network-infra.
        # Red herrings: payment-service looks down, postgres-db exists
        {
            "scenario_id": "RCA-002",
            "description": (
                "Checkout failures concentrated in specific availability zones. "
                "Some services appear unreachable while others work fine."
            ),
            "incident_summary": (
                "Checkout failure rate has spiked to 61%. payment-service and "
                "fraud-detection-service are unreachable from some parts of the "
                "infrastructure but appear healthy from others. Multiple services "
                "to investigate. Find the root cause."
            ),
            "alert": {
                "id": "ALT-RCA-002",
                "title": "Checkout failures — partial service unreachability",
                "severity_fired": "P2",
                "affected_services": [
                    "order-service", "payment-service", "fraud-detection-service",
                ],
                "symptoms": [
                    "checkout failure rate: 61%",
                    "payment-service: intermittently unreachable",
                    "fraud-detection-service: intermittently unreachable",
                    "failures appear zone-specific",
                ],
                "error_rate": 0.61,
                "duration_minutes": 9,
            },
            "known_services": {
                "order-service", "payment-service", "fraud-detection-service",
                "postgres-db", "redis-payment-cache", "network-infra",
                "cdn-edge", "api-gateway",
            },
            "tool_responses": {
                        # In RCA-002, replace query_logs:
        "query_logs": {
            "order-service": (
                "2024-03-17T14:32:10Z ERROR connection timeout "
                "payment-service:8080 — no route to host\n"
                "2024-03-17T14:32:11Z ERROR fraud-detection-service: i/o timeout\n"
                "2024-03-17T14:32:12Z WARN failures only from AZ-2/AZ-3, "
                "AZ-1 traffic normal — possible network-infra issue"
            ),
            "payment-service": (
                "2024-03-17T14:31:58Z WARN health check from external LB failing\n"
                "2024-03-17T14:31:59Z INFO local AZ-1 traffic: all normal\n"
                "2024-03-17T14:32:00Z INFO processing requests normally (local only)"
            ),
            "fraud-detection-service": (
                "2024-03-17T14:32:00Z INFO local requests: processing normally\n"
                "2024-03-17T14:32:01Z WARN external health probes: 100% timeout"
            ),
            "network-infra": (
                "2024-03-17T14:31:45Z CRITICAL BGP session 10.0.2.1 DOWN — "
                "routes to 10.0.1.0/24 withdrawn from peer\n"
                "2024-03-17T14:31:45Z CRITICAL BGP session 10.0.3.1 DOWN — "
                "routes to 10.0.1.0/24 withdrawn from peer\n"
                "2024-03-17T14:31:44Z INFO configuration change applied — "
                "export filter policy updated"
            ),
            "postgres-db": "Operating normally — no errors",
            "redis-payment-cache": "Operating normally — all healthy",
            "cdn-edge": "Operating normally — cache serving fine",
            "api-gateway": (
                "2024-03-17T14:32:15Z ERROR some backend routes timing out\n"
                "2024-03-17T14:32:16Z INFO AZ-1 backends: responding normally"
            ),
        },
                "check_metrics": {
                    "order-service": (
                        "Failure rate varies by source AZ: "
                        "AZ-1: 0.2% | AZ-2: 99% | AZ-3: 98%"
                    ),
                    "payment-service": (
                        "Internal processing: 100% success | "
                        "Inbound from AZ-2: 0 connections | Inbound from AZ-3: 0 connections | "
                        "Inbound from AZ-1: normal"
                    ),
                    "fraud-detection-service": (
                        "Internal: normal | External probes: 100% timeout"
                    ),
                    "network-infra": (
                        "BGP sessions: AZ-1 internal UP | "
                        "AZ-2→AZ-1: WITHDRAWN | AZ-3→AZ-1: WITHDRAWN | "
                        "Last change: 18min ago"
                    ),
                    "postgres-db": "All metrics normal",
                    "redis-payment-cache": "All metrics normal",
                    "cdn-edge": "Cache hit: 91% | Normal operation",
                    "api-gateway": "Mixed — AZ-1 OK, AZ-2/AZ-3 partial failures",
                },
                "check_dependencies": {
                    "order-service": (
                        "Depends on: payment-service [PARTIAL], "
                        "fraud-detection-service [PARTIAL]"
                    ),
                    "payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
                    "fraud-detection-service": "Depends on: postgres-db [OK]",
                    "network-infra": (
                        "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN], AZ-1 [UP]"
                    ),
                    "postgres-db": "All connections healthy",
                    "redis-payment-cache": "All connections healthy",
                    "cdn-edge": "No issues",
                    "api-gateway": "Depends on: multiple backends [MIXED]",
                },
                "check_recent_deploys": {
                    "network-infra": (
                        "Router configuration change 18min ago — modified BGP "
                        "export filter policy. Change accidentally removed AZ-1 "
                        "prefix 10.0.1.0/24 from advertisements to AZ-2 and AZ-3 peers."
                    ),
                    "payment-service": "No recent deploys",
                    "order-service": "No recent deploys",
                    "fraud-detection-service": "No recent deploys",
                    "postgres-db": (
                        "Minor config change 5 days ago — increased shared_buffers. "
                        "No issues since."
                    ),
                    "redis-payment-cache": "No recent deploys",
                    "cdn-edge": "No recent deploys",
                    "api-gateway": (
                        "Deploy 1 day ago — added request tracing headers. "
                        "No routing changes."
                    ),
                },
                "check_service_status": {
                    "payment-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
                    "order-service": "DEGRADED | Partial failures",
                    "network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
                    "fraud-detection-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
                    "postgres-db": "HEALTHY",
                    "redis-payment-cache": "HEALTHY",
                    "cdn-edge": "HEALTHY",
                    "api-gateway": "DEGRADED | Mixed backend status",
                },
            },
            "correct_root_cause": {
                "service": "network-infra",
                "failure_mode": "BGP route withdrawal causing AZ network partition",
            },
            "wrong_actions": {
                "restart_service:payment-service": "healthy — network issue",
                "restart_service:order-service": "victim",
                "scale_service:payment-service": "won't fix routing",
                "clear_cache:redis-payment-cache": "cache is healthy",
                "restart_service:api-gateway": "victim of routing issue",
                "rollback_deploy:api-gateway": "deploy was unrelated tracing headers",
                "rollback_deploy:postgres-db": "config change was 5 days ago, unrelated",
                "restart_service:cdn-edge": "CDN is healthy",
            },
        },

        # RCA-003: config-service credential rotation bug
        # 8 known services. Root cause: config-service.
        # Red herrings: user-service had a recent deploy, postgres-db stressed
        {
            "scenario_id": "RCA-003",
            "description": (
                "Multiple services experiencing database authentication failures. "
                "The database itself may not be the problem."
            ),
            "incident_summary": (
                "Several services are reporting database authentication failures. "
                "postgres-db connection pool is saturated. user-service and "
                "notification-service are down. api-gateway error rate elevated. "
                "Investigate all services to find what triggered this."
            ),
            "alert": {
                "id": "ALT-RCA-003",
                "title": "Multiple services — database authentication failures",
                "severity_fired": "P2",
                "affected_services": [
                    "api-gateway", "user-service", "notification-service", "postgres-db",
                ],
                "symptoms": [
                    "user-service: FATAL password authentication failed",
                    "notification-service: FATAL password authentication failed",
                    "api-gateway: 503 rate 62%",
                    "postgres-db: connection pool 490/500",
                ],
                "error_rate": 0.62,
                "duration_minutes": 7,
            },
            "known_services": {
                "api-gateway", "user-service", "notification-service",
                "postgres-db", "config-service", "redis-session",
                "order-service", "product-service",
            },
            "tool_responses": {
                        # In RCA-003, replace query_logs:
        "query_logs": {
            "user-service": (
    "2024-03-18T08:14:00Z FATAL password authentication failed "
    "for user 'app_user'\n"
    "2024-03-18T08:14:01Z ERROR DB credentials rejected — "
    "credentials were last pushed by config-service secrets "
    "rotation at 08:12:00Z\n"
    "2024-03-18T08:14:02Z WARN credential hash mismatch — "
    "check config-service rotation job for issues"
),
            "notification-service": (
    "2024-03-18T08:14:05Z FATAL password authentication failed "
    "for user 'app_user'\n"
    "2024-03-18T08:14:06Z WARN credentials from config-service "
    "rotation at 08:12:00Z appear invalid"
),
            "api-gateway": (
                "2024-03-18T08:14:10Z ERROR upstream user-service: 503\n"
                "2024-03-18T08:14:11Z ERROR upstream notification-service: 503"
            ),
            "postgres-db": (
                "2024-03-18T08:14:00Z LOG auth failure from 10.0.3.x\n"
                "2024-03-18T08:14:00Z LOG auth failure from 10.0.4.x\n"
                "2024-03-18T08:14:01Z LOG 490/500 slots used by failed auth retries"
            ),
            "config-service": (
                "2024-03-18T08:12:00Z INFO secrets rotation job executed\n"
                "2024-03-18T08:12:01Z WARN rotation referenced PREVIOUS "
                "credential set instead of generating new — template bug "
                "in version v3.2.1\n"
                "2024-03-18T08:12:02Z INFO pushed credentials to: "
                "user-service, notification-service, order-service"
            ),
            "redis-session": "Operating normally",
            "order-service": (
                "2024-03-18T08:14:20Z WARN received credential push from "
                "config-service but have not restarted — still using old valid creds"
            ),
            "product-service": "Operating normally — using original credentials",
        },
                "check_metrics": {
                    "user-service": "DB auth: 100% failure | HTTP 503: 100%",
                    "notification-service": "DB auth: 100% failure | HTTP 503: 100%",
                    "api-gateway": "503 rate: 62% | Some upstreams DOWN",
                    "postgres-db": (
                        "Connections: 490/500 | Auth failures/s: 80 | "
                        "Valid connections: 10 | DB itself: HEALTHY"
                    ),
                    "config-service": (
                        "Status: HEALTHY | Last push: 7min ago | "
                        "Type: secrets_rotation | Result: COMPLETED"
                    ),
                    "redis-session": "All normal",
                    "order-service": "Using old credentials — still working",
                    "product-service": "All normal — unaffected",
                },
                "check_dependencies": {
                    "user-service": (
                        "Depends on: postgres-db [AUTH FAIL], "
                        "config-service [credential source]"
                    ),
                    "notification-service": (
                        "Depends on: postgres-db [AUTH FAIL], "
                        "config-service [credential source]"
                    ),
                    "api-gateway": "Depends on: user-service [DOWN], notification-service [DOWN]",
                    "postgres-db": "No upstream dependencies — DB is healthy",
                    "config-service": (
                        "Provides: credentials to user-service, "
                        "notification-service, order-service"
                    ),
                    "redis-session": "Standalone",
                    "order-service": (
                        "Depends on: postgres-db [OK — old creds], "
                        "config-service [pending push]"
                    ),
                    "product-service": "Depends on: postgres-db [OK — original creds]",
                },
                "check_recent_deploys": {
                    "config-service": (
                        "Deploy 2h ago: version v3.2.1 — updated secrets rotation "
                        "job template. Bug: rotation references previous credential "
                        "set instead of generating new credentials."
                    ),
                    "user-service": (
                        "Deploy 4h ago: added new profile API endpoint. "
                        "No database or credential changes."
                    ),
                    "notification-service": "No recent deploys",
                    "postgres-db": "No recent deploys",
                    "api-gateway": "No recent deploys",
                    "redis-session": "No recent deploys",
                    "order-service": (
                        "Deploy 1 day ago: updated order confirmation email template. "
                        "No DB changes."
                    ),
                    "product-service": "No recent deploys",
                },
                "check_service_status": {
                    "user-service": "DOWN | DB auth failures",
                    "notification-service": "DOWN | DB auth failures",
                    "api-gateway": "DEGRADED | 62% error rate",
                    "postgres-db": "STRESSED but HEALTHY | 490/500 connections (failed auths)",
                    "config-service": "HEALTHY | Last rotation: 7min ago (completed)",
                    "redis-session": "HEALTHY",
                    "order-service": "HEALTHY | Old credentials still valid",
                    "product-service": "HEALTHY",
                },
            },
            "correct_root_cause": {
                "service": "config-service",
                "failure_mode": "secrets rotation pushed stale credentials to downstream services",
            },
            "wrong_actions": {
                "restart_service:user-service": "will retry with same bad credentials",
                "restart_service:notification-service": "same bad credentials",
                "restart_service:postgres-db": "DB is healthy — client creds are bad",
                "scale_service:postgres-db": "connections are failed auths",
                "rollback_deploy:user-service": "user-service deploy was unrelated",
                "rollback_deploy:order-service": "order-service deploy was unrelated",
                "restart_service:api-gateway": "downstream — fix upstream first",
            },
        },
    ],

    # ══════════════════════════════════════════════════════════════════════
    # TASK 3: REMEDIATION PLANNING (HARD)
    # Target: 8B→0.15-0.35, 70B→0.30-0.50
    #
    # KEY DESIGN RULES:
    # 1. Same diagnostic challenge as medium
    # 2. 5-7 step remediation sequence required
    # 3. 8-10 known services = many wrong choices
    # 4. Wrong actions carry -0.05 penalty each (up to -0.15)
    # 5. Summary must hit 3+ keywords for bonus
    # 6. incident_summary does NOT reveal root cause
    # ══════════════════════════════════════════════════════════════════════

    "remediation_planning": [

        # RP-001: OOM remediation — 6-step sequence, 8 services
        {
            "scenario_id": "RP-001",
            "description": (
                "Full incident remediation required. Multiple services down. "
                "Diagnose the root cause, execute fixes in the correct order, "
                "and document your resolution."
            ),
            "incident_summary": (
                "CRITICAL — postgres-db is crash-looping. auth-service, order-service, "
                "and api-gateway are all down. notification-service queue backing up. "
                "Diagnose the root cause, fix it, restore all services, and document."
            ),
            "alert": {
                "id": "ALT-RP-001",
                "title": "CRITICAL: database crash loop — multiple services down",
                "severity_fired": "P1",
                "affected_services": [
                    "postgres-db", "auth-service", "order-service", "api-gateway",
                ],
            },
            "known_services": {
                "postgres-db", "auth-service", "order-service",
                "api-gateway", "analytics-service", "redis-session",
                "product-service", "notification-service",
            },
            "tool_responses": {
                "query_logs": {
                    # RP-001 query_logs → postgres-db — REPLACE WITH:
"postgres-db": (
    "FATAL: terminated by kernel OOM killer — "
    "query from client 10.0.5.47 running 12min consuming "
    "31.8GB of 32GB available memory"
),
                    "analytics-service": (
                        "INFO: starting job full_history_export\n"
                        "WARN: query plan: 847M rows, cross-table JOIN, no LIMIT\n"
                        "ERROR: job terminated — database connection lost"
                    ),
                    "auth-service": "ERROR: connect ECONNREFUSED postgres-db:5432",
                    "order-service": "ERROR: pq: database system is starting up",
                    "api-gateway": "ERROR: upstream auth-service 503",
                    "redis-session": "Operating normally",
                    "product-service": "WARN: DB failing — serving cached data",
                    "notification-service": "ERROR: user lookup failed — queuing",
                },
                "check_metrics": {
                    "postgres-db": "OOM killed | Restarts: 4 | Heaviest client: 10.0.5.47",
                    "analytics-service": "Job FAILED | Memory peak: 31GB/32GB | IP: 10.0.5.47",
                    "auth-service": "0% DB success | Queue: 1,200",
                    "order-service": "0% write success",
                    "api-gateway": "503 rate: 95%",
                    "redis-session": "HEALTHY | 99.2% hit rate",
                    "product-service": "Cache fallback active",
                    "notification-service": "Queue: 8,400 messages backed up",
                },
                "check_dependencies": {
                    "postgres-db": (
                        "Clients: auth-service, order-service, analytics-service, "
                        "product-service, notification-service"
                    ),
                    "analytics-service": "Depends on: postgres-db [CRASH LOOP]",
                    "auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
                    "api-gateway": "Depends on: auth-service [DOWN]",
                    "order-service": "Depends on: postgres-db [CRASH LOOP]",
                    "redis-session": "Standalone",
                    "product-service": "Depends on: postgres-db [CRASH LOOP — cache fallback]",
                    "notification-service": "Depends on: postgres-db [CRASH LOOP]",
                },
                "check_recent_deploys": {
                    "analytics-service": (
                        "Deploy 6h ago: added scheduled export job — "
                        "cross-table JOIN without LIMIT clause"
                    ),
                    "postgres-db": "No deploys in 3 weeks",
                    "auth-service": "Deploy 2h ago: logging format only — no functional changes",
                    "order-service": "No recent deploys",
                    "product-service": "Deploy 3 days ago: image lazy loading — no DB changes",
                    "notification-service": "No recent deploys",
                },
                "check_service_status": {
                    "postgres-db": "CRASH LOOP | OOM | Uptime: 47s",
                    "analytics-service": "ERROR | Job FAILED",
                    "auth-service": "DOWN",
                    "order-service": "DOWN",
                    "api-gateway": "DEGRADED | 95% errors",
                    "redis-session": "HEALTHY",
                    "product-service": "DEGRADED | Cache fallback",
                    "notification-service": "DEGRADED | Queue backlog",
                },
            },
            "remediation_data": {
                "disable_feature_flag": {
                    "full_history_export": (
                        "Cron job full_history_export DISABLED — "
                        "unbounded query will not execute again"
                    ),
                },
                "restart_service": {
                    "postgres-db": "postgres-db restarted — accepting connections (12/500)",
                    "analytics-service": "analytics-service restarted — idle",
                    "auth-service": "auth-service restarted — connected to postgres-db OK",
                    "order-service": "order-service restarted — writes resuming",
                    "api-gateway": "api-gateway restarted — routing recovered",
                    "product-service": "product-service — switched from cache to live DB",
                    "notification-service": "notification-service — draining queue",
                },
                "execute_runbook_step": {
                    "verify_db_health": "postgres-db: 12/500 connections, CPU 12%, Memory 34% — healthy",
                    "check_service_recovery": (
                        "auth OK | order OK | api-gateway OK | product OK | notification DRAINING"
                    ),
                },
            },
            "correct_remediation_sequence": [
                "disable_feature_flag:full_history_export",
                "restart_service:analytics-service",
                "restart_service:postgres-db",
                "restart_service:auth-service",
                "restart_service:order-service",
                "execute_runbook_step:verify_db_health",
            ],
            "wrong_actions": {
                "rollback_deploy:postgres-db": "no recent deploy",
                "scale_service:postgres-db": "won't prevent OOM",
                "restart_service:api-gateway": "downstream — fix DB stack first",
                "rollback_deploy:auth-service": "cosmetic deploy only",
                "clear_cache:redis-session": "healthy — not related",
                "restart_service:redis-session": "healthy — not related",
                "rollback_deploy:product-service": "unrelated deploy",
                "restart_service:notification-service": "will recover once DB is up",
            },
            "resolution_keywords": [
                "analytics", "oom", "memory", "postgres", "query",
                "full_history_export", "disabled", "restarted",
                "recovered", "unbounded", "crash", "kill",
            ],
        },

        # RP-002: BGP remediation — 4-step sequence, 8 services
        {
            "scenario_id": "RP-002",
            "description": (
                "Full incident remediation required. Checkout failures affecting "
                "most users. Diagnose, fix, verify, and document."
            ),
            "incident_summary": (
                "Checkout failure rate 61%. payment-service unreachable from most "
                "of the infrastructure. Some services report no issues. "
                "Diagnose the root cause, execute remediation, verify recovery, "
                "and document the resolution."
            ),
            "alert": {
                "id": "ALT-RP-002",
                "title": "Checkout failures — partial service unreachability",
                "severity_fired": "P2",
                "affected_services": ["order-service", "payment-service"],
            },
            "known_services": {
                "network-infra", "order-service", "payment-service",
                "fraud-detection-service", "postgres-db",
                "redis-payment-cache", "cdn-edge", "api-gateway",
            },
            "tool_responses": {
                "query_logs": {
                    "network-infra": (
                        "CRITICAL: BGP peer 10.0.2.1 route withdrawal — "
                        "routes to 10.0.1.0/24 removed\n"
                        "CRITICAL: BGP peer 10.0.3.1 route withdrawal — "
                        "routes to 10.0.1.0/24 removed\n"
                        "INFO: configuration change applied — export filter updated"
                    ),
                    "order-service": "ERROR: timeout payment-service — no route to host",
                    "payment-service": "INFO: local traffic normal | WARN: external health failing",
                    "fraud-detection-service": "WARN: cross-AZ probes timeout | Local: OK",
                    "postgres-db": "Operating normally",
                    "redis-payment-cache": "Operating normally",
                    "cdn-edge": "Operating normally",
                    "api-gateway": "ERROR: some backend routes timing out",
                },
                "check_metrics": {
                    "network-infra": (
                        "BGP AZ-2→AZ-1: WITHDRAWN | AZ-3→AZ-1: WITHDRAWN | "
                        "AZ-1 internal: UP | Last change: 18min ago"
                    ),
                    "order-service": "AZ-1: 0.2% fail | AZ-2: 99% fail | AZ-3: 98% fail",
                    "payment-service": "Internal: 100% success | External: 0 inbound from AZ-2/3",
                    "fraud-detection-service": "Local: normal | External: timeout",
                    "postgres-db": "All normal",
                    "redis-payment-cache": "All normal",
                    "cdn-edge": "Cache: 91% hit | Normal",
                    "api-gateway": "Mixed — AZ-1 OK, AZ-2/3 partial failures",
                },
                "check_dependencies": {
                    "order-service": "Depends on: payment-service [PARTIAL], fraud-detection [PARTIAL]",
                    "payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
                    "network-infra": "BGP: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN]",
                    "fraud-detection-service": "Depends on: postgres-db [OK]",
                    "postgres-db": "All healthy",
                    "redis-payment-cache": "All healthy",
                    "cdn-edge": "No issues",
                    "api-gateway": "Mixed backends",
                },
                "check_recent_deploys": {
                    "network-infra": (
                        "Config change 18min ago — BGP export filter modified, "
                        "accidentally removed AZ-1 prefix from AZ-2/AZ-3 ads"
                    ),
                    "payment-service": "No recent deploys",
                    "order-service": "No recent deploys",
                    "fraud-detection-service": "No recent deploys",
                    "postgres-db": "Minor change 5 days ago — increased shared_buffers",
                    "redis-payment-cache": "No recent deploys",
                    "cdn-edge": "No recent deploys",
                    "api-gateway": "Deploy 1 day ago — tracing headers, no routing changes",
                },
                "check_service_status": {
                    "network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
                    "payment-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
                    "order-service": "DEGRADED",
                    "fraud-detection-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
                    "postgres-db": "HEALTHY",
                    "redis-payment-cache": "HEALTHY",
                    "cdn-edge": "HEALTHY",
                    "api-gateway": "DEGRADED",
                },
            },
            "remediation_data": {
                "rollback_deploy": {
                    "network-infra": "Router config rolled back — BGP policy restored",
                },
                "execute_runbook_step": {
                    "restore_bgp_routes": "BGP routes restored — AZ-2/3 can reach AZ-1",
                    "verify_checkout_recovery": "Checkout failure: 0.3% — resolved",
                    "verify_cross_az_connectivity": "AZ-2→AZ-1: OK | AZ-3→AZ-1: OK",
                },
            },
            "correct_remediation_sequence": [
                "execute_runbook_step:restore_bgp_routes",
                "rollback_deploy:network-infra",
                "execute_runbook_step:verify_cross_az_connectivity",
                "execute_runbook_step:verify_checkout_recovery",
            ],
            "wrong_actions": {
                "restart_service:payment-service": "healthy — network issue",
                "scale_service:payment-service": "won't fix routing",
                "restart_service:order-service": "victim",
                "clear_cache:redis-payment-cache": "unrelated",
                "restart_service:cdn-edge": "healthy",
                "restart_service:fraud-detection-service": "healthy locally",
                "restart_service:api-gateway": "victim of routing",
                "rollback_deploy:api-gateway": "deploy was unrelated",
                "rollback_deploy:postgres-db": "change was 5 days ago",
            },
            "resolution_keywords": [
                "bgp", "network", "route", "rollback", "partition",
                "restored", "az-1", "az-2", "az-3", "checkout",
                "withdrawal", "config", "advertisement", "export",
            ],
        },

        # RP-003: Credential rotation remediation — 7-step sequence, 8 services
        {
            "scenario_id": "RP-003",
            "description": (
                "Full incident remediation required. Multiple services failing "
                "database authentication. Diagnose, fix, verify, and document."
            ),
            "incident_summary": (
                "Multiple services reporting database authentication failures. "
                "postgres-db connection pool near capacity with failed auth attempts. "
                "user-service and notification-service are down. api-gateway degraded. "
                "Diagnose the root cause, execute remediation, and document."
            ),
            "alert": {
                "id": "ALT-RP-003",
                "title": "Multiple services — DB authentication failures",
                "severity_fired": "P2",
                "affected_services": [
                    "user-service", "notification-service", "api-gateway",
                ],
            },
            "known_services": {
                "api-gateway", "user-service", "notification-service",
                "postgres-db", "config-service", "redis-session",
                "order-service", "product-service",
            },
            "tool_responses": {
                "query_logs": {
                    "user-service": (
    "FATAL: password authentication failed for user 'app_user'\n"
    "ERROR: DB credentials rejected\n"
    "WARN: credentials last refreshed at 08:12:00Z"
),

"notification-service": (
    "FATAL: password authentication failed\n"
    "WARN: credentials last refreshed at 08:12:00Z — "
    "authentication rejected by postgres-db"
),
                    "api-gateway": (
                        "ERROR: upstream user-service 503\n"
                        "ERROR: upstream notification-service 503"
                    ),
                    "postgres-db": (
                        "LOG: auth failure from 10.0.3.x (user-service)\n"
                        "LOG: auth failure from 10.0.4.x (notification-service)\n"
                        "LOG: 490/500 slots used by failed auth retries"
                    ),
                    "config-service": (
                        "INFO: secrets rotation executed at 08:12:00Z\n"
                        "WARN: rotation used PREVIOUS credential set — "
                        "template bug in v3.2.1\n"
                        "INFO: pushed to: user-service, notification-service, order-service"
                    ),
                    "redis-session": "Operating normally",
                    "order-service": (
                        "WARN: received credential push at 08:12:00Z — "
                        "not applied yet, still using old valid credentials"
                    ),
                    "product-service": "Operating normally — using original credentials",
                },
                "check_metrics": {
                    "user-service": "DB auth: 100% failure | HTTP 503: 100%",
                    "notification-service": "DB auth: 100% failure | HTTP 503: 100%",
                    "api-gateway": "503 rate: 62%",
                    "postgres-db": "Connections: 490/500 | Auth failures/s: 80 | DB: HEALTHY",
                    "config-service": "HEALTHY | Last push: 7min ago | Type: secrets_rotation",
                    "redis-session": "All normal",
                    "order-service": "HEALTHY | Using old (valid) credentials",
                    "product-service": "HEALTHY | Unaffected",
                },
                "check_dependencies": {
                    "user-service": "Depends on: postgres-db [AUTH FAIL], config-service [creds]",
                    "notification-service": "Depends on: postgres-db [AUTH FAIL], config-service [creds]",
                    "api-gateway": "Depends on: user-service [DOWN], notification-service [DOWN]",
                    "postgres-db": "No upstream — DB itself is healthy",
                    "config-service": "Provides credentials to: user-svc, notification-svc, order-svc",
                    "redis-session": "Standalone",
                    "order-service": "Depends on: postgres-db [OK — old creds]",
                    "product-service": "Depends on: postgres-db [OK — original creds]",
                },
                "check_recent_deploys": {
                    "config-service": (
                        "Deploy 2h ago: v3.2.1 — updated secrets rotation template. "
                        "Bug: references previous credential set instead of generating new."
                    ),
                    "user-service": "Deploy 4h ago: profile endpoint — no DB changes",
                    "notification-service": "No recent deploys",
                    "postgres-db": "No recent deploys",
                    "api-gateway": "No recent deploys",
                    "redis-session": "No recent deploys",
                    "order-service": "Deploy 1 day ago: email template — no DB changes",
                    "product-service": "No recent deploys",
                },
                "check_service_status": {
                    "user-service": "DOWN | DB auth failures",
                    "notification-service": "DOWN | DB auth failures",
                    "api-gateway": "DEGRADED | 62%",
                    "postgres-db": "STRESSED | 490/500 connections (failed auths)",
                    "config-service": "HEALTHY | Rotation completed",
                    "redis-session": "HEALTHY",
                    "order-service": "HEALTHY | Old creds valid",
                    "product-service": "HEALTHY",
                },
            },
            "remediation_data": {
                "rollback_deploy": {
                    "config-service": "config-service rolled back to v3.2.0 — bug removed",
                },
                "execute_runbook_step": {
                    "trigger_credential_rotation": (
                        "Correct credentials generated and pushed to "
                        "user-service, notification-service, order-service"
                    ),
                    "verify_db_connectivity": (
                        "user-service: DB OK | notification-service: DB OK | "
                        "order-service: DB OK | postgres-db: 45/500 connections"
                    ),
                    "verify_api_recovery": "api-gateway 503 rate: 0.1% — recovered",
                },
                "restart_service": {
                    "user-service": "user-service restarted — DB auth OK with correct creds",
                    "notification-service": "notification-service restarted — DB auth OK",
                    "order-service": "order-service restarted — using correct credentials",
                },
            },
            "correct_remediation_sequence": [
                "rollback_deploy:config-service",
                "execute_runbook_step:trigger_credential_rotation",
                "restart_service:user-service",
                "restart_service:notification-service",
                "restart_service:order-service",
                "execute_runbook_step:verify_db_connectivity",
                "execute_runbook_step:verify_api_recovery",
            ],
            "wrong_actions": {
                "restart_service:postgres-db": "DB is healthy — problem is credentials",
                "scale_service:postgres-db": "connections are failed auths",
                "restart_service:api-gateway": "downstream — fix auth first",
                "rollback_deploy:user-service": "deploy was unrelated",
                "rollback_deploy:order-service": "deploy was unrelated",
                "clear_cache:redis-session": "healthy",
                "restart_service:product-service": "healthy",
                "restart_service:redis-session": "healthy",
            },
            "resolution_keywords": [
                "config", "credential", "rotation", "stale", "password",
                "authentication", "rollback", "config-service", "v3.2.1",
                "restarted", "recovered", "push", "secrets", "template",
            ],
        },
    ],
}


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def get_task(task_id: str) -> dict:
    if task_id not in ALL_TASKS:
        raise ValueError(
            f"Unknown task_id '{task_id}'. Valid: {list(ALL_TASKS.keys())}"
        )
    return ALL_TASKS[task_id]


def get_scenario(task_id: str, index: int) -> dict:
    if task_id not in SCENARIOS:
        raise ValueError(f"No scenarios for task_id '{task_id}'.")
    scenarios = SCENARIOS[task_id]
    if index < 0 or index >= len(scenarios):
        raise ValueError(
            f"Scenario index {index} out of range for task '{task_id}' "
            f"(valid: 0–{len(scenarios) - 1})"
        )
    return scenarios[index]


def list_tasks() -> list:
    return list(ALL_TASKS.values())