File size: 39,211 Bytes
d5fc8a7
37204eb
d5fc8a7
37204eb
 
 
 
 
 
 
 
 
d5fc8a7
 
37204eb
 
 
 
d5fc8a7
 
37204eb
 
d5fc8a7
 
 
 
 
 
 
 
37204eb
 
 
 
d5fc8a7
 
37204eb
 
 
 
 
d5fc8a7
 
37204eb
d5fc8a7
 
 
 
 
 
 
 
37204eb
 
 
 
d5fc8a7
 
37204eb
 
 
 
 
 
d5fc8a7
 
37204eb
d5fc8a7
 
 
 
 
 
 
 
37204eb
 
 
d5fc8a7
 
 
 
37204eb
 
 
 
 
 
 
 
 
 
 
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
 
37204eb
d5fc8a7
 
 
 
37204eb
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
 
37204eb
 
d5fc8a7
 
 
 
37204eb
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
37204eb
d5fc8a7
37204eb
d5fc8a7
37204eb
d5fc8a7
 
 
 
 
 
 
 
 
37204eb
 
 
 
d5fc8a7
 
 
37204eb
 
 
d5fc8a7
 
 
37204eb
 
 
d5fc8a7
 
 
37204eb
 
 
 
 
 
 
 
 
 
 
 
d5fc8a7
 
37204eb
d5fc8a7
37204eb
d5fc8a7
 
 
37204eb
 
 
 
d5fc8a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37204eb
 
 
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
37204eb
 
 
d5fc8a7
 
37204eb
 
 
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
 
 
37204eb
 
 
d5fc8a7
 
 
 
 
 
 
 
 
37204eb
 
 
 
d5fc8a7
 
 
37204eb
 
d5fc8a7
 
37204eb
 
 
d5fc8a7
 
 
37204eb
 
 
 
 
 
 
 
 
 
 
 
d5fc8a7
 
 
 
 
 
 
37204eb
 
 
 
 
 
d5fc8a7
 
 
 
 
 
37204eb
d5fc8a7
 
 
 
 
 
 
37204eb
 
 
 
 
 
 
d5fc8a7
 
 
 
37204eb
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
37204eb
 
 
d5fc8a7
 
 
37204eb
 
d5fc8a7
 
 
37204eb
d5fc8a7
37204eb
 
 
d5fc8a7
37204eb
 
 
d5fc8a7
37204eb
d5fc8a7
 
 
 
 
 
 
 
 
 
 
37204eb
d5fc8a7
 
37204eb
 
 
d5fc8a7
 
37204eb
 
 
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
 
 
 
 
37204eb
d5fc8a7
 
37204eb
 
 
 
 
 
 
 
 
d5fc8a7
37204eb
 
d5fc8a7
 
37204eb
 
 
 
 
d5fc8a7
37204eb
 
d5fc8a7
 
 
37204eb
 
d5fc8a7
37204eb
d5fc8a7
 
37204eb
d5fc8a7
 
37204eb
 
 
 
 
 
d5fc8a7
 
 
 
37204eb
d5fc8a7
 
 
37204eb
 
d5fc8a7
 
37204eb
 
 
d5fc8a7
 
 
 
37204eb
d5fc8a7
 
 
37204eb
 
 
d5fc8a7
 
37204eb
 
 
d5fc8a7
 
 
37204eb
d5fc8a7
37204eb
 
 
d5fc8a7
37204eb
d5fc8a7
 
37204eb
 
d5fc8a7
 
 
 
 
 
 
 
 
 
 
 
37204eb
 
d5fc8a7
 
37204eb
 
d5fc8a7
 
37204eb
 
d5fc8a7
 
 
 
 
37204eb
 
 
d5fc8a7
37204eb
 
d5fc8a7
 
37204eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5fc8a7
 
37204eb
 
 
 
d5fc8a7
 
37204eb
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
 
37204eb
d5fc8a7
 
37204eb
 
d5fc8a7
37204eb
 
 
d5fc8a7
 
 
 
 
 
 
 
 
 
 
37204eb
 
 
 
 
 
d5fc8a7
 
 
 
37204eb
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
37204eb
 
 
d5fc8a7
 
37204eb
 
 
d5fc8a7
 
 
37204eb
d5fc8a7
37204eb
 
 
 
d5fc8a7
 
 
 
 
 
 
 
37204eb
 
 
 
 
d5fc8a7
 
 
 
 
 
37204eb
 
d5fc8a7
 
37204eb
d5fc8a7
 
 
37204eb
 
 
d5fc8a7
 
37204eb
 
 
 
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
 
37204eb
 
 
 
d5fc8a7
 
37204eb
 
 
 
 
 
 
 
 
 
 
 
 
d5fc8a7
 
37204eb
 
 
 
 
 
d5fc8a7
 
 
 
 
 
 
 
 
 
37204eb
 
 
 
 
 
 
 
 
d5fc8a7
 
37204eb
 
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
37204eb
 
 
d5fc8a7
 
37204eb
 
 
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
 
 
 
 
 
 
 
37204eb
 
 
 
 
 
 
 
 
 
d5fc8a7
 
 
37204eb
 
 
d5fc8a7
37204eb
d5fc8a7
 
 
 
37204eb
d5fc8a7
 
37204eb
 
 
 
d5fc8a7
37204eb
d5fc8a7
 
 
37204eb
d5fc8a7
 
 
 
 
37204eb
 
 
 
d5fc8a7
 
37204eb
 
 
 
 
 
d5fc8a7
 
 
 
 
 
 
 
37204eb
 
 
 
d5fc8a7
 
 
 
37204eb
 
d5fc8a7
 
 
 
 
 
 
 
 
 
 
 
37204eb
 
 
 
d5fc8a7
 
 
 
 
 
 
 
 
 
37204eb
d5fc8a7
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
"""
tasks.py β€” Task and scenario definitions for Cloud Incident Response OpenEnv.

Covers cross-service cascading failures in distributed cloud systems:
  - DB connection pool exhaustion cascading through service mesh
  - CDN cache invalidation storms
  - OOM kills from runaway analytics queries
  - BGP network partitions isolating availability zones

Distinct from Kubernetes ops environments β€” focuses on application-layer
incident response: log correlation, dependency tracing, and remediation
across microservice architectures.

Public API:
    get_task(task_id)            -> task metadata dict
    get_scenario(task_id, index) -> scenario dict
    list_tasks()                 -> list of task dicts
    ALL_TASKS                    -> dict[task_id -> metadata]
"""

from __future__ import annotations

ALL_TASKS: dict = {
    "alert_classification": {
        "id": "alert_classification",
        "name": "Task 1: Alert Severity Classification",
        "difficulty": "easy",
        "max_steps": 3,
        "score_range": [0.0, 1.0],
        "description": (
            "An alert has fired. Query logs and metrics across affected services, "
            "then classify the incident severity: P1 (CRITICAL β€” revenue/user impact, "
            "immediate action), P2 (HIGH β€” degraded service), P3 (MEDIUM β€” minor issue), "
            "P4 (LOW β€” informational). Submit severity with submit_severity."
        ),
        "available_actions": [
            "query_logs",
            "check_metrics",
            "check_dependencies",
            "check_recent_deploys",
            "submit_severity",
        ],
        "submission_action": "submit_severity",
        "scenarios": 2,
    },
    "root_cause_analysis": {
        "id": "root_cause_analysis",
        "name": "Task 2: Root Cause Analysis",
        "difficulty": "medium",
        "max_steps": 10,
        "score_range": [0.0, 1.0],
        "description": (
            "A production incident is active. Use diagnostic tools to trace the failure "
            "chain across services. Query logs, metrics, dependency graphs, and recent "
            "deploys to identify which service is the root cause and what failure mode "
            "triggered the cascade. Submit findings with submit_root_cause."
        ),
        "available_actions": [
            "query_logs",
            "check_metrics",
            "check_dependencies",
            "check_recent_deploys",
            "check_service_status",
            "submit_root_cause",
        ],
        "submission_action": "submit_root_cause",
        "scenarios": 2,
    },
    "remediation_planning": {
        "id": "remediation_planning",
        "name": "Task 3: Incident Remediation",
        "difficulty": "hard",
        "max_steps": 15,
        "score_range": [0.0, 1.0],
        "description": (
            "A critical production incident requires full end-to-end resolution. "
            "Diagnose the root cause, execute the correct remediation sequence "
            "(disable feature flags, restart services, rollback deploys, run runbook steps), "
            "then submit a resolution summary. Scored on investigation quality, "
            "remediation correctness, efficiency, and documentation."
        ),
        "available_actions": [
            "query_logs",
            "check_metrics",
            "check_dependencies",
            "check_recent_deploys",
            "check_service_status",
            "restart_service",
            "rollback_deploy",
            "scale_service",
            "disable_feature_flag",
            "clear_cache",
            "execute_runbook_step",
            "submit_resolution",
        ],
        "submission_action": "submit_resolution",
        "scenarios": 2,
    },
}

# ---------------------------------------------------------------------------
# Scenario data β€” 3 tasks x 2 scenarios = 6 total episodes
# ---------------------------------------------------------------------------

SCENARIOS: dict = {

    # ── TASK 1: ALERT CLASSIFICATION ────────────────────────────────────────

    "alert_classification": [

        # AC-001: Cascading DB connection pool exhaustion
        {
            "scenario_id": "AC-001",
            "description": (
                "Cascading failure: postgres-db connection pool exhausted, "
                "causing auth-service timeouts, blocking api-gateway requests. "
                "Revenue impact is severe and growing."
            ),
            "incident_summary": (
                "P1 ALERT β€” api-gateway 5xx rate 78%, auth-service timeout rate 94%, "
                "postgres-db connection pool at 100% (500/500). "
                "Checkout completely down. Revenue impact: $12,000/min."
            ),
            "alert": {
                "id": "ALT-20240315-001",
                "title": "CRITICAL: api-gateway error rate spike 78%",
                "severity_fired": "P1",
                "affected_services": ["api-gateway", "auth-service", "postgres-db"],
                "symptoms": [
                    "api-gateway: HTTP 503 rate 78% (baseline: 0.1%)",
                    "auth-service: connection timeout 94% of requests",
                    "postgres-db: connection pool 500/500 β€” 100% utilized",
                    "checkout flow: completely unavailable",
                    "new user logins: 0% success rate",
                ],
                "error_rate": 0.78,
                "duration_minutes": 4,
                "revenue_impact_per_min": 12000,
            },
            "known_services": {"api-gateway", "auth-service", "postgres-db"},
            "tool_responses": {
                "query_logs": {
                    "api-gateway": (
                        "2024-03-15T10:04:12Z ERROR upstream connect error β€” "
                        "reset reason: connection timeout auth-service:8080\n"
                        "2024-03-15T10:04:13Z ERROR 503 Service Unavailable upstream: auth-service\n"
                        "2024-03-15T10:04:14Z ERROR circuit breaker OPEN for auth-service"
                    ),
                    "auth-service": (
                        "2024-03-15T10:04:10Z ERROR pq: sorry, too many clients already\n"
                        "2024-03-15T10:04:11Z ERROR dial tcp postgres-db:5432: "
                        "connect: connection refused β€” pool exhausted (500/500)\n"
                        "2024-03-15T10:04:12Z ERROR all connection pool slots occupied"
                    ),
                    "postgres-db": (
                        "2024-03-15T10:03:58Z LOG connection received: host=auth-service\n"
                        "2024-03-15T10:04:00Z FATAL remaining connection slots reserved "
                        "for non-replication superuser\n"
                        "2024-03-15T10:04:01Z LOG max_connections=500 active=500 idle=0"
                    ),
                },
                "check_metrics": {
                    "api-gateway": (
                        "HTTP 5xx rate: 78% | p99 latency: 30s (timeout) | "
                        "RPS: 1,200 | circuit_breaker: OPEN"
                    ),
                    "auth-service": (
                        "Error rate: 94% | DB connection wait: 28s | "
                        "Active connections: 0 | Request queue: 847"
                    ),
                    "postgres-db": (
                        "Connections: 500/500 (100%) | Query queue: 847 | "
                        "CPU: 98% | Memory: 89% | Active queries: 500"
                    ),
                },
                "check_dependencies": {
                    "api-gateway": "Depends on: auth-service [CRITICAL], product-service [OK]",
                    "auth-service": "Depends on: postgres-db [CRITICAL], redis-session [OK]",
                    "postgres-db": "No upstream dependencies β€” root level service",
                },
                "check_recent_deploys": {
                    "api-gateway": "Last deploy: 3 days ago β€” no recent changes",
                    "auth-service": (
                        "Last deploy: 47 min ago β€” PR #2341: "
                        "increased default connection pool size from 10 to 500"
                    ),
                    "postgres-db": "Last deploy: 12 days ago β€” no recent changes",
                },
            },
            "correct_severity": "P1",
            "adjacent_severities": ["P2"],
            "correct_root_cause": {
                "service": "postgres-db",
                "failure_mode": "connection pool exhaustion",
            },
            "correct_remediation": [
                "restart_service:auth-service",
                "execute_runbook_step:increase_max_connections",
                "scale_service:postgres-db",
            ],
            "wrong_actions": {
                "rollback_deploy": "Rolling back auth-service pool size won't fix 500 stuck connections",
                "restart_service:api-gateway": "api-gateway is a victim β€” fixing it won't help",
                "clear_cache": "Cache is unrelated to DB connection pool exhaustion",
            },
        },

        # AC-002: CDN cache invalidation storm
        {
            "scenario_id": "AC-002",
            "description": (
                "CDN cache invalidation storm: a misconfigured purge cronjob wiped "
                "all 2.1M cached keys, sending 40x normal traffic to origin. "
                "Site degraded but not fully down β€” P2 severity."
            ),
            "incident_summary": (
                "P2 ALERT β€” CDN cache hit rate dropped from 94% to 3%, "
                "product-service origin traffic up 4000%, image-service CPU at 95%. "
                "Pages loading slowly (p99: 18s). Checkout still working."
            ),
            "alert": {
                "id": "ALT-20240315-002",
                "title": "HIGH: CDN cache miss storm β€” origin overloaded",
                "severity_fired": "P2",
                "affected_services": ["cdn-edge", "product-service", "image-service"],
                "symptoms": [
                    "CDN cache hit rate: 3% (normal: 94%)",
                    "product-service: origin RPS 48,000 (normal: 1,200)",
                    "image-service: CPU 95%, p99 latency 18s",
                    "User experience: product pages slow, some images timing out",
                    "Checkout: still functional (not affected)",
                ],
                "error_rate": 0.15,
                "duration_minutes": 8,
                "revenue_impact_per_min": 800,
            },
            "known_services": {"cdn-edge", "product-service", "image-service"},
            "tool_responses": {
                "query_logs": {
                    "cdn-edge": (
                        "2024-03-15T10:22:00Z INFO cache MISS ratio: 97% (5min window)\n"
                        "2024-03-15T10:20:11Z WARN mass cache invalidation β€” "
                        "2,100,000 keys purged by purge-job-prod\n"
                        "2024-03-15T10:20:10Z INFO purge pattern: /* (ALL keys)"
                    ),
                    "product-service": (
                        "2024-03-15T10:22:05Z WARN request queue depth: 12,400\n"
                        "2024-03-15T10:22:06Z ERROR timeout fetching from image-service (18s)\n"
                        "2024-03-15T10:22:07Z WARN worker pool 95% utilized"
                    ),
                    "image-service": (
                        "2024-03-15T10:22:00Z WARN CPU throttling engaged (95%)\n"
                        "2024-03-15T10:22:01Z ERROR worker pool exhausted β€” dropping requests\n"
                        "2024-03-15T10:22:02Z ERROR OOM risk: memory at 91%"
                    ),
                },
                "check_metrics": {
                    "cdn-edge": (
                        "Cache hit rate: 3% | Purge events (1h): 1 mass purge | "
                        "Origin RPS: 48,000 | Bandwidth: 890 Gbps"
                    ),
                    "product-service": (
                        "Origin RPS: 48,000 (normal: 1,200) | "
                        "Queue depth: 12,400 | Worker utilization: 95%"
                    ),
                    "image-service": (
                        "CPU: 95% | Memory: 91% | "
                        "Worker pool: 0 free / 200 | p99 latency: 18s"
                    ),
                },
                "check_dependencies": {
                    "cdn-edge": "Origin: product-service [OVERLOADED]",
                    "product-service": "Depends on: image-service [DEGRADED], postgres-db [OK]",
                    "image-service": "Depends on: object-storage [OK] β€” no upstream issues",
                },
                "check_recent_deploys": {
                    "cdn-edge": (
                        "Cronjob purge-job-prod updated 2h ago β€” "
                        "purge pattern changed from /images/* to /* (all keys)"
                    ),
                    "product-service": "Last deploy: 5 days ago β€” no recent changes",
                    "image-service": "Last deploy: 2 days ago β€” no recent changes",
                },
            },
            "correct_severity": "P2",
            "adjacent_severities": ["P1", "P3"],
            "correct_root_cause": {
                "service": "cdn-edge",
                "failure_mode": "misconfigured purge job invalidated all cache keys",
            },
            "correct_remediation": [
                "disable_feature_flag:purge-job-prod",
                "execute_runbook_step:warm_cdn_cache",
                "scale_service:image-service",
            ],
            "wrong_actions": {
                "restart_service:image-service": (
                    "Restarting won't fix the CDN miss storm β€” source is the purge job"
                ),
                "rollback_deploy:product-service": "product-service has no recent deploys",
                "restart_service:cdn-edge": (
                    "Restarting CDN edge nodes will make cache miss rate worse temporarily"
                ),
            },
        },
    ],

    # ── TASK 2: ROOT CAUSE ANALYSIS ─────────────────────────────────────────

    "root_cause_analysis": [

        # RCA-001: Analytics service OOM kills postgres-db
        {
            "scenario_id": "RCA-001",
            "description": (
                "postgres-db was OOM-killed by the Linux kernel after a runaway "
                "analytics query with no LIMIT clause consumed all available memory. "
                "All downstream services are now failing."
            ),
            "incident_summary": (
                "Multiple services down: api-gateway 503, auth-service failing, "
                "order-service write failures. postgres-db restarting in a loop. "
                "Root cause upstream β€” trace the failure chain."
            ),
            "alert": {
                "id": "ALT-RCA-001",
                "title": "CRITICAL: postgres-db crash loop β€” all dependents down",
                "severity_fired": "P1",
                "affected_services": [
                    "api-gateway", "auth-service", "order-service", "postgres-db"
                ],
                "symptoms": [
                    "postgres-db: 4 restarts in 12 minutes",
                    "auth-service: connection refused β€” 100% failure",
                    "order-service: all writes failing",
                    "api-gateway: 503 on all authenticated routes",
                    "analytics-service: last job failed 12 min ago",
                ],
                "error_rate": 0.95,
                "duration_minutes": 14,
            },
            "known_services": {
                "api-gateway", "auth-service", "order-service",
                "postgres-db", "analytics-service", "redis-session",
            },
            "tool_responses": {
                "query_logs": {
                    "postgres-db": (
                        "2024-03-16T02:11:00Z LOG database system shut down at 02:10:58\n"
                        "2024-03-16T02:10:58Z FATAL Out of Memory: Kill process 1847 (postgres) "
                        "score 982 or sacrifice child\n"
                        "2024-03-16T02:10:30Z LOG process 1847 query running 12min: "
                        "SELECT * FROM events JOIN user_sessions JOIN orders "
                        "JOIN products β€” no LIMIT clause, est 847M rows"
                    ),
                    "analytics-service": (
                        "2024-03-16T01:58:00Z INFO starting job: full_history_export\n"
                        "2024-03-16T01:58:01Z WARN query has no LIMIT β€” estimated 847M rows\n"
                        "2024-03-16T02:10:55Z ERROR job killed by OOM β€” full_history_export FAILED"
                    ),
                    "auth-service": (
                        "2024-03-16T02:11:05Z ERROR connect ECONNREFUSED postgres-db:5432\n"
                        "2024-03-16T02:11:06Z ERROR all retries exhausted β€” giving up"
                    ),
                    "api-gateway": (
                        "2024-03-16T02:11:10Z ERROR upstream auth-service: 503 Service Unavailable"
                    ),
                    "order-service": (
                        "2024-03-16T02:11:08Z ERROR pq: the database system is starting up"
                    ),
                    "redis-session": "No errors β€” operating normally at 99.2% hit rate",
                },
                "check_metrics": {
                    "postgres-db": (
                        "Memory: OOM killed (0% free at crash) | "
                        "Restarts: 4 in 12min | Status: RESTARTING"
                    ),
                    "analytics-service": (
                        "Memory at crash: 31.2GB / 32GB (97.5%) | "
                        "Job runtime: 12min 55s | Status: ERROR"
                    ),
                    "auth-service": "Connection success: 0% | DB: CRITICAL | Redis: OK",
                    "api-gateway": "503 rate: 95% | Auth dependency: DOWN",
                    "order-service": "Write success: 0% | DB: RESTARTING",
                    "redis-session": "Hit rate: 99.2% | Memory: 42% | Healthy",
                },
                "check_dependencies": {
                    "postgres-db": (
                        "Clients: auth-service, order-service, analytics-service, product-service"
                    ),
                    "analytics-service": "Depends on: postgres-db [CRASH LOOP]",
                    "auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
                    "api-gateway": "Depends on: auth-service [DOWN]",
                    "order-service": "Depends on: postgres-db [CRASH LOOP]",
                    "redis-session": "No DB dependency β€” standalone cache",
                },
                "check_recent_deploys": {
                    "analytics-service": (
                        "Deploy 6h ago: added full_history_export scheduled job β€” "
                        "runs daily at 02:00 UTC, no LIMIT on cross-table JOIN"
                    ),
                    "postgres-db": "No deploys in 3 weeks",
                    "auth-service": "No recent deploys",
                    "order-service": "No recent deploys",
                    "redis-session": "No recent deploys",
                },
                "check_service_status": {
                    "postgres-db": "RESTARTING | Uptime: 47s | Crash reason: OOM",
                    "analytics-service": "ERROR | Last job: full_history_export FAILED",
                    "auth-service": "DOWN | Waiting for postgres-db",
                    "api-gateway": "DEGRADED | 95% requests failing",
                    "order-service": "DOWN | Waiting for postgres-db",
                    "redis-session": "HEALTHY | All normal",
                },
            },
            "correct_root_cause": {
                "service": "analytics-service",
                "failure_mode": "unbounded query OOM killing postgres-db",
            },
            "correct_remediation": [
                "disable_feature_flag:full_history_export",
                "restart_service:analytics-service",
                "restart_service:postgres-db",
            ],
            "wrong_actions": {
                "restart_service:auth-service": "auth-service is a victim β€” DB must be fixed first",
                "restart_service:api-gateway": "api-gateway is downstream β€” won't help",
                "scale_service:postgres-db": "Scaling won't prevent OOM if the bad query runs again",
                "rollback_deploy:postgres-db": "postgres-db has no recent deploys",
            },
        },

        # RCA-002: BGP route withdrawal β€” AZ network partition
        {
            "scenario_id": "RCA-002",
            "description": (
                "A BGP route withdrawal isolated AZ-1 (where payment-service runs) "
                "from AZ-2 and AZ-3, causing 61% of checkout requests to fail. "
                "Services within AZ-1 are healthy β€” it's a pure network issue."
            ),
            "incident_summary": (
                "Checkout failure rate 61% β€” AZ-2 and AZ-3 cannot reach payment-service "
                "in AZ-1. AZ-1 users unaffected. fraud-detection-service also unreachable "
                "cross-AZ. Network infrastructure change 18 min ago."
            ),
            "alert": {
                "id": "ALT-RCA-002",
                "title": "HIGH: checkout failure 61% β€” cross-AZ connectivity loss",
                "severity_fired": "P2",
                "affected_services": [
                    "order-service", "payment-service", "fraud-detection-service"
                ],
                "symptoms": [
                    "checkout failure rate: 61% (AZ-2/AZ-3 only)",
                    "payment-service: unreachable from AZ-2, AZ-3",
                    "fraud-detection-service: timeout from AZ-2, AZ-3",
                    "AZ-1 users: 0% failure rate",
                    "Network: AZ-2/AZ-3 β†’ AZ-1 routing broken",
                ],
                "error_rate": 0.61,
                "duration_minutes": 9,
            },
            "known_services": {
                "order-service", "payment-service", "fraud-detection-service",
                "postgres-db", "redis-payment-cache", "network-infra",
            },
            "tool_responses": {
                "query_logs": {
                    "order-service": (
                        "2024-03-17T14:32:10Z ERROR connection timeout payment-service:8080 "
                        "(AZ-2 to AZ-1: no route to host)\n"
                        "2024-03-17T14:32:11Z ERROR fraud-detection-service: i/o timeout (30s)"
                    ),
                    "payment-service": (
                        "2024-03-17T14:31:58Z WARN health check from AZ-2 LB failing\n"
                        "2024-03-17T14:31:59Z INFO AZ-1 local traffic: all normal"
                    ),
                    "fraud-detection-service": (
                        "2024-03-17T14:32:00Z INFO AZ-1 requests: all normal\n"
                        "2024-03-17T14:32:01Z WARN cross-AZ health probes: 100% timeout"
                    ),
                    "network-infra": (
                        "2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.2.1 route withdrawal β€” "
                        "AZ-2 lost route to AZ-1 CIDR 10.0.1.0/24\n"
                        "2024-03-17T14:31:45Z CRITICAL BGP peer 10.0.3.1 route withdrawal β€” "
                        "AZ-3 lost route to AZ-1 CIDR 10.0.1.0/24\n"
                        "2024-03-17T14:31:44Z INFO router config change applied β€” "
                        "BGP advertisement policy updated"
                    ),
                    "postgres-db": "Operating normally β€” no errors detected",
                    "redis-payment-cache": "Operating normally β€” AZ-1 traffic only, all healthy",
                },
                "check_metrics": {
                    "order-service": (
                        "AZ-2 checkout failure: 99% | AZ-3 checkout failure: 98% | "
                        "AZ-1 checkout failure: 0.2% (baseline)"
                    ),
                    "payment-service": (
                        "AZ-1 traffic: normal (100% success) | "
                        "AZ-2/AZ-3 inbound connections: 0 (blocked)"
                    ),
                    "fraud-detection-service": (
                        "AZ-1 processing: normal | "
                        "Cross-AZ health checks: 100% timeout"
                    ),
                    "network-infra": (
                        "BGP session AZ-2: WITHDRAWN | BGP session AZ-3: WITHDRAWN | "
                        "AZ-1 internal: all UP | Config change: 18min ago"
                    ),
                    "postgres-db": "All metrics normal β€” no anomalies",
                    "redis-payment-cache": "All metrics normal β€” AZ-1 only traffic",
                },
                "check_dependencies": {
                    "order-service": (
                        "Depends on: payment-service [PARTITIONED], "
                        "fraud-detection-service [PARTITIONED]"
                    ),
                    "payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
                    "fraud-detection-service": "Depends on: postgres-db [OK]",
                    "network-infra": "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN], AZ-1 [UP]",
                },
                "check_recent_deploys": {
                    "network-infra": (
                        "Router config change 18min ago β€” BGP route advertisement policy update: "
                        "inadvertently withdrew AZ-1 routes from AZ-2/AZ-3 peers"
                    ),
                    "payment-service": "No recent deploys",
                    "order-service": "No recent deploys",
                    "fraud-detection-service": "No recent deploys",
                },
                "check_service_status": {
                    "payment-service": "HEALTHY within AZ-1 | Cross-AZ: UNREACHABLE",
                    "order-service": "DEGRADED | AZ-2/AZ-3 instances failing",
                    "network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN | AZ-1: UP",
                    "fraud-detection-service": "HEALTHY within AZ-1 | Cross-AZ: UNREACHABLE",
                    "postgres-db": "HEALTHY",
                    "redis-payment-cache": "HEALTHY",
                },
            },
            "correct_root_cause": {
                "service": "network-infra",
                "failure_mode": "BGP route withdrawal causing AZ network partition",
            },
            "correct_remediation": [
                "execute_runbook_step:restore_bgp_routes",
                "rollback_deploy:network-infra",
            ],
            "wrong_actions": {
                "restart_service:payment-service": (
                    "payment-service is healthy β€” restarting won't fix routing"
                ),
                "restart_service:order-service": "order-service is a victim of the partition",
                "scale_service:payment-service": "Scaling won't fix a BGP routing issue",
                "clear_cache:redis-payment-cache": "Cache is healthy β€” not the cause",
            },
        },
    ],

    # ── TASK 3: REMEDIATION PLANNING ────────────────────────────────────────

    "remediation_planning": [

        # RP-001: Full OOM remediation
        {
            "scenario_id": "RP-001",
            "description": (
                "Full remediation: analytics-service OOM-killed postgres-db with an "
                "unbounded query. Must disable the offending job, restart postgres, "
                "restore all downstream services, and document the resolution."
            ),
            "incident_summary": (
                "CRITICAL β€” postgres-db in OOM crash loop. auth-service, order-service, "
                "api-gateway all down. analytics-service caused it with unbounded query. "
                "Required actions: disable job, restart postgres, restore services, document."
            ),
            "alert": {
                "id": "ALT-RP-001",
                "title": "CRITICAL: postgres-db OOM crash loop β€” full stack down",
                "severity_fired": "P1",
                "affected_services": [
                    "postgres-db", "analytics-service",
                    "auth-service", "order-service", "api-gateway"
                ],
            },
            "known_services": {
                "postgres-db", "auth-service", "order-service",
                "api-gateway", "analytics-service",
            },
            "tool_responses": {
                "query_logs": {
                    "postgres-db": (
                        "FATAL: Out of Memory: Kill process (postgres) β€” "
                        "analytics query running 12min with no LIMIT"
                    ),
                    "analytics-service": (
                        "ERROR: full_history_export β€” unbounded JOIN, 847M rows, killed by OOM"
                    ),
                    "auth-service": "ERROR: connect ECONNREFUSED postgres-db:5432",
                    "order-service": "ERROR: pq: the database system is starting up",
                    "api-gateway": "ERROR: upstream auth-service 503",
                },
                "check_metrics": {
                    "postgres-db": "Memory: OOM | Restarts: 4 | Status: CRASH LOOP",
                    "analytics-service": "Memory spike: 31GB/32GB | Status: ERROR",
                    "auth-service": "Connection success: 0% | Waiting for DB",
                    "order-service": "Write success: 0% | Waiting for DB",
                    "api-gateway": "503 rate: 95% | Auth: DOWN",
                },
                "check_dependencies": {
                    "postgres-db": "Clients: auth-service, order-service, analytics-service",
                    "analytics-service": "Depends on: postgres-db [CRASH LOOP]",
                    "auth-service": "Depends on: postgres-db [CRASH LOOP]",
                    "order-service": "Depends on: postgres-db [CRASH LOOP]",
                },
                "check_recent_deploys": {
                    "analytics-service": (
                        "Deploy 6h ago: full_history_export job β€” "
                        "unbounded cross-table JOIN query"
                    ),
                    "postgres-db": "No recent changes",
                },
                "check_service_status": {
                    "postgres-db": "CRASH LOOP | OOM kill | Uptime: 47s",
                    "analytics-service": "ERROR | Last job failed",
                    "auth-service": "DOWN",
                    "order-service": "DOWN",
                    "api-gateway": "DEGRADED",
                },
            },
            "remediation_data": {
                "disable_feature_flag": {
                    "full_history_export": (
                        "Cron job full_history_export DISABLED β€” "
                        "no more unbounded queries will run"
                    ),
                },
                "restart_service": {
                    "postgres-db": (
                        "postgres-db restarted cleanly β€” "
                        "accepting connections (12/500 active)"
                    ),
                    "analytics-service": (
                        "analytics-service restarted β€” no active queries"
                    ),
                    "auth-service": (
                        "auth-service restarted β€” reconnected to postgres-db OK"
                    ),
                    "order-service": (
                        "order-service restarted β€” writes resuming normally"
                    ),
                },
                "execute_runbook_step": {
                    "verify_db_health": (
                        "postgres-db: connections 12/500, CPU 12%, Memory 34% β€” healthy"
                    ),
                    "check_service_recovery": (
                        "auth-service OK | order-service OK | api-gateway OK"
                    ),
                },
            },
            "correct_remediation_sequence": [
                "disable_feature_flag:full_history_export",
                "restart_service:analytics-service",
                "restart_service:postgres-db",
                "restart_service:auth-service",
                "restart_service:order-service",
            ],
            "wrong_actions": {
                "rollback_deploy:postgres-db": (
                    "postgres-db has no recent deploy to roll back"
                ),
                "scale_service:postgres-db": (
                    "Scaling won't prevent the OOM query from running again"
                ),
                "restart_service:api-gateway": (
                    "api-gateway is downstream β€” fix the DB first"
                ),
            },
            "resolution_keywords": [
                "analytics", "oom", "memory", "postgres", "query",
                "full_history_export", "disabled", "restarted", "recovered",
            ],
        },

        # RP-002: Full BGP remediation
        {
            "scenario_id": "RP-002",
            "description": (
                "Full remediation: BGP route withdrawal partitioned AZ-2/AZ-3 from "
                "AZ-1 where payment-service runs. Must restore BGP routes, roll back "
                "the router config change, verify checkout recovery, and document."
            ),
            "incident_summary": (
                "P2 β€” BGP partition isolating payment-service from 61% of users. "
                "Router config change 18min ago is the cause. "
                "Required: restore BGP routes, rollback network config, verify recovery."
            ),
            "alert": {
                "id": "ALT-RP-002",
                "title": "HIGH: checkout 61% failure β€” BGP AZ partition",
                "severity_fired": "P2",
                "affected_services": ["network-infra", "order-service", "payment-service"],
            },
            "known_services": {
                "network-infra", "order-service", "payment-service",
                "fraud-detection-service", "postgres-db",
            },
            "tool_responses": {
                "query_logs": {
                    "network-infra": (
                        "CRITICAL: BGP route withdrawal β€” "
                        "AZ-2/AZ-3 lost route to AZ-1 10.0.1.0/24\n"
                        "Router config change 18min ago: BGP policy updated"
                    ),
                    "order-service": (
                        "ERROR: connection timeout payment-service β€” no route to host"
                    ),
                    "payment-service": (
                        "INFO: AZ-1 traffic normal | "
                        "WARN: cross-AZ health checks failing"
                    ),
                },
                "check_metrics": {
                    "network-infra": (
                        "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN | AZ-1: UP"
                    ),
                    "order-service": "AZ-2 failure: 99% | AZ-1 failure: 0.2%",
                    "payment-service": "AZ-1: normal | Cross-AZ inbound: 0",
                },
                "check_dependencies": {
                    "order-service": "Depends on: payment-service [PARTITIONED]",
                    "payment-service": "Depends on: postgres-db [OK]",
                    "network-infra": "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN]",
                },
                "check_recent_deploys": {
                    "network-infra": (
                        "Config change 18min ago β€” BGP policy update "
                        "accidentally withdrew AZ-1 routes"
                    ),
                    "payment-service": "No recent deploys",
                    "order-service": "No recent deploys",
                },
                "check_service_status": {
                    "network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
                    "payment-service": "HEALTHY (AZ-1) | Cross-AZ: UNREACHABLE",
                    "order-service": "DEGRADED",
                },
            },
            "remediation_data": {
                "rollback_deploy": {
                    "network-infra": (
                        "Router config rolled back β€” "
                        "BGP advertisement policy restored to previous version"
                    ),
                },
                "execute_runbook_step": {
                    "restore_bgp_routes": (
                        "BGP routes restored β€” AZ-2/AZ-3 can now reach AZ-1 10.0.1.0/24"
                    ),
                    "verify_checkout_recovery": (
                        "Checkout failure rate: 0.3% β€” incident fully resolved"
                    ),
                },
            },
            "correct_remediation_sequence": [
                "execute_runbook_step:restore_bgp_routes",
                "rollback_deploy:network-infra",
                "execute_runbook_step:verify_checkout_recovery",
            ],
            "wrong_actions": {
                "restart_service:payment-service": (
                    "payment-service is healthy β€” network is the issue"
                ),
                "scale_service:payment-service": "Scaling won't fix BGP routing",
                "restart_service:order-service": "order-service is a victim",
                "clear_cache": "Cache is unrelated to network routing",
            },
            "resolution_keywords": [
                "bgp", "network", "route", "rollback", "partition",
                "restored", "az-1", "az-2", "az-3", "checkout", "withdrawal",
            ],
        },
    ],
}


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def get_task(task_id: str) -> dict:
    if task_id not in ALL_TASKS:
        raise ValueError(
            f"Unknown task_id '{task_id}'. "
            f"Valid task IDs: {list(ALL_TASKS.keys())}"
        )
    return ALL_TASKS[task_id]


def get_scenario(task_id: str, index: int) -> dict:
    if task_id not in SCENARIOS:
        raise ValueError(f"No scenarios for task_id '{task_id}'.")
    scenarios = SCENARIOS[task_id]
    if index < 0 or index >= len(scenarios):
        raise ValueError(
            f"Scenario index {index} out of range for task '{task_id}' "
            f"(valid: 0–{len(scenarios) - 1})"
        )
    return scenarios[index]


def list_tasks() -> list:
    return list(ALL_TASKS.values())