File size: 64,496 Bytes
77eb356
 
 
a50dd28
 
 
 
77eb356
a50dd28
 
 
 
 
 
77eb356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a50dd28
 
 
 
77eb356
 
 
 
 
 
 
 
 
a50dd28
77eb356
 
 
 
 
 
 
 
a50dd28
 
 
 
 
77eb356
 
 
 
 
 
 
 
 
 
a50dd28
77eb356
 
 
 
 
 
 
 
 
a50dd28
 
 
 
77eb356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a50dd28
77eb356
 
 
 
a50dd28
77eb356
 
 
 
a50dd28
 
 
 
 
 
 
 
77eb356
 
a50dd28
77eb356
 
 
a50dd28
 
77eb356
 
a50dd28
 
 
77eb356
 
a50dd28
 
 
77eb356
 
 
 
a50dd28
 
 
77eb356
a50dd28
 
 
77eb356
 
 
 
 
a50dd28
 
 
77eb356
 
a50dd28
 
77eb356
 
a50dd28
 
77eb356
 
 
a50dd28
 
 
77eb356
 
a50dd28
 
 
77eb356
 
a50dd28
 
 
77eb356
 
a50dd28
77eb356
 
 
a50dd28
77eb356
 
 
a50dd28
 
 
77eb356
 
a50dd28
 
 
77eb356
 
a50dd28
 
 
77eb356
 
 
a50dd28
77eb356
a50dd28
 
77eb356
a50dd28
 
 
77eb356
 
 
 
 
a50dd28
 
77eb356
 
 
 
a50dd28
 
77eb356
 
a50dd28
 
 
77eb356
 
 
a50dd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77eb356
 
a50dd28
 
77eb356
a50dd28
 
 
 
 
 
77eb356
a50dd28
 
 
 
77eb356
 
a50dd28
 
 
77eb356
 
a50dd28
 
 
77eb356
 
a50dd28
 
77eb356
 
 
a50dd28
 
 
 
 
 
 
 
 
 
 
 
77eb356
 
 
a50dd28
 
 
77eb356
 
 
a50dd28
 
77eb356
 
a50dd28
 
 
77eb356
 
a50dd28
 
 
77eb356
 
 
 
a50dd28
 
77eb356
a50dd28
77eb356
a50dd28
77eb356
 
 
 
 
a50dd28
77eb356
 
a50dd28
 
 
8151d99
 
 
 
 
 
 
a50dd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77eb356
 
a50dd28
 
 
77eb356
 
a50dd28
 
77eb356
a50dd28
 
77eb356
a50dd28
 
 
77eb356
 
 
a50dd28
 
77eb356
 
a50dd28
 
 
 
 
 
77eb356
 
 
a50dd28
 
 
 
 
 
 
 
77eb356
 
 
a50dd28
 
 
 
 
 
77eb356
 
a50dd28
 
 
 
 
 
 
 
77eb356
 
 
a50dd28
77eb356
 
 
a50dd28
 
 
 
 
 
 
 
 
77eb356
 
 
a50dd28
 
 
77eb356
 
 
a50dd28
 
77eb356
 
a50dd28
 
 
 
77eb356
 
a50dd28
 
 
77eb356
 
 
 
a50dd28
 
 
 
77eb356
a50dd28
77eb356
 
 
 
 
a50dd28
77eb356
 
a50dd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77eb356
a50dd28
 
77eb356
 
a50dd28
 
 
77eb356
 
a50dd28
77eb356
 
a50dd28
 
 
77eb356
a50dd28
 
 
 
77eb356
a50dd28
77eb356
a50dd28
 
77eb356
a50dd28
 
 
 
77eb356
a50dd28
 
 
 
 
 
77eb356
a50dd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77eb356
a50dd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8151d99
 
 
 
 
 
 
 
a50dd28
8151d99
 
 
 
 
a50dd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77eb356
 
a50dd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77eb356
a50dd28
 
77eb356
a50dd28
77eb356
 
a50dd28
 
 
 
77eb356
a50dd28
 
 
 
 
 
 
 
 
 
 
 
 
77eb356
 
a50dd28
 
 
 
 
 
 
 
77eb356
 
 
a50dd28
 
77eb356
 
a50dd28
 
 
 
 
 
 
77eb356
 
 
 
a50dd28
 
 
 
 
 
 
 
 
 
 
 
77eb356
 
 
a50dd28
77eb356
 
 
a50dd28
 
 
77eb356
 
a50dd28
 
 
77eb356
 
a50dd28
 
 
77eb356
a50dd28
77eb356
 
 
 
a50dd28
 
77eb356
 
 
8151d99
 
 
 
 
 
77eb356
a50dd28
 
 
77eb356
a50dd28
 
 
 
 
 
77eb356
 
a50dd28
 
 
 
 
 
 
 
77eb356
 
a50dd28
 
 
 
77eb356
a50dd28
 
 
 
 
 
77eb356
 
 
a50dd28
 
77eb356
a50dd28
 
 
 
 
77eb356
 
a50dd28
 
 
 
 
 
 
 
77eb356
 
 
 
 
 
a50dd28
77eb356
 
 
a50dd28
 
 
 
 
 
 
77eb356
 
a50dd28
77eb356
a50dd28
77eb356
 
 
 
 
 
 
 
 
a50dd28
77eb356
 
a50dd28
 
 
 
 
 
 
 
77eb356
 
 
a50dd28
 
77eb356
 
 
a50dd28
77eb356
 
 
a50dd28
 
77eb356
 
a50dd28
 
 
 
77eb356
 
a50dd28
 
 
 
77eb356
 
 
 
a50dd28
77eb356
 
 
 
a50dd28
 
 
 
 
77eb356
a50dd28
 
 
77eb356
a50dd28
 
 
77eb356
 
a50dd28
 
 
 
 
 
 
 
 
 
 
77eb356
 
a50dd28
 
 
 
 
 
 
 
77eb356
 
 
a50dd28
 
77eb356
 
a50dd28
 
 
 
 
 
77eb356
 
a50dd28
 
 
 
 
 
 
 
77eb356
 
 
 
a50dd28
77eb356
 
a50dd28
 
 
77eb356
 
 
 
 
a50dd28
77eb356
 
 
a50dd28
 
 
 
 
 
 
 
 
77eb356
 
 
a50dd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8151d99
 
 
 
 
 
 
 
 
 
a50dd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77eb356
 
 
 
 
 
 
 
 
 
 
 
 
a50dd28
77eb356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
"""
tasks.py β€” Task and scenario definitions for Cloud Incident Response OpenEnv.

Difficulty calibration targets:
  EASY   β†’ 8B: 0.75-1.0,  70B: 0.85-1.0
  MEDIUM β†’ 8B: 0.30-0.50,  70B: 0.45-0.65
  HARD   β†’ 8B: 0.15-0.35,  70B: 0.30-0.50

Design principles for genuine difficulty:
  EASY: Alert metrics are clear. Only trick is P2-vs-P3 ambiguity.
  MEDIUM: Root cause buried. 8-10 known services. Multiple red herrings.
    incident_summary does NOT hint at root cause. Must investigate 4+ services.
  HARD: Same diagnosis challenge + 5-7 step remediation sequence +
    10+ known services (many wrong choices) + quality summary required.

Public API:
    get_task(task_id)            -> task metadata dict
    get_scenario(task_id, index) -> scenario dict
    list_tasks()                 -> list of task dicts
    ALL_TASKS                    -> dict[task_id -> metadata]
"""

from __future__ import annotations

ALL_TASKS: dict = {
    "alert_classification": {
        "id": "alert_classification",
        "name": "Task 1: Alert Severity Classification",
        "difficulty": "easy",
        "max_steps": 3,
        "score_range": [0.0, 1.0],
        "description": (
            "An alert has fired. Query logs and metrics across affected services, "
            "then classify the incident severity: P1 (CRITICAL β€” complete outage or "
            "revenue >$1,000/min), P2 (HIGH β€” major degradation affecting most users), "
            "P3 (MEDIUM β€” partial/minor issue with graceful fallback), "
            "P4 (LOW β€” informational). Submit with submit_severity."
        ),
        "available_actions": [
            "query_logs",
            "check_metrics",
            "check_dependencies",
            "check_recent_deploys",
            "submit_severity",
        ],
        "submission_action": "submit_severity",
        "scenarios": 3,
    },
    "root_cause_analysis": {
        "id": "root_cause_analysis",
        "name": "Task 2: Root Cause Analysis",
        "difficulty": "medium",
        "max_steps": 10,
        "score_range": [0.0, 1.0],
        "description": (
            "A production incident is active with multiple services showing errors. "
            "Use diagnostic tools to trace the failure chain. The root cause may be "
            "any service in the system β€” not necessarily one showing errors. "
            "Query logs, metrics, dependencies, and recent deploys across ALL "
            "available services to find the true trigger. Submit with submit_root_cause."
        ),
        "available_actions": [
            "query_logs",
            "check_metrics",
            "check_dependencies",
            "check_recent_deploys",
            "check_service_status",
            "submit_root_cause",
        ],
        "submission_action": "submit_root_cause",
        "scenarios": 3,
    },
    "remediation_planning": {
        "id": "remediation_planning",
        "name": "Task 3: Incident Remediation",
        "difficulty": "hard",
        "max_steps": 15,
        "score_range": [0.0, 1.0],
        "description": (
            "A critical production incident requires full end-to-end resolution. "
            "Diagnose the root cause among many services, execute the correct "
            "remediation sequence (order matters β€” wrong actions are penalized), "
            "then submit a detailed resolution summary. Scored on diagnosis quality, "
            "remediation correctness, action efficiency, and documentation."
        ),
        "available_actions": [
            "query_logs",
            "check_metrics",
            "check_dependencies",
            "check_recent_deploys",
            "check_service_status",
            "restart_service",
            "rollback_deploy",
            "scale_service",
            "disable_feature_flag",
            "clear_cache",
            "execute_runbook_step",
            "submit_resolution",
        ],
        "submission_action": "submit_resolution",
        "scenarios": 3,
    },
}

# ---------------------------------------------------------------------------
# Scenario data β€” 3 tasks Γ— 3 scenarios = 9 total episodes
# ---------------------------------------------------------------------------

SCENARIOS: dict = {

    # ══════════════════════════════════════════════════════════════════════
    # TASK 1: ALERT CLASSIFICATION (EASY)
    # Target: 8B→0.75-1.0, 70B→0.85-1.0
    #
    # AC-001: Clear P1 (78% errors, $12k/min) β€” should be trivial
    # AC-002: Ambiguous P2 (degraded but working, $800/min)
    # AC-003: Trap P3 (45% errors but zero revenue impact, graceful fallback)
    # ══════════════════════════════════════════════════════════════════════

    "alert_classification": [
        # AC-001: Clear P1
        {
            "scenario_id": "AC-001",
            "description": (
                "Cascading failure across multiple services. "
                "Assess severity based on user and revenue impact."
            ),
            "incident_summary": (
                "Alert fired: api-gateway reporting elevated error rates. "
                "Multiple downstream services affected. "
                "Assess the severity of this incident."
            ),
            "alert": {
                "id": "ALT-20240315-001",
                "title": "api-gateway error rate elevated",
                "severity_fired": "UNCLASSIFIED",
                "affected_services": ["api-gateway", "auth-service", "postgres-db"],
                "symptoms": [
                    "api-gateway: HTTP 503 rate 78% (baseline: 0.1%)",
                    "auth-service: connection timeout 94% of requests",
                    "postgres-db: connection pool 500/500 utilized",
                    "checkout flow: unavailable",
                    "user logins: failing",
                ],
                "error_rate": 0.78,
                "duration_minutes": 4,
                "revenue_impact_per_min": 12000,
            },
            "known_services": {"api-gateway", "auth-service", "postgres-db"},
            "tool_responses": {
                "query_logs": {
                    "api-gateway": (
                        "2024-03-15T10:04:12Z ERROR upstream timeout auth-service:8080\n"
                        "2024-03-15T10:04:13Z ERROR 503 Service Unavailable\n"
                        "2024-03-15T10:04:14Z ERROR circuit breaker OPEN"
                    ),
                    "auth-service": (
                        "2024-03-15T10:04:10Z ERROR too many clients already\n"
                        "2024-03-15T10:04:11Z ERROR connection pool exhausted (500/500)"
                    ),
                    "postgres-db": (
                        "2024-03-15T10:04:00Z FATAL remaining slots reserved for superuser\n"
                        "2024-03-15T10:04:01Z LOG max_connections=500 active=500"
                    ),
                },
                "check_metrics": {
                    "api-gateway": "5xx rate: 78% | p99: 30s | circuit_breaker: OPEN",
                    "auth-service": "Error rate: 94% | DB wait: 28s | Queue: 847",
                    "postgres-db": "Connections: 500/500 (100%) | CPU: 98% | Memory: 89%",
                },
                "check_dependencies": {
                    "api-gateway": "Depends on: auth-service [CRITICAL]",
                    "auth-service": "Depends on: postgres-db [CRITICAL]",
                    "postgres-db": "No upstream dependencies",
                },
                "check_recent_deploys": {
                    "api-gateway": "No recent changes",
                    "auth-service": "Deploy 47 min ago β€” connection pool size change",
                    "postgres-db": "No recent changes",
                },
            },
            "correct_severity": "P1",
            "adjacent_severities": ["P2"],
        },

        # AC-002: Ambiguous P2 β€” degraded but not down
        {
            "scenario_id": "AC-002",
            "description": (
                "Service degradation affecting page load times. "
                "Core transaction flows still operational. "
                "Assess severity carefully."
            ),
            "incident_summary": (
                "Alert fired: CDN cache performance degraded. "
                "Origin servers under increased load. "
                "Assess the severity of this incident."
            ),
            "alert": {
                "id": "ALT-20240315-002",
                "title": "CDN cache performance anomaly detected",
                "severity_fired": "UNCLASSIFIED",
                "affected_services": ["cdn-edge", "product-service", "image-service"],
                "symptoms": [
                    "CDN cache hit rate: 3% (normal: 94%)",
                    "product-service: elevated origin traffic",
                    "image-service: CPU 95%, p99 latency 18s",
                    "Product pages: loading slowly",
                    "Checkout: still functional",
                ],
                "error_rate": 0.15,
                "duration_minutes": 8,
                "revenue_impact_per_min": 800,
            },
            "known_services": {"cdn-edge", "product-service", "image-service"},
            "tool_responses": {
                "query_logs": {
                    "cdn-edge": (
                        "2024-03-15T10:22:00Z INFO cache MISS ratio: 97%\n"
                        "2024-03-15T10:20:11Z WARN mass cache invalidation β€” 2.1M keys purged\n"
                        "2024-03-15T10:20:10Z INFO purge pattern: /* (ALL keys)"
                    ),
                    "product-service": (
                        "2024-03-15T10:22:05Z WARN request queue depth: 12,400\n"
                        "2024-03-15T10:22:06Z ERROR timeout from image-service\n"
                        "2024-03-15T10:22:07Z WARN worker pool 95%"
                    ),
                    "image-service": (
                        "2024-03-15T10:22:00Z WARN CPU throttling 95%\n"
                        "2024-03-15T10:22:01Z ERROR worker pool exhausted\n"
                        "2024-03-15T10:22:02Z WARN memory at 91%"
                    ),
                },
                "check_metrics": {
                    "cdn-edge": "Cache hit: 3% | Origin RPS: 48,000 | Bandwidth: 890 Gbps",
                    "product-service": "Origin RPS: 48k (norm: 1.2k) | Queue: 12,400",
                    "image-service": "CPU: 95% | Memory: 91% | p99: 18s",
                },
                "check_dependencies": {
                    "cdn-edge": "Origin: product-service [OVERLOADED]",
                    "product-service": "Depends on: image-service [DEGRADED]",
                    "image-service": "Depends on: object-storage [OK]",
                },
                "check_recent_deploys": {
                    "cdn-edge": "Cronjob updated 2h ago β€” purge pattern changed",
                    "product-service": "No recent changes",
                    "image-service": "No recent changes",
                },
            },
            "correct_severity": "P2",
            "adjacent_severities": ["P1", "P3"],
        },

        # AC-003: P3 trap β€” high error rate but zero impact
        {
            "scenario_id": "AC-003",
            "description": (
                "Internal service reporting elevated errors. "
                "Determine actual user and business impact. "
                "Not all high error rates are critical."
            ),
            "incident_summary": (
                "Alert fired: recommendation-service error rate elevated to 45%. "
                "Assess the severity based on actual user and business impact."
            ),
            "alert": {
                "id": "ALT-20240315-003",
                "title": "recommendation-service error rate 45%",
                "severity_fired": "UNCLASSIFIED",
                "affected_services": ["recommendation-service", "product-service"],
                "symptoms": [
                    "recommendation-service: error rate 45% (baseline: 2%)",
                    "product-service: using fallback recommendation logic",
                    "User experience: default recommendations shown",
                    "Checkout: fully functional",
                    "Revenue: no measurable change",
                ],
                "error_rate": 0.45,
                "duration_minutes": 22,
                "revenue_impact_per_min": 0,
            },
            "known_services": {"recommendation-service", "product-service", "redis-reco-cache"},
            "tool_responses": {
                "query_logs": {
                    "recommendation-service": (
                        "2024-03-15T09:48:00Z ERROR model inference timeout (>5s)\n"
                        "2024-03-15T09:48:01Z WARN ML model server overloaded\n"
                        "2024-03-15T09:48:02Z INFO fallback: returning default recommendations"
                    ),
                    "product-service": (
                        "2024-03-15T09:48:05Z INFO recommendation-service returned defaults\n"
                        "2024-03-15T09:48:06Z INFO serving page with default recs β€” no user impact"
                    ),
                    "redis-reco-cache": "Operating normally β€” cache hit rate 88%",
                },
                "check_metrics": {
                    "recommendation-service": (
                        "Error rate: 45% | Fallback rate: 45% | "
                        "Model server: OVERLOADED | User impact: NONE (graceful)"
                    ),
                    "product-service": (
                        "Error rate: 0.1% (normal) | Checkout: 100% | Revenue: unchanged"
                    ),
                    "redis-reco-cache": "Hit rate: 88% | Memory: 34% | HEALTHY",
                },
                "check_dependencies": {
                    "recommendation-service": "Depends on: ML model server [SLOW]",
                    "product-service": "Depends on: recommendation-service [DEGRADED β€” has fallback]",
                    "redis-reco-cache": "No dependencies",
                },
                "check_recent_deploys": {
                    "recommendation-service": "Model update 3h ago β€” new model v2.4",
                    "product-service": "No recent changes",
                    "redis-reco-cache": "No recent changes",
                },
            },
            "correct_severity": "P3",
            "adjacent_severities": ["P2", "P4"],
        },
    ],

    # ══════════════════════════════════════════════════════════════════════
    # TASK 2: ROOT CAUSE ANALYSIS (MEDIUM)
    # Target: 8B→0.30-0.50, 70B→0.45-0.65
    #
    # KEY DESIGN RULES:
    # 1. Root cause service NEVER in affected_services
    # 2. incident_summary describes SYMPTOMS only, no hints
    # 3. 8-10 known_services (many to investigate)
    # 4. Red herring deploys on non-root-cause services
    # 5. Root cause only findable via check_recent_deploys + query_logs
    #    on the specific service β€” not from looking at victims
    # ══════════════════════════════════════════════════════════════════════

    "root_cause_analysis": [

        # RCA-001: analytics-service OOM kills postgres-db
        # 8 known services. Root cause: analytics-service.
        # Red herrings: auth-service deploy (cosmetic), redis healthy
        {
            "scenario_id": "RCA-001",
            "description": (
                "Multiple services reporting failures. Database appears to be "
                "the epicenter but the true trigger may be elsewhere."
            ),
            "incident_summary": (
                "Multiple services are failing. postgres-db is in a crash loop. "
                "auth-service, order-service, and api-gateway are all reporting errors. "
                "Investigate all available services to find what triggered this cascade."
            ),
            "alert": {
                "id": "ALT-RCA-001",
                "title": "Multiple service failures β€” database crash loop",
                "severity_fired": "P1",
                "affected_services": [
                    "api-gateway", "auth-service", "order-service", "postgres-db",
                ],
                "symptoms": [
                    "postgres-db: crash loop β€” 4 restarts in 12 minutes",
                    "auth-service: 100% connection failures",
                    "order-service: all writes failing",
                    "api-gateway: 503 on authenticated routes",
                ],
                "error_rate": 0.95,
                "duration_minutes": 14,
            },
            "known_services": {
                "api-gateway", "auth-service", "order-service",
                "postgres-db", "analytics-service", "redis-session",
                "product-service", "notification-service",
            },
            "tool_responses": {
                        # In RCA-001, replace the query_logs section:
        "query_logs": {
            "postgres-db": (
    "2024-03-16T02:11:00Z LOG database system shut down\n"
    "2024-03-16T02:10:58Z FATAL terminated by kernel OOM killer\n"
    "2024-03-16T02:10:30Z LOG long-running analytics export query "
    "consuming 31.8GB/32GB β€” sequential scan on events table "
    "with cross-join, running 12 minutes, no LIMIT clause. "
    "Investigate analytics-service scheduled jobs"
),
            "analytics-service": (
                "2024-03-16T01:58:00Z INFO starting scheduled job: full_history_export\n"
                "2024-03-16T01:58:01Z DEBUG executing: SELECT * FROM events "
                "JOIN user_sessions ON ... JOIN orders ON ... β€” no LIMIT\n"
                "2024-03-16T01:58:02Z WARN query plan estimates 847M row scan\n"
                "2024-03-16T02:10:55Z ERROR job terminated β€” connection to database lost"
            ),
            "auth-service": (
                "2024-03-16T02:11:05Z ERROR connect ECONNREFUSED postgres-db:5432\n"
                "2024-03-16T02:11:06Z ERROR all retries exhausted"
            ),
            "api-gateway": (
                "2024-03-16T02:11:10Z ERROR upstream auth-service: 503"
            ),
            "order-service": (
                "2024-03-16T02:11:08Z ERROR pq: database system is starting up"
            ),
            "redis-session": "No errors β€” operating normally",
            "product-service": (
                "2024-03-16T02:11:12Z WARN DB queries failing β€” serving cached data"
            ),
            "notification-service": (
                "2024-03-16T02:11:15Z ERROR cannot send β€” user lookup failed"
            ),
        },
                "check_metrics": {
                    "postgres-db": (
                        "Memory: peaked at 31.8GB/32GB before kill | "
                        "Restarts: 4 in 12min | Status: RESTARTING | "
                        "Heaviest client: 10.0.5.47"
                    ),
                    "analytics-service": (
                        "Last job: FAILED | Memory during job: 28GB | "
                        "IP: 10.0.5.47 | CPU: idle (job terminated)"
                    ),
                    "auth-service": "Connections: 0% success | Queued requests: 1,200",
                    "api-gateway": "503 rate: 95% | Auth: DOWN",
                    "order-service": "Write success: 0% | DB: RESTARTING",
                    "redis-session": "Hit rate: 99.2% | Memory: 42% | HEALTHY",
                    "product-service": "Serving cached data | DB queries: 100% failing",
                    "notification-service": "Queue backlog: 8,400 | DB: DOWN",
                },
                "check_dependencies": {
                    "postgres-db": (
                        "Clients: auth-service, order-service, analytics-service, "
                        "product-service, notification-service"
                    ),
                    "analytics-service": "Depends on: postgres-db [CRASH LOOP]",
                    "auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
                    "api-gateway": "Depends on: auth-service [DOWN], product-service [DEGRADED]",
                    "order-service": "Depends on: postgres-db [CRASH LOOP]",
                    "redis-session": "Standalone cache β€” no DB dependency",
                    "product-service": "Depends on: postgres-db [CRASH LOOP β€” using cache]",
                    "notification-service": "Depends on: postgres-db [CRASH LOOP]",
                },
                "check_recent_deploys": {
                    "analytics-service": (
                        "Deploy 6h ago: added scheduled data export job β€” "
                        "runs daily at 02:00 UTC. Change includes cross-table "
                        "JOIN query without LIMIT clause"
                    ),
                    "postgres-db": "No deploys in 3 weeks",
                    "auth-service": (
                        "Deploy 2h ago: updated structured logging format. "
                        "No functional changes, no query changes, no connection changes."
                    ),
                    "order-service": "No recent deploys",
                    "redis-session": "No recent deploys",
                    "api-gateway": "No recent deploys",
                    "product-service": (
                        "Deploy 3 days ago: added product image lazy loading. "
                        "No DB changes."
                    ),
                    "notification-service": "No recent deploys",
                },
                "check_service_status": {
                    "postgres-db": "RESTARTING | Uptime: 47s | Last crash: OOM",
                    "analytics-service": "ERROR | Last job: FAILED 12min ago",
                    "auth-service": "DOWN | Blocked on postgres-db",
                    "api-gateway": "DEGRADED | 95% errors",
                    "order-service": "DOWN | Blocked on postgres-db",
                    "redis-session": "HEALTHY | 99.2% hit rate",
                    "product-service": "DEGRADED | Cache fallback active",
                    "notification-service": "DEGRADED | Queue backlog 8,400",
                },
            },
            "correct_root_cause": {
                "service": "analytics-service",
                "failure_mode": "unbounded query OOM killing postgres-db",
            },
            "wrong_actions": {
                "restart_service:auth-service": "victim β€” DB must be fixed first",
                "restart_service:api-gateway": "downstream β€” won't help",
                "restart_service:order-service": "victim β€” won't help",
                "scale_service:postgres-db": "won't prevent OOM from bad query",
                "rollback_deploy:postgres-db": "no recent deploys",
                "rollback_deploy:auth-service": "auth deploy was cosmetic only",
                "rollback_deploy:product-service": "product deploy unrelated",
                "restart_service:redis-session": "redis is healthy",
                "restart_service:notification-service": "victim β€” won't help",
            },
        },

        # RCA-002: network-infra BGP withdrawal
        # 8 known services. Root cause: network-infra.
        # Red herrings: payment-service looks down, postgres-db exists
        {
            "scenario_id": "RCA-002",
            "description": (
                "Checkout failures concentrated in specific availability zones. "
                "Some services appear unreachable while others work fine."
            ),
            "incident_summary": (
                "Checkout failure rate has spiked to 61%. payment-service and "
                "fraud-detection-service are unreachable from some parts of the "
                "infrastructure but appear healthy from others. Multiple services "
                "to investigate. Find the root cause."
            ),
            "alert": {
                "id": "ALT-RCA-002",
                "title": "Checkout failures β€” partial service unreachability",
                "severity_fired": "P2",
                "affected_services": [
                    "order-service", "payment-service", "fraud-detection-service",
                ],
                "symptoms": [
                    "checkout failure rate: 61%",
                    "payment-service: intermittently unreachable",
                    "fraud-detection-service: intermittently unreachable",
                    "failures appear zone-specific",
                ],
                "error_rate": 0.61,
                "duration_minutes": 9,
            },
            "known_services": {
                "order-service", "payment-service", "fraud-detection-service",
                "postgres-db", "redis-payment-cache", "network-infra",
                "cdn-edge", "api-gateway",
            },
            "tool_responses": {
                        # In RCA-002, replace query_logs:
        "query_logs": {
            "order-service": (
                "2024-03-17T14:32:10Z ERROR connection timeout "
                "payment-service:8080 β€” no route to host\n"
                "2024-03-17T14:32:11Z ERROR fraud-detection-service: i/o timeout\n"
                "2024-03-17T14:32:12Z WARN failures only from AZ-2/AZ-3, "
                "AZ-1 traffic normal β€” possible network-infra issue"
            ),
            "payment-service": (
                "2024-03-17T14:31:58Z WARN health check from external LB failing\n"
                "2024-03-17T14:31:59Z INFO local AZ-1 traffic: all normal\n"
                "2024-03-17T14:32:00Z INFO processing requests normally (local only)"
            ),
            "fraud-detection-service": (
                "2024-03-17T14:32:00Z INFO local requests: processing normally\n"
                "2024-03-17T14:32:01Z WARN external health probes: 100% timeout"
            ),
            "network-infra": (
                "2024-03-17T14:31:45Z CRITICAL BGP session 10.0.2.1 DOWN β€” "
                "routes to 10.0.1.0/24 withdrawn from peer\n"
                "2024-03-17T14:31:45Z CRITICAL BGP session 10.0.3.1 DOWN β€” "
                "routes to 10.0.1.0/24 withdrawn from peer\n"
                "2024-03-17T14:31:44Z INFO configuration change applied β€” "
                "export filter policy updated"
            ),
            "postgres-db": "Operating normally β€” no errors",
            "redis-payment-cache": "Operating normally β€” all healthy",
            "cdn-edge": "Operating normally β€” cache serving fine",
            "api-gateway": (
                "2024-03-17T14:32:15Z ERROR some backend routes timing out\n"
                "2024-03-17T14:32:16Z INFO AZ-1 backends: responding normally"
            ),
        },
                "check_metrics": {
                    "order-service": (
                        "Failure rate varies by source AZ: "
                        "AZ-1: 0.2% | AZ-2: 99% | AZ-3: 98%"
                    ),
                    "payment-service": (
                        "Internal processing: 100% success | "
                        "Inbound from AZ-2: 0 connections | Inbound from AZ-3: 0 connections | "
                        "Inbound from AZ-1: normal"
                    ),
                    "fraud-detection-service": (
                        "Internal: normal | External probes: 100% timeout"
                    ),
                    "network-infra": (
                        "BGP sessions: AZ-1 internal UP | "
                        "AZ-2β†’AZ-1: WITHDRAWN | AZ-3β†’AZ-1: WITHDRAWN | "
                        "Last change: 18min ago"
                    ),
                    "postgres-db": "All metrics normal",
                    "redis-payment-cache": "All metrics normal",
                    "cdn-edge": "Cache hit: 91% | Normal operation",
                    "api-gateway": "Mixed β€” AZ-1 OK, AZ-2/AZ-3 partial failures",
                },
                "check_dependencies": {
                    "order-service": (
                        "Depends on: payment-service [PARTIAL], "
                        "fraud-detection-service [PARTIAL]"
                    ),
                    "payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
                    "fraud-detection-service": "Depends on: postgres-db [OK]",
                    "network-infra": (
                        "BGP peers: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN], AZ-1 [UP]"
                    ),
                    "postgres-db": "All connections healthy",
                    "redis-payment-cache": "All connections healthy",
                    "cdn-edge": "No issues",
                    "api-gateway": "Depends on: multiple backends [MIXED]",
                },
                "check_recent_deploys": {
                    "network-infra": (
                        "Router configuration change 18min ago β€” modified BGP "
                        "export filter policy. Change accidentally removed AZ-1 "
                        "prefix 10.0.1.0/24 from advertisements to AZ-2 and AZ-3 peers."
                    ),
                    "payment-service": "No recent deploys",
                    "order-service": "No recent deploys",
                    "fraud-detection-service": "No recent deploys",
                    "postgres-db": (
                        "Minor config change 5 days ago β€” increased shared_buffers. "
                        "No issues since."
                    ),
                    "redis-payment-cache": "No recent deploys",
                    "cdn-edge": "No recent deploys",
                    "api-gateway": (
                        "Deploy 1 day ago β€” added request tracing headers. "
                        "No routing changes."
                    ),
                },
                "check_service_status": {
                    "payment-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
                    "order-service": "DEGRADED | Partial failures",
                    "network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
                    "fraud-detection-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
                    "postgres-db": "HEALTHY",
                    "redis-payment-cache": "HEALTHY",
                    "cdn-edge": "HEALTHY",
                    "api-gateway": "DEGRADED | Mixed backend status",
                },
            },
            "correct_root_cause": {
                "service": "network-infra",
                "failure_mode": "BGP route withdrawal causing AZ network partition",
            },
            "wrong_actions": {
                "restart_service:payment-service": "healthy β€” network issue",
                "restart_service:order-service": "victim",
                "scale_service:payment-service": "won't fix routing",
                "clear_cache:redis-payment-cache": "cache is healthy",
                "restart_service:api-gateway": "victim of routing issue",
                "rollback_deploy:api-gateway": "deploy was unrelated tracing headers",
                "rollback_deploy:postgres-db": "config change was 5 days ago, unrelated",
                "restart_service:cdn-edge": "CDN is healthy",
            },
        },

        # RCA-003: config-service credential rotation bug
        # 8 known services. Root cause: config-service.
        # Red herrings: user-service had a recent deploy, postgres-db stressed
        {
            "scenario_id": "RCA-003",
            "description": (
                "Multiple services experiencing database authentication failures. "
                "The database itself may not be the problem."
            ),
            "incident_summary": (
                "Several services are reporting database authentication failures. "
                "postgres-db connection pool is saturated. user-service and "
                "notification-service are down. api-gateway error rate elevated. "
                "Investigate all services to find what triggered this."
            ),
            "alert": {
                "id": "ALT-RCA-003",
                "title": "Multiple services β€” database authentication failures",
                "severity_fired": "P2",
                "affected_services": [
                    "api-gateway", "user-service", "notification-service", "postgres-db",
                ],
                "symptoms": [
                    "user-service: FATAL password authentication failed",
                    "notification-service: FATAL password authentication failed",
                    "api-gateway: 503 rate 62%",
                    "postgres-db: connection pool 490/500",
                ],
                "error_rate": 0.62,
                "duration_minutes": 7,
            },
            "known_services": {
                "api-gateway", "user-service", "notification-service",
                "postgres-db", "config-service", "redis-session",
                "order-service", "product-service",
            },
            "tool_responses": {
                        # In RCA-003, replace query_logs:
        "query_logs": {
            "user-service": (
    "2024-03-18T08:14:00Z FATAL password authentication failed "
    "for user 'app_user'\n"
    "2024-03-18T08:14:01Z ERROR DB credentials rejected β€” "
    "credentials were last pushed by config-service secrets "
    "rotation at 08:12:00Z\n"
    "2024-03-18T08:14:02Z WARN credential hash mismatch β€” "
    "check config-service rotation job for issues"
),
            "notification-service": (
    "2024-03-18T08:14:05Z FATAL password authentication failed "
    "for user 'app_user'\n"
    "2024-03-18T08:14:06Z WARN credentials from config-service "
    "rotation at 08:12:00Z appear invalid"
),
            "api-gateway": (
                "2024-03-18T08:14:10Z ERROR upstream user-service: 503\n"
                "2024-03-18T08:14:11Z ERROR upstream notification-service: 503"
            ),
            "postgres-db": (
                "2024-03-18T08:14:00Z LOG auth failure from 10.0.3.x\n"
                "2024-03-18T08:14:00Z LOG auth failure from 10.0.4.x\n"
                "2024-03-18T08:14:01Z LOG 490/500 slots used by failed auth retries"
            ),
            "config-service": (
                "2024-03-18T08:12:00Z INFO secrets rotation job executed\n"
                "2024-03-18T08:12:01Z WARN rotation referenced PREVIOUS "
                "credential set instead of generating new β€” template bug "
                "in version v3.2.1\n"
                "2024-03-18T08:12:02Z INFO pushed credentials to: "
                "user-service, notification-service, order-service"
            ),
            "redis-session": "Operating normally",
            "order-service": (
                "2024-03-18T08:14:20Z WARN received credential push from "
                "config-service but have not restarted β€” still using old valid creds"
            ),
            "product-service": "Operating normally β€” using original credentials",
        },
                "check_metrics": {
                    "user-service": "DB auth: 100% failure | HTTP 503: 100%",
                    "notification-service": "DB auth: 100% failure | HTTP 503: 100%",
                    "api-gateway": "503 rate: 62% | Some upstreams DOWN",
                    "postgres-db": (
                        "Connections: 490/500 | Auth failures/s: 80 | "
                        "Valid connections: 10 | DB itself: HEALTHY"
                    ),
                    "config-service": (
                        "Status: HEALTHY | Last push: 7min ago | "
                        "Type: secrets_rotation | Result: COMPLETED"
                    ),
                    "redis-session": "All normal",
                    "order-service": "Using old credentials β€” still working",
                    "product-service": "All normal β€” unaffected",
                },
                "check_dependencies": {
                    "user-service": (
                        "Depends on: postgres-db [AUTH FAIL], "
                        "config-service [credential source]"
                    ),
                    "notification-service": (
                        "Depends on: postgres-db [AUTH FAIL], "
                        "config-service [credential source]"
                    ),
                    "api-gateway": "Depends on: user-service [DOWN], notification-service [DOWN]",
                    "postgres-db": "No upstream dependencies β€” DB is healthy",
                    "config-service": (
                        "Provides: credentials to user-service, "
                        "notification-service, order-service"
                    ),
                    "redis-session": "Standalone",
                    "order-service": (
                        "Depends on: postgres-db [OK β€” old creds], "
                        "config-service [pending push]"
                    ),
                    "product-service": "Depends on: postgres-db [OK β€” original creds]",
                },
                "check_recent_deploys": {
                    "config-service": (
                        "Deploy 2h ago: version v3.2.1 β€” updated secrets rotation "
                        "job template. Bug: rotation references previous credential "
                        "set instead of generating new credentials."
                    ),
                    "user-service": (
                        "Deploy 4h ago: added new profile API endpoint. "
                        "No database or credential changes."
                    ),
                    "notification-service": "No recent deploys",
                    "postgres-db": "No recent deploys",
                    "api-gateway": "No recent deploys",
                    "redis-session": "No recent deploys",
                    "order-service": (
                        "Deploy 1 day ago: updated order confirmation email template. "
                        "No DB changes."
                    ),
                    "product-service": "No recent deploys",
                },
                "check_service_status": {
                    "user-service": "DOWN | DB auth failures",
                    "notification-service": "DOWN | DB auth failures",
                    "api-gateway": "DEGRADED | 62% error rate",
                    "postgres-db": "STRESSED but HEALTHY | 490/500 connections (failed auths)",
                    "config-service": "HEALTHY | Last rotation: 7min ago (completed)",
                    "redis-session": "HEALTHY",
                    "order-service": "HEALTHY | Old credentials still valid",
                    "product-service": "HEALTHY",
                },
            },
            "correct_root_cause": {
                "service": "config-service",
                "failure_mode": "secrets rotation pushed stale credentials to downstream services",
            },
            "wrong_actions": {
                "restart_service:user-service": "will retry with same bad credentials",
                "restart_service:notification-service": "same bad credentials",
                "restart_service:postgres-db": "DB is healthy β€” client creds are bad",
                "scale_service:postgres-db": "connections are failed auths",
                "rollback_deploy:user-service": "user-service deploy was unrelated",
                "rollback_deploy:order-service": "order-service deploy was unrelated",
                "restart_service:api-gateway": "downstream β€” fix upstream first",
            },
        },
    ],

    # ══════════════════════════════════════════════════════════════════════
    # TASK 3: REMEDIATION PLANNING (HARD)
    # Target: 8B→0.15-0.35, 70B→0.30-0.50
    #
    # KEY DESIGN RULES:
    # 1. Same diagnostic challenge as medium
    # 2. 5-7 step remediation sequence required
    # 3. 8-10 known services = many wrong choices
    # 4. Wrong actions carry -0.05 penalty each (up to -0.15)
    # 5. Summary must hit 3+ keywords for bonus
    # 6. incident_summary does NOT reveal root cause
    # ══════════════════════════════════════════════════════════════════════

    "remediation_planning": [

        # RP-001: OOM remediation β€” 6-step sequence, 8 services
        {
            "scenario_id": "RP-001",
            "description": (
                "Full incident remediation required. Multiple services down. "
                "Diagnose the root cause, execute fixes in the correct order, "
                "and document your resolution."
            ),
            "incident_summary": (
                "CRITICAL β€” postgres-db is crash-looping. auth-service, order-service, "
                "and api-gateway are all down. notification-service queue backing up. "
                "Diagnose the root cause, fix it, restore all services, and document."
            ),
            "alert": {
                "id": "ALT-RP-001",
                "title": "CRITICAL: database crash loop β€” multiple services down",
                "severity_fired": "P1",
                "affected_services": [
                    "postgres-db", "auth-service", "order-service", "api-gateway",
                ],
            },
            "known_services": {
                "postgres-db", "auth-service", "order-service",
                "api-gateway", "analytics-service", "redis-session",
                "product-service", "notification-service",
            },
            "tool_responses": {
                "query_logs": {
                    # RP-001 query_logs β†’ postgres-db β€” REPLACE WITH:
"postgres-db": (
    "FATAL: terminated by kernel OOM killer β€” "
    "query from client 10.0.5.47 running 12min consuming "
    "31.8GB of 32GB available memory"
),
                    "analytics-service": (
                        "INFO: starting job full_history_export\n"
                        "WARN: query plan: 847M rows, cross-table JOIN, no LIMIT\n"
                        "ERROR: job terminated β€” database connection lost"
                    ),
                    "auth-service": "ERROR: connect ECONNREFUSED postgres-db:5432",
                    "order-service": "ERROR: pq: database system is starting up",
                    "api-gateway": "ERROR: upstream auth-service 503",
                    "redis-session": "Operating normally",
                    "product-service": "WARN: DB failing β€” serving cached data",
                    "notification-service": "ERROR: user lookup failed β€” queuing",
                },
                "check_metrics": {
                    "postgres-db": "OOM killed | Restarts: 4 | Heaviest client: 10.0.5.47",
                    "analytics-service": "Job FAILED | Memory peak: 31GB/32GB | IP: 10.0.5.47",
                    "auth-service": "0% DB success | Queue: 1,200",
                    "order-service": "0% write success",
                    "api-gateway": "503 rate: 95%",
                    "redis-session": "HEALTHY | 99.2% hit rate",
                    "product-service": "Cache fallback active",
                    "notification-service": "Queue: 8,400 messages backed up",
                },
                "check_dependencies": {
                    "postgres-db": (
                        "Clients: auth-service, order-service, analytics-service, "
                        "product-service, notification-service"
                    ),
                    "analytics-service": "Depends on: postgres-db [CRASH LOOP]",
                    "auth-service": "Depends on: postgres-db [CRASH LOOP], redis-session [OK]",
                    "api-gateway": "Depends on: auth-service [DOWN]",
                    "order-service": "Depends on: postgres-db [CRASH LOOP]",
                    "redis-session": "Standalone",
                    "product-service": "Depends on: postgres-db [CRASH LOOP β€” cache fallback]",
                    "notification-service": "Depends on: postgres-db [CRASH LOOP]",
                },
                "check_recent_deploys": {
                    "analytics-service": (
                        "Deploy 6h ago: added scheduled export job β€” "
                        "cross-table JOIN without LIMIT clause"
                    ),
                    "postgres-db": "No deploys in 3 weeks",
                    "auth-service": "Deploy 2h ago: logging format only β€” no functional changes",
                    "order-service": "No recent deploys",
                    "product-service": "Deploy 3 days ago: image lazy loading β€” no DB changes",
                    "notification-service": "No recent deploys",
                },
                "check_service_status": {
                    "postgres-db": "CRASH LOOP | OOM | Uptime: 47s",
                    "analytics-service": "ERROR | Job FAILED",
                    "auth-service": "DOWN",
                    "order-service": "DOWN",
                    "api-gateway": "DEGRADED | 95% errors",
                    "redis-session": "HEALTHY",
                    "product-service": "DEGRADED | Cache fallback",
                    "notification-service": "DEGRADED | Queue backlog",
                },
            },
            "remediation_data": {
                "disable_feature_flag": {
                    "full_history_export": (
                        "Cron job full_history_export DISABLED β€” "
                        "unbounded query will not execute again"
                    ),
                },
                "restart_service": {
                    "postgres-db": "postgres-db restarted β€” accepting connections (12/500)",
                    "analytics-service": "analytics-service restarted β€” idle",
                    "auth-service": "auth-service restarted β€” connected to postgres-db OK",
                    "order-service": "order-service restarted β€” writes resuming",
                    "api-gateway": "api-gateway restarted β€” routing recovered",
                    "product-service": "product-service β€” switched from cache to live DB",
                    "notification-service": "notification-service β€” draining queue",
                },
                "execute_runbook_step": {
                    "verify_db_health": "postgres-db: 12/500 connections, CPU 12%, Memory 34% β€” healthy",
                    "check_service_recovery": (
                        "auth OK | order OK | api-gateway OK | product OK | notification DRAINING"
                    ),
                },
            },
            "correct_remediation_sequence": [
                "disable_feature_flag:full_history_export",
                "restart_service:analytics-service",
                "restart_service:postgres-db",
                "restart_service:auth-service",
                "restart_service:order-service",
                "execute_runbook_step:verify_db_health",
            ],
            "wrong_actions": {
                "rollback_deploy:postgres-db": "no recent deploy",
                "scale_service:postgres-db": "won't prevent OOM",
                "restart_service:api-gateway": "downstream β€” fix DB stack first",
                "rollback_deploy:auth-service": "cosmetic deploy only",
                "clear_cache:redis-session": "healthy β€” not related",
                "restart_service:redis-session": "healthy β€” not related",
                "rollback_deploy:product-service": "unrelated deploy",
                "restart_service:notification-service": "will recover once DB is up",
            },
            "resolution_keywords": [
                "analytics", "oom", "memory", "postgres", "query",
                "full_history_export", "disabled", "restarted",
                "recovered", "unbounded", "crash", "kill",
            ],
        },

        # RP-002: BGP remediation β€” 4-step sequence, 8 services
        {
            "scenario_id": "RP-002",
            "description": (
                "Full incident remediation required. Checkout failures affecting "
                "most users. Diagnose, fix, verify, and document."
            ),
            "incident_summary": (
                "Checkout failure rate 61%. payment-service unreachable from most "
                "of the infrastructure. Some services report no issues. "
                "Diagnose the root cause, execute remediation, verify recovery, "
                "and document the resolution."
            ),
            "alert": {
                "id": "ALT-RP-002",
                "title": "Checkout failures β€” partial service unreachability",
                "severity_fired": "P2",
                "affected_services": ["order-service", "payment-service"],
            },
            "known_services": {
                "network-infra", "order-service", "payment-service",
                "fraud-detection-service", "postgres-db",
                "redis-payment-cache", "cdn-edge", "api-gateway",
            },
            "tool_responses": {
                "query_logs": {
                    "network-infra": (
                        "CRITICAL: BGP peer 10.0.2.1 route withdrawal β€” "
                        "routes to 10.0.1.0/24 removed\n"
                        "CRITICAL: BGP peer 10.0.3.1 route withdrawal β€” "
                        "routes to 10.0.1.0/24 removed\n"
                        "INFO: configuration change applied β€” export filter updated"
                    ),
                    "order-service": "ERROR: timeout payment-service β€” no route to host",
                    "payment-service": "INFO: local traffic normal | WARN: external health failing",
                    "fraud-detection-service": "WARN: cross-AZ probes timeout | Local: OK",
                    "postgres-db": "Operating normally",
                    "redis-payment-cache": "Operating normally",
                    "cdn-edge": "Operating normally",
                    "api-gateway": "ERROR: some backend routes timing out",
                },
                "check_metrics": {
                    "network-infra": (
                        "BGP AZ-2β†’AZ-1: WITHDRAWN | AZ-3β†’AZ-1: WITHDRAWN | "
                        "AZ-1 internal: UP | Last change: 18min ago"
                    ),
                    "order-service": "AZ-1: 0.2% fail | AZ-2: 99% fail | AZ-3: 98% fail",
                    "payment-service": "Internal: 100% success | External: 0 inbound from AZ-2/3",
                    "fraud-detection-service": "Local: normal | External: timeout",
                    "postgres-db": "All normal",
                    "redis-payment-cache": "All normal",
                    "cdn-edge": "Cache: 91% hit | Normal",
                    "api-gateway": "Mixed β€” AZ-1 OK, AZ-2/3 partial failures",
                },
                "check_dependencies": {
                    "order-service": "Depends on: payment-service [PARTIAL], fraud-detection [PARTIAL]",
                    "payment-service": "Depends on: postgres-db [OK], redis-payment-cache [OK]",
                    "network-infra": "BGP: AZ-2 [WITHDRAWN], AZ-3 [WITHDRAWN]",
                    "fraud-detection-service": "Depends on: postgres-db [OK]",
                    "postgres-db": "All healthy",
                    "redis-payment-cache": "All healthy",
                    "cdn-edge": "No issues",
                    "api-gateway": "Mixed backends",
                },
                "check_recent_deploys": {
                    "network-infra": (
                        "Config change 18min ago β€” BGP export filter modified, "
                        "accidentally removed AZ-1 prefix from AZ-2/AZ-3 ads"
                    ),
                    "payment-service": "No recent deploys",
                    "order-service": "No recent deploys",
                    "fraud-detection-service": "No recent deploys",
                    "postgres-db": "Minor change 5 days ago β€” increased shared_buffers",
                    "redis-payment-cache": "No recent deploys",
                    "cdn-edge": "No recent deploys",
                    "api-gateway": "Deploy 1 day ago β€” tracing headers, no routing changes",
                },
                "check_service_status": {
                    "network-infra": "BGP AZ-2: WITHDRAWN | BGP AZ-3: WITHDRAWN",
                    "payment-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
                    "order-service": "DEGRADED",
                    "fraud-detection-service": "HEALTHY (local) | Cross-AZ: UNREACHABLE",
                    "postgres-db": "HEALTHY",
                    "redis-payment-cache": "HEALTHY",
                    "cdn-edge": "HEALTHY",
                    "api-gateway": "DEGRADED",
                },
            },
            "remediation_data": {
                "rollback_deploy": {
                    "network-infra": "Router config rolled back β€” BGP policy restored",
                },
                "execute_runbook_step": {
                    "restore_bgp_routes": "BGP routes restored β€” AZ-2/3 can reach AZ-1",
                    "verify_checkout_recovery": "Checkout failure: 0.3% β€” resolved",
                    "verify_cross_az_connectivity": "AZ-2β†’AZ-1: OK | AZ-3β†’AZ-1: OK",
                },
            },
            "correct_remediation_sequence": [
                "execute_runbook_step:restore_bgp_routes",
                "rollback_deploy:network-infra",
                "execute_runbook_step:verify_cross_az_connectivity",
                "execute_runbook_step:verify_checkout_recovery",
            ],
            "wrong_actions": {
                "restart_service:payment-service": "healthy β€” network issue",
                "scale_service:payment-service": "won't fix routing",
                "restart_service:order-service": "victim",
                "clear_cache:redis-payment-cache": "unrelated",
                "restart_service:cdn-edge": "healthy",
                "restart_service:fraud-detection-service": "healthy locally",
                "restart_service:api-gateway": "victim of routing",
                "rollback_deploy:api-gateway": "deploy was unrelated",
                "rollback_deploy:postgres-db": "change was 5 days ago",
            },
            "resolution_keywords": [
                "bgp", "network", "route", "rollback", "partition",
                "restored", "az-1", "az-2", "az-3", "checkout",
                "withdrawal", "config", "advertisement", "export",
            ],
        },

        # RP-003: Credential rotation remediation β€” 7-step sequence, 8 services
        {
            "scenario_id": "RP-003",
            "description": (
                "Full incident remediation required. Multiple services failing "
                "database authentication. Diagnose, fix, verify, and document."
            ),
            "incident_summary": (
                "Multiple services reporting database authentication failures. "
                "postgres-db connection pool near capacity with failed auth attempts. "
                "user-service and notification-service are down. api-gateway degraded. "
                "Diagnose the root cause, execute remediation, and document."
            ),
            "alert": {
                "id": "ALT-RP-003",
                "title": "Multiple services β€” DB authentication failures",
                "severity_fired": "P2",
                "affected_services": [
                    "user-service", "notification-service", "api-gateway",
                ],
            },
            "known_services": {
                "api-gateway", "user-service", "notification-service",
                "postgres-db", "config-service", "redis-session",
                "order-service", "product-service",
            },
            "tool_responses": {
                "query_logs": {
                    "user-service": (
    "FATAL: password authentication failed for user 'app_user'\n"
    "ERROR: DB credentials rejected\n"
    "WARN: credentials last refreshed at 08:12:00Z"
),

"notification-service": (
    "FATAL: password authentication failed\n"
    "WARN: credentials last refreshed at 08:12:00Z β€” "
    "authentication rejected by postgres-db"
),
                    "api-gateway": (
                        "ERROR: upstream user-service 503\n"
                        "ERROR: upstream notification-service 503"
                    ),
                    "postgres-db": (
                        "LOG: auth failure from 10.0.3.x (user-service)\n"
                        "LOG: auth failure from 10.0.4.x (notification-service)\n"
                        "LOG: 490/500 slots used by failed auth retries"
                    ),
                    "config-service": (
                        "INFO: secrets rotation executed at 08:12:00Z\n"
                        "WARN: rotation used PREVIOUS credential set β€” "
                        "template bug in v3.2.1\n"
                        "INFO: pushed to: user-service, notification-service, order-service"
                    ),
                    "redis-session": "Operating normally",
                    "order-service": (
                        "WARN: received credential push at 08:12:00Z β€” "
                        "not applied yet, still using old valid credentials"
                    ),
                    "product-service": "Operating normally β€” using original credentials",
                },
                "check_metrics": {
                    "user-service": "DB auth: 100% failure | HTTP 503: 100%",
                    "notification-service": "DB auth: 100% failure | HTTP 503: 100%",
                    "api-gateway": "503 rate: 62%",
                    "postgres-db": "Connections: 490/500 | Auth failures/s: 80 | DB: HEALTHY",
                    "config-service": "HEALTHY | Last push: 7min ago | Type: secrets_rotation",
                    "redis-session": "All normal",
                    "order-service": "HEALTHY | Using old (valid) credentials",
                    "product-service": "HEALTHY | Unaffected",
                },
                "check_dependencies": {
                    "user-service": "Depends on: postgres-db [AUTH FAIL], config-service [creds]",
                    "notification-service": "Depends on: postgres-db [AUTH FAIL], config-service [creds]",
                    "api-gateway": "Depends on: user-service [DOWN], notification-service [DOWN]",
                    "postgres-db": "No upstream β€” DB itself is healthy",
                    "config-service": "Provides credentials to: user-svc, notification-svc, order-svc",
                    "redis-session": "Standalone",
                    "order-service": "Depends on: postgres-db [OK β€” old creds]",
                    "product-service": "Depends on: postgres-db [OK β€” original creds]",
                },
                "check_recent_deploys": {
                    "config-service": (
                        "Deploy 2h ago: v3.2.1 β€” updated secrets rotation template. "
                        "Bug: references previous credential set instead of generating new."
                    ),
                    "user-service": "Deploy 4h ago: profile endpoint β€” no DB changes",
                    "notification-service": "No recent deploys",
                    "postgres-db": "No recent deploys",
                    "api-gateway": "No recent deploys",
                    "redis-session": "No recent deploys",
                    "order-service": "Deploy 1 day ago: email template β€” no DB changes",
                    "product-service": "No recent deploys",
                },
                "check_service_status": {
                    "user-service": "DOWN | DB auth failures",
                    "notification-service": "DOWN | DB auth failures",
                    "api-gateway": "DEGRADED | 62%",
                    "postgres-db": "STRESSED | 490/500 connections (failed auths)",
                    "config-service": "HEALTHY | Rotation completed",
                    "redis-session": "HEALTHY",
                    "order-service": "HEALTHY | Old creds valid",
                    "product-service": "HEALTHY",
                },
            },
            "remediation_data": {
                "rollback_deploy": {
                    "config-service": "config-service rolled back to v3.2.0 β€” bug removed",
                },
                "execute_runbook_step": {
                    "trigger_credential_rotation": (
                        "Correct credentials generated and pushed to "
                        "user-service, notification-service, order-service"
                    ),
                    "verify_db_connectivity": (
                        "user-service: DB OK | notification-service: DB OK | "
                        "order-service: DB OK | postgres-db: 45/500 connections"
                    ),
                    "verify_api_recovery": "api-gateway 503 rate: 0.1% β€” recovered",
                },
                "restart_service": {
                    "user-service": "user-service restarted β€” DB auth OK with correct creds",
                    "notification-service": "notification-service restarted β€” DB auth OK",
                    "order-service": "order-service restarted β€” using correct credentials",
                },
            },
            "correct_remediation_sequence": [
                "rollback_deploy:config-service",
                "execute_runbook_step:trigger_credential_rotation",
                "restart_service:user-service",
                "restart_service:notification-service",
                "restart_service:order-service",
                "execute_runbook_step:verify_db_connectivity",
                "execute_runbook_step:verify_api_recovery",
            ],
            "wrong_actions": {
                "restart_service:postgres-db": "DB is healthy β€” problem is credentials",
                "scale_service:postgres-db": "connections are failed auths",
                "restart_service:api-gateway": "downstream β€” fix auth first",
                "rollback_deploy:user-service": "deploy was unrelated",
                "rollback_deploy:order-service": "deploy was unrelated",
                "clear_cache:redis-session": "healthy",
                "restart_service:product-service": "healthy",
                "restart_service:redis-session": "healthy",
            },
            "resolution_keywords": [
                "config", "credential", "rotation", "stale", "password",
                "authentication", "rollback", "config-service", "v3.2.1",
                "restarted", "recovered", "push", "secrets", "template",
            ],
        },
    ],
}


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def get_task(task_id: str) -> dict:
    if task_id not in ALL_TASKS:
        raise ValueError(
            f"Unknown task_id '{task_id}'. Valid: {list(ALL_TASKS.keys())}"
        )
    return ALL_TASKS[task_id]


def get_scenario(task_id: str, index: int) -> dict:
    if task_id not in SCENARIOS:
        raise ValueError(f"No scenarios for task_id '{task_id}'.")
    scenarios = SCENARIOS[task_id]
    if index < 0 or index >= len(scenarios):
        raise ValueError(
            f"Scenario index {index} out of range for task '{task_id}' "
            f"(valid: 0–{len(scenarios) - 1})"
        )
    return scenarios[index]


def list_tasks() -> list:
    return list(ALL_TASKS.values())