aamrinder commited on
Commit
e1fcd2c
Β·
verified Β·
1 Parent(s): 99ef18d

Upload server/scenarios.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. server/scenarios.py +584 -0
server/scenarios.py ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Three task scenarios: easy, medium, hard."""
2
+
3
+ SCENARIOS = {
4
+ # ═══════════════════════════════════════════════════════════════
5
+ # EASY β€” Memory Leak in API Gateway
6
+ # Clear logs, single root cause, no red herrings.
7
+ # An LLM should score 0.7-1.0 here.
8
+ # ═══════════════════════════════════════════════════════════════
9
+ "easy": {
10
+ "task_id": "easy",
11
+ "difficulty": "easy",
12
+ "title": "Memory Leak in API Gateway",
13
+ "description": (
14
+ "ALERT: api-gateway is experiencing high memory usage and intermittent OOM kills. "
15
+ "Users are reporting 502 errors. Investigate and resolve the issue."
16
+ ),
17
+ "max_steps": 15,
18
+ "optimal_steps": 5,
19
+ "degradation_rate": 0.03,
20
+ "root_cause": {
21
+ "service": "api-gateway",
22
+ "issue": "memory_leak",
23
+ "keywords": ["memory", "leak", "oom", "api-gateway"],
24
+ },
25
+ "dependency_graph": {
26
+ "frontend": ["api-gateway"],
27
+ "api-gateway": ["user-service", "order-service"],
28
+ "user-service": ["postgres-primary"],
29
+ "order-service": ["postgres-primary"],
30
+ },
31
+ "services": [
32
+ {
33
+ "name": "frontend",
34
+ "status": "degraded",
35
+ "cpu_percent": 25.0,
36
+ "memory_percent": 35.0,
37
+ "latency_ms": 800.0,
38
+ "error_rate": 15.0,
39
+ "logs": [
40
+ "[ERROR] Upstream api-gateway returning 502 Bad Gateway",
41
+ "[WARN] Response time exceeded 500ms threshold",
42
+ "[ERROR] 15% of requests failing β€” user-facing errors",
43
+ "[INFO] Health check: frontend process healthy, upstream degraded",
44
+ ],
45
+ "processes": [
46
+ {"pid": "1001", "name": "nginx", "cpu_percent": 10.0, "memory_mb": 128.0},
47
+ {"pid": "1002", "name": "node-frontend", "cpu_percent": 15.0, "memory_mb": 256.0},
48
+ ],
49
+ },
50
+ {
51
+ "name": "api-gateway",
52
+ "status": "degraded",
53
+ "cpu_percent": 45.0,
54
+ "memory_percent": 88.0,
55
+ "latency_ms": 1200.0,
56
+ "error_rate": 25.0,
57
+ "disk_usage_percent": 40.0,
58
+ "logs": [
59
+ "[WARN] Memory usage at 85% β€” GC pressure increasing",
60
+ "[ERROR] OOM kill detected on worker pid 2003 at 14:23:01",
61
+ "[WARN] Memory usage at 88% β€” approaching critical threshold",
62
+ "[ERROR] Request queue backing up β€” 250 pending requests",
63
+ "[WARN] Memory leak detected in connection pool β€” objects not being freed",
64
+ "[ERROR] OOM kill detected on worker pid 2004 at 14:25:33",
65
+ "[INFO] Auto-restart triggered for worker processes",
66
+ "[WARN] Memory usage climbing again after restart β€” leak persists",
67
+ ],
68
+ "processes": [
69
+ {"pid": "2001", "name": "api-gateway-main", "cpu_percent": 20.0, "memory_mb": 1800.0},
70
+ {"pid": "2002", "name": "api-gateway-worker-1", "cpu_percent": 15.0, "memory_mb": 950.0},
71
+ {"pid": "2003", "name": "api-gateway-worker-2", "cpu_percent": 10.0, "memory_mb": 920.0, "status": "killed"},
72
+ ],
73
+ "config": {"max_connections": "500", "worker_memory_limit": "2048MB"},
74
+ },
75
+ {
76
+ "name": "user-service",
77
+ "status": "healthy",
78
+ "cpu_percent": 20.0,
79
+ "memory_percent": 40.0,
80
+ "latency_ms": 80.0,
81
+ "error_rate": 2.0,
82
+ "logs": [
83
+ "[INFO] Service running normally",
84
+ "[WARN] Increased timeout errors from upstream clients",
85
+ "[INFO] Database connection pool: 12/50 active",
86
+ ],
87
+ "processes": [
88
+ {"pid": "3001", "name": "user-service", "cpu_percent": 20.0, "memory_mb": 512.0},
89
+ ],
90
+ },
91
+ {
92
+ "name": "order-service",
93
+ "status": "healthy",
94
+ "cpu_percent": 18.0,
95
+ "memory_percent": 35.0,
96
+ "latency_ms": 60.0,
97
+ "error_rate": 1.0,
98
+ "logs": [
99
+ "[INFO] Service running normally",
100
+ "[INFO] Processing 45 orders/min",
101
+ ],
102
+ "processes": [
103
+ {"pid": "4001", "name": "order-service", "cpu_percent": 18.0, "memory_mb": 480.0},
104
+ ],
105
+ },
106
+ {
107
+ "name": "postgres-primary",
108
+ "status": "healthy",
109
+ "cpu_percent": 30.0,
110
+ "memory_percent": 50.0,
111
+ "latency_ms": 5.0,
112
+ "error_rate": 0.0,
113
+ "logs": [
114
+ "[INFO] Database healthy β€” 120 active connections",
115
+ "[INFO] Replication lag: 0ms",
116
+ ],
117
+ "processes": [
118
+ {"pid": "5001", "name": "postgres", "cpu_percent": 30.0, "memory_mb": 2048.0},
119
+ ],
120
+ },
121
+ ],
122
+ "alerts": [
123
+ {
124
+ "severity": "critical",
125
+ "service": "api-gateway",
126
+ "message": "Memory usage at 88% β€” OOM kills detected",
127
+ "timestamp": "2024-01-15T14:25:00Z",
128
+ },
129
+ {
130
+ "severity": "warning",
131
+ "service": "frontend",
132
+ "message": "Error rate above 10% threshold",
133
+ "timestamp": "2024-01-15T14:24:00Z",
134
+ },
135
+ ],
136
+ },
137
+
138
+ # ═══════════════════════════════════════════════════════════════
139
+ # MEDIUM β€” Cascading Database Failure
140
+ # Root cause is NOT obvious from description or surface alerts.
141
+ # Multiple services are screaming β€” agent must trace deps to find
142
+ # that postgres is the upstream cause, not payment-service.
143
+ # 4 red herring alerts to distract. Faster degradation.
144
+ # An LLM should score 0.3-0.6 here.
145
+ # ═══════════════════════════════════════════════════════════════
146
+ "medium": {
147
+ "task_id": "medium",
148
+ "difficulty": "medium",
149
+ "title": "Cascading Database Failure",
150
+ "description": (
151
+ "ALERT: payment-service is DOWN and multiple services are degraded. "
152
+ "Customers cannot complete purchases. Several alerts firing across the stack. "
153
+ "Investigate and restore all services."
154
+ ),
155
+ "max_steps": 20,
156
+ "optimal_steps": 10,
157
+ "degradation_rate": 0.04,
158
+ "root_cause": {
159
+ "service": "postgres-primary",
160
+ "issue": "connection_pool_exhaustion",
161
+ "keywords": ["postgres", "connection", "pool", "exhaustion", "database"],
162
+ },
163
+ "dependency_graph": {
164
+ "frontend": ["api-gateway"],
165
+ "api-gateway": ["user-service", "order-service", "payment-service"],
166
+ "user-service": ["postgres-primary", "cache-service"],
167
+ "order-service": ["postgres-primary", "cache-service"],
168
+ "payment-service": ["postgres-primary"],
169
+ "cache-service": [],
170
+ },
171
+ "services": [
172
+ {
173
+ "name": "frontend",
174
+ "status": "degraded",
175
+ "cpu_percent": 20.0,
176
+ "memory_percent": 30.0,
177
+ "latency_ms": 2000.0,
178
+ "error_rate": 30.0,
179
+ "logs": [
180
+ "[ERROR] Multiple 503 Service Unavailable responses",
181
+ "[ERROR] User checkout flow failing β€” timeouts",
182
+ "[WARN] Session timeouts increasing across all endpoints",
183
+ "[ERROR] Static assets loading but API calls failing",
184
+ ],
185
+ "processes": [{"pid": "1001", "name": "nginx", "cpu_percent": 10.0, "memory_mb": 128.0}],
186
+ },
187
+ {
188
+ "name": "api-gateway",
189
+ "status": "degraded",
190
+ "cpu_percent": 35.0,
191
+ "memory_percent": 50.0,
192
+ "latency_ms": 1500.0,
193
+ "error_rate": 25.0,
194
+ "logs": [
195
+ "[ERROR] Timeout from downstream services (5000ms exceeded)",
196
+ "[ERROR] payment-service returning 500 Internal Server Error",
197
+ "[WARN] Circuit breaker tripped for payment-service",
198
+ "[ERROR] Request backlog growing β€” 300 queued requests",
199
+ "[WARN] Thread pool saturation at 95%",
200
+ ],
201
+ "processes": [{"pid": "2001", "name": "api-gateway", "cpu_percent": 35.0, "memory_mb": 800.0}],
202
+ },
203
+ {
204
+ "name": "user-service",
205
+ "status": "degraded",
206
+ "cpu_percent": 70.0,
207
+ "memory_percent": 60.0,
208
+ "latency_ms": 4500.0,
209
+ "error_rate": 40.0,
210
+ "logs": [
211
+ "[ERROR] Connection acquisition timeout after 30s",
212
+ "[ERROR] java.sql.SQLTransientConnectionException: HikariPool-1 - Connection is not available",
213
+ "[WARN] Thread pool exhausted β€” 85 threads blocked on I/O",
214
+ "[ERROR] Query timeout: SELECT * FROM users WHERE id = $1",
215
+ "[INFO] Falling back to cache for read operations where possible",
216
+ "[WARN] Cache fallback only covers 30% of queries β€” rest failing",
217
+ ],
218
+ "processes": [{"pid": "3001", "name": "user-service", "cpu_percent": 70.0, "memory_mb": 1200.0}],
219
+ },
220
+ {
221
+ "name": "order-service",
222
+ "status": "degraded",
223
+ "cpu_percent": 65.0,
224
+ "memory_percent": 55.0,
225
+ "latency_ms": 4200.0,
226
+ "error_rate": 35.0,
227
+ "logs": [
228
+ "[ERROR] Connection acquisition timeout after 30s",
229
+ "[ERROR] Failed to insert order record β€” I/O timeout",
230
+ "[WARN] Order processing queue growing: 450 pending orders",
231
+ "[ERROR] Deadlock detected β€” retrying transaction",
232
+ "[WARN] Retry budget exhausted for 12 pending transactions",
233
+ ],
234
+ "processes": [{"pid": "4001", "name": "order-service", "cpu_percent": 65.0, "memory_mb": 1100.0}],
235
+ },
236
+ {
237
+ "name": "payment-service",
238
+ "status": "down",
239
+ "cpu_percent": 90.0,
240
+ "memory_percent": 80.0,
241
+ "latency_ms": 10000.0,
242
+ "error_rate": 85.0,
243
+ "logs": [
244
+ "[CRITICAL] All downstream connections timed out",
245
+ "[ERROR] Cannot process payments β€” backend connectivity lost",
246
+ "[ERROR] Transaction rollback failed β€” connection dropped mid-commit",
247
+ "[CRITICAL] Health check failing for 5 consecutive cycles",
248
+ "[ERROR] Stripe webhook delivery failing β€” cannot acknowledge",
249
+ "[WARN] PCI audit log write failing β€” compliance risk",
250
+ ],
251
+ "processes": [{"pid": "5001", "name": "payment-service", "cpu_percent": 90.0, "memory_mb": 1500.0}],
252
+ },
253
+ {
254
+ "name": "postgres-primary",
255
+ "status": "degraded",
256
+ "cpu_percent": 95.0,
257
+ "memory_percent": 85.0,
258
+ "latency_ms": 8000.0,
259
+ "error_rate": 60.0,
260
+ "logs": [
261
+ "[WARN] High number of active connections: 495/500",
262
+ "[WARN] Long-running queries detected: 23 queries running > 30s",
263
+ "[ERROR] Lock contention on table 'orders' β€” 15 blocked transactions",
264
+ "[WARN] Checkpoint taking too long β€” I/O bottleneck detected",
265
+ "[WARN] Idle connections detected: 340 of 500 slots held by idle sessions",
266
+ "[INFO] Connection sources: payment-svc (180), order-svc (150), user-svc (120), unknown (50)",
267
+ ],
268
+ "processes": [{"pid": "6001", "name": "postgres", "cpu_percent": 95.0, "memory_mb": 4096.0}],
269
+ "config": {"max_connections": "500", "idle_timeout": "0", "connection_limit_per_user": "200"},
270
+ },
271
+ {
272
+ "name": "cache-service",
273
+ "status": "healthy",
274
+ "cpu_percent": 55.0,
275
+ "memory_percent": 70.0,
276
+ "latency_ms": 2.0,
277
+ "error_rate": 0.0,
278
+ "logs": [
279
+ "[INFO] Cache hit rate: 45% (normally 92%)",
280
+ "[WARN] Eviction rate increased β€” many cache misses",
281
+ "[INFO] Memory usage normal β€” 4.2GB / 8GB",
282
+ ],
283
+ "processes": [{"pid": "7001", "name": "redis", "cpu_percent": 55.0, "memory_mb": 4200.0}],
284
+ },
285
+ ],
286
+ "alerts": [
287
+ # The loudest alert is payment-service β€” but it's a SYMPTOM, not the cause
288
+ {
289
+ "severity": "critical",
290
+ "service": "payment-service",
291
+ "message": "Service DOWN β€” health checks failing for 5 minutes",
292
+ "timestamp": "2024-01-15T14:20:00Z",
293
+ },
294
+ {
295
+ "severity": "critical",
296
+ "service": "order-service",
297
+ "message": "Error rate at 35% β€” order processing stalled",
298
+ "timestamp": "2024-01-15T14:21:00Z",
299
+ },
300
+ {
301
+ "severity": "warning",
302
+ "service": "user-service",
303
+ "message": "Error rate above 30% β€” login failures increasing",
304
+ "timestamp": "2024-01-15T14:21:30Z",
305
+ },
306
+ # Red herrings β€” look important but are unrelated
307
+ {
308
+ "severity": "critical",
309
+ "service": "cache-service",
310
+ "message": "Memory usage at 70% β€” approaching eviction threshold",
311
+ "timestamp": "2024-01-15T14:19:00Z",
312
+ "is_noise": True,
313
+ "ttl": 3,
314
+ },
315
+ {
316
+ "severity": "warning",
317
+ "service": "cache-service",
318
+ "message": "Cache hit rate dropped to 45% β€” possible cache poisoning",
319
+ "timestamp": "2024-01-15T14:19:30Z",
320
+ "is_noise": True,
321
+ "ttl": 4,
322
+ },
323
+ {
324
+ "severity": "warning",
325
+ "service": "frontend",
326
+ "message": "SSL certificate renewal failed β€” expires in 12 days",
327
+ "timestamp": "2024-01-15T14:00:00Z",
328
+ "is_noise": True,
329
+ "ttl": 10,
330
+ },
331
+ {
332
+ "severity": "info",
333
+ "service": "api-gateway",
334
+ "message": "New deployment v3.2.1 rolled out 6 hours ago β€” monitoring",
335
+ "timestamp": "2024-01-15T08:00:00Z",
336
+ "is_noise": True,
337
+ "ttl": 15,
338
+ },
339
+ ],
340
+ },
341
+
342
+ # ═══════════════════════════════════════════════════════════════
343
+ # HARD β€” Crypto-Mining Attack + Disk Full
344
+ # Two CONCURRENT unrelated issues. Logs are ambiguous β€” no log
345
+ # explicitly says "crypto mining". Agent must correlate: unknown
346
+ # high-CPU process + suspicious outbound connection + recent
347
+ # unauthorized deployment. Plus a separate disk issue on another
348
+ # service. 5 red herring alerts. Fast degradation.
349
+ # An LLM should score 0.1-0.3 here.
350
+ # ═══════════════════════════════════════════════════════════════
351
+ "hard": {
352
+ "task_id": "hard",
353
+ "difficulty": "hard",
354
+ "title": "Crypto-Mining Attack + Disk Full",
355
+ "description": (
356
+ "ALERT: Multiple critical alerts firing. worker-service CPU at 98%. "
357
+ "log-aggregator disk full. order-service errors spiking. cache-service "
358
+ "memory warning. Several services degrading rapidly. Investigate all issues."
359
+ ),
360
+ "max_steps": 25,
361
+ "optimal_steps": 15,
362
+ "degradation_rate": 0.05,
363
+ "root_cause": {
364
+ "service": "worker-service",
365
+ "issue": "crypto_mining_attack",
366
+ "keywords": ["crypto", "mining", "malicious", "compromised", "attack", "xmrig", "unauthorized"],
367
+ },
368
+ "dependency_graph": {
369
+ "frontend": ["api-gateway"],
370
+ "api-gateway": ["user-service", "order-service", "worker-service"],
371
+ "user-service": ["postgres-primary"],
372
+ "order-service": ["postgres-primary", "worker-service"],
373
+ "worker-service": ["postgres-primary", "cache-service"],
374
+ "log-aggregator": [],
375
+ },
376
+ "services": [
377
+ {
378
+ "name": "frontend",
379
+ "status": "degraded",
380
+ "cpu_percent": 20.0,
381
+ "memory_percent": 30.0,
382
+ "latency_ms": 1500.0,
383
+ "error_rate": 20.0,
384
+ "logs": [
385
+ "[ERROR] Slow responses from backend",
386
+ "[WARN] Multiple user complaints filed in last 30 minutes",
387
+ "[ERROR] Checkout page timing out for 20% of users",
388
+ ],
389
+ "processes": [{"pid": "1001", "name": "nginx", "cpu_percent": 10.0, "memory_mb": 128.0}],
390
+ },
391
+ {
392
+ "name": "api-gateway",
393
+ "status": "degraded",
394
+ "cpu_percent": 40.0,
395
+ "memory_percent": 45.0,
396
+ "latency_ms": 1200.0,
397
+ "error_rate": 18.0,
398
+ "logs": [
399
+ "[ERROR] Downstream service timeouts increasing",
400
+ "[WARN] order-service p99 latency at 4500ms (SLA: 500ms)",
401
+ "[ERROR] worker-service health check failing intermittently",
402
+ "[WARN] Request retry rate at 35%",
403
+ ],
404
+ "processes": [{"pid": "2001", "name": "api-gateway", "cpu_percent": 40.0, "memory_mb": 900.0}],
405
+ },
406
+ {
407
+ "name": "user-service",
408
+ "status": "healthy",
409
+ "cpu_percent": 25.0,
410
+ "memory_percent": 40.0,
411
+ "latency_ms": 100.0,
412
+ "error_rate": 2.0,
413
+ "logs": [
414
+ "[INFO] Service operating within normal parameters",
415
+ "[WARN] Slight latency increase on auth endpoints",
416
+ ],
417
+ "processes": [{"pid": "3001", "name": "user-service", "cpu_percent": 25.0, "memory_mb": 512.0}],
418
+ },
419
+ {
420
+ "name": "order-service",
421
+ "status": "degraded",
422
+ "cpu_percent": 50.0,
423
+ "memory_percent": 55.0,
424
+ "latency_ms": 2000.0,
425
+ "error_rate": 25.0,
426
+ "logs": [
427
+ "[ERROR] Background job dispatch failing β€” worker pool unresponsive",
428
+ "[WARN] Order processing queue depth: 200 (threshold: 50)",
429
+ "[ERROR] Timeout processing order #45892 β€” worker callback never received",
430
+ "[WARN] Falling back to synchronous processing β€” degraded throughput",
431
+ "[ERROR] 12 orders stuck in PROCESSING state for > 10 minutes",
432
+ ],
433
+ "processes": [{"pid": "4001", "name": "order-service", "cpu_percent": 50.0, "memory_mb": 800.0}],
434
+ },
435
+ {
436
+ "name": "worker-service",
437
+ "status": "degraded",
438
+ "cpu_percent": 98.0,
439
+ "memory_percent": 75.0,
440
+ "latency_ms": 15000.0,
441
+ "error_rate": 70.0,
442
+ "disk_usage_percent": 45.0,
443
+ "deployment_version": "v2.1.0",
444
+ "previous_version": "v2.0.0",
445
+ "logs": [
446
+ "[WARN] CPU usage sustained at 98% for 45 minutes",
447
+ "[ERROR] Task processing completely stalled β€” 0 tasks/min (normal: 500/min)",
448
+ "[WARN] Unrecognized process PID 5002 consuming 85% CPU β€” not in service manifest",
449
+ "[ERROR] Outbound connection to unknown host 45.33.32.156:3333 β€” not in allowlist",
450
+ "[WARN] Deployment v2.1.0 was pushed 2 hours ago via CI service account",
451
+ "[INFO] No code changes in v2.1.0 diff β€” only Dockerfile layer modified",
452
+ "[ERROR] Service account token last rotated 90 days ago (policy: 30 days)",
453
+ ],
454
+ "processes": [
455
+ {"pid": "5001", "name": "worker-main", "cpu_percent": 8.0, "memory_mb": 512.0},
456
+ {"pid": "5002", "name": "kworker/u8:2", "cpu_percent": 85.0, "memory_mb": 2048.0, "malicious": True},
457
+ {"pid": "5003", "name": "worker-scheduler", "cpu_percent": 5.0, "memory_mb": 256.0},
458
+ ],
459
+ "network_connections": [
460
+ {"src": "worker-service:48892", "dst": "45.33.32.156:3333", "state": "ESTABLISHED"},
461
+ {"src": "worker-service:8080", "dst": "api-gateway:443", "state": "ESTABLISHED"},
462
+ {"src": "worker-service:54210", "dst": "postgres-primary:5432", "state": "ESTABLISHED"},
463
+ {"src": "worker-service:39001", "dst": "cache-service:6379", "state": "ESTABLISHED"},
464
+ ],
465
+ "config": {"task_workers": "8", "max_retries": "3"},
466
+ },
467
+ {
468
+ "name": "postgres-primary",
469
+ "status": "healthy",
470
+ "cpu_percent": 35.0,
471
+ "memory_percent": 55.0,
472
+ "latency_ms": 10.0,
473
+ "error_rate": 1.0,
474
+ "logs": [
475
+ "[INFO] Database healthy β€” 80 active connections",
476
+ "[WARN] Unusual query pattern from worker-service β€” bulk SELECT on credentials table",
477
+ "[INFO] Replication lag: 2ms (within threshold)",
478
+ ],
479
+ "processes": [{"pid": "6001", "name": "postgres", "cpu_percent": 35.0, "memory_mb": 2048.0}],
480
+ },
481
+ {
482
+ "name": "cache-service",
483
+ "status": "healthy",
484
+ "cpu_percent": 30.0,
485
+ "memory_percent": 50.0,
486
+ "latency_ms": 3.0,
487
+ "error_rate": 0.0,
488
+ "logs": [
489
+ "[INFO] Cache operating normally",
490
+ "[WARN] Slight increase in connections from worker-service",
491
+ ],
492
+ "processes": [{"pid": "7001", "name": "redis", "cpu_percent": 30.0, "memory_mb": 3000.0}],
493
+ },
494
+ {
495
+ "name": "log-aggregator",
496
+ "status": "degraded",
497
+ "cpu_percent": 40.0,
498
+ "memory_percent": 50.0,
499
+ "latency_ms": 500.0,
500
+ "error_rate": 30.0,
501
+ "disk_usage_percent": 97.0,
502
+ "logs": [
503
+ "[CRITICAL] Disk usage at 97% on /var/log",
504
+ "[ERROR] Cannot write new log entries β€” ENOSPC",
505
+ "[WARN] Log rotation failed β€” insufficient space for rotation",
506
+ "[ERROR] Dropping log streams from worker-service, order-service, api-gateway",
507
+ "[WARN] Last successful rotation: 3 days ago",
508
+ ],
509
+ "processes": [
510
+ {"pid": "8001", "name": "fluentd", "cpu_percent": 30.0, "memory_mb": 512.0},
511
+ {"pid": "8002", "name": "elasticsearch", "cpu_percent": 10.0, "memory_mb": 2048.0},
512
+ ],
513
+ },
514
+ ],
515
+ "alerts": [
516
+ # Real alerts β€” but the ROOT CAUSE isn't obvious
517
+ {
518
+ "severity": "critical",
519
+ "service": "worker-service",
520
+ "message": "CPU at 98% sustained for 45 minutes",
521
+ "timestamp": "2024-01-15T14:10:00Z",
522
+ },
523
+ {
524
+ "severity": "critical",
525
+ "service": "log-aggregator",
526
+ "message": "Disk full β€” /var/log at 97%",
527
+ "timestamp": "2024-01-15T14:12:00Z",
528
+ },
529
+ {
530
+ "severity": "warning",
531
+ "service": "order-service",
532
+ "message": "Error rate at 25% β€” SLA breach imminent",
533
+ "timestamp": "2024-01-15T14:15:00Z",
534
+ },
535
+ # Red herrings β€” look like they could be the cause
536
+ {
537
+ "severity": "critical",
538
+ "service": "cache-service",
539
+ "message": "Connection spike detected β€” possible cache stampede",
540
+ "timestamp": "2024-01-15T14:11:00Z",
541
+ "is_noise": True,
542
+ "ttl": 3,
543
+ },
544
+ {
545
+ "severity": "warning",
546
+ "service": "postgres-primary",
547
+ "message": "Unusual query patterns detected β€” possible SQL injection",
548
+ "timestamp": "2024-01-15T14:13:00Z",
549
+ "is_noise": True,
550
+ "ttl": 4,
551
+ },
552
+ {
553
+ "severity": "warning",
554
+ "service": "frontend",
555
+ "message": "CDN cache invalidation failed β€” stale assets being served",
556
+ "timestamp": "2024-01-15T14:14:00Z",
557
+ "is_noise": True,
558
+ "ttl": 5,
559
+ },
560
+ {
561
+ "severity": "warning",
562
+ "service": "api-gateway",
563
+ "message": "Rate limiter triggered for IP range 203.0.113.0/24 β€” possible DDoS",
564
+ "timestamp": "2024-01-15T14:16:00Z",
565
+ "is_noise": True,
566
+ "ttl": 6,
567
+ },
568
+ {
569
+ "severity": "info",
570
+ "service": "user-service",
571
+ "message": "Scheduled maintenance window in 2 hours β€” auto-scaling disabled",
572
+ "timestamp": "2024-01-15T14:00:00Z",
573
+ "is_noise": True,
574
+ "ttl": 20,
575
+ },
576
+ ],
577
+ },
578
+ }
579
+
580
+
581
+ def get_scenario(task_id: str) -> dict:
582
+ if task_id not in SCENARIOS:
583
+ raise ValueError(f"Unknown task_id: {task_id}. Available: {list(SCENARIOS.keys())}")
584
+ return SCENARIOS[task_id]