OGrohit Claude Opus 4.6 commited on
Commit
5884d9c
Β·
1 Parent(s): e270f30

Day 2: environment.py, log_generator.py, single_crash scenario, real endpoints

Browse files

- LogTriageEnvironment with real reset()/step()/state()
- Reward function with partial credit + penalties
- log_generator.py β€” realistic log synthesis with signal/noise mixing
- single_crash.py β€” Task 1 scenario with 8-step signal progression
- /reset, /step, /state endpoints now return real observations
- Full Task 1 episode playable end-to-end
- Grader returns varying scores (correct agent: 1.0, wrong agent: 0.0)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

.claude/settings.local.json CHANGED
@@ -3,7 +3,8 @@
3
  "allow": [
4
  "Bash(cd:*)",
5
  "Bash(pip install:*)",
6
- "Bash(curl -s http://localhost:7860/health)"
 
7
  ]
8
  }
9
  }
 
3
  "allow": [
4
  "Bash(cd:*)",
5
  "Bash(pip install:*)",
6
+ "Bash(curl -s http://localhost:7860/health)",
7
+ "Bash(python:*)"
8
  ]
9
  }
10
  }
DAY2.md ADDED
@@ -0,0 +1,963 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Day 2 β€” Execution Plan
2
+ **LogTriageEnv | Meta Γ— PyTorch Hackathon**
3
+ **Date: March 27, 2026 | Deadline: April 7, 11:59 PM IST**
4
+
5
+ ---
6
+
7
+ ## Goal for Today
8
+ By end of Day 2 you must have:
9
+ - [ ] `server/log_generator.py` β€” synthetic log generation engine working
10
+ - [ ] `server/scenarios/single_crash.py` β€” Task 1 scenario fully defined
11
+ - [ ] `server/environment.py` β€” `LogTriageEnvironment` class with real `reset()` and `step()` logic
12
+ - [ ] `/reset` and `/step` endpoints returning **real observations** (not placeholders)
13
+ - [ ] `/state` endpoint returning real episode state
14
+ - [ ] Full Task 1 episode playable end-to-end via curl
15
+ - [ ] Git push with all Day 2 work
16
+
17
+ ---
18
+
19
+ ## What Day 2 Builds
20
+
21
+ Day 1 gave you the skeleton. Day 2 gives it a brain.
22
+
23
+ ```
24
+ server/
25
+ β”œβ”€β”€ log_generator.py ← BUILD THIS FIRST (foundation)
26
+ β”œβ”€β”€ scenarios/
27
+ β”‚ └── single_crash.py ← BUILD THIS SECOND (Task 1 data)
28
+ └── environment.py ← BUILD THIS LAST (wires everything together)
29
+ ```
30
+
31
+ Build in this exact order. `log_generator` feeds `single_crash`, which feeds `environment`.
32
+
33
+ ---
34
+
35
+ ## Step 1 β€” Write `server/log_generator.py`
36
+
37
+ This is the engine that generates realistic log lines for any scenario.
38
+ Open `server/log_generator.py` and paste:
39
+
40
+ ```python
41
+ """
42
+ Log generator for LogTriageEnv.
43
+ Produces realistic-looking log lines for the simulated microservice cluster.
44
+ """
45
+ from __future__ import annotations
46
+ import random
47
+ from datetime import datetime, timedelta
48
+ from server.models import LogLine, ServiceStatus
49
+
50
+ # ─── SERVICES ─────────────────────────────────────────────────────────────────
51
+
52
+ SERVICES = [
53
+ "api-gateway",
54
+ "auth-service",
55
+ "user-db",
56
+ "payment-service",
57
+ "payment-db",
58
+ "notification-service",
59
+ "email-queue",
60
+ ]
61
+
62
+ # ─── LOG TEMPLATES ────────────────────────────────────────────────────────────
63
+
64
+ # Noise logs β€” realistic but irrelevant to the incident
65
+ NOISE_TEMPLATES = {
66
+ "api-gateway": [
67
+ ("INFO", "health check passed β€” all upstream services reachable"),
68
+ ("INFO", "request completed: GET /api/v1/users/profile [200] 45ms"),
69
+ ("INFO", "rate limiter: 1240/5000 requests this minute"),
70
+ ("DEBUG", "connection pool: 12/100 active connections"),
71
+ ("INFO", "TLS certificate valid for 87 more days"),
72
+ ],
73
+ "auth-service": [
74
+ ("INFO", "JWT token issued for user_id=88142 [expires: 3600s]"),
75
+ ("INFO", "OAuth2 flow completed successfully"),
76
+ ("DEBUG", "session cache hit ratio: 94.2%"),
77
+ ("INFO", "password reset email queued for user_id=23019"),
78
+ ],
79
+ "user-db": [
80
+ ("INFO", "daily vacuum completed: 0 dead tuples removed"),
81
+ ("INFO", "checkpoint complete: wrote 142 buffers"),
82
+ ("DEBUG", "autovacuum: processing table 'sessions'"),
83
+ ("INFO", "replication lag: 12ms (within threshold)"),
84
+ ],
85
+ "payment-service": [
86
+ ("INFO", "payment processed: txn_id=TXN-8812 amount=299.00 INR [success]"),
87
+ ("INFO", "webhook delivered: stripe event=payment.succeeded"),
88
+ ("DEBUG", "idempotency key cache: 2341 keys active"),
89
+ ],
90
+ "payment-db": [
91
+ ("INFO", "connection pool: 8/50 active"),
92
+ ("DEBUG", "query plan cache: 88% hit ratio"),
93
+ ("INFO", "index usage: 99.1% queries using indexed scans"),
94
+ ],
95
+ "notification-service": [
96
+ ("INFO", "email dispatched: template=welcome_email to=user@example.com"),
97
+ ("INFO", "SMS delivered: +91XXXXXXXXXX [provider=twilio]"),
98
+ ("WARN", "email bounce rate: 1.2% (threshold: 5%)"),
99
+ ("INFO", "push notification sent: device_tokens=1240"),
100
+ ],
101
+ "email-queue": [
102
+ ("INFO", "queue depth: 42 messages pending"),
103
+ ("INFO", "consumer lag: 0.3s (healthy)"),
104
+ ("DEBUG", "partition rebalance completed in 120ms"),
105
+ ],
106
+ }
107
+
108
+ # Signal logs β€” actual incident indicators
109
+ SIGNAL_TEMPLATES = {
110
+ # Single service crash signals (Task 1 β€” payment-service crash)
111
+ "single_crash_payment": [
112
+ ("ERROR", "NullPointerException: Cannot invoke method processPayment() on null object β€” PaymentProcessor.java:142"),
113
+ ("ERROR", "HTTP 500 Internal Server Error: payment gateway returned null response"),
114
+ ("ERROR", "NullPointerException in PaymentService.execute() β€” retrying (attempt 1/3)"),
115
+ ("ERROR", "NullPointerException in PaymentService.execute() β€” retrying (attempt 2/3)"),
116
+ ("FATAL", "NullPointerException in PaymentService.execute() β€” all retries exhausted, request failed"),
117
+ ("ERROR", "health check FAILED: payment-service returned 500 (was 200)"),
118
+ ("ERROR", "circuit breaker OPEN: payment-service error rate 98.2% (threshold: 10%)"),
119
+ ],
120
+ # Cascading failure signals (Task 2 β€” user-db β†’ auth-service β†’ api-gateway)
121
+ "cascading_userdb": [
122
+ ("WARN", "slow query detected: SELECT * FROM sessions WHERE user_id=? [latency: 2847ms, threshold: 200ms]"),
123
+ ("ERROR", "slow query detected: SELECT * FROM sessions WHERE user_id=? [latency: 4120ms]"),
124
+ ("ERROR", "query timeout: SELECT * FROM active_sessions [timeout after 5000ms]"),
125
+ ],
126
+ "cascading_auth": [
127
+ ("WARN", "db connection pool: 42/50 active connections (84% utilization)"),
128
+ ("ERROR", "db connection pool exhausted: 50/50 connections in use β€” requests queuing"),
129
+ ("ERROR", "authentication request timed out waiting for db connection [5200ms]"),
130
+ ],
131
+ "cascading_gateway": [
132
+ ("ERROR", "upstream timeout: auth-service failed to respond within 5000ms [req-id: {req_id}]"),
133
+ ("ERROR", "upstream timeout: auth-service [req-id: {req_id}] β€” returning 504 to client"),
134
+ ("WARN", "error rate spike: 34.2% of requests failing (threshold: 5%)"),
135
+ ],
136
+ # Silent degradation signals (Task 3 β€” payment-db slow)
137
+ "silent_paymentdb": [
138
+ ("WARN", "query latency elevated: avg=450ms (normal: 80ms) β€” monitoring"),
139
+ ("WARN", "query latency elevated: avg=620ms β€” possible memory pressure"),
140
+ ("WARN", "query latency elevated: avg=890ms β€” recommend investigation"),
141
+ ("WARN", "query latency elevated: avg=1200ms β€” approaching timeout threshold"),
142
+ ("WARN", "buffer cache hit ratio degraded: 87% (normal: 98%) β€” possible memory issue"),
143
+ ],
144
+ }
145
+
146
+
147
+ def _make_timestamp(base_time: datetime, offset_seconds: int = 0) -> str:
148
+ t = base_time + timedelta(seconds=offset_seconds)
149
+ return t.strftime("%Y-%m-%dT%H:%M:%SZ")
150
+
151
+
152
+ def _noise_log(service: str, base_time: datetime, offset: int) -> LogLine:
153
+ templates = NOISE_TEMPLATES.get(service, [("INFO", "routine operation completed")])
154
+ level, message = random.choice(templates)
155
+ return LogLine(
156
+ timestamp=_make_timestamp(base_time, offset),
157
+ level=level,
158
+ service=service,
159
+ request_id=None,
160
+ message=message,
161
+ latency_ms=None,
162
+ )
163
+
164
+
165
+ def generate_log_batch(
166
+ scenario_signals: list[tuple[str, str, str]], # [(service, level, message), ...]
167
+ step: int,
168
+ base_time: datetime,
169
+ noise_ratio: float = 0.3,
170
+ batch_size: int = 8,
171
+ rng: random.Random = None,
172
+ ) -> list[LogLine]:
173
+ """
174
+ Generate a mixed batch of signal + noise log lines.
175
+
176
+ Args:
177
+ scenario_signals: List of (service, level, message) tuples β€” the actual signals for this step
178
+ step: Current step number (used for timestamp offset)
179
+ base_time: Episode start time (used for timestamps)
180
+ noise_ratio: Fraction of logs that are noise (0.0 = all signal, 1.0 = all noise)
181
+ batch_size: Total number of log lines to return
182
+ rng: Optional seeded Random for reproducibility
183
+
184
+ Returns:
185
+ List of LogLine objects, shuffled (signal mixed into noise)
186
+ """
187
+ if rng is None:
188
+ rng = random.Random()
189
+
190
+ logs = []
191
+ base_offset = step * 30 # 30 simulated seconds per step
192
+
193
+ # Add signal logs
194
+ for i, (service, level, message) in enumerate(scenario_signals):
195
+ req_id = f"req-{rng.randint(1000, 9999)}" if level in ("ERROR", "WARN") else None
196
+ logs.append(LogLine(
197
+ timestamp=_make_timestamp(base_time, base_offset + i),
198
+ level=level,
199
+ service=service,
200
+ request_id=req_id,
201
+ message=message,
202
+ latency_ms=rng.randint(200, 5000) if "timeout" in message.lower() or "latency" in message.lower() else None,
203
+ ))
204
+
205
+ # Fill remaining slots with noise logs
206
+ noise_count = max(0, batch_size - len(logs))
207
+ noise_services = rng.choices(SERVICES, k=noise_count)
208
+ for i, svc in enumerate(noise_services):
209
+ logs.append(_noise_log(svc, base_time, base_offset + len(scenario_signals) + i))
210
+
211
+ # Shuffle β€” signal should not always be first
212
+ rng.shuffle(logs)
213
+ return logs[:batch_size]
214
+
215
+
216
+ def generate_healthy_system_state(base_time: datetime) -> dict[str, ServiceStatus]:
217
+ """Generate a fully healthy system state snapshot."""
218
+ now = _make_timestamp(base_time)
219
+ return {
220
+ svc: ServiceStatus(
221
+ name=svc,
222
+ status="up",
223
+ error_rate=round(random.uniform(0.001, 0.01), 4),
224
+ latency_p99_ms=random.randint(20, 80),
225
+ last_updated=now,
226
+ )
227
+ for svc in SERVICES
228
+ }
229
+ ```
230
+
231
+ ---
232
+
233
+ ## Step 2 β€” Write `server/scenarios/single_crash.py`
234
+
235
+ This defines Task 1: the payment-service crash scenario.
236
+ Open `server/scenarios/single_crash.py` and paste:
237
+
238
+ ```python
239
+ """
240
+ Task 1 β€” Single Service Crash (Easy)
241
+
242
+ Scenario: payment-service crashes with NullPointerException on every request.
243
+ All other services are healthy. Logs are mostly unambiguous.
244
+ Noise ratio: ~20%.
245
+
246
+ Ground truth:
247
+ - severity: P1
248
+ - root_cause: payment-service
249
+ - remediation: restart:payment-service
250
+ - correct_team: backend-team
251
+ """
252
+ from __future__ import annotations
253
+ import random
254
+ from datetime import datetime
255
+ from server.models import LogLine, ServiceStatus
256
+ from server.log_generator import (
257
+ generate_log_batch,
258
+ generate_healthy_system_state,
259
+ SIGNAL_TEMPLATES,
260
+ _make_timestamp,
261
+ )
262
+
263
+ # ─── GROUND TRUTH ─────────────────────────────────────────────────────────────
264
+
265
+ GROUND_TRUTH = {
266
+ "severity": "P1",
267
+ "root_cause": "payment-service",
268
+ "remediation_prefixes": {"restart"}, # restart:payment-service is correct
269
+ "remediation_service": "payment-service",
270
+ "correct_teams": {"backend-team", "sre-team"},
271
+ "max_steps": 8,
272
+ "noise_ratio": 0.20,
273
+ }
274
+
275
+ # ─── STEP-BY-STEP SIGNAL PLAN ─────────────────────────────────────────────────
276
+ # Each list = signals injected at that step index.
277
+ # Step 0 = after reset (first observation), Step 7 = last possible step.
278
+
279
+ STEP_SIGNALS = [
280
+ # Step 0: first signs β€” circuit breaker opens, error rate spike
281
+ [
282
+ ("payment-service", "ERROR", "NullPointerException: Cannot invoke processPayment() on null β€” PaymentProcessor.java:142"),
283
+ ("api-gateway", "WARN", "error rate spike: 28.4% of /payment requests failing"),
284
+ ],
285
+ # Step 1: escalating β€” more errors, health check fails
286
+ [
287
+ ("payment-service", "FATAL", "NullPointerException in PaymentService.execute() β€” all retries (3/3) exhausted"),
288
+ ("payment-service", "ERROR", "health check FAILED: payment-service returned HTTP 500"),
289
+ ],
290
+ # Step 2: circuit breaker fully open
291
+ [
292
+ ("api-gateway", "ERROR", "circuit breaker OPEN: payment-service error rate 98.2% (threshold: 10%)"),
293
+ ("payment-service", "ERROR", "NullPointerException: Cannot invoke processPayment() on null β€” PaymentProcessor.java:142"),
294
+ ],
295
+ # Step 3+: same signals repeat β€” incident ongoing until agent acts
296
+ [
297
+ ("payment-service", "ERROR", "NullPointerException in PaymentService.execute() β€” retrying (1/3)"),
298
+ ("api-gateway", "ERROR", "upstream failure: payment-service unavailable [circuit breaker: OPEN]"),
299
+ ],
300
+ [
301
+ ("payment-service", "FATAL", "payment-service health check FAILED for 90s β€” marking as DOWN"),
302
+ ("api-gateway", "WARN", "payment endpoint degraded β€” all requests returning 503"),
303
+ ],
304
+ [
305
+ ("payment-service", "ERROR", "NullPointerException: Cannot invoke processPayment() on null β€” PaymentProcessor.java:142"),
306
+ ("api-gateway", "ERROR", "error rate: 99.1% on /payment/* routes"),
307
+ ],
308
+ [
309
+ ("payment-service", "FATAL", "NullPointerException β€” service unresponsive for 180s"),
310
+ ("api-gateway", "ERROR", "SLA breach: payment service uptime < 99.9%"),
311
+ ],
312
+ [
313
+ ("payment-service", "FATAL", "CRITICAL: payment-service has been DOWN for 210s β€” immediate action required"),
314
+ ("api-gateway", "ERROR", "all payment transactions failing β€” revenue impact ongoing"),
315
+ ],
316
+ ]
317
+
318
+
319
+ def get_system_state(step: int, base_time: datetime) -> dict[str, ServiceStatus]:
320
+ """Return system state for this step. payment-service is down; others are healthy."""
321
+ now = _make_timestamp(base_time, step * 30)
322
+ state = generate_healthy_system_state(base_time)
323
+
324
+ # Override payment-service to be DOWN
325
+ state["payment-service"] = ServiceStatus(
326
+ name="payment-service",
327
+ status="down",
328
+ error_rate=0.982,
329
+ latency_p99_ms=5000,
330
+ last_updated=now,
331
+ )
332
+ return state
333
+
334
+
335
+ def get_step_data(step: int, base_time: datetime, rng: random.Random) -> tuple[list[LogLine], dict[str, ServiceStatus]]:
336
+ """
337
+ Returns (logs, system_state) for the given step.
338
+ Signals get louder over time if agent hasn't acted.
339
+ """
340
+ signal_idx = min(step, len(STEP_SIGNALS) - 1)
341
+ signals = STEP_SIGNALS[signal_idx]
342
+
343
+ logs = generate_log_batch(
344
+ scenario_signals=signals,
345
+ step=step,
346
+ base_time=base_time,
347
+ noise_ratio=GROUND_TRUTH["noise_ratio"],
348
+ batch_size=8,
349
+ rng=rng,
350
+ )
351
+ system_state = get_system_state(step, base_time)
352
+ return logs, system_state
353
+
354
+
355
+ def get_active_alerts(step: int) -> list[str]:
356
+ """Return active alerts for this step."""
357
+ alerts = ["payment-service: circuit breaker OPEN", "payment-service: health check FAILING"]
358
+ if step >= 2:
359
+ alerts.append("SLA_BREACH: payment availability < 99.9%")
360
+ if step >= 5:
361
+ alerts.append("CRITICAL: payment-service DOWN > 150s")
362
+ return alerts
363
+ ```
364
+
365
+ ---
366
+
367
+ ## Step 3 β€” Write `server/environment.py`
368
+
369
+ This is the core class. It wires log_generator + scenarios into a proper OpenEnv environment.
370
+ Open `server/environment.py` and paste:
371
+
372
+ ```python
373
+ """
374
+ Core LogTriageEnvironment class.
375
+ Implements OpenEnv interface: reset(), step(), state property.
376
+ """
377
+ from __future__ import annotations
378
+ import random
379
+ from datetime import datetime
380
+ from uuid import uuid4
381
+
382
+ from server.models import (
383
+ TriageAction,
384
+ TriageObservation,
385
+ EpisodeState,
386
+ LogLine,
387
+ ServiceStatus,
388
+ )
389
+ from server.scenarios import single_crash
390
+ from server.log_generator import generate_healthy_system_state, _make_timestamp
391
+
392
+ # ─── TASK REGISTRY ─────────────────────────────────────────────────────────────
393
+
394
+ TASK_MAX_STEPS = {
395
+ "single_crash": 8,
396
+ "cascading_failure": 12,
397
+ "silent_degradation": 15,
398
+ }
399
+
400
+ # ─── REWARD CONSTANTS ──────────────────────────────────────────────────────────
401
+
402
+ R_CORRECT_SEVERITY = 0.30
403
+ R_CORRECT_ROOT_CAUSE = 0.35
404
+ R_CORRECT_REMEDIATION = 0.25
405
+ R_CORRECT_ESCALATION = 0.10
406
+ R_SPEED_BONUS = 0.10
407
+ R_PARTIAL_SERVICE_FAM = 0.10
408
+ R_PARTIAL_SEVERITY_ADJ = 0.10
409
+
410
+ P_WRONG_ESCALATION = -0.10
411
+ P_IGNORE_P1 = -0.50
412
+ P_REDUNDANT_ACTION = -0.05
413
+ P_EXCEEDED_BUDGET = -0.20
414
+ P_OVERESCALATE_P3_P1 = -0.15
415
+
416
+
417
+ class LogTriageEnvironment:
418
+ """
419
+ OpenEnv-compatible environment for SRE incident triage.
420
+
421
+ Usage:
422
+ env = LogTriageEnvironment()
423
+ obs = env.reset(task_id="single_crash", seed=42)
424
+ while not obs.done:
425
+ action = agent.act(obs)
426
+ obs = env.step(action)
427
+ score = env.get_grader_score()
428
+ """
429
+
430
+ def __init__(self):
431
+ self._state: EpisodeState | None = None
432
+ self._rng: random.Random = random.Random()
433
+ self._base_time: datetime = datetime.utcnow()
434
+ self._task_id: str = "single_crash"
435
+ self._ground_truth: dict = {}
436
+ self._current_obs: TriageObservation | None = None
437
+
438
+ # ─── OPENENV INTERFACE ─────────────────────────────────────────────────────
439
+
440
+ def reset(self, task_id: str = "single_crash", seed: int | None = None) -> TriageObservation:
441
+ """Start a fresh episode. Returns initial observation."""
442
+ if task_id not in TASK_MAX_STEPS:
443
+ raise ValueError(f"Unknown task_id '{task_id}'. Valid: {list(TASK_MAX_STEPS.keys())}")
444
+
445
+ self._task_id = task_id
446
+ self._rng = random.Random(seed)
447
+ self._base_time = datetime.utcnow()
448
+
449
+ # Load ground truth for this task
450
+ if task_id == "single_crash":
451
+ self._ground_truth = single_crash.GROUND_TRUTH
452
+ else:
453
+ # Tasks 2 & 3 will be wired in Day 3
454
+ self._ground_truth = {}
455
+
456
+ # Initialize episode state
457
+ self._state = EpisodeState(
458
+ episode_id=str(uuid4()),
459
+ task_id=task_id,
460
+ step_count=0,
461
+ max_steps=TASK_MAX_STEPS[task_id],
462
+ done=False,
463
+ cumulative_score=0.0,
464
+ actions_taken=[],
465
+ correct_severity=None,
466
+ correct_root_cause=None,
467
+ correct_remediation=False,
468
+ )
469
+
470
+ # Get initial observation (step 0)
471
+ logs, system_state = self._get_step_data(0)
472
+ alerts = self._get_alerts(0)
473
+
474
+ obs = TriageObservation(
475
+ logs=logs,
476
+ system_state=system_state,
477
+ incident_id=self._state.episode_id,
478
+ task_id=task_id,
479
+ step_count=0,
480
+ time_elapsed_seconds=0,
481
+ active_alerts=alerts,
482
+ reward=0.0,
483
+ cumulative_score=0.0,
484
+ done=False,
485
+ last_action_feedback="Incident detected. Analyze the logs and take action.",
486
+ invalid_action_error=None,
487
+ )
488
+ self._current_obs = obs
489
+ return obs
490
+
491
+ def step(self, action: TriageAction) -> TriageObservation:
492
+ """Take one action. Returns next observation + reward."""
493
+ if self._state is None:
494
+ raise RuntimeError("Call reset() before step()")
495
+ if self._state.done:
496
+ raise RuntimeError("Episode is done. Call reset() to start a new episode.")
497
+
498
+ # Validate action
499
+ valid, err = action.is_valid()
500
+ if not valid:
501
+ return self._make_obs(
502
+ reward=0.0,
503
+ feedback=f"Invalid action: {err}",
504
+ invalid_action_error=err,
505
+ advance_step=False,
506
+ )
507
+
508
+ # Calculate reward for this action
509
+ reward, feedback = self._evaluate_action(action)
510
+
511
+ # Update state
512
+ self._state.cumulative_score = round(
513
+ self._state.cumulative_score + reward, 4
514
+ )
515
+ self._state.actions_taken.append(action.action_type)
516
+ self._state.step_count += 1
517
+
518
+ # Check if episode should end
519
+ done = self._check_done(action)
520
+ self._state.done = done
521
+
522
+ # If done due to budget exceeded, apply penalty
523
+ if self._state.step_count >= self._state.max_steps and not done:
524
+ self._state.cumulative_score = round(
525
+ self._state.cumulative_score + P_EXCEEDED_BUDGET, 4
526
+ )
527
+ self._state.done = True
528
+ feedback += f" Step budget exceeded ({self._state.max_steps} steps). Penalty applied."
529
+
530
+ return self._make_obs(reward=reward, feedback=feedback, advance_step=True)
531
+
532
+ @property
533
+ def state(self) -> EpisodeState:
534
+ """Return current episode state."""
535
+ if self._state is None:
536
+ raise RuntimeError("Call reset() first.")
537
+ return self._state
538
+
539
+ def get_grader_score(self) -> float:
540
+ """
541
+ Return final grader score for the completed episode.
542
+ Score is normalized to [0.0, 1.0].
543
+ """
544
+ if self._state is None:
545
+ return 0.0
546
+ # Clamp score to [0.0, 1.0]
547
+ raw = self._state.cumulative_score
548
+ return round(max(0.0, min(1.0, raw)), 4)
549
+
550
+ # ─── INTERNAL HELPERS ──────────────────────────────────────────────────────
551
+
552
+ def _evaluate_action(self, action: TriageAction) -> tuple[float, str]:
553
+ """
554
+ Evaluate the action against ground truth.
555
+ Returns (reward: float, feedback: str).
556
+ """
557
+ gt = self._ground_truth
558
+ reward = 0.0
559
+ feedback_parts = []
560
+
561
+ # Penalize redundant actions
562
+ if action.action_type in self._state.actions_taken:
563
+ reward += P_REDUNDANT_ACTION
564
+ feedback_parts.append("Redundant action β€” you've already done this.")
565
+
566
+ # ── classify_severity ──────────────────────────────────────────────────
567
+ if action.action_type == "classify_severity":
568
+ correct_sev = gt.get("severity", "")
569
+ if action.value == correct_sev:
570
+ if self._state.correct_severity is None: # only reward first time
571
+ reward += R_CORRECT_SEVERITY
572
+ feedback_parts.append(f"Correct severity: {action.value}. +{R_CORRECT_SEVERITY}")
573
+ self._state.correct_severity = action.value
574
+ else:
575
+ # Partial credit: P1 vs P2 is close, P1 vs P3 is not
576
+ if correct_sev == "P1" and action.value == "P3":
577
+ reward += P_OVERESCALATE_P3_P1 # wrong direction
578
+ feedback_parts.append(f"Incorrect severity: {action.value}. P1 expected. This is a customer-impacting incident.")
579
+ elif correct_sev == "P1" and action.value == "P2":
580
+ reward += R_PARTIAL_SEVERITY_ADJ
581
+ feedback_parts.append(f"Close β€” {action.value} given, P1 expected. Partial credit.")
582
+ else:
583
+ feedback_parts.append(f"Incorrect severity: {action.value}. Reassess.")
584
+
585
+ # ── identify_root_cause ────────────────────────────────────────────────
586
+ elif action.action_type == "identify_root_cause":
587
+ correct_rc = gt.get("root_cause", "")
588
+ if action.value == correct_rc:
589
+ if self._state.correct_root_cause is None:
590
+ reward += R_CORRECT_ROOT_CAUSE
591
+ feedback_parts.append(f"Correct root cause: {action.value}. +{R_CORRECT_ROOT_CAUSE}")
592
+ self._state.correct_root_cause = action.value
593
+ else:
594
+ # Partial credit: same tier (e.g. payment-db instead of payment-service)
595
+ if correct_rc.split("-")[0] == action.value.split("-")[0]:
596
+ reward += R_PARTIAL_SERVICE_FAM
597
+ feedback_parts.append(f"Close β€” {action.value} is in the right service family. Check more carefully.")
598
+ else:
599
+ feedback_parts.append(f"Incorrect root cause: {action.value}. Look at which service is actually failing.")
600
+
601
+ # ── escalate ──────────────────────────────────────────────────────────
602
+ elif action.action_type == "escalate":
603
+ correct_teams = gt.get("correct_teams", set())
604
+ if action.value in correct_teams:
605
+ reward += R_CORRECT_ESCALATION
606
+ feedback_parts.append(f"Correct escalation to {action.value}. +{R_CORRECT_ESCALATION}")
607
+ else:
608
+ reward += P_WRONG_ESCALATION
609
+ feedback_parts.append(f"Wrong team escalated: {action.value}. Penalty applied.")
610
+
611
+ # ── remediate ────────────────────────────────���────────────────────────
612
+ elif action.action_type == "remediate":
613
+ prefix = action.value.split(":")[0]
614
+ service = action.value.split(":")[1] if ":" in action.value else ""
615
+ correct_prefixes = gt.get("remediation_prefixes", set())
616
+ correct_service = gt.get("remediation_service", "")
617
+
618
+ if prefix in correct_prefixes and service == correct_service:
619
+ if not self._state.correct_remediation:
620
+ reward += R_CORRECT_REMEDIATION
621
+ feedback_parts.append(f"Correct remediation: {action.value}. +{R_CORRECT_REMEDIATION}")
622
+ self._state.correct_remediation = True
623
+ elif service == correct_service and prefix not in correct_prefixes:
624
+ reward += 0.05 # right service, wrong action
625
+ feedback_parts.append(f"Right service, but '{prefix}' may not fix this. Try another remediation type.")
626
+ else:
627
+ feedback_parts.append(f"Incorrect remediation: {action.value}. Reconsider which service needs fixing.")
628
+
629
+ # ── ignore ────────────────────────────────────────────────────────────
630
+ elif action.action_type == "ignore":
631
+ correct_sev = gt.get("severity", "")
632
+ if correct_sev == "P1":
633
+ reward += P_IGNORE_P1
634
+ feedback_parts.append(f"CRITICAL ERROR: Ignored a P1 incident! Major penalty applied.")
635
+ else:
636
+ feedback_parts.append("Marked as noise.")
637
+
638
+ # ── request_more_logs ─────────────────────────────────────────────────
639
+ elif action.action_type == "request_more_logs":
640
+ feedback_parts.append(f"Fetching more logs for {action.value}...")
641
+
642
+ # ── resolve ───────────────────────────────────────────────────────────
643
+ elif action.action_type == "resolve":
644
+ # Speed bonus if resolved within 60% of step budget
645
+ step_budget = self._state.max_steps
646
+ if self._state.step_count <= int(step_budget * 0.6):
647
+ reward += R_SPEED_BONUS
648
+ feedback_parts.append(f"Incident resolved efficiently. Speed bonus: +{R_SPEED_BONUS}")
649
+ else:
650
+ feedback_parts.append("Incident resolved.")
651
+
652
+ return round(reward, 4), " | ".join(feedback_parts) or "Action processed."
653
+
654
+ def _check_done(self, action: TriageAction) -> bool:
655
+ """Episode ends on resolve, ignore (with P1), or step budget exhausted."""
656
+ if action.action_type == "resolve":
657
+ return True
658
+ if action.action_type == "ignore" and self._ground_truth.get("severity") == "P1":
659
+ return True # Catastrophic β€” episode ends immediately
660
+ if self._state.step_count >= self._state.max_steps:
661
+ return True
662
+ return False
663
+
664
+ def _get_step_data(self, step: int):
665
+ """Get logs and system state for the current step."""
666
+ if self._task_id == "single_crash":
667
+ return single_crash.get_step_data(step, self._base_time, self._rng)
668
+ # Tasks 2 & 3 wired in Day 3
669
+ return [], generate_healthy_system_state(self._base_time)
670
+
671
+ def _get_alerts(self, step: int) -> list[str]:
672
+ """Get active alerts for the current step."""
673
+ if self._task_id == "single_crash":
674
+ return single_crash.get_active_alerts(step)
675
+ return []
676
+
677
+ def _make_obs(
678
+ self,
679
+ reward: float,
680
+ feedback: str,
681
+ invalid_action_error: str | None = None,
682
+ advance_step: bool = True,
683
+ ) -> TriageObservation:
684
+ """Build a TriageObservation for the current state."""
685
+ step = self._state.step_count
686
+ logs, system_state = self._get_step_data(step)
687
+ alerts = self._get_alerts(step)
688
+
689
+ return TriageObservation(
690
+ logs=logs,
691
+ system_state=system_state,
692
+ incident_id=self._state.episode_id,
693
+ task_id=self._state.task_id,
694
+ step_count=step,
695
+ time_elapsed_seconds=step * 30,
696
+ active_alerts=alerts,
697
+ reward=reward,
698
+ cumulative_score=self._state.cumulative_score,
699
+ done=self._state.done,
700
+ last_action_feedback=feedback,
701
+ invalid_action_error=invalid_action_error,
702
+ )
703
+ ```
704
+
705
+ ---
706
+
707
+ ## Step 4 β€” Wire `app.py` Endpoints
708
+
709
+ Now replace the placeholder `/reset`, `/step`, and `/state` endpoints in `server/app.py`.
710
+
711
+ **Replace the entire file** with this:
712
+
713
+ ```python
714
+ from fastapi import FastAPI, Query
715
+ from fastapi.responses import JSONResponse
716
+ import uvicorn
717
+
718
+ from server.models import TriageAction
719
+ from server.environment import LogTriageEnvironment
720
+
721
+ app = FastAPI(
722
+ title="LogTriageEnv",
723
+ description="OpenEnv environment for SRE incident triage",
724
+ version="1.0.0",
725
+ )
726
+
727
+ # One environment instance per server process
728
+ # (In production / HF Spaces, each request could get its own instance)
729
+ env = LogTriageEnvironment()
730
+
731
+
732
+ @app.get("/health")
733
+ def health():
734
+ return {"status": "ok", "environment": "logtriage-env", "version": "1.0.0"}
735
+
736
+
737
+ @app.post("/reset")
738
+ def reset(
739
+ task: str = Query(default="single_crash", description="Task ID to run"),
740
+ seed: int = Query(default=None, description="Random seed for reproducibility"),
741
+ ):
742
+ try:
743
+ obs = env.reset(task_id=task, seed=seed)
744
+ return obs.model_dump()
745
+ except ValueError as e:
746
+ return JSONResponse(status_code=400, content={"error": str(e)})
747
+
748
+
749
+ @app.post("/step")
750
+ def step(action: TriageAction):
751
+ valid, err = action.is_valid()
752
+ if not valid:
753
+ return JSONResponse(status_code=422, content={"error": err})
754
+ try:
755
+ obs = env.step(action)
756
+ return obs.model_dump()
757
+ except RuntimeError as e:
758
+ return JSONResponse(status_code=400, content={"error": str(e)})
759
+
760
+
761
+ @app.get("/state")
762
+ def state():
763
+ try:
764
+ return env.state.model_dump()
765
+ except RuntimeError as e:
766
+ return JSONResponse(status_code=400, content={"error": str(e)})
767
+
768
+
769
+ @app.get("/tasks")
770
+ def get_tasks():
771
+ return {
772
+ "tasks": [
773
+ {
774
+ "id": "single_crash",
775
+ "name": "Single Service Crash",
776
+ "difficulty": "easy",
777
+ "max_steps": 8,
778
+ "description": "One service crashes. Classify severity, find root cause, remediate.",
779
+ "action_schema": {
780
+ "action_type": "classify_severity | identify_root_cause | escalate | remediate | request_more_logs | resolve | ignore",
781
+ "value": "string (depends on action_type β€” see README)",
782
+ "confidence": "float [0.0, 1.0]",
783
+ "reasoning": "string (optional)",
784
+ },
785
+ },
786
+ {
787
+ "id": "cascading_failure",
788
+ "name": "Cascading Failure",
789
+ "difficulty": "medium",
790
+ "max_steps": 12,
791
+ "description": "DB slowdown cascades upstream. Find the true root cause, not symptoms.",
792
+ "action_schema": {
793
+ "action_type": "classify_severity | identify_root_cause | escalate | remediate | request_more_logs | resolve | ignore",
794
+ "value": "string (depends on action_type β€” see README)",
795
+ "confidence": "float [0.0, 1.0]",
796
+ "reasoning": "string (optional)",
797
+ },
798
+ },
799
+ {
800
+ "id": "silent_degradation",
801
+ "name": "Silent Degradation with Noise",
802
+ "difficulty": "hard",
803
+ "max_steps": 15,
804
+ "description": "Slow degradation hidden in 60% noise. Nuanced P2 severity judgment.",
805
+ "action_schema": {
806
+ "action_type": "classify_severity | identify_root_cause | escalate | remediate | request_more_logs | resolve | ignore",
807
+ "value": "string (depends on action_type β€” see README)",
808
+ "confidence": "float [0.0, 1.0]",
809
+ "reasoning": "string (optional)",
810
+ },
811
+ },
812
+ ]
813
+ }
814
+
815
+
816
+ @app.post("/grader")
817
+ def grader():
818
+ score = env.get_grader_score()
819
+ return {
820
+ "score": score,
821
+ "episode_id": env.state.episode_id if env._state else None,
822
+ "task_id": env._task_id,
823
+ "steps_taken": env.state.step_count if env._state else 0,
824
+ }
825
+
826
+
827
+ @app.post("/baseline")
828
+ def baseline():
829
+ # TODO Day 5: wire to baseline.py
830
+ return {"message": "baseline endpoint β€” to be wired on Day 5"}
831
+
832
+
833
+ if __name__ == "__main__":
834
+ uvicorn.run("server.app:app", host="0.0.0.0", port=7860, reload=True)
835
+ ```
836
+
837
+ ---
838
+
839
+ ## Step 5 β€” Test Full Episode End-to-End
840
+
841
+ ### 5a. Start the server
842
+
843
+ ```bash
844
+ cd C:\Users\Rohit\Desktop\logtriage-env
845
+ python -m uvicorn server.app:app --host 0.0.0.0 --port 7860 --reload
846
+ ```
847
+
848
+ ### 5b. Play a full Task 1 episode (open second terminal)
849
+
850
+ Run these curl commands **in order** β€” this simulates a correct agent solving Task 1:
851
+
852
+ ```bash
853
+ # 1. Start episode
854
+ curl -X POST "http://localhost:7860/reset?task=single_crash&seed=42"
855
+
856
+ # 2. Classify severity correctly
857
+ curl -X POST http://localhost:7860/step ^
858
+ -H "Content-Type: application/json" ^
859
+ -d "{\"action_type\": \"classify_severity\", \"value\": \"P1\", \"confidence\": 0.95, \"reasoning\": \"error rate spike and circuit breaker open\"}"
860
+
861
+ # 3. Identify root cause correctly
862
+ curl -X POST http://localhost:7860/step ^
863
+ -H "Content-Type: application/json" ^
864
+ -d "{\"action_type\": \"identify_root_cause\", \"value\": \"payment-service\", \"confidence\": 0.9, \"reasoning\": \"NullPointerException in payment-service logs\"}"
865
+
866
+ # 4. Apply correct remediation
867
+ curl -X POST http://localhost:7860/step ^
868
+ -H "Content-Type: application/json" ^
869
+ -d "{\"action_type\": \"remediate\", \"value\": \"restart:payment-service\", \"confidence\": 0.85, \"reasoning\": \"NPE likely from bad deploy, restart clears it\"}"
870
+
871
+ # 5. Resolve the incident
872
+ curl -X POST http://localhost:7860/step ^
873
+ -H "Content-Type: application/json" ^
874
+ -d "{\"action_type\": \"resolve\", \"value\": \"resolved\", \"confidence\": 1.0, \"reasoning\": \"payment-service restarted and healthy\"}"
875
+
876
+ # 6. Check final grader score β€” should be ~0.9+
877
+ curl -X POST http://localhost:7860/grader
878
+
879
+ # 7. Check episode state
880
+ curl http://localhost:7860/state
881
+ ```
882
+
883
+ **Expected final score:** 0.90–1.00
884
+ - classify_severity P1 correct: +0.30
885
+ - identify_root_cause payment-service correct: +0.35
886
+ - remediate restart:payment-service correct: +0.25
887
+ - resolve within 4 steps (well under 8): +0.10 speed bonus
888
+ - **Total: 1.00**
889
+
890
+ ### 5c. Test a WRONG agent (should score lower)
891
+
892
+ ```bash
893
+ # Reset fresh
894
+ curl -X POST "http://localhost:7860/reset?task=single_crash&seed=42"
895
+
896
+ # Wrong severity
897
+ curl -X POST http://localhost:7860/step ^
898
+ -H "Content-Type: application/json" ^
899
+ -d "{\"action_type\": \"classify_severity\", \"value\": \"P3\", \"confidence\": 0.5, \"reasoning\": \"seems minor\"}"
900
+
901
+ # Wrong root cause
902
+ curl -X POST http://localhost:7860/step ^
903
+ -H "Content-Type: application/json" ^
904
+ -d "{\"action_type\": \"identify_root_cause\", \"value\": \"api-gateway\", \"confidence\": 0.5, \"reasoning\": \"gateway errors visible\"}"
905
+
906
+ # Check score β€” should be much lower (or negative)
907
+ curl -X POST http://localhost:7860/grader
908
+ ```
909
+
910
+ **This proves graders return VARYING scores β€” critical for disqualification avoidance.**
911
+
912
+ ---
913
+
914
+ ## Step 6 β€” Git Push
915
+
916
+ ```bash
917
+ cd C:\Users\Rohit\Desktop\logtriage-env
918
+ git add .
919
+ git commit -m "Day 2: environment.py, log_generator.py, single_crash scenario, real endpoints
920
+
921
+ - LogTriageEnvironment with real reset()/step()/state()
922
+ - Reward function with partial credit + penalties
923
+ - log_generator.py β€” realistic log synthesis with signal/noise mixing
924
+ - single_crash.py β€” Task 1 scenario with 8-step signal progression
925
+ - /reset, /step, /state endpoints now return real observations
926
+ - Full Task 1 episode playable end-to-end
927
+ - Grader returns varying scores (proven with correct vs wrong agent)"
928
+
929
+ git push origin main
930
+ ```
931
+
932
+ ---
933
+
934
+ ## Day 2 Done Checklist
935
+
936
+ - [ ] `server/log_generator.py` created β€” `generate_log_batch()` returns `list[LogLine]`
937
+ - [ ] `server/scenarios/single_crash.py` created β€” `GROUND_TRUTH`, `STEP_SIGNALS`, `get_step_data()`, `get_active_alerts()` all defined
938
+ - [ ] `server/environment.py` created β€” `LogTriageEnvironment` with `reset()`, `step()`, `state` property, `get_grader_score()`
939
+ - [ ] `server/app.py` updated β€” `/reset`, `/step`, `/state` return real data
940
+ - [ ] `uvicorn server.app:app` starts without errors
941
+ - [ ] `POST /reset?task=single_crash` returns real logs + system state (not placeholder text)
942
+ - [ ] `POST /step` with correct actions returns positive rewards
943
+ - [ ] `POST /step` with wrong actions returns negative/zero rewards
944
+ - [ ] `POST /grader` returns a score that varies between correct and wrong agents
945
+ - [ ] `GET /state` returns real episode state (step count, cumulative score, actions taken)
946
+ - [ ] Full correct episode scores 0.90+ on Task 1
947
+ - [ ] Full wrong episode scores differently (proves score variance)
948
+ - [ ] Git pushed
949
+
950
+ ---
951
+
952
+ ## What NOT to do today
953
+
954
+ - Do NOT start Tasks 2 or 3 scenarios (that is Day 3)
955
+ - Do NOT start grader files in `server/graders/` (that is Day 4)
956
+ - Do NOT touch HF Spaces or Docker beyond making sure it still builds
957
+ - Do NOT add complexity to reward function β€” the one above is final
958
+
959
+ ---
960
+
961
+ ## Tomorrow (Day 3 Preview)
962
+
963
+ You will write `server/scenarios/cascading.py` (Task 2) and `server/scenarios/silent_degrade.py` (Task 3), wire them into `environment.py`, and verify all 3 tasks produce real observations with the reward function working correctly across all scenarios.
FILE_INVENTORY.md CHANGED
@@ -1,4 +1,4 @@
1
- # LogTriageEnv β€” Complete File Inventory
2
 
3
  ## πŸ“‚ Project Root Files
4
 
 
1
+ ~~# LogTriageEnv β€” Complete File Inventory
2
 
3
  ## πŸ“‚ Project Root Files
4
 
server/app.py CHANGED
@@ -1,8 +1,9 @@
1
- from fastapi import FastAPI
2
  from fastapi.responses import JSONResponse
3
  import uvicorn
4
 
5
- from server.models import TriageAction, TriageObservation, EpisodeState
 
6
 
7
  app = FastAPI(
8
  title="LogTriageEnv",
@@ -10,6 +11,9 @@ app = FastAPI(
10
  version="1.0.0",
11
  )
12
 
 
 
 
13
 
14
  @app.get("/health")
15
  def health():
@@ -17,24 +21,35 @@ def health():
17
 
18
 
19
  @app.post("/reset")
20
- def reset(task: str = "single_crash", seed: int = None):
21
- # TODO Day 2: wire to LogTriageEnvironment
22
- return {"message": "reset endpoint placeholder", "task": task}
 
 
 
 
 
 
23
 
24
 
25
  @app.post("/step")
26
  def step(action: TriageAction):
27
- # TODO Day 2: wire to LogTriageEnvironment
28
  valid, err = action.is_valid()
29
  if not valid:
30
  return JSONResponse(status_code=422, content={"error": err})
31
- return {"message": "step endpoint placeholder", "action_received": action.model_dump()}
 
 
 
 
32
 
33
 
34
  @app.get("/state")
35
  def state():
36
- # TODO Day 2: wire to LogTriageEnvironment
37
- return {"message": "state endpoint placeholder"}
 
 
38
 
39
 
40
  @app.get("/tasks")
@@ -49,7 +64,7 @@ def get_tasks():
49
  "description": "One service crashes. Classify severity, find root cause, remediate.",
50
  "action_schema": {
51
  "action_type": "classify_severity | identify_root_cause | escalate | remediate | request_more_logs | resolve | ignore",
52
- "value": "string (depends on action_type)",
53
  "confidence": "float [0.0, 1.0]",
54
  "reasoning": "string (optional)",
55
  },
@@ -59,10 +74,10 @@ def get_tasks():
59
  "name": "Cascading Failure",
60
  "difficulty": "medium",
61
  "max_steps": 12,
62
- "description": "DB slowdown cascades upstream. Find the true root cause.",
63
  "action_schema": {
64
  "action_type": "classify_severity | identify_root_cause | escalate | remediate | request_more_logs | resolve | ignore",
65
- "value": "string (depends on action_type)",
66
  "confidence": "float [0.0, 1.0]",
67
  "reasoning": "string (optional)",
68
  },
@@ -72,10 +87,10 @@ def get_tasks():
72
  "name": "Silent Degradation with Noise",
73
  "difficulty": "hard",
74
  "max_steps": 15,
75
- "description": "Slow degradation hidden in 60% noise. Nuanced P2 judgment.",
76
  "action_schema": {
77
  "action_type": "classify_severity | identify_root_cause | escalate | remediate | request_more_logs | resolve | ignore",
78
- "value": "string (depends on action_type)",
79
  "confidence": "float [0.0, 1.0]",
80
  "reasoning": "string (optional)",
81
  },
@@ -86,14 +101,19 @@ def get_tasks():
86
 
87
  @app.post("/grader")
88
  def grader():
89
- # TODO Day 4: wire to grader logic
90
- return {"message": "grader endpoint placeholder", "score": 0.0}
 
 
 
 
 
91
 
92
 
93
  @app.post("/baseline")
94
  def baseline():
95
  # TODO Day 5: wire to baseline.py
96
- return {"message": "baseline endpoint placeholder"}
97
 
98
 
99
  if __name__ == "__main__":
 
1
+ from fastapi import FastAPI, Query
2
  from fastapi.responses import JSONResponse
3
  import uvicorn
4
 
5
+ from server.models import TriageAction
6
+ from server.environment import LogTriageEnvironment
7
 
8
  app = FastAPI(
9
  title="LogTriageEnv",
 
11
  version="1.0.0",
12
  )
13
 
14
+ # One environment instance per server process
15
+ env = LogTriageEnvironment()
16
+
17
 
18
  @app.get("/health")
19
  def health():
 
21
 
22
 
23
  @app.post("/reset")
24
+ def reset(
25
+ task: str = Query(default="single_crash", description="Task ID to run"),
26
+ seed: int = Query(default=None, description="Random seed for reproducibility"),
27
+ ):
28
+ try:
29
+ obs = env.reset(task_id=task, seed=seed)
30
+ return obs.model_dump()
31
+ except ValueError as e:
32
+ return JSONResponse(status_code=400, content={"error": str(e)})
33
 
34
 
35
  @app.post("/step")
36
  def step(action: TriageAction):
 
37
  valid, err = action.is_valid()
38
  if not valid:
39
  return JSONResponse(status_code=422, content={"error": err})
40
+ try:
41
+ obs = env.step(action)
42
+ return obs.model_dump()
43
+ except RuntimeError as e:
44
+ return JSONResponse(status_code=400, content={"error": str(e)})
45
 
46
 
47
  @app.get("/state")
48
  def state():
49
+ try:
50
+ return env.state.model_dump()
51
+ except RuntimeError as e:
52
+ return JSONResponse(status_code=400, content={"error": str(e)})
53
 
54
 
55
  @app.get("/tasks")
 
64
  "description": "One service crashes. Classify severity, find root cause, remediate.",
65
  "action_schema": {
66
  "action_type": "classify_severity | identify_root_cause | escalate | remediate | request_more_logs | resolve | ignore",
67
+ "value": "string (depends on action_type β€” see README)",
68
  "confidence": "float [0.0, 1.0]",
69
  "reasoning": "string (optional)",
70
  },
 
74
  "name": "Cascading Failure",
75
  "difficulty": "medium",
76
  "max_steps": 12,
77
+ "description": "DB slowdown cascades upstream. Find the true root cause, not symptoms.",
78
  "action_schema": {
79
  "action_type": "classify_severity | identify_root_cause | escalate | remediate | request_more_logs | resolve | ignore",
80
+ "value": "string (depends on action_type β€” see README)",
81
  "confidence": "float [0.0, 1.0]",
82
  "reasoning": "string (optional)",
83
  },
 
87
  "name": "Silent Degradation with Noise",
88
  "difficulty": "hard",
89
  "max_steps": 15,
90
+ "description": "Slow degradation hidden in 60% noise. Nuanced P2 severity judgment.",
91
  "action_schema": {
92
  "action_type": "classify_severity | identify_root_cause | escalate | remediate | request_more_logs | resolve | ignore",
93
+ "value": "string (depends on action_type β€” see README)",
94
  "confidence": "float [0.0, 1.0]",
95
  "reasoning": "string (optional)",
96
  },
 
101
 
102
  @app.post("/grader")
103
  def grader():
104
+ score = env.get_grader_score()
105
+ return {
106
+ "score": score,
107
+ "episode_id": env.state.episode_id if env._state else None,
108
+ "task_id": env._task_id,
109
+ "steps_taken": env.state.step_count if env._state else 0,
110
+ }
111
 
112
 
113
  @app.post("/baseline")
114
  def baseline():
115
  # TODO Day 5: wire to baseline.py
116
+ return {"message": "baseline endpoint β€” to be wired on Day 5"}
117
 
118
 
119
  if __name__ == "__main__":
server/environment.py CHANGED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Core LogTriageEnvironment class.
3
+ Implements OpenEnv interface: reset(), step(), state property.
4
+ """
5
+ from __future__ import annotations
6
+ import random
7
+ from datetime import datetime
8
+ from uuid import uuid4
9
+
10
+ from server.models import (
11
+ TriageAction,
12
+ TriageObservation,
13
+ EpisodeState,
14
+ LogLine,
15
+ ServiceStatus,
16
+ )
17
+ from server.scenarios import single_crash
18
+ from server.log_generator import generate_healthy_system_state, _make_timestamp
19
+
20
+ # ─── TASK REGISTRY ─────────────────────────────────────────────────────────────
21
+
22
+ TASK_MAX_STEPS = {
23
+ "single_crash": 8,
24
+ "cascading_failure": 12,
25
+ "silent_degradation": 15,
26
+ }
27
+
28
+ # ─── REWARD CONSTANTS ──────────────────────────────────────────────────────────
29
+
30
+ R_CORRECT_SEVERITY = 0.30
31
+ R_CORRECT_ROOT_CAUSE = 0.35
32
+ R_CORRECT_REMEDIATION = 0.25
33
+ R_CORRECT_ESCALATION = 0.10
34
+ R_SPEED_BONUS = 0.10
35
+ R_PARTIAL_SERVICE_FAM = 0.10
36
+ R_PARTIAL_SEVERITY_ADJ = 0.10
37
+
38
+ P_WRONG_ESCALATION = -0.10
39
+ P_IGNORE_P1 = -0.50
40
+ P_REDUNDANT_ACTION = -0.05
41
+ P_EXCEEDED_BUDGET = -0.20
42
+ P_OVERESCALATE_P3_P1 = -0.15
43
+
44
+
45
+ class LogTriageEnvironment:
46
+ """
47
+ OpenEnv-compatible environment for SRE incident triage.
48
+
49
+ Usage:
50
+ env = LogTriageEnvironment()
51
+ obs = env.reset(task_id="single_crash", seed=42)
52
+ while not obs.done:
53
+ action = agent.act(obs)
54
+ obs = env.step(action)
55
+ score = env.get_grader_score()
56
+ """
57
+
58
+ def __init__(self):
59
+ self._state: EpisodeState | None = None
60
+ self._rng: random.Random = random.Random()
61
+ self._base_time: datetime = datetime.utcnow()
62
+ self._task_id: str = "single_crash"
63
+ self._ground_truth: dict = {}
64
+ self._current_obs: TriageObservation | None = None
65
+
66
+ # ─── OPENENV INTERFACE ─────────────────────────────────────────────────────
67
+
68
+ def reset(self, task_id: str = "single_crash", seed: int | None = None) -> TriageObservation:
69
+ """Start a fresh episode. Returns initial observation."""
70
+ if task_id not in TASK_MAX_STEPS:
71
+ raise ValueError(f"Unknown task_id '{task_id}'. Valid: {list(TASK_MAX_STEPS.keys())}")
72
+
73
+ self._task_id = task_id
74
+ self._rng = random.Random(seed)
75
+ self._base_time = datetime.utcnow()
76
+
77
+ # Load ground truth for this task
78
+ if task_id == "single_crash":
79
+ self._ground_truth = single_crash.GROUND_TRUTH
80
+ else:
81
+ # Tasks 2 & 3 will be wired in Day 3
82
+ self._ground_truth = {}
83
+
84
+ # Initialize episode state
85
+ self._state = EpisodeState(
86
+ episode_id=str(uuid4()),
87
+ task_id=task_id,
88
+ step_count=0,
89
+ max_steps=TASK_MAX_STEPS[task_id],
90
+ done=False,
91
+ cumulative_score=0.0,
92
+ actions_taken=[],
93
+ correct_severity=None,
94
+ correct_root_cause=None,
95
+ correct_remediation=False,
96
+ )
97
+
98
+ # Get initial observation (step 0)
99
+ logs, system_state = self._get_step_data(0)
100
+ alerts = self._get_alerts(0)
101
+
102
+ obs = TriageObservation(
103
+ logs=logs,
104
+ system_state=system_state,
105
+ incident_id=self._state.episode_id,
106
+ task_id=task_id,
107
+ step_count=0,
108
+ time_elapsed_seconds=0,
109
+ active_alerts=alerts,
110
+ reward=0.0,
111
+ cumulative_score=0.0,
112
+ done=False,
113
+ last_action_feedback="Incident detected. Analyze the logs and take action.",
114
+ invalid_action_error=None,
115
+ )
116
+ self._current_obs = obs
117
+ return obs
118
+
119
+ def step(self, action: TriageAction) -> TriageObservation:
120
+ """Take one action. Returns next observation + reward."""
121
+ if self._state is None:
122
+ raise RuntimeError("Call reset() before step()")
123
+ if self._state.done:
124
+ raise RuntimeError("Episode is done. Call reset() to start a new episode.")
125
+
126
+ # Validate action
127
+ valid, err = action.is_valid()
128
+ if not valid:
129
+ return self._make_obs(
130
+ reward=0.0,
131
+ feedback=f"Invalid action: {err}",
132
+ invalid_action_error=err,
133
+ advance_step=False,
134
+ )
135
+
136
+ # Calculate reward for this action
137
+ reward, feedback = self._evaluate_action(action)
138
+
139
+ # Update state
140
+ self._state.cumulative_score = round(
141
+ self._state.cumulative_score + reward, 4
142
+ )
143
+ self._state.actions_taken.append(action.action_type)
144
+ self._state.step_count += 1
145
+
146
+ # Check if episode should end
147
+ done = self._check_done(action)
148
+ self._state.done = done
149
+
150
+ # If done due to budget exceeded, apply penalty
151
+ if self._state.step_count >= self._state.max_steps and not done:
152
+ self._state.cumulative_score = round(
153
+ self._state.cumulative_score + P_EXCEEDED_BUDGET, 4
154
+ )
155
+ self._state.done = True
156
+ feedback += f" Step budget exceeded ({self._state.max_steps} steps). Penalty applied."
157
+
158
+ return self._make_obs(reward=reward, feedback=feedback, advance_step=True)
159
+
160
+ @property
161
+ def state(self) -> EpisodeState:
162
+ """Return current episode state."""
163
+ if self._state is None:
164
+ raise RuntimeError("Call reset() first.")
165
+ return self._state
166
+
167
+ def get_grader_score(self) -> float:
168
+ """
169
+ Return final grader score for the completed episode.
170
+ Score is normalized to [0.0, 1.0].
171
+ """
172
+ if self._state is None:
173
+ return 0.0
174
+ # Clamp score to [0.0, 1.0]
175
+ raw = self._state.cumulative_score
176
+ return round(max(0.0, min(1.0, raw)), 4)
177
+
178
+ # ─── INTERNAL HELPERS ──────────────────────────────────────────────────────
179
+
180
+ def _evaluate_action(self, action: TriageAction) -> tuple[float, str]:
181
+ """
182
+ Evaluate the action against ground truth.
183
+ Returns (reward: float, feedback: str).
184
+ """
185
+ gt = self._ground_truth
186
+ reward = 0.0
187
+ feedback_parts = []
188
+
189
+ # Penalize redundant actions
190
+ if action.action_type in self._state.actions_taken:
191
+ reward += P_REDUNDANT_ACTION
192
+ feedback_parts.append("Redundant action β€” you've already done this.")
193
+
194
+ # ── classify_severity ──────────────────────────────────────────────────
195
+ if action.action_type == "classify_severity":
196
+ correct_sev = gt.get("severity", "")
197
+ if action.value == correct_sev:
198
+ if self._state.correct_severity is None: # only reward first time
199
+ reward += R_CORRECT_SEVERITY
200
+ feedback_parts.append(f"Correct severity: {action.value}. +{R_CORRECT_SEVERITY}")
201
+ self._state.correct_severity = action.value
202
+ else:
203
+ # Partial credit: P1 vs P2 is close, P1 vs P3 is not
204
+ if correct_sev == "P1" and action.value == "P3":
205
+ reward += P_OVERESCALATE_P3_P1 # wrong direction
206
+ feedback_parts.append(f"Incorrect severity: {action.value}. P1 expected. This is a customer-impacting incident.")
207
+ elif correct_sev == "P1" and action.value == "P2":
208
+ reward += R_PARTIAL_SEVERITY_ADJ
209
+ feedback_parts.append(f"Close β€” {action.value} given, P1 expected. Partial credit.")
210
+ else:
211
+ feedback_parts.append(f"Incorrect severity: {action.value}. Reassess.")
212
+
213
+ # ── identify_root_cause ────────────────────────────────────────────────
214
+ elif action.action_type == "identify_root_cause":
215
+ correct_rc = gt.get("root_cause", "")
216
+ if action.value == correct_rc:
217
+ if self._state.correct_root_cause is None:
218
+ reward += R_CORRECT_ROOT_CAUSE
219
+ feedback_parts.append(f"Correct root cause: {action.value}. +{R_CORRECT_ROOT_CAUSE}")
220
+ self._state.correct_root_cause = action.value
221
+ else:
222
+ # Partial credit: same tier (e.g. payment-db instead of payment-service)
223
+ if correct_rc.split("-")[0] == action.value.split("-")[0]:
224
+ reward += R_PARTIAL_SERVICE_FAM
225
+ feedback_parts.append(f"Close β€” {action.value} is in the right service family. Check more carefully.")
226
+ else:
227
+ feedback_parts.append(f"Incorrect root cause: {action.value}. Look at which service is actually failing.")
228
+
229
+ # ── escalate ──────────────────────────────────────────────────────────
230
+ elif action.action_type == "escalate":
231
+ correct_teams = gt.get("correct_teams", set())
232
+ if action.value in correct_teams:
233
+ reward += R_CORRECT_ESCALATION
234
+ feedback_parts.append(f"Correct escalation to {action.value}. +{R_CORRECT_ESCALATION}")
235
+ else:
236
+ reward += P_WRONG_ESCALATION
237
+ feedback_parts.append(f"Wrong team escalated: {action.value}. Penalty applied.")
238
+
239
+ # ── remediate ─────────────────────────────────────────────────────────
240
+ elif action.action_type == "remediate":
241
+ prefix = action.value.split(":")[0]
242
+ service = action.value.split(":")[1] if ":" in action.value else ""
243
+ correct_prefixes = gt.get("remediation_prefixes", set())
244
+ correct_service = gt.get("remediation_service", "")
245
+
246
+ if prefix in correct_prefixes and service == correct_service:
247
+ if not self._state.correct_remediation:
248
+ reward += R_CORRECT_REMEDIATION
249
+ feedback_parts.append(f"Correct remediation: {action.value}. +{R_CORRECT_REMEDIATION}")
250
+ self._state.correct_remediation = True
251
+ elif service == correct_service and prefix not in correct_prefixes:
252
+ reward += 0.05 # right service, wrong action
253
+ feedback_parts.append(f"Right service, but '{prefix}' may not fix this. Try another remediation type.")
254
+ else:
255
+ feedback_parts.append(f"Incorrect remediation: {action.value}. Reconsider which service needs fixing.")
256
+
257
+ # ── ignore ────────────────────────────────────────────────────────────
258
+ elif action.action_type == "ignore":
259
+ correct_sev = gt.get("severity", "")
260
+ if correct_sev == "P1":
261
+ reward += P_IGNORE_P1
262
+ feedback_parts.append(f"CRITICAL ERROR: Ignored a P1 incident! Major penalty applied.")
263
+ else:
264
+ feedback_parts.append("Marked as noise.")
265
+
266
+ # ── request_more_logs ─────────────────────────────────────────────────
267
+ elif action.action_type == "request_more_logs":
268
+ feedback_parts.append(f"Fetching more logs for {action.value}...")
269
+
270
+ # ── resolve ───────────────────────────────────────────────────────────
271
+ elif action.action_type == "resolve":
272
+ # Speed bonus if resolved within 60% of step budget
273
+ step_budget = self._state.max_steps
274
+ if self._state.step_count <= int(step_budget * 0.6):
275
+ reward += R_SPEED_BONUS
276
+ feedback_parts.append(f"Incident resolved efficiently. Speed bonus: +{R_SPEED_BONUS}")
277
+ else:
278
+ feedback_parts.append("Incident resolved.")
279
+
280
+ return round(reward, 4), " | ".join(feedback_parts) or "Action processed."
281
+
282
+ def _check_done(self, action: TriageAction) -> bool:
283
+ """Episode ends on resolve, ignore (with P1), or step budget exhausted."""
284
+ if action.action_type == "resolve":
285
+ return True
286
+ if action.action_type == "ignore" and self._ground_truth.get("severity") == "P1":
287
+ return True # Catastrophic β€” episode ends immediately
288
+ if self._state.step_count >= self._state.max_steps:
289
+ return True
290
+ return False
291
+
292
+ def _get_step_data(self, step: int):
293
+ """Get logs and system state for the current step."""
294
+ if self._task_id == "single_crash":
295
+ return single_crash.get_step_data(step, self._base_time, self._rng)
296
+ # Tasks 2 & 3 wired in Day 3
297
+ return [], generate_healthy_system_state(self._base_time)
298
+
299
+ def _get_alerts(self, step: int) -> list[str]:
300
+ """Get active alerts for the current step."""
301
+ if self._task_id == "single_crash":
302
+ return single_crash.get_active_alerts(step)
303
+ return []
304
+
305
+ def _make_obs(
306
+ self,
307
+ reward: float,
308
+ feedback: str,
309
+ invalid_action_error: str | None = None,
310
+ advance_step: bool = True,
311
+ ) -> TriageObservation:
312
+ """Build a TriageObservation for the current state."""
313
+ step = self._state.step_count
314
+ logs, system_state = self._get_step_data(step)
315
+ alerts = self._get_alerts(step)
316
+
317
+ return TriageObservation(
318
+ logs=logs,
319
+ system_state=system_state,
320
+ incident_id=self._state.episode_id,
321
+ task_id=self._state.task_id,
322
+ step_count=step,
323
+ time_elapsed_seconds=step * 30,
324
+ active_alerts=alerts,
325
+ reward=reward,
326
+ cumulative_score=self._state.cumulative_score,
327
+ done=self._state.done,
328
+ last_action_feedback=feedback,
329
+ invalid_action_error=invalid_action_error,
330
+ )
server/log_generator.py CHANGED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Log generator for LogTriageEnv.
3
+ Produces realistic-looking log lines for the simulated microservice cluster.
4
+ """
5
+ from __future__ import annotations
6
+ import random
7
+ from datetime import datetime, timedelta
8
+ from server.models import LogLine, ServiceStatus
9
+
10
+ # ─── SERVICES ─────────────────────────────────────────────────────────────────
11
+
12
+ SERVICES = [
13
+ "api-gateway",
14
+ "auth-service",
15
+ "user-db",
16
+ "payment-service",
17
+ "payment-db",
18
+ "notification-service",
19
+ "email-queue",
20
+ ]
21
+
22
+ # ─── LOG TEMPLATES ────────────────────────────────────────────────────────────
23
+
24
+ # Noise logs β€” realistic but irrelevant to the incident
25
+ NOISE_TEMPLATES = {
26
+ "api-gateway": [
27
+ ("INFO", "health check passed β€” all upstream services reachable"),
28
+ ("INFO", "request completed: GET /api/v1/users/profile [200] 45ms"),
29
+ ("INFO", "rate limiter: 1240/5000 requests this minute"),
30
+ ("DEBUG", "connection pool: 12/100 active connections"),
31
+ ("INFO", "TLS certificate valid for 87 more days"),
32
+ ],
33
+ "auth-service": [
34
+ ("INFO", "JWT token issued for user_id=88142 [expires: 3600s]"),
35
+ ("INFO", "OAuth2 flow completed successfully"),
36
+ ("DEBUG", "session cache hit ratio: 94.2%"),
37
+ ("INFO", "password reset email queued for user_id=23019"),
38
+ ],
39
+ "user-db": [
40
+ ("INFO", "daily vacuum completed: 0 dead tuples removed"),
41
+ ("INFO", "checkpoint complete: wrote 142 buffers"),
42
+ ("DEBUG", "autovacuum: processing table 'sessions'"),
43
+ ("INFO", "replication lag: 12ms (within threshold)"),
44
+ ],
45
+ "payment-service": [
46
+ ("INFO", "payment processed: txn_id=TXN-8812 amount=299.00 INR [success]"),
47
+ ("INFO", "webhook delivered: stripe event=payment.succeeded"),
48
+ ("DEBUG", "idempotency key cache: 2341 keys active"),
49
+ ],
50
+ "payment-db": [
51
+ ("INFO", "connection pool: 8/50 active"),
52
+ ("DEBUG", "query plan cache: 88% hit ratio"),
53
+ ("INFO", "index usage: 99.1% queries using indexed scans"),
54
+ ],
55
+ "notification-service": [
56
+ ("INFO", "email dispatched: template=welcome_email to=user@example.com"),
57
+ ("INFO", "SMS delivered: +91XXXXXXXXXX [provider=twilio]"),
58
+ ("WARN", "email bounce rate: 1.2% (threshold: 5%)"),
59
+ ("INFO", "push notification sent: device_tokens=1240"),
60
+ ],
61
+ "email-queue": [
62
+ ("INFO", "queue depth: 42 messages pending"),
63
+ ("INFO", "consumer lag: 0.3s (healthy)"),
64
+ ("DEBUG", "partition rebalance completed in 120ms"),
65
+ ],
66
+ }
67
+
68
+ # Signal logs β€” actual incident indicators
69
+ SIGNAL_TEMPLATES = {
70
+ # Single service crash signals (Task 1 β€” payment-service crash)
71
+ "single_crash_payment": [
72
+ ("ERROR", "NullPointerException: Cannot invoke method processPayment() on null object β€” PaymentProcessor.java:142"),
73
+ ("ERROR", "HTTP 500 Internal Server Error: payment gateway returned null response"),
74
+ ("ERROR", "NullPointerException in PaymentService.execute() β€” retrying (attempt 1/3)"),
75
+ ("ERROR", "NullPointerException in PaymentService.execute() β€” retrying (attempt 2/3)"),
76
+ ("FATAL", "NullPointerException in PaymentService.execute() β€” all retries exhausted, request failed"),
77
+ ("ERROR", "health check FAILED: payment-service returned 500 (was 200)"),
78
+ ("ERROR", "circuit breaker OPEN: payment-service error rate 98.2% (threshold: 10%)"),
79
+ ],
80
+ # Cascading failure signals (Task 2 β€” user-db β†’ auth-service β†’ api-gateway)
81
+ "cascading_userdb": [
82
+ ("WARN", "slow query detected: SELECT * FROM sessions WHERE user_id=? [latency: 2847ms, threshold: 200ms]"),
83
+ ("ERROR", "slow query detected: SELECT * FROM sessions WHERE user_id=? [latency: 4120ms]"),
84
+ ("ERROR", "query timeout: SELECT * FROM active_sessions [timeout after 5000ms]"),
85
+ ],
86
+ "cascading_auth": [
87
+ ("WARN", "db connection pool: 42/50 active connections (84% utilization)"),
88
+ ("ERROR", "db connection pool exhausted: 50/50 connections in use β€” requests queuing"),
89
+ ("ERROR", "authentication request timed out waiting for db connection [5200ms]"),
90
+ ],
91
+ "cascading_gateway": [
92
+ ("ERROR", "upstream timeout: auth-service failed to respond within 5000ms [req-id: {req_id}]"),
93
+ ("ERROR", "upstream timeout: auth-service [req-id: {req_id}] β€” returning 504 to client"),
94
+ ("WARN", "error rate spike: 34.2% of requests failing (threshold: 5%)"),
95
+ ],
96
+ # Silent degradation signals (Task 3 β€” payment-db slow)
97
+ "silent_paymentdb": [
98
+ ("WARN", "query latency elevated: avg=450ms (normal: 80ms) β€” monitoring"),
99
+ ("WARN", "query latency elevated: avg=620ms β€” possible memory pressure"),
100
+ ("WARN", "query latency elevated: avg=890ms β€” recommend investigation"),
101
+ ("WARN", "query latency elevated: avg=1200ms β€” approaching timeout threshold"),
102
+ ("WARN", "buffer cache hit ratio degraded: 87% (normal: 98%) β€” possible memory issue"),
103
+ ],
104
+ }
105
+
106
+
107
+ def _make_timestamp(base_time: datetime, offset_seconds: int = 0) -> str:
108
+ t = base_time + timedelta(seconds=offset_seconds)
109
+ return t.strftime("%Y-%m-%dT%H:%M:%SZ")
110
+
111
+
112
+ def _noise_log(service: str, base_time: datetime, offset: int) -> LogLine:
113
+ templates = NOISE_TEMPLATES.get(service, [("INFO", "routine operation completed")])
114
+ level, message = random.choice(templates)
115
+ return LogLine(
116
+ timestamp=_make_timestamp(base_time, offset),
117
+ level=level,
118
+ service=service,
119
+ request_id=None,
120
+ message=message,
121
+ latency_ms=None,
122
+ )
123
+
124
+
125
+ def generate_log_batch(
126
+ scenario_signals: list[tuple[str, str, str]], # [(service, level, message), ...]
127
+ step: int,
128
+ base_time: datetime,
129
+ noise_ratio: float = 0.3,
130
+ batch_size: int = 8,
131
+ rng: random.Random = None,
132
+ ) -> list[LogLine]:
133
+ """
134
+ Generate a mixed batch of signal + noise log lines.
135
+
136
+ Args:
137
+ scenario_signals: List of (service, level, message) tuples β€” the actual signals for this step
138
+ step: Current step number (used for timestamp offset)
139
+ base_time: Episode start time (used for timestamps)
140
+ noise_ratio: Fraction of logs that are noise (0.0 = all signal, 1.0 = all noise)
141
+ batch_size: Total number of log lines to return
142
+ rng: Optional seeded Random for reproducibility
143
+
144
+ Returns:
145
+ List of LogLine objects, shuffled (signal mixed into noise)
146
+ """
147
+ if rng is None:
148
+ rng = random.Random()
149
+
150
+ logs = []
151
+ base_offset = step * 30 # 30 simulated seconds per step
152
+
153
+ # Add signal logs
154
+ for i, (service, level, message) in enumerate(scenario_signals):
155
+ req_id = f"req-{rng.randint(1000, 9999)}" if level in ("ERROR", "WARN") else None
156
+ logs.append(LogLine(
157
+ timestamp=_make_timestamp(base_time, base_offset + i),
158
+ level=level,
159
+ service=service,
160
+ request_id=req_id,
161
+ message=message,
162
+ latency_ms=rng.randint(200, 5000) if "timeout" in message.lower() or "latency" in message.lower() else None,
163
+ ))
164
+
165
+ # Fill remaining slots with noise logs
166
+ noise_count = max(0, batch_size - len(logs))
167
+ noise_services = rng.choices(SERVICES, k=noise_count)
168
+ for i, svc in enumerate(noise_services):
169
+ logs.append(_noise_log(svc, base_time, base_offset + len(scenario_signals) + i))
170
+
171
+ # Shuffle β€” signal should not always be first
172
+ rng.shuffle(logs)
173
+ return logs[:batch_size]
174
+
175
+
176
+ def generate_healthy_system_state(base_time: datetime) -> dict[str, ServiceStatus]:
177
+ """Generate a fully healthy system state snapshot."""
178
+ now = _make_timestamp(base_time)
179
+ return {
180
+ svc: ServiceStatus(
181
+ name=svc,
182
+ status="up",
183
+ error_rate=round(random.uniform(0.001, 0.01), 4),
184
+ latency_p99_ms=random.randint(20, 80),
185
+ last_updated=now,
186
+ )
187
+ for svc in SERVICES
188
+ }
server/scenarios/single_crash.py CHANGED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task 1 β€” Single Service Crash (Easy)
3
+
4
+ Scenario: payment-service crashes with NullPointerException on every request.
5
+ All other services are healthy. Logs are mostly unambiguous.
6
+ Noise ratio: ~20%.
7
+
8
+ Ground truth:
9
+ - severity: P1
10
+ - root_cause: payment-service
11
+ - remediation: restart:payment-service
12
+ - correct_team: backend-team
13
+ """
14
+ from __future__ import annotations
15
+ import random
16
+ from datetime import datetime
17
+ from server.models import LogLine, ServiceStatus
18
+ from server.log_generator import (
19
+ generate_log_batch,
20
+ generate_healthy_system_state,
21
+ SIGNAL_TEMPLATES,
22
+ _make_timestamp,
23
+ )
24
+
25
+ # ─── GROUND TRUTH ─────────────────────────────────────────────────────────────
26
+
27
+ GROUND_TRUTH = {
28
+ "severity": "P1",
29
+ "root_cause": "payment-service",
30
+ "remediation_prefixes": {"restart"}, # restart:payment-service is correct
31
+ "remediation_service": "payment-service",
32
+ "correct_teams": {"backend-team", "sre-team"},
33
+ "max_steps": 8,
34
+ "noise_ratio": 0.20,
35
+ }
36
+
37
+ # ─── STEP-BY-STEP SIGNAL PLAN ─────────────────────────────────────────────────
38
+ # Each list = signals injected at that step index.
39
+ # Step 0 = after reset (first observation), Step 7 = last possible step.
40
+
41
+ STEP_SIGNALS = [
42
+ # Step 0: first signs β€” circuit breaker opens, error rate spike
43
+ [
44
+ ("payment-service", "ERROR", "NullPointerException: Cannot invoke processPayment() on null β€” PaymentProcessor.java:142"),
45
+ ("api-gateway", "WARN", "error rate spike: 28.4% of /payment requests failing"),
46
+ ],
47
+ # Step 1: escalating β€” more errors, health check fails
48
+ [
49
+ ("payment-service", "FATAL", "NullPointerException in PaymentService.execute() β€” all retries (3/3) exhausted"),
50
+ ("payment-service", "ERROR", "health check FAILED: payment-service returned HTTP 500"),
51
+ ],
52
+ # Step 2: circuit breaker fully open
53
+ [
54
+ ("api-gateway", "ERROR", "circuit breaker OPEN: payment-service error rate 98.2% (threshold: 10%)"),
55
+ ("payment-service", "ERROR", "NullPointerException: Cannot invoke processPayment() on null β€” PaymentProcessor.java:142"),
56
+ ],
57
+ # Step 3+: same signals repeat β€” incident ongoing until agent acts
58
+ [
59
+ ("payment-service", "ERROR", "NullPointerException in PaymentService.execute() β€” retrying (1/3)"),
60
+ ("api-gateway", "ERROR", "upstream failure: payment-service unavailable [circuit breaker: OPEN]"),
61
+ ],
62
+ [
63
+ ("payment-service", "FATAL", "payment-service health check FAILED for 90s β€” marking as DOWN"),
64
+ ("api-gateway", "WARN", "payment endpoint degraded β€” all requests returning 503"),
65
+ ],
66
+ [
67
+ ("payment-service", "ERROR", "NullPointerException: Cannot invoke processPayment() on null β€” PaymentProcessor.java:142"),
68
+ ("api-gateway", "ERROR", "error rate: 99.1% on /payment/* routes"),
69
+ ],
70
+ [
71
+ ("payment-service", "FATAL", "NullPointerException β€” service unresponsive for 180s"),
72
+ ("api-gateway", "ERROR", "SLA breach: payment service uptime < 99.9%"),
73
+ ],
74
+ [
75
+ ("payment-service", "FATAL", "CRITICAL: payment-service has been DOWN for 210s β€” immediate action required"),
76
+ ("api-gateway", "ERROR", "all payment transactions failing β€” revenue impact ongoing"),
77
+ ],
78
+ ]
79
+
80
+
81
+ def get_system_state(step: int, base_time: datetime) -> dict[str, ServiceStatus]:
82
+ """Return system state for this step. payment-service is down; others are healthy."""
83
+ now = _make_timestamp(base_time, step * 30)
84
+ state = generate_healthy_system_state(base_time)
85
+
86
+ # Override payment-service to be DOWN
87
+ state["payment-service"] = ServiceStatus(
88
+ name="payment-service",
89
+ status="down",
90
+ error_rate=0.982,
91
+ latency_p99_ms=5000,
92
+ last_updated=now,
93
+ )
94
+ return state
95
+
96
+
97
+ def get_step_data(step: int, base_time: datetime, rng: random.Random) -> tuple[list[LogLine], dict[str, ServiceStatus]]:
98
+ """
99
+ Returns (logs, system_state) for the given step.
100
+ Signals get louder over time if agent hasn't acted.
101
+ """
102
+ signal_idx = min(step, len(STEP_SIGNALS) - 1)
103
+ signals = STEP_SIGNALS[signal_idx]
104
+
105
+ logs = generate_log_batch(
106
+ scenario_signals=signals,
107
+ step=step,
108
+ base_time=base_time,
109
+ noise_ratio=GROUND_TRUTH["noise_ratio"],
110
+ batch_size=8,
111
+ rng=rng,
112
+ )
113
+ system_state = get_system_state(step, base_time)
114
+ return logs, system_state
115
+
116
+
117
+ def get_active_alerts(step: int) -> list[str]:
118
+ """Return active alerts for this step."""
119
+ alerts = ["payment-service: circuit breaker OPEN", "payment-service: health check FAILING"]
120
+ if step >= 2:
121
+ alerts.append("SLA_BREACH: payment availability < 99.9%")
122
+ if step >= 5:
123
+ alerts.append("CRITICAL: payment-service DOWN > 150s")
124
+ return alerts