Spaces:
Sleeping
Sleeping
Pramod Basavaraj Menasi commited on
Commit ·
3cdafd1
1
Parent(s): d574597
fix: add tasks with graders to openenv.yaml and grade() method
Browse files- openenv.yaml +30 -0
- server/incidentops_env_environment.py +28 -0
openenv.yaml
CHANGED
|
@@ -5,3 +5,33 @@ runtime: fastapi
|
|
| 5 |
app: server.app:app
|
| 6 |
port: 8000
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
app: server.app:app
|
| 6 |
port: 8000
|
| 7 |
|
| 8 |
+
tasks:
|
| 9 |
+
- id: incident_easy
|
| 10 |
+
name: "Single Service Outage (Easy)"
|
| 11 |
+
description: "Diagnose and resolve a payment-service latency spike caused by a bad deployment."
|
| 12 |
+
difficulty: easy
|
| 13 |
+
reset_kwargs:
|
| 14 |
+
difficulty: easy
|
| 15 |
+
grader:
|
| 16 |
+
type: env_reward
|
| 17 |
+
success_threshold: 0.1
|
| 18 |
+
|
| 19 |
+
- id: incident_medium
|
| 20 |
+
name: "Dependency Failure (Medium)"
|
| 21 |
+
description: "Identify a DB timeout causing API gateway failures with no logs initially available."
|
| 22 |
+
difficulty: medium
|
| 23 |
+
reset_kwargs:
|
| 24 |
+
difficulty: medium
|
| 25 |
+
grader:
|
| 26 |
+
type: env_reward
|
| 27 |
+
success_threshold: 0.1
|
| 28 |
+
|
| 29 |
+
- id: incident_hard
|
| 30 |
+
name: "Multi-Service Root Cause (Hard)"
|
| 31 |
+
description: "Trace EU checkout failures across auth, payment, and checkout services to a DNS issue."
|
| 32 |
+
difficulty: hard
|
| 33 |
+
reset_kwargs:
|
| 34 |
+
difficulty: hard
|
| 35 |
+
grader:
|
| 36 |
+
type: env_reward
|
| 37 |
+
success_threshold: 0.1
|
server/incidentops_env_environment.py
CHANGED
|
@@ -276,7 +276,35 @@ class IncidentopsEnvironment(Environment):
|
|
| 276 |
}
|
| 277 |
self._last_observation = obs
|
| 278 |
return obs
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
@property
|
| 281 |
def state(self) -> State:
|
| 282 |
return self._state
|
|
|
|
| 276 |
}
|
| 277 |
self._last_observation = obs
|
| 278 |
return obs
|
| 279 |
+
def grade(self) -> dict:
|
| 280 |
+
"""Called by the OpenEnv validator to score a completed episode."""
|
| 281 |
+
assert self._snapshot is not None
|
| 282 |
+
s = self._snapshot
|
| 283 |
|
| 284 |
+
# Compute normalized score 0.0 - 1.0
|
| 285 |
+
total_steps = s.step_count or 1
|
| 286 |
+
sla_ok = s.step_count <= s.sla_steps
|
| 287 |
+
correct_actions = sum(
|
| 288 |
+
1 for a in s.action_history if a in s.correct_action_sequence
|
| 289 |
+
)
|
| 290 |
+
correctness_ratio = correct_actions / max(len(s.correct_action_sequence), 1)
|
| 291 |
+
|
| 292 |
+
if s.resolved and sla_ok:
|
| 293 |
+
score = min(1.0, 0.5 + 0.5 * correctness_ratio)
|
| 294 |
+
elif s.resolved:
|
| 295 |
+
score = min(0.6, 0.3 + 0.3 * correctness_ratio)
|
| 296 |
+
else:
|
| 297 |
+
score = max(0.0, 0.1 * correctness_ratio)
|
| 298 |
+
|
| 299 |
+
return {
|
| 300 |
+
"score": round(score, 4),
|
| 301 |
+
"success": s.resolved and sla_ok,
|
| 302 |
+
"incident_resolved": s.resolved,
|
| 303 |
+
"steps_taken": s.step_count,
|
| 304 |
+
"sla_met": sla_ok,
|
| 305 |
+
"wrong_escalations": s.wrong_escalations,
|
| 306 |
+
"evidence_collected": s.evidence_collected,
|
| 307 |
+
}
|
| 308 |
@property
|
| 309 |
def state(self) -> State:
|
| 310 |
return self._state
|