Pramod Basavaraj Menasi commited on
Commit
3cdafd1
·
1 Parent(s): d574597

fix: add tasks with graders to openenv.yaml and grade() method

Browse files
openenv.yaml CHANGED
@@ -5,3 +5,33 @@ runtime: fastapi
5
  app: server.app:app
6
  port: 8000
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  app: server.app:app
6
  port: 8000
7
 
8
+ tasks:
9
+ - id: incident_easy
10
+ name: "Single Service Outage (Easy)"
11
+ description: "Diagnose and resolve a payment-service latency spike caused by a bad deployment."
12
+ difficulty: easy
13
+ reset_kwargs:
14
+ difficulty: easy
15
+ grader:
16
+ type: env_reward
17
+ success_threshold: 0.1
18
+
19
+ - id: incident_medium
20
+ name: "Dependency Failure (Medium)"
21
+ description: "Identify a DB timeout causing API gateway failures with no logs initially available."
22
+ difficulty: medium
23
+ reset_kwargs:
24
+ difficulty: medium
25
+ grader:
26
+ type: env_reward
27
+ success_threshold: 0.1
28
+
29
+ - id: incident_hard
30
+ name: "Multi-Service Root Cause (Hard)"
31
+ description: "Trace EU checkout failures across auth, payment, and checkout services to a DNS issue."
32
+ difficulty: hard
33
+ reset_kwargs:
34
+ difficulty: hard
35
+ grader:
36
+ type: env_reward
37
+ success_threshold: 0.1
server/incidentops_env_environment.py CHANGED
@@ -276,7 +276,35 @@ class IncidentopsEnvironment(Environment):
276
  }
277
  self._last_observation = obs
278
  return obs
 
 
 
 
279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  @property
281
  def state(self) -> State:
282
  return self._state
 
276
  }
277
  self._last_observation = obs
278
  return obs
279
+ def grade(self) -> dict:
280
+ """Called by the OpenEnv validator to score a completed episode."""
281
+ assert self._snapshot is not None
282
+ s = self._snapshot
283
 
284
+ # Compute normalized score 0.0 - 1.0
285
+ total_steps = s.step_count or 1
286
+ sla_ok = s.step_count <= s.sla_steps
287
+ correct_actions = sum(
288
+ 1 for a in s.action_history if a in s.correct_action_sequence
289
+ )
290
+ correctness_ratio = correct_actions / max(len(s.correct_action_sequence), 1)
291
+
292
+ if s.resolved and sla_ok:
293
+ score = min(1.0, 0.5 + 0.5 * correctness_ratio)
294
+ elif s.resolved:
295
+ score = min(0.6, 0.3 + 0.3 * correctness_ratio)
296
+ else:
297
+ score = max(0.0, 0.1 * correctness_ratio)
298
+
299
+ return {
300
+ "score": round(score, 4),
301
+ "success": s.resolved and sla_ok,
302
+ "incident_resolved": s.resolved,
303
+ "steps_taken": s.step_count,
304
+ "sla_met": sla_ok,
305
+ "wrong_escalations": s.wrong_escalations,
306
+ "evidence_collected": s.evidence_collected,
307
+ }
308
  @property
309
  def state(self) -> State:
310
  return self._state