Elliot89 commited on
Commit
ebe276c
Β·
verified Β·
1 Parent(s): e4ab55f

Delete scoring.py

Browse files
Files changed (1) hide show
  1. scoring.py +0 -294
scoring.py DELETED
@@ -1,294 +0,0 @@
1
- """
2
- graders.py β€” Deterministic graders for all 3 SRE Incident Response tasks.
3
-
4
- Public API:
5
- grade(task_id, state, scenario) -> {"total": float, "breakdown": dict, "feedback": str}
6
-
7
- All scores are in [0.0, 1.0]. Graders are deterministic and reproducible.
8
- """
9
-
10
- from __future__ import annotations
11
-
12
-
13
- def grade(task_id: str, state: dict, scenario: dict) -> dict:
14
- """
15
- Entry point. Routes to the correct task grader.
16
-
17
- Args:
18
- task_id: One of alert_classification, root_cause_analysis, remediation_planning
19
- state: Current episode state dict from IncidentEnvironment
20
- scenario: The scenario dict that was loaded for this episode
21
-
22
- Returns:
23
- {
24
- "total": float in [0.0, 1.0],
25
- "breakdown": dict of sub-scores,
26
- "feedback": human-readable string
27
- }
28
- """
29
- graders = {
30
- "alert_classification": _grade_alert_classification,
31
- "root_cause_analysis": _grade_root_cause_analysis,
32
- "remediation_planning": _grade_remediation_planning,
33
- }
34
- if task_id not in graders:
35
- return {"total": 0.0, "breakdown": {}, "feedback": f"Unknown task_id: {task_id}"}
36
- return graders[task_id](state, scenario)
37
-
38
-
39
- # ── Task 1: Alert Classification ────────────────────────────────────────────
40
-
41
- def _grade_alert_classification(state: dict, scenario: dict) -> dict:
42
- """
43
- Scoring:
44
- 1.0 β€” exact severity match
45
- 0.5 β€” adjacent severity (one level off)
46
- 0.25 β€” two levels off
47
- 0.0 β€” opposite end or no submission
48
- """
49
- action_history = state.get("action_history", [])
50
- correct = scenario.get("correct_severity", "P1")
51
- adjacent = scenario.get("adjacent_severities", [])
52
-
53
- submitted_severity = None
54
- for action in action_history:
55
- if action.get("action_type") == "submit_severity":
56
- submitted_severity = (
57
- action.get("parameters", {}).get("severity", "")
58
- .upper()
59
- .strip()
60
- )
61
- break
62
-
63
- if not submitted_severity:
64
- return {
65
- "total": 0.0,
66
- "breakdown": {"severity_match": 0.0, "submitted": False},
67
- "feedback": "No severity submitted β€” score 0.0",
68
- }
69
-
70
- severity_order = ["P1", "P2", "P3", "P4"]
71
-
72
- if submitted_severity == correct:
73
- score = 1.0
74
- feedback = f"Exact match: {submitted_severity} == {correct}"
75
- elif submitted_severity in adjacent:
76
- score = 0.5
77
- feedback = f"Adjacent severity: submitted {submitted_severity}, correct {correct}"
78
- else:
79
- # Distance-based fallback
80
- try:
81
- dist = abs(severity_order.index(submitted_severity) - severity_order.index(correct))
82
- except ValueError:
83
- dist = 4
84
- if dist == 2:
85
- score = 0.25
86
- else:
87
- score = 0.0
88
- feedback = f"Wrong severity: submitted {submitted_severity}, correct {correct} (dist={dist})"
89
-
90
- return {
91
- "total": score,
92
- "breakdown": {
93
- "submitted_severity": submitted_severity,
94
- "correct_severity": correct,
95
- "severity_match": score,
96
- },
97
- "feedback": feedback,
98
- }
99
-
100
-
101
- # ── Task 2: Root Cause Analysis ─────────────────────────────────────────────
102
-
103
- def _grade_root_cause_analysis(state: dict, scenario: dict) -> dict:
104
- """
105
- Scoring:
106
- Base score (0.0–0.6):
107
- 0.6 β€” correct service AND correct failure_mode
108
- 0.35 β€” correct service only
109
- 0.0 β€” wrong service
110
- Efficiency bonus (0.0–0.4):
111
- Based on how many unique relevant services were queried before submitting.
112
- More targeted = higher bonus (penalises random querying).
113
- """
114
- action_history = state.get("action_history", [])
115
- correct_rc = scenario.get("correct_root_cause", {})
116
- correct_service = correct_rc.get("service", "").lower().strip()
117
- correct_mode = correct_rc.get("failure_mode", "").lower().strip()
118
- known_services = {s.lower() for s in scenario.get("known_services", set())}
119
-
120
- # Find the submit_root_cause action
121
- submitted_service = ""
122
- submitted_mode = ""
123
- submit_step = None
124
- for action in action_history:
125
- if action.get("action_type") == "submit_root_cause":
126
- params = action.get("parameters", {})
127
- submitted_service = params.get("service", "").lower().strip()
128
- submitted_mode = params.get("failure_mode", "").lower().strip()
129
- submit_step = action.get("step", len(action_history))
130
- break
131
-
132
- if not submitted_service:
133
- return {
134
- "total": 0.0,
135
- "breakdown": {"base": 0.0, "efficiency": 0.0, "submitted": False},
136
- "feedback": "No root cause submitted β€” score 0.0",
137
- }
138
-
139
- # Base score
140
- service_match = submitted_service == correct_service
141
- mode_keywords = [w for w in correct_mode.split() if len(w) > 3]
142
- mode_match = service_match and any(
143
- kw in submitted_mode for kw in mode_keywords
144
- ) if mode_keywords else service_match
145
-
146
- if mode_match:
147
- base = 0.6
148
- base_feedback = f"Correct service ({submitted_service}) + failure mode matched"
149
- elif service_match:
150
- base = 0.35
151
- base_feedback = f"Correct service ({submitted_service}) but failure mode unclear"
152
- else:
153
- base = 0.0
154
- base_feedback = f"Wrong service: submitted '{submitted_service}', correct '{correct_service}'"
155
-
156
- # Efficiency bonus β€” only awarded if service was correct
157
- efficiency = 0.0
158
- if service_match and submit_step is not None:
159
- diagnostic_actions = {"query_logs", "check_metrics", "check_dependencies",
160
- "check_recent_deploys", "check_service_status"}
161
- queried = {
162
- a.get("parameters", {}).get("service", "").lower()
163
- for a in action_history[:submit_step]
164
- if a.get("action_type") in diagnostic_actions
165
- }
166
- relevant_queried = queried & known_services
167
- # Reward for querying relevant services efficiently
168
- # Full bonus for querying 2-3 key services; less for spraying all services
169
- total_queries = sum(
170
- 1 for a in action_history[:submit_step]
171
- if a.get("action_type") in diagnostic_actions
172
- )
173
- if total_queries > 0:
174
- precision = len(relevant_queried) / max(total_queries, 1)
175
- efficiency = round(min(0.4, precision * 0.4 + min(len(relevant_queried), 3) * 0.05), 4)
176
-
177
- total = round(min(1.0, base + efficiency), 4)
178
-
179
- return {
180
- "total": total,
181
- "breakdown": {
182
- "base": base,
183
- "efficiency_bonus": efficiency,
184
- "service_match": service_match,
185
- "mode_match": mode_match,
186
- "submitted_service": submitted_service,
187
- "correct_service": correct_service,
188
- },
189
- "feedback": f"{base_feedback} | efficiency bonus: {efficiency:.2f} | total: {total:.2f}",
190
- }
191
-
192
-
193
- # ── Task 3: Remediation Planning ────────────────────────────────────────────
194
-
195
- def _grade_remediation_planning(state: dict, scenario: dict) -> dict:
196
- """
197
- Scoring:
198
- Resolution base (0.0 or 0.6):
199
- 0.6 β€” submit_resolution with non-empty summary after β‰₯1 investigation action
200
- Efficiency bonus (0.0–0.3):
201
- Fraction of correct remediation actions executed (from correct_remediation_sequence)
202
- Wrong action penalty (up to -0.15):
203
- -0.05 per wrong action (capped at -0.15)
204
- Summary quality bonus (0.0–0.1):
205
- +0.1 if summary contains β‰₯3 resolution keywords from scenario
206
- """
207
- action_history = state.get("action_history", [])
208
- correct_seq = scenario.get("correct_remediation_sequence", [])
209
- wrong_actions_map = scenario.get("wrong_actions", {})
210
- resolution_keywords = scenario.get("resolution_keywords", [])
211
-
212
- diagnostic_actions = {"query_logs", "check_metrics", "check_dependencies",
213
- "check_recent_deploys", "check_service_status"}
214
- remediation_actions = {"restart_service", "rollback_deploy", "scale_service",
215
- "disable_feature_flag", "clear_cache", "execute_runbook_step"}
216
-
217
- # Find submit_resolution
218
- submitted_summary = ""
219
- for action in action_history:
220
- if action.get("action_type") == "submit_resolution":
221
- submitted_summary = action.get("parameters", {}).get("summary", "")
222
- break
223
-
224
- investigation_count = sum(
225
- 1 for a in action_history
226
- if a.get("action_type") in diagnostic_actions | remediation_actions
227
- )
228
-
229
- if not submitted_summary or investigation_count < 1:
230
- return {
231
- "total": 0.0,
232
- "breakdown": {"base": 0.0, "efficiency": 0.0, "penalty": 0.0, "summary": 0.0},
233
- "feedback": "No resolution submitted or no investigation β€” score 0.0",
234
- }
235
-
236
- base = 0.6
237
-
238
- # Efficiency bonus β€” which correct actions were executed?
239
- executed_action_keys = set()
240
- for a in action_history:
241
- at = a.get("action_type", "")
242
- svc = a.get("parameters", {}).get("service", "")
243
- flag = a.get("parameters", {}).get("flag", "")
244
- step_action = a.get("parameters", {}).get("runbook_action", "")
245
- target = a.get("parameters", {}).get("target", "")
246
- # Build key variants that match correct_remediation_sequence format
247
- executed_action_keys.add(at)
248
- if svc:
249
- executed_action_keys.add(f"{at}:{svc}")
250
- if flag:
251
- executed_action_keys.add(f"{at}:{flag}")
252
- if step_action:
253
- executed_action_keys.add(f"execute_runbook_step:{step_action}")
254
- if target:
255
- executed_action_keys.add(f"execute_runbook_step:{target}")
256
-
257
- matched = sum(1 for key in correct_seq if key in executed_action_keys)
258
- efficiency = round((matched / len(correct_seq)) * 0.3, 4) if correct_seq else 0.0
259
-
260
- # Wrong action penalty
261
- wrong_count = 0
262
- for a in action_history:
263
- at = a.get("action_type", "")
264
- svc = a.get("parameters", {}).get("service", "")
265
- key1 = at
266
- key2 = f"{at}:{svc}"
267
- if key1 in wrong_actions_map or key2 in wrong_actions_map:
268
- wrong_count += 1
269
- penalty = round(min(0.15, wrong_count * 0.05), 4)
270
-
271
- # Summary quality bonus
272
- summary_lower = submitted_summary.lower()
273
- keyword_hits = sum(1 for kw in resolution_keywords if kw in summary_lower)
274
- summary_bonus = 0.1 if keyword_hits >= 3 else 0.05 if keyword_hits >= 1 else 0.0
275
-
276
- total = round(max(0.0, min(1.0, base + efficiency - penalty + summary_bonus)), 4)
277
-
278
- return {
279
- "total": total,
280
- "breakdown": {
281
- "base": base,
282
- "efficiency_bonus": efficiency,
283
- "wrong_action_penalty": -penalty,
284
- "summary_bonus": summary_bonus,
285
- "correct_actions_matched": matched,
286
- "correct_actions_total": len(correct_seq),
287
- "wrong_actions_count": wrong_count,
288
- "summary_keywords_hit": keyword_hits,
289
- },
290
- "feedback": (
291
- f"base={base} | efficiency={efficiency:.2f} ({matched}/{len(correct_seq)} correct actions) "
292
- f"| penalty=-{penalty:.2f} | summary_bonus={summary_bonus:.2f} | total={total:.2f}"
293
- ),
294
- }