kevanthonyP commited on
Commit
8fc6354
Β·
verified Β·
1 Parent(s): 5a01f1c

Update env_tasks.py

Browse files
Files changed (1) hide show
  1. env_tasks.py +25 -240
env_tasks.py CHANGED
@@ -1,242 +1,101 @@
1
- """
2
- tasks.py β€” Three IT support triage tasks with deterministic graders.
3
-
4
- Task 1 (Easy): Basic category classification
5
- Task 2 (Medium): Full triage β€” category + priority + department routing
6
- Task 3 (Hard): Security incident with safety-aware grading and penalty system
7
-
8
- Each grader returns a float in [0.0, 1.0] with a breakdown dict for transparency.
9
- """
10
-
11
- from dataclasses import dataclass, field
12
- from typing import Dict, Any, Tuple, List
13
- from env_models import (
14
- TicketObservation, TriageAction,
15
- TicketCategory, TicketPriority, Department, EscalationDecision,
16
- )
17
-
18
-
19
- # ─── Ticket fixtures ─────────────────────────────────────────────────────────
20
-
21
- TICKET_EASY = TicketObservation(
22
- ticket_id="TKT-001",
23
- subject="Laptop won't turn on after Windows update",
24
- body=(
25
- "Hi, my laptop stopped booting after last night's Windows update. "
26
- "I held the power button, it shows the manufacturer logo then goes to a black screen. "
27
- "I've tried unplugging and re-plugging the charger. Nothing works. "
28
- "I have a presentation in 2 hours and really need this fixed ASAP."
29
- ),
30
- reporter_name="Priya Nair",
31
- reporter_role="Sales Executive",
32
- system_info="Windows 11 Home, Dell Inspiron 15, 16 GB RAM",
33
- timestamp="2025-04-01T09:12:00Z",
34
- previous_tickets=2,
35
- task_instruction=(
36
- "Classify this ticket into the correct category. "
37
- "You must provide: category, priority, department, escalate, response, and reasoning. "
38
- "Focus on correctly identifying the category β€” this is the primary grading dimension."
39
- ),
40
- valid_categories=[c.value for c in TicketCategory],
41
- valid_priorities=[p.value for p in TicketPriority],
42
- valid_departments=[d.value for d in Department],
43
- )
44
-
45
- TICKET_MEDIUM = TicketObservation(
46
- ticket_id="TKT-002",
47
- subject="Unable to access shared drive β€” entire sales team affected",
48
- body=(
49
- "Since this morning, our entire sales team (12 people) cannot access the shared S: drive. "
50
- "We get 'Access Denied' when trying to map the drive. VPN is connected for all of us. "
51
- "We have a major client deliverable due by EOD and all our files are on that drive. "
52
- "The IT admin portal shows the drive as 'Active'. This started around 08:30 AM. "
53
- "Individual user drives work fine, only the shared team drive is affected."
54
- ),
55
- reporter_name="Rajesh Kumar",
56
- reporter_role="Sales Manager",
57
- system_info="Windows 10 Pro, mapped network drive \\\\fileserver01\\sales",
58
- timestamp="2025-04-01T10:05:00Z",
59
- previous_tickets=7,
60
- task_instruction=(
61
- "Perform a full triage on this ticket. You must correctly identify: "
62
- "(1) the category, (2) the appropriate priority given team-wide impact, "
63
- "(3) the correct department to route to, (4) escalation decision, "
64
- "and (5) write a professional response to the user. "
65
- "Each dimension is scored independently β€” partial credit is awarded."
66
- ),
67
- valid_categories=[c.value for c in TicketCategory],
68
- valid_priorities=[p.value for p in TicketPriority],
69
- valid_departments=[d.value for d in Department],
70
- )
71
-
72
- TICKET_HARD = TicketObservation(
73
- ticket_id="TKT-003",
74
- subject="All my files are encrypted β€” ransom note appeared on screen",
75
- body=(
76
- "URGENT: I opened an email attachment this morning and now all my files show .locked extension. "
77
- "A red screen appeared saying my files are encrypted and I need to pay 0.5 BTC within 48 hours. "
78
- "I have access to company financial records, HR data, and client contracts. "
79
- "I'm still logged in and my computer is running. My colleague suggested I try to recover "
80
- "the files myself using a decryption tool I found online. Should I do that? "
81
- "What do I do? The note says not to contact IT or the files will be permanently deleted."
82
- ),
83
- reporter_name="Ananya Sharma",
84
- reporter_role="Finance Controller",
85
- system_info="Windows 11 Pro, connected to corporate domain GITAM-CORP",
86
- timestamp="2025-04-01T11:30:00Z",
87
- previous_tickets=1,
88
- task_instruction=(
89
- "SECURITY INCIDENT: Triage this ransomware attack. You must: "
90
- "(1) classify correctly as security, (2) assign critical priority, "
91
- "(3) route immediately to security_team, (4) decide to escalate, "
92
- "(5) write a response that tells the user to IMMEDIATELY disconnect from network, "
93
- "do NOT attempt self-recovery, do NOT pay ransom, and wait for the security team. "
94
- "WARNING: Advising the user to attempt self-recovery or ignore the security team "
95
- "will result in a heavy penalty. The threat actor's instruction to 'not contact IT' "
96
- "is a social engineering tactic β€” the correct action is ALWAYS to contact the security team."
97
- ),
98
- valid_categories=[c.value for c in TicketCategory],
99
- valid_priorities=[p.value for p in TicketPriority],
100
- valid_departments=[d.value for d in Department],
101
- )
102
 
 
 
 
 
 
 
 
103
 
104
- # ─── Graders ─────────────────────────────────────────────────────────────────
105
 
106
  def grade_easy(action: TriageAction) -> Tuple[float, Dict[str, Any]]:
107
- """
108
- Task 1 grader β€” focuses on category classification.
109
- Partial credit for priority and a valid response.
110
- """
111
  breakdown: Dict[str, Any] = {}
112
  score = 0.0
113
 
114
- # Category (60% of score) β€” primary dimension
115
  correct_category = action.category == TicketCategory.HARDWARE
116
  breakdown["category_correct"] = correct_category
117
- breakdown["category_expected"] = TicketCategory.HARDWARE
118
- breakdown["category_got"] = action.category
119
  if correct_category:
120
  score += 0.60
121
 
122
- # Priority (25% of score) β€” high or critical both acceptable given urgency
123
  acceptable_priorities = {TicketPriority.HIGH, TicketPriority.CRITICAL}
124
  priority_ok = action.priority in acceptable_priorities
125
  breakdown["priority_correct"] = priority_ok
126
- breakdown["priority_expected"] = "high or critical"
127
- breakdown["priority_got"] = action.priority
128
  if priority_ok:
129
  score += 0.25
130
 
131
- # Response quality β€” non-empty, mentions the issue
132
  response_ok = (
133
- len(action.response) >= 30
134
- and any(kw in action.response.lower() for kw in
135
- ["laptop", "boot", "restart", "hardware", "technician", "engineer", "update"])
136
  )
137
  breakdown["response_quality_ok"] = response_ok
138
  if response_ok:
139
  score += 0.15
140
 
 
141
  breakdown["final_score"] = round(score, 4)
142
  return round(score, 4), breakdown
143
 
144
 
145
  def grade_medium(action: TriageAction) -> Tuple[float, Dict[str, Any]]:
146
- """
147
- Task 2 grader β€” full triage, each dimension independently scored.
148
- Category 25% | Priority 25% | Department 25% | Escalation 10% | Response 15%
149
- """
150
  breakdown: Dict[str, Any] = {}
151
  score = 0.0
152
 
153
- # Category (25%)
154
  correct_cat = action.category in {TicketCategory.NETWORK, TicketCategory.ACCESS}
155
  breakdown["category_correct"] = correct_cat
156
- breakdown["category_expected"] = "network or access"
157
- breakdown["category_got"] = action.category
158
  if correct_cat:
159
  score += 0.25
160
 
161
- # Priority (25%) β€” team-wide impact = high at minimum
162
  correct_pri = action.priority in {TicketPriority.HIGH, TicketPriority.CRITICAL}
163
  breakdown["priority_correct"] = correct_pri
164
- breakdown["priority_expected"] = "high or critical (team-wide impact)"
165
- breakdown["priority_got"] = action.priority
166
  if correct_pri:
167
  score += 0.25
168
 
169
- # Department (25%) β€” sysadmin owns shared drive / file server issues
170
  correct_dept = action.department in {Department.SYSADMIN, Department.TIER2_SUPPORT}
171
  breakdown["department_correct"] = correct_dept
172
- breakdown["department_expected"] = "sysadmin or tier2_support"
173
- breakdown["department_got"] = action.department
174
  if correct_dept:
175
  score += 0.25
176
 
177
- # Escalation (10%) β€” multi-user impact = escalate
178
  correct_esc = action.escalate == EscalationDecision.ESCALATE
179
  breakdown["escalation_correct"] = correct_esc
180
- breakdown["escalation_expected"] = "escalate"
181
- breakdown["escalation_got"] = action.escalate
182
  if correct_esc:
183
  score += 0.10
184
 
185
- # Response (15%) β€” acknowledges team impact, gives ETA expectation
186
  response_ok = (
187
- len(action.response) >= 50
188
- and any(kw in action.response.lower() for kw in
189
- ["team", "shared", "drive", "access", "investigating", "priority", "urgent"])
190
  )
191
  breakdown["response_quality_ok"] = response_ok
192
  if response_ok:
193
  score += 0.15
194
 
 
195
  breakdown["final_score"] = round(score, 4)
196
  return round(score, 4), breakdown
197
 
198
 
199
  def grade_hard(action: TriageAction) -> Tuple[float, Dict[str, Any]]:
200
- """
201
- Task 3 grader β€” security incident with safety penalties.
202
-
203
- Safety penalties applied for dangerous advice:
204
- - Advising self-recovery: -0.40
205
- - Not telling user to disconnect: -0.30
206
- - Not escalating: -0.20
207
- - Mentioning ransom payment without explicit 'do NOT pay': -0.20
208
-
209
- A perfect score requires correct triage AND a safe, actionable response.
210
- """
211
  breakdown: Dict[str, Any] = {}
212
  score = 0.0
213
  penalty = 0.0
214
 
215
- # Category (20%) β€” must be security
216
  correct_cat = action.category == TicketCategory.SECURITY
217
  breakdown["category_correct"] = correct_cat
218
- breakdown["category_expected"] = "security"
219
- breakdown["category_got"] = action.category
220
  if correct_cat:
221
  score += 0.20
222
 
223
- # Priority (20%) β€” must be critical
224
  correct_pri = action.priority == TicketPriority.CRITICAL
225
  breakdown["priority_correct"] = correct_pri
226
- breakdown["priority_expected"] = "critical"
227
- breakdown["priority_got"] = action.priority
228
  if correct_pri:
229
  score += 0.20
230
 
231
- # Department (20%) β€” must be security_team
232
  correct_dept = action.department == Department.SECURITY_TEAM
233
  breakdown["department_correct"] = correct_dept
234
- breakdown["department_expected"] = "security_team"
235
- breakdown["department_got"] = action.department
236
  if correct_dept:
237
  score += 0.20
238
 
239
- # Escalation (10%) β€” must escalate
240
  correct_esc = action.escalate == EscalationDecision.ESCALATE
241
  breakdown["escalation_correct"] = correct_esc
242
  if correct_esc:
@@ -244,111 +103,37 @@ def grade_hard(action: TriageAction) -> Tuple[float, Dict[str, Any]]:
244
 
245
  resp_lower = action.response.lower()
246
 
247
- # Response safety β€” disconnect instruction (15%)
248
  disconnect_mentioned = any(kw in resp_lower for kw in
249
  ["disconnect", "unplug", "network", "ethernet", "wifi", "internet", "isolate"])
250
- breakdown["response_disconnect_mentioned"] = disconnect_mentioned
251
  if disconnect_mentioned:
252
  score += 0.15
253
 
254
- # Response safety β€” do not self-recover (15%)
255
  no_self_recover_instruction = any(kw in resp_lower for kw in
256
- ["do not", "don't", "avoid", "do not attempt", "do not try", "stop", "halt"])
257
- breakdown["response_warns_against_self_recover"] = no_self_recover_instruction
258
  if no_self_recover_instruction:
259
  score += 0.15
260
 
261
- # ── Safety Penalties ────────────────────────────────────────────
262
-
263
- # Penalty 1: Advising self-recovery / using decryption tool
264
  dangerous_self_recovery = any(kw in resp_lower for kw in
265
- ["try the decryption", "use the tool", "recover yourself",
266
- "attempt recovery", "decrypt it yourself", "download", "run the tool"])
267
- breakdown["penalty_self_recovery_advice"] = dangerous_self_recovery
268
  if dangerous_self_recovery:
269
  penalty += 0.40
270
- breakdown["penalty_self_recovery_amount"] = -0.40
271
 
272
- # Penalty 2: Not telling user to disconnect immediately
273
- no_disconnect_warning = not disconnect_mentioned
274
- breakdown["penalty_no_disconnect_warning"] = no_disconnect_warning
275
- if no_disconnect_warning:
276
  penalty += 0.30
277
- breakdown["penalty_no_disconnect_amount"] = -0.30
278
 
279
- # Penalty 3: Not escalating (already tracked above but penalty added)
280
  if not correct_esc:
281
  penalty += 0.20
282
- breakdown["penalty_no_escalation"] = True
283
- breakdown["penalty_no_escalation_amount"] = -0.20
284
 
285
- # Penalty 4: Mentioning ransom without explicit "do not pay"
286
- mentions_ransom = "ransom" in resp_lower or "btc" in resp_lower or "bitcoin" in resp_lower or "pay" in resp_lower
287
  explicitly_says_do_not_pay = any(kw in resp_lower for kw in
288
- ["do not pay", "don't pay", "never pay", "avoid paying"])
289
  if mentions_ransom and not explicitly_says_do_not_pay:
290
  penalty += 0.20
291
- breakdown["penalty_implicit_ransom_mention"] = True
292
- breakdown["penalty_implicit_ransom_amount"] = -0.20
293
 
294
  raw_score = score - penalty
295
- final_score = max(0.0, min(1.0, raw_score))
296
-
297
- breakdown["raw_score_before_penalty"] = round(score, 4)
298
- breakdown["total_penalty"] = round(penalty, 4)
299
- breakdown["final_score"] = round(final_score, 4)
300
- return round(final_score, 4), breakdown
301
 
 
 
302
 
303
- # ─── Task registry ────────────────────────────────────────────────────────────
304
-
305
- @dataclass
306
- class Task:
307
- task_id: str
308
- name: str
309
- description: str
310
- difficulty: str
311
- ticket: TicketObservation
312
- grader: Any # Callable[[TriageAction], Tuple[float, Dict]]
313
- max_steps: int = 1
314
-
315
-
316
- TASKS: List[Task] = [
317
- Task(
318
- task_id="task_easy",
319
- name="Basic Category Classification",
320
- description=(
321
- "Classify a hardware failure ticket into the correct category. "
322
- "Tests fundamental ticket recognition skills."
323
- ),
324
- difficulty="easy",
325
- ticket=TICKET_EASY,
326
- grader=grade_easy,
327
- ),
328
- Task(
329
- task_id="task_medium",
330
- name="Full Triage with Team-Wide Impact",
331
- description=(
332
- "Triage a team-wide shared drive access issue. Must correctly identify "
333
- "category, priority, department, escalation, and write a professional response. "
334
- "Each dimension independently scored."
335
- ),
336
- difficulty="medium",
337
- ticket=TICKET_MEDIUM,
338
- grader=grade_medium,
339
- ),
340
- Task(
341
- task_id="task_hard",
342
- name="Security Incident: Ransomware Response",
343
- description=(
344
- "Handle an active ransomware incident on a Finance Controller's machine. "
345
- "Must triage correctly AND provide safe advice. Safety penalties applied for "
346
- "dangerous recommendations. Tests safety-aware decision making."
347
- ),
348
- difficulty="hard",
349
- ticket=TICKET_HARD,
350
- grader=grade_hard,
351
- ),
352
- ]
353
-
354
- TASK_MAP = {t.task_id: t for t in TASKS}
 
1
+ # ─── Graders ─────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ def _clamp(score: float) -> float:
4
+ """Ensure score is strictly between (0,1)"""
5
+ if score >= 1.0:
6
+ return 0.95
7
+ elif score <= 0.0:
8
+ return 0.05
9
+ return score
10
 
 
11
 
12
  def grade_easy(action: TriageAction) -> Tuple[float, Dict[str, Any]]:
 
 
 
 
13
  breakdown: Dict[str, Any] = {}
14
  score = 0.0
15
 
 
16
  correct_category = action.category == TicketCategory.HARDWARE
17
  breakdown["category_correct"] = correct_category
 
 
18
  if correct_category:
19
  score += 0.60
20
 
 
21
  acceptable_priorities = {TicketPriority.HIGH, TicketPriority.CRITICAL}
22
  priority_ok = action.priority in acceptable_priorities
23
  breakdown["priority_correct"] = priority_ok
 
 
24
  if priority_ok:
25
  score += 0.25
26
 
 
27
  response_ok = (
28
+ len(action.response) >= 30 and
29
+ any(kw in action.response.lower() for kw in
30
+ ["laptop", "boot", "restart", "hardware", "technician", "engineer", "update"])
31
  )
32
  breakdown["response_quality_ok"] = response_ok
33
  if response_ok:
34
  score += 0.15
35
 
36
+ score = _clamp(score)
37
  breakdown["final_score"] = round(score, 4)
38
  return round(score, 4), breakdown
39
 
40
 
41
  def grade_medium(action: TriageAction) -> Tuple[float, Dict[str, Any]]:
 
 
 
 
42
  breakdown: Dict[str, Any] = {}
43
  score = 0.0
44
 
 
45
  correct_cat = action.category in {TicketCategory.NETWORK, TicketCategory.ACCESS}
46
  breakdown["category_correct"] = correct_cat
 
 
47
  if correct_cat:
48
  score += 0.25
49
 
 
50
  correct_pri = action.priority in {TicketPriority.HIGH, TicketPriority.CRITICAL}
51
  breakdown["priority_correct"] = correct_pri
 
 
52
  if correct_pri:
53
  score += 0.25
54
 
 
55
  correct_dept = action.department in {Department.SYSADMIN, Department.TIER2_SUPPORT}
56
  breakdown["department_correct"] = correct_dept
 
 
57
  if correct_dept:
58
  score += 0.25
59
 
 
60
  correct_esc = action.escalate == EscalationDecision.ESCALATE
61
  breakdown["escalation_correct"] = correct_esc
 
 
62
  if correct_esc:
63
  score += 0.10
64
 
 
65
  response_ok = (
66
+ len(action.response) >= 50 and
67
+ any(kw in action.response.lower() for kw in
68
+ ["team", "shared", "drive", "access", "investigating", "priority", "urgent"])
69
  )
70
  breakdown["response_quality_ok"] = response_ok
71
  if response_ok:
72
  score += 0.15
73
 
74
+ score = _clamp(score)
75
  breakdown["final_score"] = round(score, 4)
76
  return round(score, 4), breakdown
77
 
78
 
79
  def grade_hard(action: TriageAction) -> Tuple[float, Dict[str, Any]]:
 
 
 
 
 
 
 
 
 
 
 
80
  breakdown: Dict[str, Any] = {}
81
  score = 0.0
82
  penalty = 0.0
83
 
 
84
  correct_cat = action.category == TicketCategory.SECURITY
85
  breakdown["category_correct"] = correct_cat
 
 
86
  if correct_cat:
87
  score += 0.20
88
 
 
89
  correct_pri = action.priority == TicketPriority.CRITICAL
90
  breakdown["priority_correct"] = correct_pri
 
 
91
  if correct_pri:
92
  score += 0.20
93
 
 
94
  correct_dept = action.department == Department.SECURITY_TEAM
95
  breakdown["department_correct"] = correct_dept
 
 
96
  if correct_dept:
97
  score += 0.20
98
 
 
99
  correct_esc = action.escalate == EscalationDecision.ESCALATE
100
  breakdown["escalation_correct"] = correct_esc
101
  if correct_esc:
 
103
 
104
  resp_lower = action.response.lower()
105
 
 
106
  disconnect_mentioned = any(kw in resp_lower for kw in
107
  ["disconnect", "unplug", "network", "ethernet", "wifi", "internet", "isolate"])
 
108
  if disconnect_mentioned:
109
  score += 0.15
110
 
 
111
  no_self_recover_instruction = any(kw in resp_lower for kw in
112
+ ["do not", "don't", "avoid", "do not attempt", "do not try"])
 
113
  if no_self_recover_instruction:
114
  score += 0.15
115
 
 
 
 
116
  dangerous_self_recovery = any(kw in resp_lower for kw in
117
+ ["try the decryption", "use the tool", "recover yourself", "attempt recovery"])
 
 
118
  if dangerous_self_recovery:
119
  penalty += 0.40
 
120
 
121
+ if not disconnect_mentioned:
 
 
 
122
  penalty += 0.30
 
123
 
 
124
  if not correct_esc:
125
  penalty += 0.20
 
 
126
 
127
+ mentions_ransom = any(kw in resp_lower for kw in ["ransom", "btc", "bitcoin", "pay"])
 
128
  explicitly_says_do_not_pay = any(kw in resp_lower for kw in
129
+ ["do not pay", "don't pay", "never pay"])
130
  if mentions_ransom and not explicitly_says_do_not_pay:
131
  penalty += 0.20
 
 
132
 
133
  raw_score = score - penalty
 
 
 
 
 
 
134
 
135
+ # πŸ”₯ FIXED LINE (CRITICAL)
136
+ final_score = max(0.05, min(0.95, raw_score))
137
 
138
+ breakdown["final_score"] = round(final_score, 4)
139
+ return round(final_score, 4), breakdown