Imaginephoenix commited on
Commit
30509d6
·
verified ·
1 Parent(s): d49ad1e

Delete graders.py

Browse files
Files changed (1) hide show
  1. graders.py +0 -315
graders.py DELETED
@@ -1,315 +0,0 @@
1
- """Deterministic graders for OpenEnv email triage tasks."""
2
-
3
- import re
4
-
5
- from models import RewardResult, TriageAction
6
-
7
- ROUTE_ALIAS_MAP = {
8
- "billing": ["billing", "finance", "payments", "accounts"],
9
- "safety": ["safety", "compliance", "risk"],
10
- "engineering": ["engineering", "eng", "sre", "platform", "on-call"],
11
- "support": ["support", "helpdesk", "customer support"],
12
- "general": ["general", "inbox", "operations"],
13
- }
14
-
15
- SCORE_EPSILON = 1e-6
16
-
17
-
18
- def _strict_binary_score(is_positive_case: bool) -> float:
19
- """Return strict in-range score for binary outcomes."""
20
- return 1.0 - SCORE_EPSILON if is_positive_case else SCORE_EPSILON
21
-
22
-
23
- def _strict_ratio_score(raw_value: float) -> float:
24
- """Return strict in-range score for ratio-like metrics."""
25
- return _clip_score(raw_value)
26
-
27
-
28
- def _clip_score(score_value: float) -> float:
29
- """Clip a score to the strict range (0.0, 1.0).
30
-
31
- Args:
32
- score_value: Raw score.
33
-
34
- Returns:
35
- Clipped score.
36
- """
37
- clipped = max(0.0, min(1.0, score_value))
38
- if clipped <= 0.0:
39
- return SCORE_EPSILON
40
- if clipped >= 1.0:
41
- return 1.0 - SCORE_EPSILON
42
- return clipped
43
-
44
-
45
- def _normalized_text(text_value: str) -> str:
46
- """Return normalized lowercase text for deterministic comparisons.
47
-
48
- Args:
49
- text_value: Input text.
50
-
51
- Returns:
52
- Normalized text.
53
- """
54
- return text_value.strip().lower()
55
-
56
-
57
- def _route_matches(action_route: str, expected_route: str) -> bool:
58
- """Check if action route contains the expected route token.
59
-
60
- Args:
61
- action_route: Route provided by agent.
62
- expected_route: Route expected by ground truth.
63
-
64
- Returns:
65
- True when expected route is present in the action route.
66
- """
67
- normalized_expected = _normalized_text(expected_route)
68
- if not normalized_expected:
69
- return False
70
-
71
- return normalized_expected in _canonical_route_tokens(action_route)
72
-
73
-
74
- def _canonical_route_tokens(action_route: str) -> set[str]:
75
- """Map free-form route text to canonical route categories."""
76
- normalized_action = _normalized_text(action_route)
77
- if not normalized_action:
78
- return set()
79
-
80
- route_fragments = [
81
- fragment.strip()
82
- for fragment in re.split(r"[,;/|]+", normalized_action)
83
- if fragment.strip()
84
- ]
85
-
86
- canonical: set[str] = set()
87
- for fragment in route_fragments:
88
- for route_name, aliases in ROUTE_ALIAS_MAP.items():
89
- if any(alias in fragment for alias in aliases):
90
- canonical.add(route_name)
91
- break
92
-
93
- # Fallback for phrases without separators.
94
- if not canonical:
95
- for route_name, aliases in ROUTE_ALIAS_MAP.items():
96
- if any(alias in normalized_action for alias in aliases):
97
- canonical.add(route_name)
98
-
99
- return canonical
100
-
101
-
102
- def _route_noise_penalty(action_route: str) -> float:
103
- """Penalize over-routing to many teams in one action."""
104
- route_count = len(_canonical_route_tokens(action_route))
105
- if route_count <= 2:
106
- return 0.0
107
- return min(0.24, 0.08 * (route_count - 2))
108
-
109
-
110
- def _summary_keyword_score(summary_text: str, ground_truth: dict) -> float:
111
- """Score summary quality using deterministic keyword overlap.
112
-
113
- Args:
114
- summary_text: Summary text produced by the agent.
115
- ground_truth: Ground-truth dict that may include summary keywords.
116
-
117
- Returns:
118
- Score in [0.0, 1.0] based on matched summary keywords.
119
- """
120
- raw_keywords = ground_truth.get("summary_keywords", [])
121
- if not isinstance(raw_keywords, list):
122
- return _strict_binary_score(len(summary_text.strip()) >= 10)
123
-
124
- keywords = [
125
- _normalized_text(str(keyword))
126
- for keyword in raw_keywords
127
- if _normalized_text(str(keyword))
128
- ]
129
- if not keywords:
130
- return _strict_binary_score(len(summary_text.strip()) >= 10)
131
-
132
- normalized_summary = _normalized_text(summary_text)
133
- matches = 0
134
- for keyword in keywords:
135
- if keyword in normalized_summary:
136
- matches += 1
137
-
138
- base_score = matches / len(keywords)
139
-
140
- # Discourage keyword stuffing and overly verbose summaries.
141
- word_count = len(re.findall(r"[a-z0-9'-]+", normalized_summary))
142
- if word_count < 4:
143
- brevity_factor = 0.6
144
- elif word_count <= 40:
145
- brevity_factor = 1.0
146
- else:
147
- brevity_factor = max(0.45, 1.0 - (word_count - 40) * 0.02)
148
-
149
- list_like_penalty = 0.85 if normalized_summary.count(",") >= 6 and matches >= 3 else 1.0
150
- return _clip_score(base_score * brevity_factor * list_like_penalty)
151
-
152
-
153
- def grade_easy(action: TriageAction, ground_truth: dict) -> RewardResult:
154
- """Grade easy task with deterministic partial credit.
155
-
156
- Args:
157
- action: Agent action for one email.
158
- ground_truth: Expected label and route.
159
-
160
- Returns:
161
- Deterministic reward result in [0.0, 1.0].
162
- """
163
- expected_label = _normalized_text(str(ground_truth.get("label", "")))
164
- expected_route = _normalized_text(str(ground_truth.get("route_to", "")))
165
-
166
- label_correct = _normalized_text(action.label) == expected_label
167
- route_correct = _route_matches(action.route_to, expected_route)
168
- summary_score = _summary_keyword_score(action.summary, ground_truth)
169
- noise_penalty = _route_noise_penalty(action.route_to)
170
-
171
- score_value = (0.6 if label_correct else 0.0) + (0.25 if route_correct else 0.0)
172
- score_value += 0.15 * summary_score
173
- score_value -= noise_penalty
174
-
175
- score_value = _clip_score(score_value)
176
- breakdown = {
177
- "label_match": _strict_binary_score(label_correct),
178
- "route_match": _strict_binary_score(route_correct),
179
- "summary_match": round(summary_score, 4),
180
- "route_noise_penalty": round(noise_penalty, 4),
181
- }
182
- feedback = "Easy-task grading completed with context summary scoring."
183
- return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
184
-
185
-
186
- def grade_medium_step(action: TriageAction, truth: dict) -> RewardResult:
187
- """Grade one medium-task step without cumulative history effects."""
188
- expected_label = _normalized_text(str(truth.get("label", "")))
189
- expected_route = _normalized_text(str(truth.get("route_to", "")))
190
- priority_weight = max(float(truth.get("priority_weight", 1.0)), 0.1)
191
-
192
- label_correct = _normalized_text(action.label) == expected_label
193
- route_correct = _route_matches(action.route_to, expected_route)
194
- summary_score = _summary_keyword_score(action.summary, truth)
195
- noise_penalty = _route_noise_penalty(action.route_to)
196
-
197
- per_email_score = (0.55 if label_correct else 0.0) + (0.3 if route_correct else 0.0)
198
- per_email_score += 0.15 * summary_score
199
- per_email_score -= noise_penalty
200
- per_email_score = _clip_score(per_email_score)
201
-
202
- weighted_step_score = _clip_score(per_email_score * min(priority_weight, 2.0))
203
-
204
- return RewardResult(
205
- score=weighted_step_score,
206
- breakdown={
207
- "label_match": _strict_binary_score(label_correct),
208
- "route_match": _strict_binary_score(route_correct),
209
- "summary_match": round(summary_score, 4),
210
- "priority_weight": round(priority_weight, 4),
211
- "route_noise_penalty": round(noise_penalty, 4),
212
- },
213
- feedback="Medium-task step grading completed.",
214
- )
215
-
216
-
217
- def grade_medium(actions: list[TriageAction], ground_truths: list[dict]) -> RewardResult:
218
- """Grade medium task using weighted per-email partial scoring.
219
-
220
- Args:
221
- actions: Agent actions for the medium task email queue.
222
- ground_truths: Expected action details for each email.
223
-
224
- Returns:
225
- Deterministic reward result in [0.0, 1.0].
226
- """
227
- comparable_count = min(len(actions), len(ground_truths))
228
- if comparable_count == 0:
229
- return RewardResult(
230
- score=SCORE_EPSILON,
231
- breakdown={"emails_scored": SCORE_EPSILON, "weighted_average": SCORE_EPSILON},
232
- feedback="No actions available for grading.",
233
- )
234
-
235
- weighted_score_sum = 0.0
236
- weight_sum = 0.0
237
- label_hits = 0
238
- route_hits = 0
239
- summary_total = 0.0
240
- noise_penalty_total = 0.0
241
-
242
- for index in range(comparable_count):
243
- action = actions[index]
244
- truth = ground_truths[index]
245
-
246
- step_result = grade_medium_step(action, truth)
247
- priority_weight = float(step_result.breakdown.get("priority_weight", 1.0))
248
- weighted_score_sum += step_result.score
249
- weight_sum += min(priority_weight, 2.0)
250
-
251
- label_hits += 1 if step_result.breakdown.get("label_match", 0.0) > 0 else 0
252
- route_hits += 1 if step_result.breakdown.get("route_match", 0.0) > 0 else 0
253
- summary_total += float(step_result.breakdown.get("summary_match", 0.0))
254
- noise_penalty_total += float(step_result.breakdown.get("route_noise_penalty", 0.0))
255
-
256
- weighted_average = weighted_score_sum / weight_sum if weight_sum > 0.0 else 0.0
257
- score_value = _clip_score(weighted_average)
258
-
259
- breakdown = {
260
- "emails_scored": _strict_ratio_score(float(comparable_count) / (comparable_count + 1.0)),
261
- "label_accuracy": _strict_ratio_score(label_hits / comparable_count),
262
- "route_accuracy": _strict_ratio_score(route_hits / comparable_count),
263
- "summary_accuracy": _strict_ratio_score(summary_total / comparable_count),
264
- "avg_route_noise_penalty": _strict_ratio_score(noise_penalty_total / comparable_count),
265
- "weighted_average": score_value,
266
- }
267
- feedback = "Weighted medium-task grading completed."
268
- return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
269
-
270
-
271
- def grade_hard(action: TriageAction, ground_truth: dict) -> RewardResult:
272
- """Grade hard task using weighted policy-sensitive components.
273
-
274
- Args:
275
- action: Agent action for hard task case.
276
- ground_truth: Expected routing and urgency intent.
277
-
278
- Returns:
279
- Deterministic reward result in [0.0, 1.0].
280
- """
281
- expected_label = _normalized_text(str(ground_truth.get("label", "urgent")))
282
- primary_route = _normalized_text(str(ground_truth.get("route_to", "safety")))
283
- secondary_route = _normalized_text(str(ground_truth.get("cc_route", "billing")))
284
- spam_penalty = float(ground_truth.get("penalize_spam", 0.2))
285
-
286
- normalized_route = _normalized_text(action.route_to)
287
- has_primary_route = _route_matches(normalized_route, primary_route)
288
- has_secondary_route = _route_matches(normalized_route, secondary_route)
289
- urgent_label = _normalized_text(action.label) == expected_label
290
- summary_score = _summary_keyword_score(action.summary, ground_truth)
291
- noise_penalty = _route_noise_penalty(action.route_to)
292
-
293
- escalation_component = 0.35 if has_primary_route else 0.0
294
- routing_component = 0.25 if has_secondary_route else 0.0
295
- urgency_component = 0.25 if urgent_label else 0.0
296
- summary_component = 0.15 * summary_score
297
-
298
- raw_score = escalation_component + routing_component + urgency_component + summary_component
299
- raw_score -= noise_penalty
300
- if _normalized_text(action.label) == "spam":
301
- raw_score -= spam_penalty
302
-
303
- score_value = _clip_score(raw_score)
304
- breakdown = {
305
- "escalation_component": _strict_ratio_score(escalation_component),
306
- "routing_component": _strict_ratio_score(routing_component),
307
- "urgency_component": _strict_ratio_score(urgency_component),
308
- "summary_component": round(summary_component, 4),
309
- "route_noise_penalty": round(noise_penalty, 4),
310
- "spam_penalty": _strict_ratio_score(
311
- spam_penalty if _normalized_text(action.label) == "spam" else SCORE_EPSILON
312
- ),
313
- }
314
- feedback = "Hard-task weighted policy grading completed."
315
- return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)