Imaginephoenix commited on
Commit
6199458
·
verified ·
1 Parent(s): ddce2ff

Upload graders.py

Browse files
Files changed (1) hide show
  1. graders.py +315 -0
graders.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic graders for OpenEnv email triage tasks."""
2
+
3
+ import re
4
+
5
+ from models import RewardResult, TriageAction
6
+
7
+ ROUTE_ALIAS_MAP = {
8
+ "billing": ["billing", "finance", "payments", "accounts"],
9
+ "safety": ["safety", "compliance", "risk"],
10
+ "engineering": ["engineering", "eng", "sre", "platform", "on-call"],
11
+ "support": ["support", "helpdesk", "customer support"],
12
+ "general": ["general", "inbox", "operations"],
13
+ }
14
+
15
+ SCORE_EPSILON = 1e-6
16
+
17
+
18
+ def _strict_binary_score(is_positive_case: bool) -> float:
19
+ """Return strict in-range score for binary outcomes."""
20
+ return 1.0 - SCORE_EPSILON if is_positive_case else SCORE_EPSILON
21
+
22
+
23
+ def _strict_ratio_score(raw_value: float) -> float:
24
+ """Return strict in-range score for ratio-like metrics."""
25
+ return _clip_score(raw_value)
26
+
27
+
28
+ def _clip_score(score_value: float) -> float:
29
+ """Clip a score to the strict range (0.0, 1.0).
30
+
31
+ Args:
32
+ score_value: Raw score.
33
+
34
+ Returns:
35
+ Clipped score.
36
+ """
37
+ clipped = max(0.0, min(1.0, score_value))
38
+ if clipped <= 0.0:
39
+ return SCORE_EPSILON
40
+ if clipped >= 1.0:
41
+ return 1.0 - SCORE_EPSILON
42
+ return clipped
43
+
44
+
45
+ def _normalized_text(text_value: str) -> str:
46
+ """Return normalized lowercase text for deterministic comparisons.
47
+
48
+ Args:
49
+ text_value: Input text.
50
+
51
+ Returns:
52
+ Normalized text.
53
+ """
54
+ return text_value.strip().lower()
55
+
56
+
57
+ def _route_matches(action_route: str, expected_route: str) -> bool:
58
+ """Check if action route contains the expected route token.
59
+
60
+ Args:
61
+ action_route: Route provided by agent.
62
+ expected_route: Route expected by ground truth.
63
+
64
+ Returns:
65
+ True when expected route is present in the action route.
66
+ """
67
+ normalized_expected = _normalized_text(expected_route)
68
+ if not normalized_expected:
69
+ return False
70
+
71
+ return normalized_expected in _canonical_route_tokens(action_route)
72
+
73
+
74
+ def _canonical_route_tokens(action_route: str) -> set[str]:
75
+ """Map free-form route text to canonical route categories."""
76
+ normalized_action = _normalized_text(action_route)
77
+ if not normalized_action:
78
+ return set()
79
+
80
+ route_fragments = [
81
+ fragment.strip()
82
+ for fragment in re.split(r"[,;/|]+", normalized_action)
83
+ if fragment.strip()
84
+ ]
85
+
86
+ canonical: set[str] = set()
87
+ for fragment in route_fragments:
88
+ for route_name, aliases in ROUTE_ALIAS_MAP.items():
89
+ if any(alias in fragment for alias in aliases):
90
+ canonical.add(route_name)
91
+ break
92
+
93
+ # Fallback for phrases without separators.
94
+ if not canonical:
95
+ for route_name, aliases in ROUTE_ALIAS_MAP.items():
96
+ if any(alias in normalized_action for alias in aliases):
97
+ canonical.add(route_name)
98
+
99
+ return canonical
100
+
101
+
102
+ def _route_noise_penalty(action_route: str) -> float:
103
+ """Penalize over-routing to many teams in one action."""
104
+ route_count = len(_canonical_route_tokens(action_route))
105
+ if route_count <= 2:
106
+ return 0.0
107
+ return min(0.24, 0.08 * (route_count - 2))
108
+
109
+
110
+ def _summary_keyword_score(summary_text: str, ground_truth: dict) -> float:
111
+ """Score summary quality using deterministic keyword overlap.
112
+
113
+ Args:
114
+ summary_text: Summary text produced by the agent.
115
+ ground_truth: Ground-truth dict that may include summary keywords.
116
+
117
+ Returns:
118
+ Score in [0.0, 1.0] based on matched summary keywords.
119
+ """
120
+ raw_keywords = ground_truth.get("summary_keywords", [])
121
+ if not isinstance(raw_keywords, list):
122
+ return _strict_binary_score(len(summary_text.strip()) >= 10)
123
+
124
+ keywords = [
125
+ _normalized_text(str(keyword))
126
+ for keyword in raw_keywords
127
+ if _normalized_text(str(keyword))
128
+ ]
129
+ if not keywords:
130
+ return _strict_binary_score(len(summary_text.strip()) >= 10)
131
+
132
+ normalized_summary = _normalized_text(summary_text)
133
+ matches = 0
134
+ for keyword in keywords:
135
+ if keyword in normalized_summary:
136
+ matches += 1
137
+
138
+ base_score = matches / len(keywords)
139
+
140
+ # Discourage keyword stuffing and overly verbose summaries.
141
+ word_count = len(re.findall(r"[a-z0-9'-]+", normalized_summary))
142
+ if word_count < 4:
143
+ brevity_factor = 0.6
144
+ elif word_count <= 40:
145
+ brevity_factor = 1.0
146
+ else:
147
+ brevity_factor = max(0.45, 1.0 - (word_count - 40) * 0.02)
148
+
149
+ list_like_penalty = 0.85 if normalized_summary.count(",") >= 6 and matches >= 3 else 1.0
150
+ return _clip_score(base_score * brevity_factor * list_like_penalty)
151
+
152
+
153
+ def grade_easy(action: TriageAction, ground_truth: dict) -> RewardResult:
154
+ """Grade easy task with deterministic partial credit.
155
+
156
+ Args:
157
+ action: Agent action for one email.
158
+ ground_truth: Expected label and route.
159
+
160
+ Returns:
161
+ Deterministic reward result in [0.0, 1.0].
162
+ """
163
+ expected_label = _normalized_text(str(ground_truth.get("label", "")))
164
+ expected_route = _normalized_text(str(ground_truth.get("route_to", "")))
165
+
166
+ label_correct = _normalized_text(action.label) == expected_label
167
+ route_correct = _route_matches(action.route_to, expected_route)
168
+ summary_score = _summary_keyword_score(action.summary, ground_truth)
169
+ noise_penalty = _route_noise_penalty(action.route_to)
170
+
171
+ score_value = (0.6 if label_correct else 0.0) + (0.25 if route_correct else 0.0)
172
+ score_value += 0.15 * summary_score
173
+ score_value -= noise_penalty
174
+
175
+ score_value = _clip_score(score_value)
176
+ breakdown = {
177
+ "label_match": _strict_binary_score(label_correct),
178
+ "route_match": _strict_binary_score(route_correct),
179
+ "summary_match": round(summary_score, 4),
180
+ "route_noise_penalty": round(noise_penalty, 4),
181
+ }
182
+ feedback = "Easy-task grading completed with context summary scoring."
183
+ return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
184
+
185
+
186
+ def grade_medium_step(action: TriageAction, truth: dict) -> RewardResult:
187
+ """Grade one medium-task step without cumulative history effects."""
188
+ expected_label = _normalized_text(str(truth.get("label", "")))
189
+ expected_route = _normalized_text(str(truth.get("route_to", "")))
190
+ priority_weight = max(float(truth.get("priority_weight", 1.0)), 0.1)
191
+
192
+ label_correct = _normalized_text(action.label) == expected_label
193
+ route_correct = _route_matches(action.route_to, expected_route)
194
+ summary_score = _summary_keyword_score(action.summary, truth)
195
+ noise_penalty = _route_noise_penalty(action.route_to)
196
+
197
+ per_email_score = (0.55 if label_correct else 0.0) + (0.3 if route_correct else 0.0)
198
+ per_email_score += 0.15 * summary_score
199
+ per_email_score -= noise_penalty
200
+ per_email_score = _clip_score(per_email_score)
201
+
202
+ weighted_step_score = _clip_score(per_email_score * min(priority_weight, 2.0))
203
+
204
+ return RewardResult(
205
+ score=weighted_step_score,
206
+ breakdown={
207
+ "label_match": _strict_binary_score(label_correct),
208
+ "route_match": _strict_binary_score(route_correct),
209
+ "summary_match": round(summary_score, 4),
210
+ "priority_weight": round(priority_weight, 4),
211
+ "route_noise_penalty": round(noise_penalty, 4),
212
+ },
213
+ feedback="Medium-task step grading completed.",
214
+ )
215
+
216
+
217
+ def grade_medium(actions: list[TriageAction], ground_truths: list[dict]) -> RewardResult:
218
+ """Grade medium task using weighted per-email partial scoring.
219
+
220
+ Args:
221
+ actions: Agent actions for the medium task email queue.
222
+ ground_truths: Expected action details for each email.
223
+
224
+ Returns:
225
+ Deterministic reward result in [0.0, 1.0].
226
+ """
227
+ comparable_count = min(len(actions), len(ground_truths))
228
+ if comparable_count == 0:
229
+ return RewardResult(
230
+ score=SCORE_EPSILON,
231
+ breakdown={"emails_scored": SCORE_EPSILON, "weighted_average": SCORE_EPSILON},
232
+ feedback="No actions available for grading.",
233
+ )
234
+
235
+ weighted_score_sum = 0.0
236
+ weight_sum = 0.0
237
+ label_hits = 0
238
+ route_hits = 0
239
+ summary_total = 0.0
240
+ noise_penalty_total = 0.0
241
+
242
+ for index in range(comparable_count):
243
+ action = actions[index]
244
+ truth = ground_truths[index]
245
+
246
+ step_result = grade_medium_step(action, truth)
247
+ priority_weight = float(step_result.breakdown.get("priority_weight", 1.0))
248
+ weighted_score_sum += step_result.score
249
+ weight_sum += min(priority_weight, 2.0)
250
+
251
+ label_hits += 1 if step_result.breakdown.get("label_match", 0.0) > 0 else 0
252
+ route_hits += 1 if step_result.breakdown.get("route_match", 0.0) > 0 else 0
253
+ summary_total += float(step_result.breakdown.get("summary_match", 0.0))
254
+ noise_penalty_total += float(step_result.breakdown.get("route_noise_penalty", 0.0))
255
+
256
+ weighted_average = weighted_score_sum / weight_sum if weight_sum > 0.0 else 0.0
257
+ score_value = _clip_score(weighted_average)
258
+
259
+ breakdown = {
260
+ "emails_scored": _strict_ratio_score(float(comparable_count) / (comparable_count + 1.0)),
261
+ "label_accuracy": _strict_ratio_score(label_hits / comparable_count),
262
+ "route_accuracy": _strict_ratio_score(route_hits / comparable_count),
263
+ "summary_accuracy": _strict_ratio_score(summary_total / comparable_count),
264
+ "avg_route_noise_penalty": _strict_ratio_score(noise_penalty_total / comparable_count),
265
+ "weighted_average": score_value,
266
+ }
267
+ feedback = "Weighted medium-task grading completed."
268
+ return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
269
+
270
+
271
+ def grade_hard(action: TriageAction, ground_truth: dict) -> RewardResult:
272
+ """Grade hard task using weighted policy-sensitive components.
273
+
274
+ Args:
275
+ action: Agent action for hard task case.
276
+ ground_truth: Expected routing and urgency intent.
277
+
278
+ Returns:
279
+ Deterministic reward result in [0.0, 1.0].
280
+ """
281
+ expected_label = _normalized_text(str(ground_truth.get("label", "urgent")))
282
+ primary_route = _normalized_text(str(ground_truth.get("route_to", "safety")))
283
+ secondary_route = _normalized_text(str(ground_truth.get("cc_route", "billing")))
284
+ spam_penalty = float(ground_truth.get("penalize_spam", 0.2))
285
+
286
+ normalized_route = _normalized_text(action.route_to)
287
+ has_primary_route = _route_matches(normalized_route, primary_route)
288
+ has_secondary_route = _route_matches(normalized_route, secondary_route)
289
+ urgent_label = _normalized_text(action.label) == expected_label
290
+ summary_score = _summary_keyword_score(action.summary, ground_truth)
291
+ noise_penalty = _route_noise_penalty(action.route_to)
292
+
293
+ escalation_component = 0.35 if has_primary_route else 0.0
294
+ routing_component = 0.25 if has_secondary_route else 0.0
295
+ urgency_component = 0.25 if urgent_label else 0.0
296
+ summary_component = 0.15 * summary_score
297
+
298
+ raw_score = escalation_component + routing_component + urgency_component + summary_component
299
+ raw_score -= noise_penalty
300
+ if _normalized_text(action.label) == "spam":
301
+ raw_score -= spam_penalty
302
+
303
+ score_value = _clip_score(raw_score)
304
+ breakdown = {
305
+ "escalation_component": _strict_ratio_score(escalation_component),
306
+ "routing_component": _strict_ratio_score(routing_component),
307
+ "urgency_component": _strict_ratio_score(urgency_component),
308
+ "summary_component": round(summary_component, 4),
309
+ "route_noise_penalty": round(noise_penalty, 4),
310
+ "spam_penalty": _strict_ratio_score(
311
+ spam_penalty if _normalized_text(action.label) == "spam" else SCORE_EPSILON
312
+ ),
313
+ }
314
+ feedback = "Hard-task weighted policy grading completed."
315
+ return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)