Imaginephoenix commited on
Commit
ddce2ff
·
verified ·
1 Parent(s): e1f904b

Delete graders.py

Browse files
Files changed (1) hide show
  1. graders.py +0 -307
graders.py DELETED
@@ -1,307 +0,0 @@
1
- """Deterministic graders for OpenEnv email triage tasks."""
2
-
3
- import re
4
-
5
- from models import RewardResult, TriageAction
6
-
7
- ROUTE_ALIAS_MAP = {
8
- "billing": ["billing", "finance", "payments", "accounts"],
9
- "safety": ["safety", "compliance", "risk"],
10
- "engineering": ["engineering", "eng", "sre", "platform", "on-call"],
11
- "support": ["support", "helpdesk", "customer support"],
12
- "general": ["general", "inbox", "operations"],
13
- }
14
-
15
- SCORE_EPSILON = 1e-6
16
-
17
- def _strict_binary_score(is_positive_case: bool) -> float:
18
- """Return strict in-range score for binary outcomes."""
19
- return 1.0 - SCORE_EPSILON if is_positive_case else SCORE_EPSILON
20
-
21
-
22
- def _clip_score(score_value: float) -> float:
23
- """Clip a score to the strict range (0.0, 1.0).
24
-
25
- Args:
26
- score_value: Raw score.
27
-
28
- Returns:
29
- Clipped score.
30
- """
31
- clipped = max(0.0, min(1.0, score_value))
32
- if clipped <= 0.0:
33
- return SCORE_EPSILON
34
- if clipped >= 1.0:
35
- return 1.0 - SCORE_EPSILON
36
- return clipped
37
-
38
-
39
- def _normalized_text(text_value: str) -> str:
40
- """Return normalized lowercase text for deterministic comparisons.
41
-
42
- Args:
43
- text_value: Input text.
44
-
45
- Returns:
46
- Normalized text.
47
- """
48
- return text_value.strip().lower()
49
-
50
-
51
- def _route_matches(action_route: str, expected_route: str) -> bool:
52
- """Check if action route contains the expected route token.
53
-
54
- Args:
55
- action_route: Route provided by agent.
56
- expected_route: Route expected by ground truth.
57
-
58
- Returns:
59
- True when expected route is present in the action route.
60
- """
61
- normalized_expected = _normalized_text(expected_route)
62
- if not normalized_expected:
63
- return False
64
-
65
- return normalized_expected in _canonical_route_tokens(action_route)
66
-
67
-
68
- def _canonical_route_tokens(action_route: str) -> set[str]:
69
- """Map free-form route text to canonical route categories."""
70
- normalized_action = _normalized_text(action_route)
71
- if not normalized_action:
72
- return set()
73
-
74
- route_fragments = [
75
- fragment.strip()
76
- for fragment in re.split(r"[,;/|]+", normalized_action)
77
- if fragment.strip()
78
- ]
79
-
80
- canonical: set[str] = set()
81
- for fragment in route_fragments:
82
- for route_name, aliases in ROUTE_ALIAS_MAP.items():
83
- if any(alias in fragment for alias in aliases):
84
- canonical.add(route_name)
85
- break
86
-
87
- # Fallback for phrases without separators.
88
- if not canonical:
89
- for route_name, aliases in ROUTE_ALIAS_MAP.items():
90
- if any(alias in normalized_action for alias in aliases):
91
- canonical.add(route_name)
92
-
93
- return canonical
94
-
95
-
96
- def _route_noise_penalty(action_route: str) -> float:
97
- """Penalize over-routing to many teams in one action."""
98
- route_count = len(_canonical_route_tokens(action_route))
99
- if route_count <= 2:
100
- return 0.0
101
- return min(0.24, 0.08 * (route_count - 2))
102
-
103
-
104
- def _summary_keyword_score(summary_text: str, ground_truth: dict) -> float:
105
- """Score summary quality using deterministic keyword overlap.
106
-
107
- Args:
108
- summary_text: Summary text produced by the agent.
109
- ground_truth: Ground-truth dict that may include summary keywords.
110
-
111
- Returns:
112
- Score in [0.0, 1.0] based on matched summary keywords.
113
- """
114
- raw_keywords = ground_truth.get("summary_keywords", [])
115
- if not isinstance(raw_keywords, list):
116
- return _strict_binary_score(len(summary_text.strip()) >= 10)
117
-
118
- keywords = [
119
- _normalized_text(str(keyword))
120
- for keyword in raw_keywords
121
- if _normalized_text(str(keyword))
122
- ]
123
- if not keywords:
124
- return _strict_binary_score(len(summary_text.strip()) >= 10)
125
-
126
- normalized_summary = _normalized_text(summary_text)
127
- matches = 0
128
- for keyword in keywords:
129
- if keyword in normalized_summary:
130
- matches += 1
131
-
132
- base_score = matches / len(keywords)
133
-
134
- # Discourage keyword stuffing and overly verbose summaries.
135
- word_count = len(re.findall(r"[a-z0-9'-]+", normalized_summary))
136
- if word_count < 4:
137
- brevity_factor = 0.6
138
- elif word_count <= 40:
139
- brevity_factor = 1.0
140
- else:
141
- brevity_factor = max(0.45, 1.0 - (word_count - 40) * 0.02)
142
-
143
- list_like_penalty = 0.85 if normalized_summary.count(",") >= 6 and matches >= 3 else 1.0
144
- return _clip_score(base_score * brevity_factor * list_like_penalty)
145
-
146
-
147
- def grade_easy(action: TriageAction, ground_truth: dict) -> RewardResult:
148
- """Grade easy task with deterministic partial credit.
149
-
150
- Args:
151
- action: Agent action for one email.
152
- ground_truth: Expected label and route.
153
-
154
- Returns:
155
- Deterministic reward result in [0.0, 1.0].
156
- """
157
- expected_label = _normalized_text(str(ground_truth.get("label", "")))
158
- expected_route = _normalized_text(str(ground_truth.get("route_to", "")))
159
-
160
- label_correct = _normalized_text(action.label) == expected_label
161
- route_correct = _route_matches(action.route_to, expected_route)
162
- summary_score = _summary_keyword_score(action.summary, ground_truth)
163
- noise_penalty = _route_noise_penalty(action.route_to)
164
-
165
- score_value = (0.6 if label_correct else 0.0) + (0.25 if route_correct else 0.0)
166
- score_value += 0.15 * summary_score
167
- score_value -= noise_penalty
168
-
169
- score_value = _clip_score(score_value)
170
- breakdown = {
171
- "label_match": 1.0 if label_correct else 0.0,
172
- "route_match": 1.0 if route_correct else 0.0,
173
- "summary_match": round(summary_score, 4),
174
- "route_noise_penalty": round(noise_penalty, 4),
175
- }
176
- feedback = "Easy-task grading completed with context summary scoring."
177
- return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
178
-
179
-
180
- def grade_medium_step(action: TriageAction, truth: dict) -> RewardResult:
181
- """Grade one medium-task step without cumulative history effects."""
182
- expected_label = _normalized_text(str(truth.get("label", "")))
183
- expected_route = _normalized_text(str(truth.get("route_to", "")))
184
- priority_weight = max(float(truth.get("priority_weight", 1.0)), 0.1)
185
-
186
- label_correct = _normalized_text(action.label) == expected_label
187
- route_correct = _route_matches(action.route_to, expected_route)
188
- summary_score = _summary_keyword_score(action.summary, truth)
189
- noise_penalty = _route_noise_penalty(action.route_to)
190
-
191
- per_email_score = (0.55 if label_correct else 0.0) + (0.3 if route_correct else 0.0)
192
- per_email_score += 0.15 * summary_score
193
- per_email_score -= noise_penalty
194
- per_email_score = _clip_score(per_email_score)
195
-
196
- weighted_step_score = _clip_score(per_email_score * min(priority_weight, 2.0))
197
-
198
- return RewardResult(
199
- score=weighted_step_score,
200
- breakdown={
201
- "label_match": 1.0 if label_correct else 0.0,
202
- "route_match": 1.0 if route_correct else 0.0,
203
- "summary_match": round(summary_score, 4),
204
- "priority_weight": round(priority_weight, 4),
205
- "route_noise_penalty": round(noise_penalty, 4),
206
- },
207
- feedback="Medium-task step grading completed.",
208
- )
209
-
210
-
211
- def grade_medium(actions: list[TriageAction], ground_truths: list[dict]) -> RewardResult:
212
- """Grade medium task using weighted per-email partial scoring.
213
-
214
- Args:
215
- actions: Agent actions for the medium task email queue.
216
- ground_truths: Expected action details for each email.
217
-
218
- Returns:
219
- Deterministic reward result in [0.0, 1.0].
220
- """
221
- comparable_count = min(len(actions), len(ground_truths))
222
- if comparable_count == 0:
223
- return RewardResult(
224
- score=SCORE_EPSILON,
225
- breakdown={"emails_scored": 0.0, "weighted_average": SCORE_EPSILON},
226
- feedback="No actions available for grading.",
227
- )
228
-
229
- weighted_score_sum = 0.0
230
- weight_sum = 0.0
231
- label_hits = 0
232
- route_hits = 0
233
- summary_total = 0.0
234
- noise_penalty_total = 0.0
235
-
236
- for index in range(comparable_count):
237
- action = actions[index]
238
- truth = ground_truths[index]
239
-
240
- step_result = grade_medium_step(action, truth)
241
- priority_weight = float(step_result.breakdown.get("priority_weight", 1.0))
242
- weighted_score_sum += step_result.score
243
- weight_sum += min(priority_weight, 2.0)
244
-
245
- label_hits += 1 if step_result.breakdown.get("label_match", 0.0) > 0 else 0
246
- route_hits += 1 if step_result.breakdown.get("route_match", 0.0) > 0 else 0
247
- summary_total += float(step_result.breakdown.get("summary_match", 0.0))
248
- noise_penalty_total += float(step_result.breakdown.get("route_noise_penalty", 0.0))
249
-
250
- weighted_average = weighted_score_sum / weight_sum if weight_sum > 0.0 else 0.0
251
- score_value = _clip_score(weighted_average)
252
-
253
- breakdown = {
254
- "emails_scored": float(comparable_count),
255
- "label_accuracy": label_hits / comparable_count,
256
- "route_accuracy": route_hits / comparable_count,
257
- "summary_accuracy": summary_total / comparable_count,
258
- "avg_route_noise_penalty": noise_penalty_total / comparable_count,
259
- "weighted_average": score_value,
260
- }
261
- feedback = "Weighted medium-task grading completed."
262
- return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
263
-
264
-
265
- def grade_hard(action: TriageAction, ground_truth: dict) -> RewardResult:
266
- """Grade hard task using weighted policy-sensitive components.
267
-
268
- Args:
269
- action: Agent action for hard task case.
270
- ground_truth: Expected routing and urgency intent.
271
-
272
- Returns:
273
- Deterministic reward result in [0.0, 1.0].
274
- """
275
- expected_label = _normalized_text(str(ground_truth.get("label", "urgent")))
276
- primary_route = _normalized_text(str(ground_truth.get("route_to", "safety")))
277
- secondary_route = _normalized_text(str(ground_truth.get("cc_route", "billing")))
278
- spam_penalty = float(ground_truth.get("penalize_spam", 0.2))
279
-
280
- normalized_route = _normalized_text(action.route_to)
281
- has_primary_route = _route_matches(normalized_route, primary_route)
282
- has_secondary_route = _route_matches(normalized_route, secondary_route)
283
- urgent_label = _normalized_text(action.label) == expected_label
284
- summary_score = _summary_keyword_score(action.summary, ground_truth)
285
- noise_penalty = _route_noise_penalty(action.route_to)
286
-
287
- escalation_component = 0.35 if has_primary_route else 0.0
288
- routing_component = 0.25 if has_secondary_route else 0.0
289
- urgency_component = 0.25 if urgent_label else 0.0
290
- summary_component = 0.15 * summary_score
291
-
292
- raw_score = escalation_component + routing_component + urgency_component + summary_component
293
- raw_score -= noise_penalty
294
- if _normalized_text(action.label) == "spam":
295
- raw_score -= spam_penalty
296
-
297
- score_value = _clip_score(raw_score)
298
- breakdown = {
299
- "escalation_component": escalation_component,
300
- "routing_component": routing_component,
301
- "urgency_component": urgency_component,
302
- "summary_component": round(summary_component, 4),
303
- "route_noise_penalty": round(noise_penalty, 4),
304
- "spam_penalty": spam_penalty if _normalized_text(action.label) == "spam" else 0.0,
305
- }
306
- feedback = "Hard-task weighted policy grading completed."
307
- return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)