avanigupta Claude Opus 4.6 (1M context) commited on
Commit
5e1f8bb
·
1 Parent(s): 1bd072d

fix grading: reward valid fixes, not just exact matches

Browse files

Grade fixes by issue type:
- missing_value: any non-empty value = 0.8
- wrong_type: correct type = 0.8
- out_of_range: within 50% of correct = 0.8, right direction = 0.4
- format_violation: correct format pattern = 0.8
- inconsistent/outlier: within 20% = 0.8, within 50% = 0.4
- exact match still = 1.0

This makes fix grading logical — agent gets credit for reasonable
fixes even without knowing the exact original value.

124 tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. dataqa_env/server/environment.py +124 -16
dataqa_env/server/environment.py CHANGED
@@ -194,9 +194,15 @@ def grade_fixes(
194
 
195
  difficulty = matching_issue.difficulty if matching_issue else 1.0
196
 
197
- # Score the fix
 
 
 
 
 
198
  score = 0.0
199
  reason = "wrong value"
 
200
 
201
  # Exact match (case-insensitive, whitespace-stripped)
202
  if proposed.strip().lower() == clean_value.lower():
@@ -204,27 +210,129 @@ def grade_fixes(
204
  reason = "exact match"
205
  fixes_correct += 1
206
  else:
207
- # Try numeric close match
208
- try:
209
- proposed_num = float(proposed.strip())
210
- clean_num = float(clean_value)
211
- if clean_num != 0 and abs(proposed_num - clean_num) / abs(clean_num) <= 0.01:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  score = 0.8
213
- reason = "numeric close match"
 
 
 
 
214
  fixes_partial += 1
215
- elif proposed_num == clean_num:
216
- score = 1.0
217
- reason = "exact numeric match"
218
- fixes_correct += 1
219
  else:
220
  score = 0.1
221
  reason = "correct cell, wrong value"
222
  fixes_partial += 1
223
- except (ValueError, ZeroDivisionError):
224
- # Not numeric just a wrong value but at least right cell
225
- score = 0.1
226
- reason = "correct cell, wrong value"
227
- fixes_partial += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  # Keep best fix per cell
230
  if cell_key not in fixed_issues or score > fixed_issues[cell_key]:
 
194
 
195
  difficulty = matching_issue.difficulty if matching_issue else 1.0
196
 
197
+ # Score the fix using tiered grading:
198
+ # 1.0 = exact match with clean value
199
+ # 0.8 = valid fix (right type, in range, addresses the issue) but not exact
200
+ # 0.4 = partially valid (reasonable attempt, right direction)
201
+ # 0.1 = targets correct cell but fix doesn't address the issue
202
+ # 0.0 = makes things worse or targets non-issue cell
203
  score = 0.0
204
  reason = "wrong value"
205
+ issue_type = matching_issue.issue_type if matching_issue else ""
206
 
207
  # Exact match (case-insensitive, whitespace-stripped)
208
  if proposed.strip().lower() == clean_value.lower():
 
210
  reason = "exact match"
211
  fixes_correct += 1
212
  else:
213
+ # Grade by issue type — check if the fix is VALID even if not exact
214
+ proposed_stripped = proposed.strip()
215
+
216
+ if issue_type == "missing_value":
217
+ # Any non-empty value is a reasonable fix for a missing value
218
+ if proposed_stripped and proposed_stripped != " ":
219
+ score = 0.8
220
+ reason = "valid fix (non-empty value for missing field)"
221
+ fixes_partial += 1
222
+ else:
223
+ score = 0.0
224
+ reason = "fix is still empty"
225
+ fixes_wrong += 1
226
+
227
+ elif issue_type == "wrong_type":
228
+ # Check if the proposed value is the correct type
229
+ try:
230
+ float(proposed_stripped)
231
+ # Original was text, proposed is numeric — correct type fix
232
+ score = 0.8
233
+ reason = "valid fix (correct type)"
234
+ fixes_partial += 1
235
+ except ValueError:
236
+ score = 0.1
237
+ reason = "fix is still wrong type"
238
+ fixes_partial += 1
239
+
240
+ elif issue_type == "out_of_range":
241
+ # Check if proposed value is within a reasonable range
242
+ try:
243
+ proposed_num = float(proposed_stripped)
244
+ clean_num = float(clean_value)
245
+ # Within 50% of clean value = good estimate
246
+ if clean_num != 0 and abs(proposed_num - clean_num) / abs(clean_num) <= 0.5:
247
+ score = 0.8
248
+ reason = "valid fix (in reasonable range)"
249
+ fixes_partial += 1
250
+ elif proposed_num > 0 and (clean_num > 0) == (proposed_num > 0):
251
+ # At least right sign/direction
252
+ score = 0.4
253
+ reason = "partially valid (right direction)"
254
+ fixes_partial += 1
255
+ else:
256
+ score = 0.1
257
+ reason = "fix still out of reasonable range"
258
+ fixes_partial += 1
259
+ except ValueError:
260
+ score = 0.1
261
+ reason = "correct cell, wrong value"
262
+ fixes_partial += 1
263
+
264
+ elif issue_type == "format_violation":
265
+ # Check if proposed value matches expected format
266
+ # For dates: YYYY-MM-DD pattern
267
+ if re.match(r"\d{4}-\d{2}-\d{2}", proposed_stripped):
268
  score = 0.8
269
+ reason = "valid fix (correct format)"
270
+ fixes_partial += 1
271
+ elif proposed_stripped and proposed_stripped != clean_value:
272
+ score = 0.4
273
+ reason = "fix attempted but format unclear"
274
  fixes_partial += 1
 
 
 
 
275
  else:
276
  score = 0.1
277
  reason = "correct cell, wrong value"
278
  fixes_partial += 1
279
+
280
+ elif issue_type in ("inconsistent_value", "statistical_outlier"):
281
+ # These require domain knowledge — any reasonable attempt gets partial credit
282
+ try:
283
+ proposed_num = float(proposed_stripped)
284
+ clean_num = float(clean_value)
285
+ # Within 20% = strong fix, within 50% = reasonable
286
+ if clean_num != 0:
287
+ pct_diff = abs(proposed_num - clean_num) / abs(clean_num)
288
+ if pct_diff <= 0.01:
289
+ score = 1.0
290
+ reason = "exact numeric match"
291
+ fixes_correct += 1
292
+ elif pct_diff <= 0.2:
293
+ score = 0.8
294
+ reason = "valid fix (within 20% of correct value)"
295
+ fixes_partial += 1
296
+ elif pct_diff <= 0.5:
297
+ score = 0.4
298
+ reason = "partially valid (right ballpark)"
299
+ fixes_partial += 1
300
+ else:
301
+ score = 0.1
302
+ reason = "correct cell, value not close"
303
+ fixes_partial += 1
304
+ else:
305
+ score = 0.4
306
+ reason = "numeric fix attempted"
307
+ fixes_partial += 1
308
+ except ValueError:
309
+ # Non-numeric fix for text fields — check similarity
310
+ if len(proposed_stripped) > 10 and proposed_stripped != clean_value:
311
+ score = 0.4
312
+ reason = "text fix attempted (cannot verify automatically)"
313
+ fixes_partial += 1
314
+ else:
315
+ score = 0.1
316
+ reason = "correct cell, wrong value"
317
+ fixes_partial += 1
318
+
319
+ else:
320
+ # Fallback: numeric close match or partial credit
321
+ try:
322
+ proposed_num = float(proposed_stripped)
323
+ clean_num = float(clean_value)
324
+ if clean_num != 0 and abs(proposed_num - clean_num) / abs(clean_num) <= 0.01:
325
+ score = 0.8
326
+ reason = "numeric close match"
327
+ fixes_partial += 1
328
+ else:
329
+ score = 0.1
330
+ reason = "correct cell, wrong value"
331
+ fixes_partial += 1
332
+ except (ValueError, ZeroDivisionError):
333
+ score = 0.1
334
+ reason = "correct cell, wrong value"
335
+ fixes_partial += 1
336
 
337
  # Keep best fix per cell
338
  if cell_key not in fixed_issues or score > fixed_issues[cell_key]: