kushalExplores commited on
Commit
9ba4021
·
verified ·
1 Parent(s): 15a0c0f

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. evaluation.py +9 -1
  2. tests/test_metric_tracker_rl.py +4 -2
evaluation.py CHANGED
@@ -11,6 +11,8 @@ except ImportError:
11
  from analysis_tools import preview_submission_rows, submission_row_key
12
  from models import MetricSubmissionRow, RewardBreakdown, SubmissionIssue, SubmissionPreview
13
 
 
 
14
 
15
  @dataclass(frozen=True)
16
  class EvaluationConfig:
@@ -42,6 +44,12 @@ class EvaluationResult:
42
  is_perfect: bool
43
 
44
 
 
 
 
 
 
 
45
  def evaluate_submission(
46
  submitted_rows: list[dict] | list[MetricSubmissionRow],
47
  expected_rows: list[MetricSubmissionRow],
@@ -133,7 +141,7 @@ def evaluate_submission(
133
  - invalid_penalty
134
  - exploit_penalty
135
  )
136
- total_score = max(0.0, min(1.0, round(total_score, 6)))
137
 
138
  breakdown = RewardBreakdown(
139
  precision=round(precision, 6),
 
11
  from analysis_tools import preview_submission_rows, submission_row_key
12
  from models import MetricSubmissionRow, RewardBreakdown, SubmissionIssue, SubmissionPreview
13
 
14
+ SCORE_EPSILON = 0.000001
15
+
16
 
17
  @dataclass(frozen=True)
18
  class EvaluationConfig:
 
44
  is_perfect: bool
45
 
46
 
47
+ def _bounded_total_score(score: float) -> float:
48
+ """Clamp evaluator scores to the open interval (0, 1)."""
49
+ rounded_score = round(score, 6)
50
+ return min(1.0 - SCORE_EPSILON, max(SCORE_EPSILON, rounded_score))
51
+
52
+
53
  def evaluate_submission(
54
  submitted_rows: list[dict] | list[MetricSubmissionRow],
55
  expected_rows: list[MetricSubmissionRow],
 
141
  - invalid_penalty
142
  - exploit_penalty
143
  )
144
+ total_score = _bounded_total_score(total_score)
145
 
146
  breakdown = RewardBreakdown(
147
  precision=round(precision, 6),
tests/test_metric_tracker_rl.py CHANGED
@@ -92,7 +92,8 @@ def test_evaluator_scores_perfect_submission():
92
  result = evaluate_submission(expected_rows, expected_rows)
93
 
94
  assert result.is_perfect is True
95
- assert result.reward_breakdown.total_score == 1.0
 
96
  assert result.reward_breakdown.extra_rows == 0
97
  assert result.reward_breakdown.duplicate_rows == 0
98
  assert result.reward_breakdown.invalid_rows == 0
@@ -117,7 +118,8 @@ def test_task_grader_scores_perfect_submission():
117
  result = task.grade_submission(episode.expected_rows, episode.expected_rows)
118
 
119
  assert result.is_perfect is True
120
- assert result.reward_breakdown.total_score == 1.0
 
121
 
122
 
123
  def test_duplicate_and_extra_rows_are_penalized():
 
92
  result = evaluate_submission(expected_rows, expected_rows)
93
 
94
  assert result.is_perfect is True
95
+ assert 0.0 < result.reward_breakdown.total_score < 1.0
96
+ assert result.reward_breakdown.total_score == 0.999999
97
  assert result.reward_breakdown.extra_rows == 0
98
  assert result.reward_breakdown.duplicate_rows == 0
99
  assert result.reward_breakdown.invalid_rows == 0
 
118
  result = task.grade_submission(episode.expected_rows, episode.expected_rows)
119
 
120
  assert result.is_perfect is True
121
+ assert 0.0 < result.reward_breakdown.total_score < 1.0
122
+ assert result.reward_breakdown.total_score == 0.999999
123
 
124
 
125
  def test_duplicate_and_extra_rows_are_penalized():