Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- evaluation.py +9 -1
- tests/test_metric_tracker_rl.py +4 -2
evaluation.py
CHANGED
|
@@ -11,6 +11,8 @@ except ImportError:
|
|
| 11 |
from analysis_tools import preview_submission_rows, submission_row_key
|
| 12 |
from models import MetricSubmissionRow, RewardBreakdown, SubmissionIssue, SubmissionPreview
|
| 13 |
|
|
|
|
|
|
|
| 14 |
|
| 15 |
@dataclass(frozen=True)
|
| 16 |
class EvaluationConfig:
|
|
@@ -42,6 +44,12 @@ class EvaluationResult:
|
|
| 42 |
is_perfect: bool
|
| 43 |
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
def evaluate_submission(
|
| 46 |
submitted_rows: list[dict] | list[MetricSubmissionRow],
|
| 47 |
expected_rows: list[MetricSubmissionRow],
|
|
@@ -133,7 +141,7 @@ def evaluate_submission(
|
|
| 133 |
- invalid_penalty
|
| 134 |
- exploit_penalty
|
| 135 |
)
|
| 136 |
-
total_score =
|
| 137 |
|
| 138 |
breakdown = RewardBreakdown(
|
| 139 |
precision=round(precision, 6),
|
|
|
|
| 11 |
from analysis_tools import preview_submission_rows, submission_row_key
|
| 12 |
from models import MetricSubmissionRow, RewardBreakdown, SubmissionIssue, SubmissionPreview
|
| 13 |
|
| 14 |
+
SCORE_EPSILON = 0.000001
|
| 15 |
+
|
| 16 |
|
| 17 |
@dataclass(frozen=True)
|
| 18 |
class EvaluationConfig:
|
|
|
|
| 44 |
is_perfect: bool
|
| 45 |
|
| 46 |
|
| 47 |
+
def _bounded_total_score(score: float) -> float:
|
| 48 |
+
"""Clamp evaluator scores to the open interval (0, 1)."""
|
| 49 |
+
rounded_score = round(score, 6)
|
| 50 |
+
return min(1.0 - SCORE_EPSILON, max(SCORE_EPSILON, rounded_score))
|
| 51 |
+
|
| 52 |
+
|
| 53 |
def evaluate_submission(
|
| 54 |
submitted_rows: list[dict] | list[MetricSubmissionRow],
|
| 55 |
expected_rows: list[MetricSubmissionRow],
|
|
|
|
| 141 |
- invalid_penalty
|
| 142 |
- exploit_penalty
|
| 143 |
)
|
| 144 |
+
total_score = _bounded_total_score(total_score)
|
| 145 |
|
| 146 |
breakdown = RewardBreakdown(
|
| 147 |
precision=round(precision, 6),
|
tests/test_metric_tracker_rl.py
CHANGED
|
@@ -92,7 +92,8 @@ def test_evaluator_scores_perfect_submission():
|
|
| 92 |
result = evaluate_submission(expected_rows, expected_rows)
|
| 93 |
|
| 94 |
assert result.is_perfect is True
|
| 95 |
-
assert result.reward_breakdown.total_score
|
|
|
|
| 96 |
assert result.reward_breakdown.extra_rows == 0
|
| 97 |
assert result.reward_breakdown.duplicate_rows == 0
|
| 98 |
assert result.reward_breakdown.invalid_rows == 0
|
|
@@ -117,7 +118,8 @@ def test_task_grader_scores_perfect_submission():
|
|
| 117 |
result = task.grade_submission(episode.expected_rows, episode.expected_rows)
|
| 118 |
|
| 119 |
assert result.is_perfect is True
|
| 120 |
-
assert result.reward_breakdown.total_score
|
|
|
|
| 121 |
|
| 122 |
|
| 123 |
def test_duplicate_and_extra_rows_are_penalized():
|
|
|
|
| 92 |
result = evaluate_submission(expected_rows, expected_rows)
|
| 93 |
|
| 94 |
assert result.is_perfect is True
|
| 95 |
+
assert 0.0 < result.reward_breakdown.total_score < 1.0
|
| 96 |
+
assert result.reward_breakdown.total_score == 0.999999
|
| 97 |
assert result.reward_breakdown.extra_rows == 0
|
| 98 |
assert result.reward_breakdown.duplicate_rows == 0
|
| 99 |
assert result.reward_breakdown.invalid_rows == 0
|
|
|
|
| 118 |
result = task.grade_submission(episode.expected_rows, episode.expected_rows)
|
| 119 |
|
| 120 |
assert result.is_perfect is True
|
| 121 |
+
assert 0.0 < result.reward_breakdown.total_score < 1.0
|
| 122 |
+
assert result.reward_breakdown.total_score == 0.999999
|
| 123 |
|
| 124 |
|
| 125 |
def test_duplicate_and_extra_rows_are_penalized():
|