Spaces:
Sleeping
Sleeping
Commit ·
53ae9f0
1
Parent(s): fa53e30
Fix scoring
Browse files- inference.py +19 -2
- inference_log.txt +0 -0
- server/environment.py +16 -3
inference.py
CHANGED
|
@@ -112,6 +112,23 @@ def api_get(path: str) -> dict:
|
|
| 112 |
resp.raise_for_status()
|
| 113 |
return resp.json()
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
# ------------------------------------------------------------------
|
| 117 |
# Agent loop
|
|
@@ -228,7 +245,7 @@ def run_task(task_id: int) -> float:
|
|
| 228 |
finally:
|
| 229 |
log_end(success=success, steps=steps_taken, rewards=rewards)
|
| 230 |
|
| 231 |
-
final_score = obs["current_score"]
|
| 232 |
print(
|
| 233 |
f"\n Task {task_id} final score: {final_score:.4f} (steps used: {obs['step_count']})",
|
| 234 |
file=sys.stderr,
|
|
@@ -268,7 +285,7 @@ def main():
|
|
| 268 |
print("="*60, file=sys.stderr)
|
| 269 |
for k, v in scores.items():
|
| 270 |
print(f" {k}: {v:.4f}", file=sys.stderr)
|
| 271 |
-
avg = sum(scores.values()) / len(scores)
|
| 272 |
print(f" average: {avg:.4f}", file=sys.stderr)
|
| 273 |
print("="*60, file=sys.stderr)
|
| 274 |
|
|
|
|
| 112 |
resp.raise_for_status()
|
| 113 |
return resp.json()
|
| 114 |
|
| 115 |
+
# ------------------------------------------------------------------
|
| 116 |
+
# Score sanitizer
|
| 117 |
+
# ------------------------------------------------------------------
|
| 118 |
+
|
| 119 |
+
def sanitize_score(score: float) -> float:
|
| 120 |
+
"""
|
| 121 |
+
Ensures score is strictly within (0, 1)
|
| 122 |
+
required by hackathon validator.
|
| 123 |
+
"""
|
| 124 |
+
EPS = 1e-4
|
| 125 |
+
|
| 126 |
+
if score >= 1.0:
|
| 127 |
+
return 1.0 - EPS
|
| 128 |
+
if score <= 0.0:
|
| 129 |
+
return EPS
|
| 130 |
+
|
| 131 |
+
return float(score)
|
| 132 |
|
| 133 |
# ------------------------------------------------------------------
|
| 134 |
# Agent loop
|
|
|
|
| 245 |
finally:
|
| 246 |
log_end(success=success, steps=steps_taken, rewards=rewards)
|
| 247 |
|
| 248 |
+
final_score = sanitize_score(obs["current_score"])
|
| 249 |
print(
|
| 250 |
f"\n Task {task_id} final score: {final_score:.4f} (steps used: {obs['step_count']})",
|
| 251 |
file=sys.stderr,
|
|
|
|
| 285 |
print("="*60, file=sys.stderr)
|
| 286 |
for k, v in scores.items():
|
| 287 |
print(f" {k}: {v:.4f}", file=sys.stderr)
|
| 288 |
+
avg = sanitize_score(sum(scores.values()) / len(scores))
|
| 289 |
print(f" average: {avg:.4f}", file=sys.stderr)
|
| 290 |
print("="*60, file=sys.stderr)
|
| 291 |
|
inference_log.txt
CHANGED
|
Binary files a/inference_log.txt and b/inference_log.txt differ
|
|
|
server/environment.py
CHANGED
|
@@ -111,11 +111,24 @@ class DataCleaningEnvironment:
|
|
| 111 |
|
| 112 |
def _compute_score(self) -> float:
|
| 113 |
if self._task_id == 1:
|
| 114 |
-
|
| 115 |
elif self._task_id == 2:
|
| 116 |
-
|
| 117 |
else:
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
def _count_errors(self) -> int:
|
| 121 |
if self._task_id == 1:
|
|
|
|
| 111 |
|
| 112 |
def _compute_score(self) -> float:
|
| 113 |
if self._task_id == 1:
|
| 114 |
+
raw = t1.score(self._df, self._meta)
|
| 115 |
elif self._task_id == 2:
|
| 116 |
+
raw = t2.score(self._df, self._meta)
|
| 117 |
else:
|
| 118 |
+
raw = t3.score(self._df, self._meta)
|
| 119 |
+
|
| 120 |
+
EPS = 1e-4
|
| 121 |
+
|
| 122 |
+
# First round safely
|
| 123 |
+
raw = float(raw)
|
| 124 |
+
|
| 125 |
+
# HARD clamp AFTER rounding risk
|
| 126 |
+
if raw >= 1.0:
|
| 127 |
+
raw = 1.0 - EPS
|
| 128 |
+
elif raw <= 0.0:
|
| 129 |
+
raw = EPS
|
| 130 |
+
|
| 131 |
+
return round(raw, 4)
|
| 132 |
|
| 133 |
def _count_errors(self) -> int:
|
| 134 |
if self._task_id == 1:
|