Spaces:
Sleeping
Sleeping
Commit ·
3d6f059
1
Parent(s): 28070b8
fix score range
Browse files- baseline_scores.json +4 -4
- inference.py +12 -25
- inference_log.txt +0 -0
- server/environment.py +5 -5
- server/tasks/task1_missing.py +2 -2
- server/tasks/task2_format.py +1 -1
- server/tasks/task3_pipeline.py +1 -1
baseline_scores.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"scores": {
|
| 3 |
-
"task1": 0.
|
| 4 |
-
"task2": 0.
|
| 5 |
-
"task3": 0.
|
| 6 |
},
|
| 7 |
-
"average": 0.
|
| 8 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"scores": {
|
| 3 |
+
"task1": 0.99,
|
| 4 |
+
"task2": 0.99,
|
| 5 |
+
"task3": 0.99
|
| 6 |
},
|
| 7 |
+
"average": 0.9899999999999999
|
| 8 |
}
|
inference.py
CHANGED
|
@@ -11,7 +11,7 @@ Required environment variables:
|
|
| 11 |
STDOUT FORMAT (OpenEnv spec):
|
| 12 |
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 13 |
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 14 |
-
[END]
|
| 15 |
"""
|
| 16 |
|
| 17 |
import json
|
|
@@ -90,9 +90,12 @@ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[
|
|
| 90 |
)
|
| 91 |
|
| 92 |
|
| 93 |
-
def log_end(
|
| 94 |
-
|
| 95 |
-
print(
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
|
| 98 |
# ------------------------------------------------------------------
|
|
@@ -112,23 +115,6 @@ def api_get(path: str) -> dict:
|
|
| 112 |
resp.raise_for_status()
|
| 113 |
return resp.json()
|
| 114 |
|
| 115 |
-
# ------------------------------------------------------------------
|
| 116 |
-
# Score sanitizer
|
| 117 |
-
# ------------------------------------------------------------------
|
| 118 |
-
|
| 119 |
-
def sanitize_score(score: float) -> float:
|
| 120 |
-
"""
|
| 121 |
-
Ensures score is strictly within (0, 1)
|
| 122 |
-
required by hackathon validator.
|
| 123 |
-
"""
|
| 124 |
-
EPS = 1e-4
|
| 125 |
-
|
| 126 |
-
if score >= 1.0:
|
| 127 |
-
return 1.0 - EPS
|
| 128 |
-
if score <= 0.0:
|
| 129 |
-
return EPS
|
| 130 |
-
|
| 131 |
-
return float(score)
|
| 132 |
|
| 133 |
# ------------------------------------------------------------------
|
| 134 |
# Agent loop
|
|
@@ -243,9 +229,10 @@ def run_task(task_id: int) -> float:
|
|
| 243 |
time.sleep(0.3)
|
| 244 |
|
| 245 |
finally:
|
| 246 |
-
|
|
|
|
| 247 |
|
| 248 |
-
final_score =
|
| 249 |
print(
|
| 250 |
f"\n Task {task_id} final score: {final_score:.4f} (steps used: {obs['step_count']})",
|
| 251 |
file=sys.stderr,
|
|
@@ -278,14 +265,14 @@ def main():
|
|
| 278 |
scores[f"task{task_id}"] = run_task(task_id)
|
| 279 |
except Exception as exc:
|
| 280 |
print(f"[ERROR] Task {task_id} failed: {exc}", file=sys.stderr)
|
| 281 |
-
scores[f"task{task_id}"] = 0.
|
| 282 |
|
| 283 |
print("\n" + "="*60, file=sys.stderr)
|
| 284 |
print(" BASELINE RESULTS", file=sys.stderr)
|
| 285 |
print("="*60, file=sys.stderr)
|
| 286 |
for k, v in scores.items():
|
| 287 |
print(f" {k}: {v:.4f}", file=sys.stderr)
|
| 288 |
-
avg =
|
| 289 |
print(f" average: {avg:.4f}", file=sys.stderr)
|
| 290 |
print("="*60, file=sys.stderr)
|
| 291 |
|
|
|
|
| 11 |
STDOUT FORMAT (OpenEnv spec):
|
| 12 |
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 13 |
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 14 |
+
[END] task=<task_name> score=<0.00> steps=<n>
|
| 15 |
"""
|
| 16 |
|
| 17 |
import json
|
|
|
|
| 90 |
)
|
| 91 |
|
| 92 |
|
| 93 |
+
def log_end(task_name: str, score: float, steps: int) -> None:
|
| 94 |
+
safe_score = max(0.01, min(0.99, float(score)))
|
| 95 |
+
print(
|
| 96 |
+
f"[END] task={task_name} score={safe_score:.4f} steps={steps}",
|
| 97 |
+
flush=True
|
| 98 |
+
)
|
| 99 |
|
| 100 |
|
| 101 |
# ------------------------------------------------------------------
|
|
|
|
| 115 |
resp.raise_for_status()
|
| 116 |
return resp.json()
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
# ------------------------------------------------------------------
|
| 120 |
# Agent loop
|
|
|
|
| 229 |
time.sleep(0.3)
|
| 230 |
|
| 231 |
finally:
|
| 232 |
+
final = obs.get("current_score", 0.01) if isinstance(obs, dict) else 0.01
|
| 233 |
+
log_end(task_name=task_name, score=final, steps=steps_taken)
|
| 234 |
|
| 235 |
+
final_score = obs["current_score"]
|
| 236 |
print(
|
| 237 |
f"\n Task {task_id} final score: {final_score:.4f} (steps used: {obs['step_count']})",
|
| 238 |
file=sys.stderr,
|
|
|
|
| 265 |
scores[f"task{task_id}"] = run_task(task_id)
|
| 266 |
except Exception as exc:
|
| 267 |
print(f"[ERROR] Task {task_id} failed: {exc}", file=sys.stderr)
|
| 268 |
+
scores[f"task{task_id}"] = 0.01
|
| 269 |
|
| 270 |
print("\n" + "="*60, file=sys.stderr)
|
| 271 |
print(" BASELINE RESULTS", file=sys.stderr)
|
| 272 |
print("="*60, file=sys.stderr)
|
| 273 |
for k, v in scores.items():
|
| 274 |
print(f" {k}: {v:.4f}", file=sys.stderr)
|
| 275 |
+
avg = sum(scores.values()) / len(scores)
|
| 276 |
print(f" average: {avg:.4f}", file=sys.stderr)
|
| 277 |
print("="*60, file=sys.stderr)
|
| 278 |
|
inference_log.txt
CHANGED
|
Binary files a/inference_log.txt and b/inference_log.txt differ
|
|
|
server/environment.py
CHANGED
|
@@ -33,7 +33,7 @@ class DataCleaningEnvironment:
|
|
| 33 |
self._step_count: int = 0
|
| 34 |
self._max_steps: int = 20
|
| 35 |
self._total_errors: int = 0
|
| 36 |
-
self._last_score: float = 0.
|
| 37 |
self._task_cycle: int = 0 # for round-robin default
|
| 38 |
|
| 39 |
# ------------------------------------------------------------------
|
|
@@ -78,16 +78,16 @@ class DataCleaningEnvironment:
|
|
| 78 |
|
| 79 |
delta = score_after - score_before
|
| 80 |
if not applied:
|
| 81 |
-
reward = 0.
|
| 82 |
elif delta <= 0:
|
| 83 |
-
reward = 0.
|
| 84 |
else:
|
| 85 |
reward = round(delta, 4)
|
| 86 |
|
| 87 |
done = (score_after >= 0.95) or (self._step_count >= self._max_steps)
|
| 88 |
|
| 89 |
-
# Clamp reward strictly within (0.
|
| 90 |
-
reward = round(max(0.
|
| 91 |
|
| 92 |
return self._build_obs(reward, done, message)
|
| 93 |
|
|
|
|
| 33 |
self._step_count: int = 0
|
| 34 |
self._max_steps: int = 20
|
| 35 |
self._total_errors: int = 0
|
| 36 |
+
self._last_score: float = 0.01
|
| 37 |
self._task_cycle: int = 0 # for round-robin default
|
| 38 |
|
| 39 |
# ------------------------------------------------------------------
|
|
|
|
| 78 |
|
| 79 |
delta = score_after - score_before
|
| 80 |
if not applied:
|
| 81 |
+
reward = 0.01
|
| 82 |
elif delta <= 0:
|
| 83 |
+
reward = 0.01
|
| 84 |
else:
|
| 85 |
reward = round(delta, 4)
|
| 86 |
|
| 87 |
done = (score_after >= 0.95) or (self._step_count >= self._max_steps)
|
| 88 |
|
| 89 |
+
# Clamp reward strictly within (0.01, 0.99) — no terminal bonus
|
| 90 |
+
reward = round(max(0.01, min(0.99, reward)), 4)
|
| 91 |
|
| 92 |
return self._build_obs(reward, done, message)
|
| 93 |
|
server/tasks/task1_missing.py
CHANGED
|
@@ -30,9 +30,9 @@ def load():
|
|
| 30 |
def score(current_df, original_nulls: int) -> float:
|
| 31 |
"""Score in [0, 1]: fraction of nulls filled."""
|
| 32 |
if original_nulls == 0:
|
| 33 |
-
return 0.
|
| 34 |
remaining = int(current_df.isnull().sum().sum())
|
| 35 |
-
return round(max(0.
|
| 36 |
|
| 37 |
|
| 38 |
def count_errors(current_df) -> int:
|
|
|
|
| 30 |
def score(current_df, original_nulls: int) -> float:
|
| 31 |
"""Score in [0, 1]: fraction of nulls filled."""
|
| 32 |
if original_nulls == 0:
|
| 33 |
+
return 0.99
|
| 34 |
remaining = int(current_df.isnull().sum().sum())
|
| 35 |
+
return round(max(0.01, min(0.99, 1.0 - remaining / original_nulls)), 4)
|
| 36 |
|
| 37 |
|
| 38 |
def count_errors(current_df) -> int:
|
server/tasks/task2_format.py
CHANGED
|
@@ -56,7 +56,7 @@ def score(current_df, meta: dict) -> float:
|
|
| 56 |
dupe_score = 1.0 - dupes / max(meta["orig_dupes"], 1)
|
| 57 |
|
| 58 |
combined = 0.35 * phone_score + 0.35 * date_score + 0.30 * dupe_score
|
| 59 |
-
return round(max(0.
|
| 60 |
|
| 61 |
|
| 62 |
def count_errors(current_df, meta: dict) -> int:
|
|
|
|
| 56 |
dupe_score = 1.0 - dupes / max(meta["orig_dupes"], 1)
|
| 57 |
|
| 58 |
combined = 0.35 * phone_score + 0.35 * date_score + 0.30 * dupe_score
|
| 59 |
+
return round(max(0.01, min(0.99, combined)), 4)
|
| 60 |
|
| 61 |
|
| 62 |
def count_errors(current_df, meta: dict) -> int:
|
server/tasks/task3_pipeline.py
CHANGED
|
@@ -87,7 +87,7 @@ def score(current_df, meta: dict) -> float:
|
|
| 87 |
|
| 88 |
combined = 0.25 * null_score + 0.20 * dupe_score + 0.20 * outlier_score \
|
| 89 |
+ 0.175 * country_score + 0.175 * date_score
|
| 90 |
-
return round(max(0.
|
| 91 |
|
| 92 |
|
| 93 |
def count_errors(current_df, meta: dict) -> int:
|
|
|
|
| 87 |
|
| 88 |
combined = 0.25 * null_score + 0.20 * dupe_score + 0.20 * outlier_score \
|
| 89 |
+ 0.175 * country_score + 0.175 * date_score
|
| 90 |
+
return round(max(0.01, min(0.99, combined)), 4)
|
| 91 |
|
| 92 |
|
| 93 |
def count_errors(current_df, meta: dict) -> int:
|