srishtichugh commited on
Commit
3d6f059
·
1 Parent(s): 28070b8

fix score range

Browse files
baseline_scores.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "scores": {
3
- "task1": 0.999,
4
- "task2": 0.999,
5
- "task3": 0.999
6
  },
7
- "average": 0.999
8
  }
 
1
  {
2
  "scores": {
3
+ "task1": 0.99,
4
+ "task2": 0.99,
5
+ "task3": 0.99
6
  },
7
+ "average": 0.9899999999999999
8
  }
inference.py CHANGED
@@ -11,7 +11,7 @@ Required environment variables:
11
  STDOUT FORMAT (OpenEnv spec):
12
  [START] task=<task_name> env=<benchmark> model=<model_name>
13
  [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
14
- [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
15
  """
16
 
17
  import json
@@ -90,9 +90,12 @@ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[
90
  )
91
 
92
 
93
- def log_end(success: bool, steps: int, rewards: List[float]) -> None:
94
- rewards_str = ",".join(f"{r:.2f}" for r in rewards)
95
- print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)
 
 
 
96
 
97
 
98
  # ------------------------------------------------------------------
@@ -112,23 +115,6 @@ def api_get(path: str) -> dict:
112
  resp.raise_for_status()
113
  return resp.json()
114
 
115
- # ------------------------------------------------------------------
116
- # Score sanitizer
117
- # ------------------------------------------------------------------
118
-
119
- def sanitize_score(score: float) -> float:
120
- """
121
- Ensures score is strictly within (0, 1)
122
- required by hackathon validator.
123
- """
124
- EPS = 1e-4
125
-
126
- if score >= 1.0:
127
- return 1.0 - EPS
128
- if score <= 0.0:
129
- return EPS
130
-
131
- return float(score)
132
 
133
  # ------------------------------------------------------------------
134
  # Agent loop
@@ -243,9 +229,10 @@ def run_task(task_id: int) -> float:
243
  time.sleep(0.3)
244
 
245
  finally:
246
- log_end(success=success, steps=steps_taken, rewards=rewards)
 
247
 
248
- final_score = sanitize_score(obs["current_score"])
249
  print(
250
  f"\n Task {task_id} final score: {final_score:.4f} (steps used: {obs['step_count']})",
251
  file=sys.stderr,
@@ -278,14 +265,14 @@ def main():
278
  scores[f"task{task_id}"] = run_task(task_id)
279
  except Exception as exc:
280
  print(f"[ERROR] Task {task_id} failed: {exc}", file=sys.stderr)
281
- scores[f"task{task_id}"] = 0.0
282
 
283
  print("\n" + "="*60, file=sys.stderr)
284
  print(" BASELINE RESULTS", file=sys.stderr)
285
  print("="*60, file=sys.stderr)
286
  for k, v in scores.items():
287
  print(f" {k}: {v:.4f}", file=sys.stderr)
288
- avg = sanitize_score(sum(scores.values()) / len(scores))
289
  print(f" average: {avg:.4f}", file=sys.stderr)
290
  print("="*60, file=sys.stderr)
291
 
 
11
  STDOUT FORMAT (OpenEnv spec):
12
  [START] task=<task_name> env=<benchmark> model=<model_name>
13
  [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
14
+ [END] task=<task_name> score=<0.00> steps=<n>
15
  """
16
 
17
  import json
 
90
  )
91
 
92
 
93
+ def log_end(task_name: str, score: float, steps: int) -> None:
94
+ safe_score = max(0.01, min(0.99, float(score)))
95
+ print(
96
+ f"[END] task={task_name} score={safe_score:.4f} steps={steps}",
97
+ flush=True
98
+ )
99
 
100
 
101
  # ------------------------------------------------------------------
 
115
  resp.raise_for_status()
116
  return resp.json()
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  # ------------------------------------------------------------------
120
  # Agent loop
 
229
  time.sleep(0.3)
230
 
231
  finally:
232
+ final = obs.get("current_score", 0.01) if isinstance(obs, dict) else 0.01
233
+ log_end(task_name=task_name, score=final, steps=steps_taken)
234
 
235
+ final_score = obs["current_score"]
236
  print(
237
  f"\n Task {task_id} final score: {final_score:.4f} (steps used: {obs['step_count']})",
238
  file=sys.stderr,
 
265
  scores[f"task{task_id}"] = run_task(task_id)
266
  except Exception as exc:
267
  print(f"[ERROR] Task {task_id} failed: {exc}", file=sys.stderr)
268
+ scores[f"task{task_id}"] = 0.01
269
 
270
  print("\n" + "="*60, file=sys.stderr)
271
  print(" BASELINE RESULTS", file=sys.stderr)
272
  print("="*60, file=sys.stderr)
273
  for k, v in scores.items():
274
  print(f" {k}: {v:.4f}", file=sys.stderr)
275
+ avg = sum(scores.values()) / len(scores)
276
  print(f" average: {avg:.4f}", file=sys.stderr)
277
  print("="*60, file=sys.stderr)
278
 
inference_log.txt CHANGED
Binary files a/inference_log.txt and b/inference_log.txt differ
 
server/environment.py CHANGED
@@ -33,7 +33,7 @@ class DataCleaningEnvironment:
33
  self._step_count: int = 0
34
  self._max_steps: int = 20
35
  self._total_errors: int = 0
36
- self._last_score: float = 0.001
37
  self._task_cycle: int = 0 # for round-robin default
38
 
39
  # ------------------------------------------------------------------
@@ -78,16 +78,16 @@ class DataCleaningEnvironment:
78
 
79
  delta = score_after - score_before
80
  if not applied:
81
- reward = 0.001
82
  elif delta <= 0:
83
- reward = 0.001
84
  else:
85
  reward = round(delta, 4)
86
 
87
  done = (score_after >= 0.95) or (self._step_count >= self._max_steps)
88
 
89
- # Clamp reward strictly within (0.001, 0.999) — no terminal bonus
90
- reward = round(max(0.001, min(0.999, reward)), 4)
91
 
92
  return self._build_obs(reward, done, message)
93
 
 
33
  self._step_count: int = 0
34
  self._max_steps: int = 20
35
  self._total_errors: int = 0
36
+ self._last_score: float = 0.01
37
  self._task_cycle: int = 0 # for round-robin default
38
 
39
  # ------------------------------------------------------------------
 
78
 
79
  delta = score_after - score_before
80
  if not applied:
81
+ reward = 0.01
82
  elif delta <= 0:
83
+ reward = 0.01
84
  else:
85
  reward = round(delta, 4)
86
 
87
  done = (score_after >= 0.95) or (self._step_count >= self._max_steps)
88
 
89
+ # Clamp reward strictly within (0.01, 0.99) — no terminal bonus
90
+ reward = round(max(0.01, min(0.99, reward)), 4)
91
 
92
  return self._build_obs(reward, done, message)
93
 
server/tasks/task1_missing.py CHANGED
@@ -30,9 +30,9 @@ def load():
30
  def score(current_df, original_nulls: int) -> float:
31
  """Score in [0, 1]: fraction of nulls filled."""
32
  if original_nulls == 0:
33
- return 0.999
34
  remaining = int(current_df.isnull().sum().sum())
35
- return round(max(0.001, min(0.999, 1.0 - remaining / original_nulls)), 4)
36
 
37
 
38
  def count_errors(current_df) -> int:
 
30
  def score(current_df, original_nulls: int) -> float:
31
  """Score in [0, 1]: fraction of nulls filled."""
32
  if original_nulls == 0:
33
+ return 0.99
34
  remaining = int(current_df.isnull().sum().sum())
35
+ return round(max(0.01, min(0.99, 1.0 - remaining / original_nulls)), 4)
36
 
37
 
38
  def count_errors(current_df) -> int:
server/tasks/task2_format.py CHANGED
@@ -56,7 +56,7 @@ def score(current_df, meta: dict) -> float:
56
  dupe_score = 1.0 - dupes / max(meta["orig_dupes"], 1)
57
 
58
  combined = 0.35 * phone_score + 0.35 * date_score + 0.30 * dupe_score
59
- return round(max(0.001, min(0.999, combined)), 4)
60
 
61
 
62
  def count_errors(current_df, meta: dict) -> int:
 
56
  dupe_score = 1.0 - dupes / max(meta["orig_dupes"], 1)
57
 
58
  combined = 0.35 * phone_score + 0.35 * date_score + 0.30 * dupe_score
59
+ return round(max(0.01, min(0.99, combined)), 4)
60
 
61
 
62
  def count_errors(current_df, meta: dict) -> int:
server/tasks/task3_pipeline.py CHANGED
@@ -87,7 +87,7 @@ def score(current_df, meta: dict) -> float:
87
 
88
  combined = 0.25 * null_score + 0.20 * dupe_score + 0.20 * outlier_score \
89
  + 0.175 * country_score + 0.175 * date_score
90
- return round(max(0.001, min(0.999, combined)), 4)
91
 
92
 
93
  def count_errors(current_df, meta: dict) -> int:
 
87
 
88
  combined = 0.25 * null_score + 0.20 * dupe_score + 0.20 * outlier_score \
89
  + 0.175 * country_score + 0.175 * date_score
90
+ return round(max(0.01, min(0.99, combined)), 4)
91
 
92
 
93
  def count_errors(current_df, meta: dict) -> int: