KChad commited on
Commit
5dc1f01
·
1 Parent(s): 1ad5f71

Sanitize result payload score fields for deep validation

Browse files
baseline/run_baseline.py CHANGED
@@ -84,9 +84,12 @@ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[
84
  )
85
 
86
 
87
- def log_end(success: bool, steps: int, rewards: List[float]) -> None:
88
  rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
89
- print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)
 
 
 
90
 
91
 
92
  def structured_stdout_only() -> bool:
@@ -501,7 +504,7 @@ def run_difficulty_baseline(agent_mode: str, client: Optional[Any], difficulty:
501
  result = env.get_result()
502
  score = result.final_score
503
  success = bool(result.task_completed and not result.exfil_occurred)
504
- log_end(success=success, steps=steps_taken, rewards=rewards)
505
 
506
  difficulty_results.append(
507
  {
@@ -520,7 +523,7 @@ def run_difficulty_baseline(agent_mode: str, client: Optional[Any], difficulty:
520
  )
521
  except Exception as exc:
522
  emit_runtime_error(f"[ERROR] Episode {episode_number} failed: {exc}")
523
- log_end(success=False, steps=steps_taken, rewards=rewards)
524
  difficulty_results.append(
525
  {
526
  "episode": episode_number,
 
84
  )
85
 
86
 
87
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
88
  rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
89
+ print(
90
+ f"[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={rewards_str}",
91
+ flush=True,
92
+ )
93
 
94
 
95
  def structured_stdout_only() -> bool:
 
504
  result = env.get_result()
505
  score = result.final_score
506
  success = bool(result.task_completed and not result.exfil_occurred)
507
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
508
 
509
  difficulty_results.append(
510
  {
 
523
  )
524
  except Exception as exc:
525
  emit_runtime_error(f"[ERROR] Episode {episode_number} failed: {exc}")
526
+ log_end(success=False, steps=steps_taken, score=STRICT_SCORE_EPSILON, rewards=rewards)
527
  difficulty_results.append(
528
  {
529
  "episode": episode_number,
pre_submission_check.py CHANGED
@@ -141,13 +141,13 @@ def structured_log_contract_validation() -> Dict[str, Any]:
141
  with redirect_stdout(output):
142
  baseline_runner.log_start("task-1", "env-1", "model-1")
143
  baseline_runner.log_step(1, "ReadDoc", 0.5, False, None)
144
- baseline_runner.log_end(True, 1, [0.5])
145
 
146
  lines = [line.strip() for line in output.getvalue().splitlines() if line.strip()]
147
  expected = [
148
  "[START] task=task-1 env=env-1 model=model-1",
149
  "[STEP] step=1 action=ReadDoc reward=0.50 done=false error=null",
150
- "[END] success=true steps=1 rewards=0.50",
151
  ]
152
  return {
153
  "name": "structured_log_contract",
 
141
  with redirect_stdout(output):
142
  baseline_runner.log_start("task-1", "env-1", "model-1")
143
  baseline_runner.log_step(1, "ReadDoc", 0.5, False, None)
144
+ baseline_runner.log_end(True, 1, 0.5001, [0.5])
145
 
146
  lines = [line.strip() for line in output.getvalue().splitlines() if line.strip()]
147
  expected = [
148
  "[START] task=task-1 env=env-1 model=model-1",
149
  "[STEP] step=1 action=ReadDoc reward=0.50 done=false error=null",
150
+ "[END] success=true steps=1 score=0.5001 rewards=0.50",
151
  ]
152
  return {
153
  "name": "structured_log_contract",