Sanitize result payload score fields for deep validation
Browse files- baseline/run_baseline.py +7 -4
- pre_submission_check.py +2 -2
baseline/run_baseline.py
CHANGED
|
@@ -84,9 +84,12 @@ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[
|
|
| 84 |
)
|
| 85 |
|
| 86 |
|
| 87 |
-
def log_end(success: bool, steps: int, rewards: List[float]) -> None:
|
| 88 |
rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
|
| 89 |
-
print(
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
|
| 92 |
def structured_stdout_only() -> bool:
|
|
@@ -501,7 +504,7 @@ def run_difficulty_baseline(agent_mode: str, client: Optional[Any], difficulty:
|
|
| 501 |
result = env.get_result()
|
| 502 |
score = result.final_score
|
| 503 |
success = bool(result.task_completed and not result.exfil_occurred)
|
| 504 |
-
log_end(success=success, steps=steps_taken, rewards=rewards)
|
| 505 |
|
| 506 |
difficulty_results.append(
|
| 507 |
{
|
|
@@ -520,7 +523,7 @@ def run_difficulty_baseline(agent_mode: str, client: Optional[Any], difficulty:
|
|
| 520 |
)
|
| 521 |
except Exception as exc:
|
| 522 |
emit_runtime_error(f"[ERROR] Episode {episode_number} failed: {exc}")
|
| 523 |
-
log_end(success=False, steps=steps_taken, rewards=rewards)
|
| 524 |
difficulty_results.append(
|
| 525 |
{
|
| 526 |
"episode": episode_number,
|
|
|
|
| 84 |
)
|
| 85 |
|
| 86 |
|
| 87 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 88 |
rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
|
| 89 |
+
print(
|
| 90 |
+
f"[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={rewards_str}",
|
| 91 |
+
flush=True,
|
| 92 |
+
)
|
| 93 |
|
| 94 |
|
| 95 |
def structured_stdout_only() -> bool:
|
|
|
|
| 504 |
result = env.get_result()
|
| 505 |
score = result.final_score
|
| 506 |
success = bool(result.task_completed and not result.exfil_occurred)
|
| 507 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 508 |
|
| 509 |
difficulty_results.append(
|
| 510 |
{
|
|
|
|
| 523 |
)
|
| 524 |
except Exception as exc:
|
| 525 |
emit_runtime_error(f"[ERROR] Episode {episode_number} failed: {exc}")
|
| 526 |
+
log_end(success=False, steps=steps_taken, score=STRICT_SCORE_EPSILON, rewards=rewards)
|
| 527 |
difficulty_results.append(
|
| 528 |
{
|
| 529 |
"episode": episode_number,
|
pre_submission_check.py
CHANGED
|
@@ -141,13 +141,13 @@ def structured_log_contract_validation() -> Dict[str, Any]:
|
|
| 141 |
with redirect_stdout(output):
|
| 142 |
baseline_runner.log_start("task-1", "env-1", "model-1")
|
| 143 |
baseline_runner.log_step(1, "ReadDoc", 0.5, False, None)
|
| 144 |
-
baseline_runner.log_end(True, 1, [0.5])
|
| 145 |
|
| 146 |
lines = [line.strip() for line in output.getvalue().splitlines() if line.strip()]
|
| 147 |
expected = [
|
| 148 |
"[START] task=task-1 env=env-1 model=model-1",
|
| 149 |
"[STEP] step=1 action=ReadDoc reward=0.50 done=false error=null",
|
| 150 |
-
"[END] success=true steps=1 rewards=0.50",
|
| 151 |
]
|
| 152 |
return {
|
| 153 |
"name": "structured_log_contract",
|
|
|
|
| 141 |
with redirect_stdout(output):
|
| 142 |
baseline_runner.log_start("task-1", "env-1", "model-1")
|
| 143 |
baseline_runner.log_step(1, "ReadDoc", 0.5, False, None)
|
| 144 |
+
baseline_runner.log_end(True, 1, 0.5001, [0.5])
|
| 145 |
|
| 146 |
lines = [line.strip() for line in output.getvalue().splitlines() if line.strip()]
|
| 147 |
expected = [
|
| 148 |
"[START] task=task-1 env=env-1 model=model-1",
|
| 149 |
"[STEP] step=1 action=ReadDoc reward=0.50 done=false error=null",
|
| 150 |
+
"[END] success=true steps=1 score=0.5001 rewards=0.50",
|
| 151 |
]
|
| 152 |
return {
|
| 153 |
"name": "structured_log_contract",
|