{
  "benchmark": "HumanEval executable subset",
  "evaluated_tasks": 8,
  "task_selection": "first 8 HumanEval tasks",
  "before_pass": 5,
  "after_pass": 7,
  "absolute_pass_rate_before": 0.625,
  "absolute_pass_rate_after": 0.875,
  "absolute_percentage_point_delta": 25.0,
  "relative_pass_count_increase_percent": 40.0,
  "scope_reason": "Kaggle GPU-hour budget was exhausted during training, merge preparation, and upload validation, so the public executable proof was kept to a small reproducible subset.",
  "artifact_note": "eval_before_after.csv preserves scored output previews, not full generated code. executable_eval.json is the preserved pass/fail proof artifact. Future runs should save full generated completions in eval_before_after_full.jsonl."
}