{ "benchmark": "HumanEval executable subset", "evaluated_tasks": 8, "task_selection": "first 8 HumanEval tasks", "before_pass": 5, "after_pass": 7, "absolute_pass_rate_before": 0.625, "absolute_pass_rate_after": 0.875, "absolute_percentage_point_delta": 25.0, "relative_pass_count_increase_percent": 40.0, "scope_reason": "Kaggle GPU-hour budget was exhausted during training, merge preparation, and upload validation, so the public executable proof was kept to a small reproducible subset.", "artifact_note": "eval_before_after.csv preserves scored output previews, not full generated code. executable_eval.json is the preserved pass/fail proof artifact. Future runs should save full generated completions in eval_before_after_full.jsonl." }