""" Before/After evaluation for Prompt Golf. Loads the agent model (optionally with a LoRA adapter), runs it on every task in the task bank (or a specified subset), captures the prompt it produces, runs that prompt through the env, and writes a results JSONL + a readable summary table. Run twice — once without --adapter (base), once with --adapter (trained) — then diff the tables for the hackathon demo. """ from __future__ import annotations import argparse import json import os import sys import textwrap import time from pathlib import Path from typing import Dict, List import torch _HERE = Path(__file__).resolve().parent _REPO_ROOT = _HERE.parent sys.path.insert(0, str(_REPO_ROOT)) # Reuse the agent system prompt + user message + extractor from train_grpo # so eval mirrors training exactly. from training.train_grpo import ( # noqa: E402 SYSTEM_PROMPT, build_agent_user_message, extract_prompt, ) def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description="Prompt Golf eval harness") p.add_argument("--agent-model", default="Qwen/Qwen3-1.7B") p.add_argument("--adapter", default=None, help="Optional LoRA adapter dir or HF repo id.") p.add_argument("--target-model", default="meta-llama/Llama-3.2-3B-Instruct") p.add_argument("--tasks", default="all", help="'all' or comma-separated task ids.") p.add_argument("--seeds-per-task", type=int, default=1, help="At temperature=0.0 the agent is deterministic and " "the env's test slice is fixed, so seeds>1 produces " "bit-identical duplicate rows. Keep at 1 unless " "running with temperature>0.") p.add_argument("--output-json", default="outputs/eval_results.jsonl") p.add_argument("--label", default="base", help="Label to tag this eval run (e.g. 'base', 'trained').") p.add_argument("--max-new-tokens", type=int, default=768, help="Bumped from 256 to fit Qwen3's ... " "block (200-600 tokens) plus the final prompt. " "Drop back to 256 if running with thinking=OFF.") p.add_argument("--enable-thinking", action="store_true", default=True, help="Apply Qwen3 chat template with thinking ON. " "Default. Use --no-enable-thinking when evaluating " "an adapter that was TRAINED with thinking=False.") p.add_argument("--no-enable-thinking", dest="enable_thinking", action="store_false") p.add_argument("--temperature", type=float, default=0.0) p.add_argument("--push-to-hub", default=None, help="HF model repo id to upload the eval JSONL under evals/eval_