"""
Before/After evaluation for Prompt Golf.
Loads the agent model (optionally with a LoRA adapter), runs it on every
task in the task bank (or a specified subset), captures the prompt it
produces, runs that prompt through the env, and writes a results JSONL +
a readable summary table.
Run twice — once without --adapter (base), once with --adapter (trained)
— then diff the tables for the hackathon demo.
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import textwrap
import time
from pathlib import Path
from typing import Dict, List
import torch
_HERE = Path(__file__).resolve().parent
_REPO_ROOT = _HERE.parent
sys.path.insert(0, str(_REPO_ROOT))
# Reuse the agent system prompt + user message + extractor from train_grpo
# so eval mirrors training exactly.
from training.train_grpo import ( # noqa: E402
SYSTEM_PROMPT,
build_agent_user_message,
extract_prompt,
)
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="Prompt Golf eval harness")
p.add_argument("--agent-model", default="Qwen/Qwen3-1.7B")
p.add_argument("--adapter", default=None,
help="Optional LoRA adapter dir or HF repo id.")
p.add_argument("--target-model", default="meta-llama/Llama-3.2-3B-Instruct")
p.add_argument("--tasks", default="all",
help="'all' or comma-separated task ids.")
p.add_argument("--seeds-per-task", type=int, default=1,
help="At temperature=0.0 the agent is deterministic and "
"the env's test slice is fixed, so seeds>1 produces "
"bit-identical duplicate rows. Keep at 1 unless "
"running with temperature>0.")
p.add_argument("--output-json", default="outputs/eval_results.jsonl")
p.add_argument("--label", default="base",
help="Label to tag this eval run (e.g. 'base', 'trained').")
p.add_argument("--max-new-tokens", type=int, default=768,
help="Bumped from 256 to fit Qwen3's ... "
"block (200-600 tokens) plus the final prompt. "
"Drop back to 256 if running with thinking=OFF.")
p.add_argument("--enable-thinking", action="store_true", default=True,
help="Apply Qwen3 chat template with thinking ON. "
"Default. Use --no-enable-thinking when evaluating "
"an adapter that was TRAINED with thinking=False.")
p.add_argument("--no-enable-thinking", dest="enable_thinking",
action="store_false")
p.add_argument("--temperature", type=float, default=0.0)
p.add_argument("--push-to-hub", default=None,
help="HF model repo id to upload the eval JSONL under evals/eval_