"""
Before/After evaluation for Prompt Golf.

Loads the agent model (optionally with a LoRA adapter), runs it on every
task in the task bank (or a specified subset), captures the prompt it
produces, runs that prompt through the env, and writes a results JSONL +
a readable summary table.

Run twice — once without --adapter (base), once with --adapter (trained)
— then diff the tables for the hackathon demo.
"""

from __future__ import annotations

import argparse
import json
import os
import sys
import textwrap
import time
from pathlib import Path
from typing import Dict, List

import torch


_HERE = Path(__file__).resolve().parent
_REPO_ROOT = _HERE.parent
sys.path.insert(0, str(_REPO_ROOT))

# Reuse the agent system prompt + user message + extractor from train_grpo
# so eval mirrors training exactly.
from training.train_grpo import (  # noqa: E402
    SYSTEM_PROMPT,
    build_agent_user_message,
    extract_prompt,
)


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Prompt Golf eval harness")
    p.add_argument("--agent-model", default="Qwen/Qwen3-1.7B")
    p.add_argument("--adapter", default=None,
                   help="Optional LoRA adapter dir or HF repo id.")
    p.add_argument("--target-model", default="meta-llama/Llama-3.2-3B-Instruct")
    p.add_argument("--tasks", default="all",
                   help="'all' or comma-separated task ids.")
    p.add_argument("--seeds-per-task", type=int, default=1,
                   help="At temperature=0.0 the agent is deterministic and "
                        "the env's test slice is fixed, so seeds>1 produces "
                        "bit-identical duplicate rows. Keep at 1 unless "
                        "running with temperature>0.")
    p.add_argument("--output-json", default="outputs/eval_results.jsonl")
    p.add_argument("--label", default="base",
                   help="Label to tag this eval run (e.g. 'base', 'trained').")
    p.add_argument("--max-new-tokens", type=int, default=768,
                   help="Bumped from 256 to fit Qwen3's <think>...</think> "
                        "block (200-600 tokens) plus the final prompt. "
                        "Drop back to 256 if running with thinking=OFF.")
    p.add_argument("--enable-thinking", action="store_true", default=True,
                   help="Apply Qwen3 chat template with thinking ON. "
                        "Default. Use --no-enable-thinking when evaluating "
                        "an adapter that was TRAINED with thinking=False.")
    p.add_argument("--no-enable-thinking", dest="enable_thinking",
                   action="store_false")
    p.add_argument("--temperature", type=float, default=0.0)
    p.add_argument("--push-to-hub", default=None,
                   help="HF model repo id to upload the eval JSONL under evals/eval_<label>.jsonl.")
    return p.parse_args()


def load_agent(agent_model: str, adapter: str | None):
    from transformers import AutoModelForCausalLM, AutoTokenizer

    tok = AutoTokenizer.from_pretrained(agent_model)
    tok.padding_side = "left"  # decoder-only generation
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
    model = AutoModelForCausalLM.from_pretrained(
        agent_model,
        torch_dtype=dtype,
        device_map="auto" if torch.cuda.is_available() else None,
    )
    model.eval()

    if adapter:
        from peft import PeftModel
        model = PeftModel.from_pretrained(model, adapter)
        model.eval()
        print(f"[load] agent + adapter = {agent_model} + {adapter}", flush=True)
    else:
        print(f"[load] agent (base) = {agent_model}", flush=True)
    return model, tok


def build_chat_string(tok, obs, enable_thinking: bool = True) -> str:
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": build_agent_user_message(obs)},
    ]
    if getattr(tok, "chat_template", None):
        try:
            # Mirror the chat template the adapter was trained against.
            # Pass --no-enable-thinking when evaluating a thinking=False
            # adapter to keep eval-time inputs in-distribution.
            return tok.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True,
                enable_thinking=enable_thinking,
            )
        except TypeError:
            return tok.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True,
            )
    return f"{SYSTEM_PROMPT}\n\n{build_agent_user_message(obs)}\n\nAssistant:"


@torch.inference_mode()
def generate_prompt(model, tok, chat_str: str, max_new_tokens: int, temperature: float) -> str:
    enc = tok(chat_str, return_tensors="pt").to(model.device)
    gen = model.generate(
        **enc,
        max_new_tokens=max_new_tokens,
        do_sample=temperature > 0,
        temperature=max(temperature, 1e-5),
        top_p=1.0,
        pad_token_id=tok.pad_token_id,
    )
    new_tokens = gen[0][enc["input_ids"].shape[1]:]
    text = tok.decode(new_tokens, skip_special_tokens=True).strip()
    return extract_prompt(text)


def main() -> None:
    args = parse_args()

    os.environ.setdefault("PROMPT_GOLF_TARGET_MODEL", args.target_model)
    os.environ.setdefault("PROMPT_GOLF_TARGET_BACKEND", "hf")

    from prompt_golf_env.models import GolfAction
    from prompt_golf_env.server.prompt_golf_environment import PromptGolfEnvironment
    from prompt_golf_env.server.tasks import TASKS, list_task_ids as list_v1
    from prompt_golf_env.server.tasks_v2 import TASKS_V2, list_task_ids_v2
    from prompt_golf_env.server.tasks_tough import TASKS_TOUGH, list_task_ids_tough
    from prompt_golf_env.server.tasks_policy import TASKS_POLICY, list_task_ids_policy

    _ALL_TASKS = {**TASKS, **TASKS_V2, **TASKS_TOUGH, **TASKS_POLICY}

    def list_task_ids():
        return (
            list_v1() + list_task_ids_v2()
            + list_task_ids_tough() + list_task_ids_policy()
        )

    # Load agent
    model, tok = load_agent(args.agent_model, args.adapter)

    # Pick tasks
    if args.tasks == "all":
        task_ids = list_task_ids()
    else:
        task_ids = [t.strip() for t in args.tasks.split(",") if t.strip()]

    # Env
    env = PromptGolfEnvironment()

    out_path = Path(args.output_json)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    rows: List[Dict] = []
    t0 = time.time()
    for task_id in task_ids:
        for seed in range(args.seeds_per_task):
            obs = env.reset(task=task_id, seed=seed)
            chat_str = build_chat_string(tok, obs, enable_thinking=args.enable_thinking)
            agent_prompt = generate_prompt(
                model, tok, chat_str,
                max_new_tokens=args.max_new_tokens,
                temperature=args.temperature,
            )
            obs_after = env.step(GolfAction(prompt=agent_prompt))
            row = {
                "label": args.label,
                "task_id": task_id,
                "category": _ALL_TASKS[task_id].category,
                "difficulty": _ALL_TASKS[task_id].difficulty,
                "seed": seed,
                "agent_prompt": agent_prompt,
                "tokens": obs_after.submitted_prompt_tokens,
                "budget": obs_after.prompt_budget_tokens,
                "raw_task_score": obs_after.raw_task_score,
                "length_factor": obs_after.length_factor,
                "leakage_penalty": obs_after.leakage_penalty,
                "baseline_zero_shot": obs_after.baseline_zero_shot_score,
                "reward": obs_after.reward,
            }
            rows.append(row)

    # Write JSONL
    with out_path.open("w") as f:
        for r in rows:
            f.write(json.dumps(r) + "\n")

    # Print summary
    elapsed = time.time() - t0
    print(f"\n[eval] {len(rows)} episodes in {elapsed:.1f}s  →  {out_path}", flush=True)
    print("\n=== SUMMARY ===", flush=True)
    # Aggregate by task
    by_task: Dict[str, List[Dict]] = {}
    for r in rows:
        by_task.setdefault(r["task_id"], []).append(r)

    header = f'{"task_id":24s}  {"reward":>7s}  {"raw":>5s}  {"tokens":>6s}  {"lf":>5s}  {"leak":>5s}'
    print(header)
    print("-" * len(header))
    total_reward = 0.0
    total_tokens = 0.0
    for tid, items in by_task.items():
        avg = {
            k: sum((it.get(k) or 0.0) for it in items) / len(items)
            for k in ("reward", "raw_task_score", "tokens", "length_factor", "leakage_penalty")
        }
        print(
            f'{tid:24s}  {avg["reward"]:7.3f}  {avg["raw_task_score"]:5.2f}  '
            f'{avg["tokens"]:6.1f}  {avg["length_factor"]:5.2f}  {avg["leakage_penalty"]:5.2f}'
        )
        total_reward += avg["reward"]
        total_tokens += avg["tokens"]
    n = len(by_task)
    print("-" * len(header))
    print(
        f'{"AVERAGE":24s}  {total_reward/n:7.3f}  {"":>5s}  '
        f'{total_tokens/n:6.1f}'
    )

    # --- Push JSONL to the hub so the results persist past container death
    if args.push_to_hub:
        from huggingface_hub import HfApi
        api = HfApi()
        api.create_repo(args.push_to_hub, exist_ok=True, repo_type="model")
        api.upload_file(
            path_or_fileobj=str(out_path),
            path_in_repo=f"evals/eval_{args.label}.jsonl",
            repo_id=args.push_to_hub,
            repo_type="model",
            commit_message=f"eval_{args.label}: {len(rows)} episodes",
        )
        print(f"[push] uploaded evals/eval_{args.label}.jsonl to https://huggingface.co/{args.push_to_hub}", flush=True)


if __name__ == "__main__":
    main()