#!/usr/bin/env bash
set -euo pipefail

ROOT=/home/ubuntu/hermes-glm5-stagea-pilot
MODEL_VENV=/home/ubuntu/qwen36-stagea-venv
BFCL_VENV=/home/ubuntu/bfcl-venv
BFCL_DIR=$ROOT/bfcl-eval-src/berkeley-function-call-leaderboard
ADAPTER_DIR=$ROOT/outputs/qwen36_carnice_direct_v1b_lora_8192_split_200step/adapter
LOGDIR=$ROOT/benchmarks/logs
STAMP=$(date -u +%Y%m%d_%H%M%S)
RUN_NAME=qwen36_short_public_ab_$STAMP
PORT=8030
API_KEY=local-key
IFEVAL_LIMIT=${IFEVAL_LIMIT:-20}

mkdir -p "$LOGDIR/$RUN_NAME" "$ROOT/benchmarks/$RUN_NAME"

exec > >(tee -a "$LOGDIR/$RUN_NAME/driver.log") 2>&1

echo "run_name=$RUN_NAME"
echo "started=$(date -u --iso-8601=seconds)"
echo "ifeval_limit=$IFEVAL_LIMIT"

cleanup() {
  tmux kill-session -t qwen36_short_ab_server 2>/dev/null || true
}
trap cleanup EXIT

wait_for_server() {
  for _ in $(seq 1 240); do
    if curl -fsS "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then
      return 0
    fi
    sleep 2
  done
  echo "server did not become healthy" >&2
  return 1
}

start_server() {
  local label="$1"
  local served_name="$2"
  local adapter_arg=()

  cleanup
  if [[ "$label" == "adapter" ]]; then
    adapter_arg=(--adapter-dir "$ADAPTER_DIR")
  fi

  local server_log="$LOGDIR/$RUN_NAME/server_${label}.log"
  tmux new-session -d -s qwen36_short_ab_server \
    "source $MODEL_VENV/bin/activate && cd $ROOT && CUDA_VISIBLE_DEVICES=0 python serve_qwen35_hermes_openai.py --repo-root $ROOT --base-model Qwen/Qwen3.6-27B ${adapter_arg[*]} --served-model-name $served_name --host 127.0.0.1 --port $PORT --api-key $API_KEY --max-new-tokens 512 --temperature 0.0 --precision bf16 > $server_log 2>&1"
  wait_for_server
  echo "server_${label}_ready=$(date -u --iso-8601=seconds)"
  grep -E "MODEL_LOADER|LORA_ATTACHMENT_SUMMARY" "$server_log" || true
}

write_bfcl_subset() {
  cd "$BFCL_DIR"
  python3 - <<'PY'
import json
from pathlib import Path

subset = {
    "multi_turn_base": [
        "multi_turn_base_0",
        "multi_turn_base_1",
    ],
}
Path("test_case_ids_to_generate.json").write_text(
    json.dumps(subset, indent=2) + "\n",
    encoding="utf-8",
)
PY
  cp test_case_ids_to_generate.json "$LOGDIR/$RUN_NAME/bfcl_test_case_ids_to_generate.json"
}

run_bfcl_model() {
  local registry="$1"
  local label="$2"

  source "$BFCL_VENV/bin/activate"
  cd "$BFCL_DIR"
  export BFCL_PROJECT_ROOT="$BFCL_DIR"
  export REMOTE_OPENAI_BASE_URL="http://127.0.0.1:$PORT/v1"
  export REMOTE_OPENAI_API_KEY="$API_KEY"
  export REMOTE_OPENAI_TOKENIZER_PATH="Qwen/Qwen3.6-27B"
  export LOCAL_SERVER_ENDPOINT=127.0.0.1
  export LOCAL_SERVER_PORT=$PORT

  bfcl generate \
    --model "$registry" \
    --run-ids \
    --skip-server-setup \
    --include-input-log \
    --allow-overwrite \
    --num-threads 1 \
    --temperature 0.0 \
    --result-dir "result_$RUN_NAME" \
    > "$LOGDIR/$RUN_NAME/bfcl_generate_${label}.log" 2>&1

  bfcl evaluate \
    --model "$registry" \
    --test-category multi_turn_base \
    --partial-eval \
    --result-dir "result_$RUN_NAME" \
    --score-dir "score_$RUN_NAME" \
    > "$LOGDIR/$RUN_NAME/bfcl_evaluate_${label}.log" 2>&1
}

run_bfcl_ab() {
  echo "bfcl_start=$(date -u --iso-8601=seconds)"
  write_bfcl_subset

  start_server adapter qwen36-carnice-v1-local
  run_bfcl_model qwen36-carnice-v1-local-FC adapter

  start_server base qwen36-base-local
  run_bfcl_model qwen36-base-local-FC base

  echo "bfcl_done=$(date -u --iso-8601=seconds)"
}

run_ifeval_model() {
  local label="$1"
  local model_args="$2"
  local out="$ROOT/benchmarks/$RUN_NAME/ifeval_${label}"
  local log="$LOGDIR/$RUN_NAME/ifeval_${label}.log"

  source "$MODEL_VENV/bin/activate"
  cd "$ROOT"
  export TOKENIZERS_PARALLELISM=false

  CUDA_VISIBLE_DEVICES=0 lm_eval \
    --model hf \
    --model_args "$model_args" \
    --tasks ifeval \
    --batch_size 1 \
    --apply_chat_template \
    --limit "$IFEVAL_LIMIT" \
    --output_path "$out" \
    --log_samples \
    > "$log" 2>&1
}

run_ifeval_ab() {
  echo "ifeval_start=$(date -u --iso-8601=seconds)"

  run_ifeval_model adapter \
    "pretrained=Qwen/Qwen3.6-27B,peft=$ADAPTER_DIR,trust_remote_code=True,dtype=bfloat16,enable_thinking=False"

  run_ifeval_model base \
    "pretrained=Qwen/Qwen3.6-27B,trust_remote_code=True,dtype=bfloat16,enable_thinking=False"

  echo "ifeval_done=$(date -u --iso-8601=seconds)"
}

summarize() {
  source "$MODEL_VENV/bin/activate" || true
  cd "$ROOT"
  python3 - <<'PY'
import csv
import json
from pathlib import Path

root = Path("/home/ubuntu/hermes-glm5-stagea-pilot")
run = sorted((root / "benchmarks" / "logs").glob("qwen36_short_public_ab_*"))[-1].name
bench = root / "benchmarks" / run
bfcl = root / "bfcl-eval-src/berkeley-function-call-leaderboard" / f"score_{run}"
summary = {
    "run_name": run,
    "training_format_validation": json.loads((root / "benchmarks/qwen36_carnice_benchmark_summary_20260425.json").read_text()).get("training_format_validation"),
    "bfcl": {},
    "ifeval": {},
}

overall = bfcl / "data_overall.csv"
if overall.exists():
    with overall.open(newline="", encoding="utf-8") as f:
        summary["bfcl"]["overall_rows"] = list(csv.DictReader(f))
for score in sorted(bfcl.glob("**/*_score.json")):
    rel = str(score.relative_to(bfcl))
    try:
        lines = [json.loads(line) for line in score.read_text().splitlines() if line.strip()]
        summary["bfcl"][rel] = lines if len(lines) != 1 else lines[0]
    except Exception as exc:
        summary["bfcl"][rel] = {"error": str(exc), "raw": score.read_text()[:1000]}

for label in ["adapter", "base"]:
    for result_file in (bench / f"ifeval_{label}").glob("**/results_*.json"):
        try:
            summary["ifeval"][label] = json.loads(result_file.read_text())
        except Exception as exc:
            summary["ifeval"][label] = {"error": str(exc), "path": str(result_file)}

out = bench / "summary.json"
out.write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8")
print(out)
PY
}

run_bfcl_ab
cleanup
run_ifeval_ab
summarize

echo "completed=$(date -u --iso-8601=seconds)"