#!/usr/bin/env bash set -euo pipefail ROOT=/home/ubuntu/hermes-glm5-stagea-pilot MODEL_VENV=/home/ubuntu/qwen36-stagea-venv BFCL_VENV=/home/ubuntu/bfcl-venv BFCL_DIR=$ROOT/bfcl-eval-src/berkeley-function-call-leaderboard ADAPTER_DIR=$ROOT/outputs/qwen36_carnice_direct_v1b_lora_8192_split_200step/adapter LOGDIR=$ROOT/benchmarks/logs STAMP=$(date -u +%Y%m%d_%H%M%S) RUN_NAME=qwen36_short_public_ab_$STAMP PORT=8030 API_KEY=local-key IFEVAL_LIMIT=${IFEVAL_LIMIT:-20} mkdir -p "$LOGDIR/$RUN_NAME" "$ROOT/benchmarks/$RUN_NAME" exec > >(tee -a "$LOGDIR/$RUN_NAME/driver.log") 2>&1 echo "run_name=$RUN_NAME" echo "started=$(date -u --iso-8601=seconds)" echo "ifeval_limit=$IFEVAL_LIMIT" cleanup() { tmux kill-session -t qwen36_short_ab_server 2>/dev/null || true } trap cleanup EXIT wait_for_server() { for _ in $(seq 1 240); do if curl -fsS "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then return 0 fi sleep 2 done echo "server did not become healthy" >&2 return 1 } start_server() { local label="$1" local served_name="$2" local adapter_arg=() cleanup if [[ "$label" == "adapter" ]]; then adapter_arg=(--adapter-dir "$ADAPTER_DIR") fi local server_log="$LOGDIR/$RUN_NAME/server_${label}.log" tmux new-session -d -s qwen36_short_ab_server \ "source $MODEL_VENV/bin/activate && cd $ROOT && CUDA_VISIBLE_DEVICES=0 python serve_qwen35_hermes_openai.py --repo-root $ROOT --base-model Qwen/Qwen3.6-27B ${adapter_arg[*]} --served-model-name $served_name --host 127.0.0.1 --port $PORT --api-key $API_KEY --max-new-tokens 512 --temperature 0.0 --precision bf16 > $server_log 2>&1" wait_for_server echo "server_${label}_ready=$(date -u --iso-8601=seconds)" grep -E "MODEL_LOADER|LORA_ATTACHMENT_SUMMARY" "$server_log" || true } write_bfcl_subset() { cd "$BFCL_DIR" python3 - <<'PY' import json from pathlib import Path subset = { "multi_turn_base": [ "multi_turn_base_0", "multi_turn_base_1", ], } Path("test_case_ids_to_generate.json").write_text( json.dumps(subset, indent=2) + "\n", encoding="utf-8", ) PY cp test_case_ids_to_generate.json "$LOGDIR/$RUN_NAME/bfcl_test_case_ids_to_generate.json" } run_bfcl_model() { local registry="$1" local label="$2" source "$BFCL_VENV/bin/activate" cd "$BFCL_DIR" export BFCL_PROJECT_ROOT="$BFCL_DIR" export REMOTE_OPENAI_BASE_URL="http://127.0.0.1:$PORT/v1" export REMOTE_OPENAI_API_KEY="$API_KEY" export REMOTE_OPENAI_TOKENIZER_PATH="Qwen/Qwen3.6-27B" export LOCAL_SERVER_ENDPOINT=127.0.0.1 export LOCAL_SERVER_PORT=$PORT bfcl generate \ --model "$registry" \ --run-ids \ --skip-server-setup \ --include-input-log \ --allow-overwrite \ --num-threads 1 \ --temperature 0.0 \ --result-dir "result_$RUN_NAME" \ > "$LOGDIR/$RUN_NAME/bfcl_generate_${label}.log" 2>&1 bfcl evaluate \ --model "$registry" \ --test-category multi_turn_base \ --partial-eval \ --result-dir "result_$RUN_NAME" \ --score-dir "score_$RUN_NAME" \ > "$LOGDIR/$RUN_NAME/bfcl_evaluate_${label}.log" 2>&1 } run_bfcl_ab() { echo "bfcl_start=$(date -u --iso-8601=seconds)" write_bfcl_subset start_server adapter qwen36-carnice-v1-local run_bfcl_model qwen36-carnice-v1-local-FC adapter start_server base qwen36-base-local run_bfcl_model qwen36-base-local-FC base echo "bfcl_done=$(date -u --iso-8601=seconds)" } run_ifeval_model() { local label="$1" local model_args="$2" local out="$ROOT/benchmarks/$RUN_NAME/ifeval_${label}" local log="$LOGDIR/$RUN_NAME/ifeval_${label}.log" source "$MODEL_VENV/bin/activate" cd "$ROOT" export TOKENIZERS_PARALLELISM=false CUDA_VISIBLE_DEVICES=0 lm_eval \ --model hf \ --model_args "$model_args" \ --tasks ifeval \ --batch_size 1 \ --apply_chat_template \ --limit "$IFEVAL_LIMIT" \ --output_path "$out" \ --log_samples \ > "$log" 2>&1 } run_ifeval_ab() { echo "ifeval_start=$(date -u --iso-8601=seconds)" run_ifeval_model adapter \ "pretrained=Qwen/Qwen3.6-27B,peft=$ADAPTER_DIR,trust_remote_code=True,dtype=bfloat16,enable_thinking=False" run_ifeval_model base \ "pretrained=Qwen/Qwen3.6-27B,trust_remote_code=True,dtype=bfloat16,enable_thinking=False" echo "ifeval_done=$(date -u --iso-8601=seconds)" } summarize() { source "$MODEL_VENV/bin/activate" || true cd "$ROOT" python3 - <<'PY' import csv import json from pathlib import Path root = Path("/home/ubuntu/hermes-glm5-stagea-pilot") run = sorted((root / "benchmarks" / "logs").glob("qwen36_short_public_ab_*"))[-1].name bench = root / "benchmarks" / run bfcl = root / "bfcl-eval-src/berkeley-function-call-leaderboard" / f"score_{run}" summary = { "run_name": run, "training_format_validation": json.loads((root / "benchmarks/qwen36_carnice_benchmark_summary_20260425.json").read_text()).get("training_format_validation"), "bfcl": {}, "ifeval": {}, } overall = bfcl / "data_overall.csv" if overall.exists(): with overall.open(newline="", encoding="utf-8") as f: summary["bfcl"]["overall_rows"] = list(csv.DictReader(f)) for score in sorted(bfcl.glob("**/*_score.json")): rel = str(score.relative_to(bfcl)) try: lines = [json.loads(line) for line in score.read_text().splitlines() if line.strip()] summary["bfcl"][rel] = lines if len(lines) != 1 else lines[0] except Exception as exc: summary["bfcl"][rel] = {"error": str(exc), "raw": score.read_text()[:1000]} for label in ["adapter", "base"]: for result_file in (bench / f"ifeval_{label}").glob("**/results_*.json"): try: summary["ifeval"][label] = json.loads(result_file.read_text()) except Exception as exc: summary["ifeval"][label] = {"error": str(exc), "path": str(result_file)} out = bench / "summary.json" out.write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8") print(out) PY } run_bfcl_ab cleanup run_ifeval_ab summarize echo "completed=$(date -u --iso-8601=seconds)"