| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| ROOT=/home/ubuntu/hermes-glm5-stagea-pilot |
| MODEL_VENV=/home/ubuntu/qwen36-stagea-venv |
| BFCL_VENV=/home/ubuntu/bfcl-venv |
| BFCL_DIR=$ROOT/bfcl-eval-src/berkeley-function-call-leaderboard |
| ADAPTER_DIR=$ROOT/outputs/qwen36_carnice_direct_v1b_lora_8192_split_200step/adapter |
| LOGDIR=$ROOT/benchmarks/logs |
| STAMP=$(date -u +%Y%m%d_%H%M%S) |
| RUN_NAME=qwen36_short_public_ab_$STAMP |
| PORT=8030 |
| API_KEY=local-key |
| IFEVAL_LIMIT=${IFEVAL_LIMIT:-20} |
|
|
| mkdir -p "$LOGDIR/$RUN_NAME" "$ROOT/benchmarks/$RUN_NAME" |
|
|
| exec > >(tee -a "$LOGDIR/$RUN_NAME/driver.log") 2>&1 |
|
|
| echo "run_name=$RUN_NAME" |
| echo "started=$(date -u --iso-8601=seconds)" |
| echo "ifeval_limit=$IFEVAL_LIMIT" |
|
|
| cleanup() { |
| tmux kill-session -t qwen36_short_ab_server 2>/dev/null || true |
| } |
| trap cleanup EXIT |
|
|
| wait_for_server() { |
| for _ in $(seq 1 240); do |
| if curl -fsS "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then |
| return 0 |
| fi |
| sleep 2 |
| done |
| echo "server did not become healthy" >&2 |
| return 1 |
| } |
|
|
| start_server() { |
| local label="$1" |
| local served_name="$2" |
| local adapter_arg=() |
|
|
| cleanup |
| if [[ "$label" == "adapter" ]]; then |
| adapter_arg=(--adapter-dir "$ADAPTER_DIR") |
| fi |
|
|
| local server_log="$LOGDIR/$RUN_NAME/server_${label}.log" |
| tmux new-session -d -s qwen36_short_ab_server \ |
| "source $MODEL_VENV/bin/activate && cd $ROOT && CUDA_VISIBLE_DEVICES=0 python serve_qwen35_hermes_openai.py --repo-root $ROOT --base-model Qwen/Qwen3.6-27B ${adapter_arg[*]} --served-model-name $served_name --host 127.0.0.1 --port $PORT --api-key $API_KEY --max-new-tokens 512 --temperature 0.0 --precision bf16 > $server_log 2>&1" |
| wait_for_server |
| echo "server_${label}_ready=$(date -u --iso-8601=seconds)" |
| grep -E "MODEL_LOADER|LORA_ATTACHMENT_SUMMARY" "$server_log" || true |
| } |
|
|
| write_bfcl_subset() { |
| cd "$BFCL_DIR" |
| python3 - <<'PY' |
| import json |
| from pathlib import Path |
|
|
| subset = { |
| "multi_turn_base": [ |
| "multi_turn_base_0", |
| "multi_turn_base_1", |
| ], |
| } |
| Path("test_case_ids_to_generate.json").write_text( |
| json.dumps(subset, indent=2) + "\n", |
| encoding="utf-8", |
| ) |
| PY |
| cp test_case_ids_to_generate.json "$LOGDIR/$RUN_NAME/bfcl_test_case_ids_to_generate.json" |
| } |
|
|
| run_bfcl_model() { |
| local registry="$1" |
| local label="$2" |
|
|
| source "$BFCL_VENV/bin/activate" |
| cd "$BFCL_DIR" |
| export BFCL_PROJECT_ROOT="$BFCL_DIR" |
| export REMOTE_OPENAI_BASE_URL="http://127.0.0.1:$PORT/v1" |
| export REMOTE_OPENAI_API_KEY="$API_KEY" |
| export REMOTE_OPENAI_TOKENIZER_PATH="Qwen/Qwen3.6-27B" |
| export LOCAL_SERVER_ENDPOINT=127.0.0.1 |
| export LOCAL_SERVER_PORT=$PORT |
|
|
| bfcl generate \ |
| --model "$registry" \ |
| --run-ids \ |
| --skip-server-setup \ |
| --include-input-log \ |
| --allow-overwrite \ |
| --num-threads 1 \ |
| --temperature 0.0 \ |
| --result-dir "result_$RUN_NAME" \ |
| > "$LOGDIR/$RUN_NAME/bfcl_generate_${label}.log" 2>&1 |
|
|
| bfcl evaluate \ |
| --model "$registry" \ |
| --test-category multi_turn_base \ |
| --partial-eval \ |
| --result-dir "result_$RUN_NAME" \ |
| --score-dir "score_$RUN_NAME" \ |
| > "$LOGDIR/$RUN_NAME/bfcl_evaluate_${label}.log" 2>&1 |
| } |
|
|
| run_bfcl_ab() { |
| echo "bfcl_start=$(date -u --iso-8601=seconds)" |
| write_bfcl_subset |
|
|
| start_server adapter qwen36-carnice-v1-local |
| run_bfcl_model qwen36-carnice-v1-local-FC adapter |
|
|
| start_server base qwen36-base-local |
| run_bfcl_model qwen36-base-local-FC base |
|
|
| echo "bfcl_done=$(date -u --iso-8601=seconds)" |
| } |
|
|
| run_ifeval_model() { |
| local label="$1" |
| local model_args="$2" |
| local out="$ROOT/benchmarks/$RUN_NAME/ifeval_${label}" |
| local log="$LOGDIR/$RUN_NAME/ifeval_${label}.log" |
|
|
| source "$MODEL_VENV/bin/activate" |
| cd "$ROOT" |
| export TOKENIZERS_PARALLELISM=false |
|
|
| CUDA_VISIBLE_DEVICES=0 lm_eval \ |
| --model hf \ |
| --model_args "$model_args" \ |
| --tasks ifeval \ |
| --batch_size 1 \ |
| --apply_chat_template \ |
| --limit "$IFEVAL_LIMIT" \ |
| --output_path "$out" \ |
| --log_samples \ |
| > "$log" 2>&1 |
| } |
|
|
| run_ifeval_ab() { |
| echo "ifeval_start=$(date -u --iso-8601=seconds)" |
|
|
| run_ifeval_model adapter \ |
| "pretrained=Qwen/Qwen3.6-27B,peft=$ADAPTER_DIR,trust_remote_code=True,dtype=bfloat16,enable_thinking=False" |
|
|
| run_ifeval_model base \ |
| "pretrained=Qwen/Qwen3.6-27B,trust_remote_code=True,dtype=bfloat16,enable_thinking=False" |
|
|
| echo "ifeval_done=$(date -u --iso-8601=seconds)" |
| } |
|
|
| summarize() { |
| source "$MODEL_VENV/bin/activate" || true |
| cd "$ROOT" |
| python3 - <<'PY' |
| import csv |
| import json |
| from pathlib import Path |
|
|
| root = Path("/home/ubuntu/hermes-glm5-stagea-pilot") |
| run = sorted((root / "benchmarks" / "logs").glob("qwen36_short_public_ab_*"))[-1].name |
| bench = root / "benchmarks" / run |
| bfcl = root / "bfcl-eval-src/berkeley-function-call-leaderboard" / f"score_{run}" |
| summary = { |
| "run_name": run, |
| "training_format_validation": json.loads((root / "benchmarks/qwen36_carnice_benchmark_summary_20260425.json").read_text()).get("training_format_validation"), |
| "bfcl": {}, |
| "ifeval": {}, |
| } |
|
|
| overall = bfcl / "data_overall.csv" |
| if overall.exists(): |
| with overall.open(newline="", encoding="utf-8") as f: |
| summary["bfcl"]["overall_rows"] = list(csv.DictReader(f)) |
| for score in sorted(bfcl.glob("**/*_score.json")): |
| rel = str(score.relative_to(bfcl)) |
| try: |
| lines = [json.loads(line) for line in score.read_text().splitlines() if line.strip()] |
| summary["bfcl"][rel] = lines if len(lines) != 1 else lines[0] |
| except Exception as exc: |
| summary["bfcl"][rel] = {"error": str(exc), "raw": score.read_text()[:1000]} |
|
|
| for label in ["adapter", "base"]: |
| for result_file in (bench / f"ifeval_{label}").glob("**/results_*.json"): |
| try: |
| summary["ifeval"][label] = json.loads(result_file.read_text()) |
| except Exception as exc: |
| summary["ifeval"][label] = {"error": str(exc), "path": str(result_file)} |
|
|
| out = bench / "summary.json" |
| out.write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8") |
| print(out) |
| PY |
| } |
|
|
| run_bfcl_ab |
| cleanup |
| run_ifeval_ab |
| summarize |
|
|
| echo "completed=$(date -u --iso-8601=seconds)" |
|
|