Carnice-V2-27b / benchmarks /scripts /qwen36_short_ab_queue.sh
kai-os's picture
Add files using upload-large-folder tool
31a7782 verified
#!/usr/bin/env bash
set -euo pipefail
ROOT=/home/ubuntu/hermes-glm5-stagea-pilot
MODEL_VENV=/home/ubuntu/qwen36-stagea-venv
BFCL_VENV=/home/ubuntu/bfcl-venv
BFCL_DIR=$ROOT/bfcl-eval-src/berkeley-function-call-leaderboard
ADAPTER_DIR=$ROOT/outputs/qwen36_carnice_direct_v1b_lora_8192_split_200step/adapter
LOGDIR=$ROOT/benchmarks/logs
STAMP=$(date -u +%Y%m%d_%H%M%S)
RUN_NAME=qwen36_short_public_ab_$STAMP
PORT=8030
API_KEY=local-key
IFEVAL_LIMIT=${IFEVAL_LIMIT:-20}
mkdir -p "$LOGDIR/$RUN_NAME" "$ROOT/benchmarks/$RUN_NAME"
exec > >(tee -a "$LOGDIR/$RUN_NAME/driver.log") 2>&1
echo "run_name=$RUN_NAME"
echo "started=$(date -u --iso-8601=seconds)"
echo "ifeval_limit=$IFEVAL_LIMIT"
cleanup() {
tmux kill-session -t qwen36_short_ab_server 2>/dev/null || true
}
trap cleanup EXIT
wait_for_server() {
for _ in $(seq 1 240); do
if curl -fsS "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then
return 0
fi
sleep 2
done
echo "server did not become healthy" >&2
return 1
}
start_server() {
local label="$1"
local served_name="$2"
local adapter_arg=()
cleanup
if [[ "$label" == "adapter" ]]; then
adapter_arg=(--adapter-dir "$ADAPTER_DIR")
fi
local server_log="$LOGDIR/$RUN_NAME/server_${label}.log"
tmux new-session -d -s qwen36_short_ab_server \
"source $MODEL_VENV/bin/activate && cd $ROOT && CUDA_VISIBLE_DEVICES=0 python serve_qwen35_hermes_openai.py --repo-root $ROOT --base-model Qwen/Qwen3.6-27B ${adapter_arg[*]} --served-model-name $served_name --host 127.0.0.1 --port $PORT --api-key $API_KEY --max-new-tokens 512 --temperature 0.0 --precision bf16 > $server_log 2>&1"
wait_for_server
echo "server_${label}_ready=$(date -u --iso-8601=seconds)"
grep -E "MODEL_LOADER|LORA_ATTACHMENT_SUMMARY" "$server_log" || true
}
write_bfcl_subset() {
cd "$BFCL_DIR"
python3 - <<'PY'
import json
from pathlib import Path
subset = {
"multi_turn_base": [
"multi_turn_base_0",
"multi_turn_base_1",
],
}
Path("test_case_ids_to_generate.json").write_text(
json.dumps(subset, indent=2) + "\n",
encoding="utf-8",
)
PY
cp test_case_ids_to_generate.json "$LOGDIR/$RUN_NAME/bfcl_test_case_ids_to_generate.json"
}
run_bfcl_model() {
local registry="$1"
local label="$2"
source "$BFCL_VENV/bin/activate"
cd "$BFCL_DIR"
export BFCL_PROJECT_ROOT="$BFCL_DIR"
export REMOTE_OPENAI_BASE_URL="http://127.0.0.1:$PORT/v1"
export REMOTE_OPENAI_API_KEY="$API_KEY"
export REMOTE_OPENAI_TOKENIZER_PATH="Qwen/Qwen3.6-27B"
export LOCAL_SERVER_ENDPOINT=127.0.0.1
export LOCAL_SERVER_PORT=$PORT
bfcl generate \
--model "$registry" \
--run-ids \
--skip-server-setup \
--include-input-log \
--allow-overwrite \
--num-threads 1 \
--temperature 0.0 \
--result-dir "result_$RUN_NAME" \
> "$LOGDIR/$RUN_NAME/bfcl_generate_${label}.log" 2>&1
bfcl evaluate \
--model "$registry" \
--test-category multi_turn_base \
--partial-eval \
--result-dir "result_$RUN_NAME" \
--score-dir "score_$RUN_NAME" \
> "$LOGDIR/$RUN_NAME/bfcl_evaluate_${label}.log" 2>&1
}
run_bfcl_ab() {
echo "bfcl_start=$(date -u --iso-8601=seconds)"
write_bfcl_subset
start_server adapter qwen36-carnice-v1-local
run_bfcl_model qwen36-carnice-v1-local-FC adapter
start_server base qwen36-base-local
run_bfcl_model qwen36-base-local-FC base
echo "bfcl_done=$(date -u --iso-8601=seconds)"
}
run_ifeval_model() {
local label="$1"
local model_args="$2"
local out="$ROOT/benchmarks/$RUN_NAME/ifeval_${label}"
local log="$LOGDIR/$RUN_NAME/ifeval_${label}.log"
source "$MODEL_VENV/bin/activate"
cd "$ROOT"
export TOKENIZERS_PARALLELISM=false
CUDA_VISIBLE_DEVICES=0 lm_eval \
--model hf \
--model_args "$model_args" \
--tasks ifeval \
--batch_size 1 \
--apply_chat_template \
--limit "$IFEVAL_LIMIT" \
--output_path "$out" \
--log_samples \
> "$log" 2>&1
}
run_ifeval_ab() {
echo "ifeval_start=$(date -u --iso-8601=seconds)"
run_ifeval_model adapter \
"pretrained=Qwen/Qwen3.6-27B,peft=$ADAPTER_DIR,trust_remote_code=True,dtype=bfloat16,enable_thinking=False"
run_ifeval_model base \
"pretrained=Qwen/Qwen3.6-27B,trust_remote_code=True,dtype=bfloat16,enable_thinking=False"
echo "ifeval_done=$(date -u --iso-8601=seconds)"
}
summarize() {
source "$MODEL_VENV/bin/activate" || true
cd "$ROOT"
python3 - <<'PY'
import csv
import json
from pathlib import Path
root = Path("/home/ubuntu/hermes-glm5-stagea-pilot")
run = sorted((root / "benchmarks" / "logs").glob("qwen36_short_public_ab_*"))[-1].name
bench = root / "benchmarks" / run
bfcl = root / "bfcl-eval-src/berkeley-function-call-leaderboard" / f"score_{run}"
summary = {
"run_name": run,
"training_format_validation": json.loads((root / "benchmarks/qwen36_carnice_benchmark_summary_20260425.json").read_text()).get("training_format_validation"),
"bfcl": {},
"ifeval": {},
}
overall = bfcl / "data_overall.csv"
if overall.exists():
with overall.open(newline="", encoding="utf-8") as f:
summary["bfcl"]["overall_rows"] = list(csv.DictReader(f))
for score in sorted(bfcl.glob("**/*_score.json")):
rel = str(score.relative_to(bfcl))
try:
lines = [json.loads(line) for line in score.read_text().splitlines() if line.strip()]
summary["bfcl"][rel] = lines if len(lines) != 1 else lines[0]
except Exception as exc:
summary["bfcl"][rel] = {"error": str(exc), "raw": score.read_text()[:1000]}
for label in ["adapter", "base"]:
for result_file in (bench / f"ifeval_{label}").glob("**/results_*.json"):
try:
summary["ifeval"][label] = json.loads(result_file.read_text())
except Exception as exc:
summary["ifeval"][label] = {"error": str(exc), "path": str(result_file)}
out = bench / "summary.json"
out.write_text(json.dumps(summary, indent=2) + "\n", encoding="utf-8")
print(out)
PY
}
run_bfcl_ab
cleanup
run_ifeval_ab
summarize
echo "completed=$(date -u --iso-8601=seconds)"