| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| |
| |
| |
| |
|
|
| OUT_DIR="${1:-outputs/baselines/qwen3-8b-zero-shot}" |
| MODEL_ID="${BASELINE_MODEL:-Qwen/Qwen3-8B}" |
| MAX_SAMPLES="${BASELINE_MAX_SAMPLES:-200}" |
|
|
| source .venv/bin/activate |
| export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" |
| export PYTHONPATH="$PWD/src:${PYTHONPATH:-}" |
| export TOKENIZERS_PARALLELISM=false |
|
|
| python scripts/check_gpu.py |
| mkdir -p "$OUT_DIR" |
|
|
| python scripts/evaluate_model.py \ |
| --model "$MODEL_ID" \ |
| --dataset nraptisss/TMF921-intent-to-config-research-sota \ |
| --output_dir "$OUT_DIR" \ |
| --batch_size "${EVAL_BATCH_SIZE:-4}" \ |
| --max_samples_per_split "$MAX_SAMPLES" \ |
| --max_new_tokens "${EVAL_MAX_NEW_TOKENS:-1536}" \ |
| --gold_length_buffer "${EVAL_GOLD_LENGTH_BUFFER:-96}" \ |
| --save_every "${EVAL_SAVE_EVERY:-25}" |
|
|
| python scripts/normalize_eval_metrics.py --eval_dir "$OUT_DIR" |
|
|
| python - <<PY |
| import json |
| from pathlib import Path |
| p = Path("$OUT_DIR") / "all_normalized_metrics.json" |
| m = json.loads(p.read_text()) |
| print("Zero-shot baseline:", "$MODEL_ID") |
| for split, s in m.items(): |
| print(f"{split}: n={s.get('num_examples')} parse={s.get('parse_json'):.4f} norm_field_f1={s.get('norm_field_f1'):.4f} norm_key_f1={s.get('norm_key_f1'):.4f} norm_exact={s.get('norm_exact_match'):.4f}") |
| PY |
|
|