fix OOM: chunked KL with checkpointing + PYTORCH_CUDA_ALLOC_CONF expandable_segments; add kl_chunk_size config key
eb5278f verified | # Re-runs the two configs that OOM'd in the original sweep, now with the | |
| # chunked-KL fix and PYTORCH_CUDA_ALLOC_CONF=expandable_segments in distill.py. | |
| # Reads HF_TOKEN, HUGGING_FACE_HUB_TOKEN, WANDB_API_KEY from the calling env. | |
| # | |
| # Launch with: | |
| # nohup ./scripts/run_sweep_rerun.sh > logs/sweep_rerun_master.log 2>&1 & | |
| set -uo pipefail | |
| cd "$(dirname "$0")/.." | |
| CONFIGS=( | |
| "configs/replicate_zero4.toml" | |
| "configs/grow40_winning.toml" | |
| ) | |
| LOG_DIR="logs" | |
| mkdir -p "$LOG_DIR" | |
| for cfg in "${CONFIGS[@]}"; do | |
| name="$(basename "$cfg" .toml)" | |
| log="$LOG_DIR/$name.log" | |
| echo ">>> [$(date '+%F %T')] starting $name -> $log" | |
| .venv/bin/accelerate launch \ | |
| --config_file configs/accelerate.yaml \ | |
| distill.py \ | |
| --config "$cfg" \ | |
| > "$log" 2>&1 | |
| rc=$? | |
| echo "<<< [$(date '+%F %T')] finished $name (exit=$rc)" | |
| if [[ $rc -ne 0 ]]; then | |
| echo " last 20 lines of $log:" | |
| tail -20 "$log" | sed 's/^/ /' | |
| fi | |
| done | |
| echo ">>> [$(date '+%F %T')] rerun complete" | |