Add nohup run management and resumable checkpoint support
Browse files- README.md +50 -5
- scripts/nohup_eval.sh +55 -0
- scripts/nohup_new_run.sh +88 -0
- scripts/nohup_resume.sh +67 -0
- scripts/status_run.sh +46 -0
- scripts/train_qlora.py +5 -1
README.md
CHANGED
|
@@ -58,7 +58,7 @@ gradient_accumulation_steps: 16
|
|
| 58 |
|
| 59 |
Do **not** reduce `max_length` unless you intentionally want a different training task.
|
| 60 |
|
| 61 |
-
## Quick start
|
| 62 |
|
| 63 |
```bash
|
| 64 |
git clone https://huggingface.co/nraptisss/tmf921-intent-training
|
|
@@ -72,15 +72,56 @@ python -m pip install -r requirements.txt
|
|
| 72 |
export HF_TOKEN=hf_...
|
| 73 |
export CUDA_VISIBLE_DEVICES=0
|
| 74 |
export PYTHONPATH="$PWD/src"
|
|
|
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
```
|
| 79 |
|
| 80 |
-
|
| 81 |
|
| 82 |
```bash
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
```
|
| 85 |
|
| 86 |
## Optional Trackio monitoring
|
|
@@ -229,6 +270,10 @@ scripts/
|
|
| 229 |
evaluate_model.py
|
| 230 |
merge_adapter.py
|
| 231 |
run_rtx6000ada.sh
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
src/tmf921_train/
|
| 233 |
utils.py
|
| 234 |
requirements.txt
|
|
|
|
| 58 |
|
| 59 |
Do **not** reduce `max_length` unless you intentionally want a different training task.
|
| 60 |
|
| 61 |
+
## Quick start with nohup, unique run dirs, and resumable checkpoints
|
| 62 |
|
| 63 |
```bash
|
| 64 |
git clone https://huggingface.co/nraptisss/tmf921-intent-training
|
|
|
|
| 72 |
export HF_TOKEN=hf_...
|
| 73 |
export CUDA_VISIBLE_DEVICES=0
|
| 74 |
export PYTHONPATH="$PWD/src"
|
| 75 |
+
export TOKENIZERS_PARALLELISM=false
|
| 76 |
|
| 77 |
+
# Optional Trackio dashboard
|
| 78 |
+
# export TRACKIO_SPACE_ID=nraptisss/tmf921-trackio
|
| 79 |
+
|
| 80 |
+
bash scripts/nohup_new_run.sh
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
The helper creates a fresh run directory every time:
|
| 84 |
+
|
| 85 |
+
```text
|
| 86 |
+
runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS/
|
| 87 |
+
configs/config.yaml
|
| 88 |
+
logs/train.log
|
| 89 |
+
outputs/adapter/checkpoint-*/
|
| 90 |
+
eval/
|
| 91 |
```
|
| 92 |
|
| 93 |
+
Monitor:
|
| 94 |
|
| 95 |
```bash
|
| 96 |
+
RUN_DIR=runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
|
| 97 |
+
bash scripts/status_run.sh "$RUN_DIR"
|
| 98 |
+
tail -f "$RUN_DIR/logs/train.log"
|
| 99 |
+
watch -n 2 nvidia-smi
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
Resume after crash/reboot:
|
| 103 |
+
|
| 104 |
+
```bash
|
| 105 |
+
cd tmf921-intent-training
|
| 106 |
+
source .venv/bin/activate
|
| 107 |
+
export HF_TOKEN=hf_...
|
| 108 |
+
export CUDA_VISIBLE_DEVICES=0
|
| 109 |
+
export PYTHONPATH="$PWD/src"
|
| 110 |
+
|
| 111 |
+
bash scripts/nohup_resume.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
Evaluate after training:
|
| 115 |
+
|
| 116 |
+
```bash
|
| 117 |
+
bash scripts/nohup_eval.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
Manual training command, if you do not want nohup:
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
python scripts/train_qlora.py \
|
| 124 |
+
--config configs/rtx6000ada_qwen3_8b_qlora.yaml
|
| 125 |
```
|
| 126 |
|
| 127 |
## Optional Trackio monitoring
|
|
|
|
| 270 |
evaluate_model.py
|
| 271 |
merge_adapter.py
|
| 272 |
run_rtx6000ada.sh
|
| 273 |
+
nohup_new_run.sh
|
| 274 |
+
nohup_resume.sh
|
| 275 |
+
nohup_eval.sh
|
| 276 |
+
status_run.sh
|
| 277 |
src/tmf921_train/
|
| 278 |
utils.py
|
| 279 |
requirements.txt
|
scripts/nohup_eval.sh
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# Evaluate a completed or checkpointed run under nohup.
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/nohup_eval.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS [adapter_path]
|
| 7 |
+
|
| 8 |
+
if [ $# -lt 1 ]; then
|
| 9 |
+
echo "Usage: $0 <RUN_DIR> [ADAPTER_PATH]" >&2
|
| 10 |
+
exit 1
|
| 11 |
+
fi
|
| 12 |
+
|
| 13 |
+
RUN_DIR="$1"
|
| 14 |
+
ADAPTER="${2:-$RUN_DIR/outputs/adapter}"
|
| 15 |
+
|
| 16 |
+
if [ ! -d "$ADAPTER" ]; then
|
| 17 |
+
echo "ERROR: adapter path not found: $ADAPTER" >&2
|
| 18 |
+
exit 1
|
| 19 |
+
fi
|
| 20 |
+
|
| 21 |
+
source .venv/bin/activate
|
| 22 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 23 |
+
export PYTHONPATH="$PWD/src:${PYTHONPATH:-}"
|
| 24 |
+
export TOKENIZERS_PARALLELISM=false
|
| 25 |
+
|
| 26 |
+
mkdir -p "$RUN_DIR/logs" "$RUN_DIR/eval"
|
| 27 |
+
LOG="$RUN_DIR/logs/eval_$(date +%Y%m%d-%H%M%S).log"
|
| 28 |
+
|
| 29 |
+
nohup bash -lc "
|
| 30 |
+
source .venv/bin/activate
|
| 31 |
+
export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'
|
| 32 |
+
export PYTHONPATH='$PWD/src'
|
| 33 |
+
export TOKENIZERS_PARALLELISM=false
|
| 34 |
+
|
| 35 |
+
python scripts/evaluate_model.py \\
|
| 36 |
+
--model Qwen/Qwen3-8B \\
|
| 37 |
+
--adapter '$ADAPTER' \\
|
| 38 |
+
--dataset nraptisss/TMF921-intent-to-config-research-sota \\
|
| 39 |
+
--output_dir '$RUN_DIR/eval' \\
|
| 40 |
+
--load_in_4bit
|
| 41 |
+
" > "$LOG" 2>&1 &
|
| 42 |
+
|
| 43 |
+
echo $! > "$RUN_DIR/EVAL_PID.txt"
|
| 44 |
+
|
| 45 |
+
cat <<EOF
|
| 46 |
+
Started nohup evaluation.
|
| 47 |
+
RUN_DIR=$RUN_DIR
|
| 48 |
+
ADAPTER=$ADAPTER
|
| 49 |
+
PID=$(cat "$RUN_DIR/EVAL_PID.txt")
|
| 50 |
+
LOG=$LOG
|
| 51 |
+
RESULTS=$RUN_DIR/eval/all_metrics.json
|
| 52 |
+
|
| 53 |
+
Monitor:
|
| 54 |
+
tail -f "$LOG"
|
| 55 |
+
EOF
|
scripts/nohup_new_run.sh
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# Start a new unique TMF921 QLoRA run under nohup.
|
| 5 |
+
# Usage:
|
| 6 |
+
# export HF_TOKEN=hf_...
|
| 7 |
+
# bash scripts/nohup_new_run.sh [configs/rtx6000ada_qwen3_8b_qlora.yaml]
|
| 8 |
+
|
| 9 |
+
CONFIG_TEMPLATE="${1:-configs/rtx6000ada_qwen3_8b_qlora.yaml}"
|
| 10 |
+
|
| 11 |
+
if [ ! -f "$CONFIG_TEMPLATE" ]; then
|
| 12 |
+
echo "ERROR: config not found: $CONFIG_TEMPLATE" >&2
|
| 13 |
+
exit 1
|
| 14 |
+
fi
|
| 15 |
+
|
| 16 |
+
if [ -z "${HF_TOKEN:-}" ]; then
|
| 17 |
+
echo "ERROR: HF_TOKEN is not set. Run: export HF_TOKEN=hf_..." >&2
|
| 18 |
+
exit 1
|
| 19 |
+
fi
|
| 20 |
+
|
| 21 |
+
source .venv/bin/activate
|
| 22 |
+
|
| 23 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 24 |
+
export PYTHONPATH="$PWD/src:${PYTHONPATH:-}"
|
| 25 |
+
export TOKENIZERS_PARALLELISM=false
|
| 26 |
+
|
| 27 |
+
RUN_ID="qwen3-8b-qlora-$(date +%Y%m%d-%H%M%S)"
|
| 28 |
+
RUN_DIR="$PWD/runs/$RUN_ID"
|
| 29 |
+
mkdir -p "$RUN_DIR"/{logs,outputs,eval,configs}
|
| 30 |
+
cp "$CONFIG_TEMPLATE" "$RUN_DIR/configs/config.yaml"
|
| 31 |
+
|
| 32 |
+
python - <<PY
|
| 33 |
+
from pathlib import Path
|
| 34 |
+
import yaml
|
| 35 |
+
|
| 36 |
+
run_id = "$RUN_ID"
|
| 37 |
+
run_dir = Path("$RUN_DIR")
|
| 38 |
+
cfg_path = run_dir / "configs" / "config.yaml"
|
| 39 |
+
cfg = yaml.safe_load(cfg_path.read_text())
|
| 40 |
+
|
| 41 |
+
cfg["output_dir"] = str(run_dir / "outputs" / "adapter")
|
| 42 |
+
cfg["run_name"] = run_id
|
| 43 |
+
cfg["hub_model_id"] = f"nraptisss/Qwen3-8B-TMF921-Intent-QLoRA-{run_id}"
|
| 44 |
+
cfg["push_to_hub"] = True
|
| 45 |
+
|
| 46 |
+
# Trackio can also be passed via env TRACKIO_SPACE_ID.
|
| 47 |
+
if "TRACKIO_SPACE_ID" in __import__('os').environ and __import__('os').environ["TRACKIO_SPACE_ID"]:
|
| 48 |
+
cfg["trackio_space_id"] = __import__('os').environ["TRACKIO_SPACE_ID"]
|
| 49 |
+
|
| 50 |
+
cfg_path.write_text(yaml.safe_dump(cfg, sort_keys=False))
|
| 51 |
+
print(yaml.safe_dump(cfg, sort_keys=False))
|
| 52 |
+
PY
|
| 53 |
+
|
| 54 |
+
echo "$RUN_ID" > "$RUN_DIR/RUN_ID.txt"
|
| 55 |
+
echo "$RUN_DIR" > "$RUN_DIR/RUN_DIR.txt"
|
| 56 |
+
echo "$(date -Is)" > "$RUN_DIR/STARTED_AT.txt"
|
| 57 |
+
|
| 58 |
+
nohup bash -lc "
|
| 59 |
+
source .venv/bin/activate
|
| 60 |
+
export HF_TOKEN='$HF_TOKEN'
|
| 61 |
+
export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'
|
| 62 |
+
export PYTHONPATH='$PWD/src'
|
| 63 |
+
export TOKENIZERS_PARALLELISM=false
|
| 64 |
+
export TRACKIO_SPACE_ID='${TRACKIO_SPACE_ID:-}'
|
| 65 |
+
|
| 66 |
+
python scripts/train_qlora.py \\
|
| 67 |
+
--config '$RUN_DIR/configs/config.yaml'
|
| 68 |
+
" > "$RUN_DIR/logs/train.log" 2>&1 &
|
| 69 |
+
|
| 70 |
+
echo $! > "$RUN_DIR/TRAIN_PID.txt"
|
| 71 |
+
|
| 72 |
+
cat <<EOF
|
| 73 |
+
Started new nohup training run.
|
| 74 |
+
RUN_ID=$RUN_ID
|
| 75 |
+
RUN_DIR=$RUN_DIR
|
| 76 |
+
PID=$(cat "$RUN_DIR/TRAIN_PID.txt")
|
| 77 |
+
LOG=$RUN_DIR/logs/train.log
|
| 78 |
+
|
| 79 |
+
Monitor:
|
| 80 |
+
tail -f "$RUN_DIR/logs/train.log"
|
| 81 |
+
watch -n 2 nvidia-smi
|
| 82 |
+
|
| 83 |
+
Resume if interrupted:
|
| 84 |
+
bash scripts/nohup_resume.sh "$RUN_DIR"
|
| 85 |
+
|
| 86 |
+
Evaluate after training:
|
| 87 |
+
bash scripts/nohup_eval.sh "$RUN_DIR"
|
| 88 |
+
EOF
|
scripts/nohup_resume.sh
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# Resume a previous run from latest checkpoint under nohup.
|
| 5 |
+
# Usage:
|
| 6 |
+
# export HF_TOKEN=hf_...
|
| 7 |
+
# bash scripts/nohup_resume.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
|
| 8 |
+
|
| 9 |
+
if [ $# -lt 1 ]; then
|
| 10 |
+
echo "Usage: $0 <RUN_DIR>" >&2
|
| 11 |
+
exit 1
|
| 12 |
+
fi
|
| 13 |
+
|
| 14 |
+
RUN_DIR="$1"
|
| 15 |
+
CONFIG="$RUN_DIR/configs/config.yaml"
|
| 16 |
+
OUT_DIR="$RUN_DIR/outputs/adapter"
|
| 17 |
+
|
| 18 |
+
if [ ! -f "$CONFIG" ]; then
|
| 19 |
+
echo "ERROR: config not found: $CONFIG" >&2
|
| 20 |
+
exit 1
|
| 21 |
+
fi
|
| 22 |
+
|
| 23 |
+
if [ -z "${HF_TOKEN:-}" ]; then
|
| 24 |
+
echo "ERROR: HF_TOKEN is not set. Run: export HF_TOKEN=hf_..." >&2
|
| 25 |
+
exit 1
|
| 26 |
+
fi
|
| 27 |
+
|
| 28 |
+
LATEST_CKPT=$(find "$OUT_DIR" -maxdepth 1 -type d -name "checkpoint-*" 2>/dev/null | sort -V | tail -n 1 || true)
|
| 29 |
+
if [ -z "$LATEST_CKPT" ]; then
|
| 30 |
+
echo "ERROR: no checkpoint found in $OUT_DIR" >&2
|
| 31 |
+
exit 1
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
source .venv/bin/activate
|
| 35 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 36 |
+
export PYTHONPATH="$PWD/src:${PYTHONPATH:-}"
|
| 37 |
+
export TOKENIZERS_PARALLELISM=false
|
| 38 |
+
|
| 39 |
+
LOG="$RUN_DIR/logs/train_resume_$(date +%Y%m%d-%H%M%S).log"
|
| 40 |
+
|
| 41 |
+
echo "$(date -Is) resume from $LATEST_CKPT" >> "$RUN_DIR/RESUME_HISTORY.txt"
|
| 42 |
+
|
| 43 |
+
nohup bash -lc "
|
| 44 |
+
source .venv/bin/activate
|
| 45 |
+
export HF_TOKEN='$HF_TOKEN'
|
| 46 |
+
export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'
|
| 47 |
+
export PYTHONPATH='$PWD/src'
|
| 48 |
+
export TOKENIZERS_PARALLELISM=false
|
| 49 |
+
export TRACKIO_SPACE_ID='${TRACKIO_SPACE_ID:-}'
|
| 50 |
+
|
| 51 |
+
python scripts/train_qlora.py \\
|
| 52 |
+
--config '$CONFIG' \\
|
| 53 |
+
--resume_from_checkpoint '$LATEST_CKPT'
|
| 54 |
+
" > "$LOG" 2>&1 &
|
| 55 |
+
|
| 56 |
+
echo $! > "$RUN_DIR/TRAIN_RESUME_PID.txt"
|
| 57 |
+
|
| 58 |
+
cat <<EOF
|
| 59 |
+
Resumed nohup training run.
|
| 60 |
+
RUN_DIR=$RUN_DIR
|
| 61 |
+
CHECKPOINT=$LATEST_CKPT
|
| 62 |
+
PID=$(cat "$RUN_DIR/TRAIN_RESUME_PID.txt")
|
| 63 |
+
LOG=$LOG
|
| 64 |
+
|
| 65 |
+
Monitor:
|
| 66 |
+
tail -f "$LOG"
|
| 67 |
+
EOF
|
scripts/status_run.sh
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# Show status for a run directory.
|
| 5 |
+
# Usage: bash scripts/status_run.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
|
| 6 |
+
|
| 7 |
+
if [ $# -lt 1 ]; then
|
| 8 |
+
echo "Usage: $0 <RUN_DIR>" >&2
|
| 9 |
+
exit 1
|
| 10 |
+
fi
|
| 11 |
+
|
| 12 |
+
RUN_DIR="$1"
|
| 13 |
+
|
| 14 |
+
echo "RUN_DIR=$RUN_DIR"
|
| 15 |
+
[ -f "$RUN_DIR/RUN_ID.txt" ] && echo "RUN_ID=$(cat "$RUN_DIR/RUN_ID.txt")"
|
| 16 |
+
|
| 17 |
+
for f in TRAIN_PID.txt TRAIN_RESUME_PID.txt EVAL_PID.txt; do
|
| 18 |
+
if [ -f "$RUN_DIR/$f" ]; then
|
| 19 |
+
PID=$(cat "$RUN_DIR/$f")
|
| 20 |
+
if kill -0 "$PID" 2>/dev/null; then
|
| 21 |
+
echo "$f PID=$PID running"
|
| 22 |
+
else
|
| 23 |
+
echo "$f PID=$PID not running"
|
| 24 |
+
fi
|
| 25 |
+
fi
|
| 26 |
+
done
|
| 27 |
+
|
| 28 |
+
echo
|
| 29 |
+
if [ -d "$RUN_DIR/outputs/adapter" ]; then
|
| 30 |
+
echo "Latest checkpoints:"
|
| 31 |
+
find "$RUN_DIR/outputs/adapter" -maxdepth 1 -type d -name "checkpoint-*" | sort -V | tail -n 5 || true
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
echo
|
| 35 |
+
if ls "$RUN_DIR"/logs/*.log >/dev/null 2>&1; then
|
| 36 |
+
echo "Logs:"
|
| 37 |
+
ls -lh "$RUN_DIR"/logs/*.log
|
| 38 |
+
echo
|
| 39 |
+
echo "Last train/eval lines:"
|
| 40 |
+
tail -n 20 "$RUN_DIR"/logs/*.log
|
| 41 |
+
fi
|
| 42 |
+
|
| 43 |
+
echo
|
| 44 |
+
if [ -f "$RUN_DIR/eval/all_metrics.json" ]; then
|
| 45 |
+
echo "Eval metrics: $RUN_DIR/eval/all_metrics.json"
|
| 46 |
+
fi
|
scripts/train_qlora.py
CHANGED
|
@@ -66,6 +66,7 @@ def parse_args():
|
|
| 66 |
p.add_argument("--no_push", action="store_true")
|
| 67 |
p.add_argument("--packing", action="store_true", help="Override config and enable packing. Requires compatible attention setup.")
|
| 68 |
p.add_argument("--flash_attn", action="store_true", help="Use flash_attention_2 in model_init_kwargs. Install flash-attn first.")
|
|
|
|
| 69 |
p.add_argument("--seed", type=int, default=42)
|
| 70 |
return p.parse_args()
|
| 71 |
|
|
@@ -181,7 +182,10 @@ def main():
|
|
| 181 |
callbacks=[TrackioAlertCallback()],
|
| 182 |
)
|
| 183 |
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
| 185 |
metrics = trainer.evaluate()
|
| 186 |
write_json(Path(cfg["output_dir"]) / "final_eval_metrics.json", metrics)
|
| 187 |
trainer.save_model(cfg["output_dir"])
|
|
|
|
| 66 |
p.add_argument("--no_push", action="store_true")
|
| 67 |
p.add_argument("--packing", action="store_true", help="Override config and enable packing. Requires compatible attention setup.")
|
| 68 |
p.add_argument("--flash_attn", action="store_true", help="Use flash_attention_2 in model_init_kwargs. Install flash-attn first.")
|
| 69 |
+
p.add_argument("--resume_from_checkpoint", default=None, help="Path to checkpoint dir, or 'true' to auto-resume latest checkpoint in output_dir")
|
| 70 |
p.add_argument("--seed", type=int, default=42)
|
| 71 |
return p.parse_args()
|
| 72 |
|
|
|
|
| 182 |
callbacks=[TrackioAlertCallback()],
|
| 183 |
)
|
| 184 |
|
| 185 |
+
resume_arg = args.resume_from_checkpoint
|
| 186 |
+
if resume_arg is not None and str(resume_arg).lower() == "true":
|
| 187 |
+
resume_arg = True
|
| 188 |
+
trainer.train(resume_from_checkpoint=resume_arg)
|
| 189 |
metrics = trainer.evaluate()
|
| 190 |
write_json(Path(cfg["output_dir"]) / "final_eval_metrics.json", metrics)
|
| 191 |
trainer.save_model(cfg["output_dir"])
|