Add nohup run management and resumable checkpoint support

Browse files

Files changed (6) hide show

README.md +50 -5
scripts/nohup_eval.sh +55 -0
scripts/nohup_new_run.sh +88 -0
scripts/nohup_resume.sh +67 -0
scripts/status_run.sh +46 -0
scripts/train_qlora.py +5 -1

README.md CHANGED Viewed

@@ -58,7 +58,7 @@ gradient_accumulation_steps: 16
 Do **not** reduce `max_length` unless you intentionally want a different training task.
-## Quick start
 ```bash
 git clone https://huggingface.co/nraptisss/tmf921-intent-training
@@ -72,15 +72,56 @@ python -m pip install -r requirements.txt
 export HF_TOKEN=hf_...
 export CUDA_VISIBLE_DEVICES=0
 export PYTHONPATH="$PWD/src"
-python scripts/train_qlora.py \
-  --config configs/rtx6000ada_qwen3_8b_qlora.yaml
 ```
-Or run the full train+eval script:
 ```bash
-bash scripts/run_rtx6000ada.sh
 ```
 ## Optional Trackio monitoring
@@ -229,6 +270,10 @@ scripts/
   evaluate_model.py
   merge_adapter.py
   run_rtx6000ada.sh
 src/tmf921_train/
   utils.py
 requirements.txt

 Do **not** reduce `max_length` unless you intentionally want a different training task.
+## Quick start with nohup, unique run dirs, and resumable checkpoints
 ```bash
 git clone https://huggingface.co/nraptisss/tmf921-intent-training
 export HF_TOKEN=hf_...
 export CUDA_VISIBLE_DEVICES=0
 export PYTHONPATH="$PWD/src"
+export TOKENIZERS_PARALLELISM=false
+# Optional Trackio dashboard
+# export TRACKIO_SPACE_ID=nraptisss/tmf921-trackio
+bash scripts/nohup_new_run.sh
+```
+The helper creates a fresh run directory every time:
+```text
+runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS/
+  configs/config.yaml
+  logs/train.log
+  outputs/adapter/checkpoint-*/
+  eval/
 ```
+Monitor:
 ```bash
+RUN_DIR=runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
+bash scripts/status_run.sh "$RUN_DIR"
+tail -f "$RUN_DIR/logs/train.log"
+watch -n 2 nvidia-smi
+```
+Resume after crash/reboot:
+```bash
+cd tmf921-intent-training
+source .venv/bin/activate
+export HF_TOKEN=hf_...
+export CUDA_VISIBLE_DEVICES=0
+export PYTHONPATH="$PWD/src"
+bash scripts/nohup_resume.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
+```
+Evaluate after training:
+```bash
+bash scripts/nohup_eval.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
+```
+Manual training command, if you do not want nohup:
+```bash
+python scripts/train_qlora.py \
+  --config configs/rtx6000ada_qwen3_8b_qlora.yaml
 ```
 ## Optional Trackio monitoring
   evaluate_model.py
   merge_adapter.py
   run_rtx6000ada.sh
+  nohup_new_run.sh
+  nohup_resume.sh
+  nohup_eval.sh
+  status_run.sh
 src/tmf921_train/
   utils.py
 requirements.txt

scripts/nohup_eval.sh ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# Evaluate a completed or checkpointed run under nohup.
+# Usage:
+#   bash scripts/nohup_eval.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS [adapter_path]
+if [ $# -lt 1 ]; then
+  echo "Usage: $0 <RUN_DIR> [ADAPTER_PATH]" >&2
+  exit 1
+fi
+RUN_DIR="$1"
+ADAPTER="${2:-$RUN_DIR/outputs/adapter}"
+if [ ! -d "$ADAPTER" ]; then
+  echo "ERROR: adapter path not found: $ADAPTER" >&2
+  exit 1
+fi
+source .venv/bin/activate
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+export PYTHONPATH="$PWD/src:${PYTHONPATH:-}"
+export TOKENIZERS_PARALLELISM=false
+mkdir -p "$RUN_DIR/logs" "$RUN_DIR/eval"
+LOG="$RUN_DIR/logs/eval_$(date +%Y%m%d-%H%M%S).log"
+nohup bash -lc "
+source .venv/bin/activate
+export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'
+export PYTHONPATH='$PWD/src'
+export TOKENIZERS_PARALLELISM=false
+python scripts/evaluate_model.py \\
+  --model Qwen/Qwen3-8B \\
+  --adapter '$ADAPTER' \\
+  --dataset nraptisss/TMF921-intent-to-config-research-sota \\
+  --output_dir '$RUN_DIR/eval' \\
+  --load_in_4bit
+" > "$LOG" 2>&1 &
+echo $! > "$RUN_DIR/EVAL_PID.txt"
+cat <<EOF
+Started nohup evaluation.
+RUN_DIR=$RUN_DIR
+ADAPTER=$ADAPTER
+PID=$(cat "$RUN_DIR/EVAL_PID.txt")
+LOG=$LOG
+RESULTS=$RUN_DIR/eval/all_metrics.json
+Monitor:
+  tail -f "$LOG"
+EOF

scripts/nohup_new_run.sh ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# Start a new unique TMF921 QLoRA run under nohup.
+# Usage:
+#   export HF_TOKEN=hf_...
+#   bash scripts/nohup_new_run.sh [configs/rtx6000ada_qwen3_8b_qlora.yaml]
+CONFIG_TEMPLATE="${1:-configs/rtx6000ada_qwen3_8b_qlora.yaml}"
+if [ ! -f "$CONFIG_TEMPLATE" ]; then
+  echo "ERROR: config not found: $CONFIG_TEMPLATE" >&2
+  exit 1
+fi
+if [ -z "${HF_TOKEN:-}" ]; then
+  echo "ERROR: HF_TOKEN is not set. Run: export HF_TOKEN=hf_..." >&2
+  exit 1
+fi
+source .venv/bin/activate
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+export PYTHONPATH="$PWD/src:${PYTHONPATH:-}"
+export TOKENIZERS_PARALLELISM=false
+RUN_ID="qwen3-8b-qlora-$(date +%Y%m%d-%H%M%S)"
+RUN_DIR="$PWD/runs/$RUN_ID"
+mkdir -p "$RUN_DIR"/{logs,outputs,eval,configs}
+cp "$CONFIG_TEMPLATE" "$RUN_DIR/configs/config.yaml"
+python - <<PY
+from pathlib import Path
+import yaml
+run_id = "$RUN_ID"
+run_dir = Path("$RUN_DIR")
+cfg_path = run_dir / "configs" / "config.yaml"
+cfg = yaml.safe_load(cfg_path.read_text())
+cfg["output_dir"] = str(run_dir / "outputs" / "adapter")
+cfg["run_name"] = run_id
+cfg["hub_model_id"] = f"nraptisss/Qwen3-8B-TMF921-Intent-QLoRA-{run_id}"
+cfg["push_to_hub"] = True
+# Trackio can also be passed via env TRACKIO_SPACE_ID.
+if "TRACKIO_SPACE_ID" in __import__('os').environ and __import__('os').environ["TRACKIO_SPACE_ID"]:
+    cfg["trackio_space_id"] = __import__('os').environ["TRACKIO_SPACE_ID"]
+cfg_path.write_text(yaml.safe_dump(cfg, sort_keys=False))
+print(yaml.safe_dump(cfg, sort_keys=False))
+PY
+echo "$RUN_ID" > "$RUN_DIR/RUN_ID.txt"
+echo "$RUN_DIR" > "$RUN_DIR/RUN_DIR.txt"
+echo "$(date -Is)" > "$RUN_DIR/STARTED_AT.txt"
+nohup bash -lc "
+source .venv/bin/activate
+export HF_TOKEN='$HF_TOKEN'
+export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'
+export PYTHONPATH='$PWD/src'
+export TOKENIZERS_PARALLELISM=false
+export TRACKIO_SPACE_ID='${TRACKIO_SPACE_ID:-}'
+python scripts/train_qlora.py \\
+  --config '$RUN_DIR/configs/config.yaml'
+" > "$RUN_DIR/logs/train.log" 2>&1 &
+echo $! > "$RUN_DIR/TRAIN_PID.txt"
+cat <<EOF
+Started new nohup training run.
+RUN_ID=$RUN_ID
+RUN_DIR=$RUN_DIR
+PID=$(cat "$RUN_DIR/TRAIN_PID.txt")
+LOG=$RUN_DIR/logs/train.log
+Monitor:
+  tail -f "$RUN_DIR/logs/train.log"
+  watch -n 2 nvidia-smi
+Resume if interrupted:
+  bash scripts/nohup_resume.sh "$RUN_DIR"
+Evaluate after training:
+  bash scripts/nohup_eval.sh "$RUN_DIR"
+EOF

scripts/nohup_resume.sh ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# Resume a previous run from latest checkpoint under nohup.
+# Usage:
+#   export HF_TOKEN=hf_...
+#   bash scripts/nohup_resume.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
+if [ $# -lt 1 ]; then
+  echo "Usage: $0 <RUN_DIR>" >&2
+  exit 1
+fi
+RUN_DIR="$1"
+CONFIG="$RUN_DIR/configs/config.yaml"
+OUT_DIR="$RUN_DIR/outputs/adapter"
+if [ ! -f "$CONFIG" ]; then
+  echo "ERROR: config not found: $CONFIG" >&2
+  exit 1
+fi
+if [ -z "${HF_TOKEN:-}" ]; then
+  echo "ERROR: HF_TOKEN is not set. Run: export HF_TOKEN=hf_..." >&2
+  exit 1
+fi
+LATEST_CKPT=$(find "$OUT_DIR" -maxdepth 1 -type d -name "checkpoint-*" 2>/dev/null | sort -V | tail -n 1 || true)
+if [ -z "$LATEST_CKPT" ]; then
+  echo "ERROR: no checkpoint found in $OUT_DIR" >&2
+  exit 1
+fi
+source .venv/bin/activate
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+export PYTHONPATH="$PWD/src:${PYTHONPATH:-}"
+export TOKENIZERS_PARALLELISM=false
+LOG="$RUN_DIR/logs/train_resume_$(date +%Y%m%d-%H%M%S).log"
+echo "$(date -Is) resume from $LATEST_CKPT" >> "$RUN_DIR/RESUME_HISTORY.txt"
+nohup bash -lc "
+source .venv/bin/activate
+export HF_TOKEN='$HF_TOKEN'
+export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'
+export PYTHONPATH='$PWD/src'
+export TOKENIZERS_PARALLELISM=false
+export TRACKIO_SPACE_ID='${TRACKIO_SPACE_ID:-}'
+python scripts/train_qlora.py \\
+  --config '$CONFIG' \\
+  --resume_from_checkpoint '$LATEST_CKPT'
+" > "$LOG" 2>&1 &
+echo $! > "$RUN_DIR/TRAIN_RESUME_PID.txt"
+cat <<EOF
+Resumed nohup training run.
+RUN_DIR=$RUN_DIR
+CHECKPOINT=$LATEST_CKPT
+PID=$(cat "$RUN_DIR/TRAIN_RESUME_PID.txt")
+LOG=$LOG
+Monitor:
+  tail -f "$LOG"
+EOF

scripts/status_run.sh ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# Show status for a run directory.
+# Usage: bash scripts/status_run.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
+if [ $# -lt 1 ]; then
+  echo "Usage: $0 <RUN_DIR>" >&2
+  exit 1
+fi
+RUN_DIR="$1"
+echo "RUN_DIR=$RUN_DIR"
+[ -f "$RUN_DIR/RUN_ID.txt" ] && echo "RUN_ID=$(cat "$RUN_DIR/RUN_ID.txt")"
+for f in TRAIN_PID.txt TRAIN_RESUME_PID.txt EVAL_PID.txt; do
+  if [ -f "$RUN_DIR/$f" ]; then
+    PID=$(cat "$RUN_DIR/$f")
+    if kill -0 "$PID" 2>/dev/null; then
+      echo "$f PID=$PID running"
+    else
+      echo "$f PID=$PID not running"
+    fi
+  fi
+done
+echo
+if [ -d "$RUN_DIR/outputs/adapter" ]; then
+  echo "Latest checkpoints:"
+  find "$RUN_DIR/outputs/adapter" -maxdepth 1 -type d -name "checkpoint-*" | sort -V | tail -n 5 || true
+fi
+echo
+if ls "$RUN_DIR"/logs/*.log >/dev/null 2>&1; then
+  echo "Logs:"
+  ls -lh "$RUN_DIR"/logs/*.log
+  echo
+  echo "Last train/eval lines:"
+  tail -n 20 "$RUN_DIR"/logs/*.log
+fi
+echo
+if [ -f "$RUN_DIR/eval/all_metrics.json" ]; then
+  echo "Eval metrics: $RUN_DIR/eval/all_metrics.json"
+fi

scripts/train_qlora.py CHANGED Viewed

@@ -66,6 +66,7 @@ def parse_args():
     p.add_argument("--no_push", action="store_true")
     p.add_argument("--packing", action="store_true", help="Override config and enable packing. Requires compatible attention setup.")
     p.add_argument("--flash_attn", action="store_true", help="Use flash_attention_2 in model_init_kwargs. Install flash-attn first.")
     p.add_argument("--seed", type=int, default=42)
     return p.parse_args()
@@ -181,7 +182,10 @@ def main():
         callbacks=[TrackioAlertCallback()],
     )
-    trainer.train()
     metrics = trainer.evaluate()
     write_json(Path(cfg["output_dir"]) / "final_eval_metrics.json", metrics)
     trainer.save_model(cfg["output_dir"])

     p.add_argument("--no_push", action="store_true")
     p.add_argument("--packing", action="store_true", help="Override config and enable packing. Requires compatible attention setup.")
     p.add_argument("--flash_attn", action="store_true", help="Use flash_attention_2 in model_init_kwargs. Install flash-attn first.")
+    p.add_argument("--resume_from_checkpoint", default=None, help="Path to checkpoint dir, or 'true' to auto-resume latest checkpoint in output_dir")
     p.add_argument("--seed", type=int, default=42)
     return p.parse_args()
         callbacks=[TrackioAlertCallback()],
     )
+    resume_arg = args.resume_from_checkpoint
+    if resume_arg is not None and str(resume_arg).lower() == "true":
+        resume_arg = True
+    trainer.train(resume_from_checkpoint=resume_arg)
     metrics = trainer.evaluate()
     write_json(Path(cfg["output_dir"]) / "final_eval_metrics.json", metrics)
     trainer.save_model(cfg["output_dir"])