PEFT
qlora
sft
trl
qwen3
tmf921
intent-based-networking
network-slicing
rtx-6000-ada
ml-intern
nraptisss commited on
Commit
a896ecd
·
verified ·
1 Parent(s): d9ba941

Add nohup run management and resumable checkpoint support

Browse files
README.md CHANGED
@@ -58,7 +58,7 @@ gradient_accumulation_steps: 16
58
 
59
  Do **not** reduce `max_length` unless you intentionally want a different training task.
60
 
61
- ## Quick start
62
 
63
  ```bash
64
  git clone https://huggingface.co/nraptisss/tmf921-intent-training
@@ -72,15 +72,56 @@ python -m pip install -r requirements.txt
72
  export HF_TOKEN=hf_...
73
  export CUDA_VISIBLE_DEVICES=0
74
  export PYTHONPATH="$PWD/src"
 
75
 
76
- python scripts/train_qlora.py \
77
- --config configs/rtx6000ada_qwen3_8b_qlora.yaml
 
 
 
 
 
 
 
 
 
 
 
 
78
  ```
79
 
80
- Or run the full train+eval script:
81
 
82
  ```bash
83
- bash scripts/run_rtx6000ada.sh
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ```
85
 
86
  ## Optional Trackio monitoring
@@ -229,6 +270,10 @@ scripts/
229
  evaluate_model.py
230
  merge_adapter.py
231
  run_rtx6000ada.sh
 
 
 
 
232
  src/tmf921_train/
233
  utils.py
234
  requirements.txt
 
58
 
59
  Do **not** reduce `max_length` unless you intentionally want a different training task.
60
 
61
+ ## Quick start with nohup, unique run dirs, and resumable checkpoints
62
 
63
  ```bash
64
  git clone https://huggingface.co/nraptisss/tmf921-intent-training
 
72
  export HF_TOKEN=hf_...
73
  export CUDA_VISIBLE_DEVICES=0
74
  export PYTHONPATH="$PWD/src"
75
+ export TOKENIZERS_PARALLELISM=false
76
 
77
+ # Optional Trackio dashboard
78
+ # export TRACKIO_SPACE_ID=nraptisss/tmf921-trackio
79
+
80
+ bash scripts/nohup_new_run.sh
81
+ ```
82
+
83
+ The helper creates a fresh run directory every time:
84
+
85
+ ```text
86
+ runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS/
87
+ configs/config.yaml
88
+ logs/train.log
89
+ outputs/adapter/checkpoint-*/
90
+ eval/
91
  ```
92
 
93
+ Monitor:
94
 
95
  ```bash
96
+ RUN_DIR=runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
97
+ bash scripts/status_run.sh "$RUN_DIR"
98
+ tail -f "$RUN_DIR/logs/train.log"
99
+ watch -n 2 nvidia-smi
100
+ ```
101
+
102
+ Resume after crash/reboot:
103
+
104
+ ```bash
105
+ cd tmf921-intent-training
106
+ source .venv/bin/activate
107
+ export HF_TOKEN=hf_...
108
+ export CUDA_VISIBLE_DEVICES=0
109
+ export PYTHONPATH="$PWD/src"
110
+
111
+ bash scripts/nohup_resume.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
112
+ ```
113
+
114
+ Evaluate after training:
115
+
116
+ ```bash
117
+ bash scripts/nohup_eval.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
118
+ ```
119
+
120
+ Manual training command, if you do not want nohup:
121
+
122
+ ```bash
123
+ python scripts/train_qlora.py \
124
+ --config configs/rtx6000ada_qwen3_8b_qlora.yaml
125
  ```
126
 
127
  ## Optional Trackio monitoring
 
270
  evaluate_model.py
271
  merge_adapter.py
272
  run_rtx6000ada.sh
273
+ nohup_new_run.sh
274
+ nohup_resume.sh
275
+ nohup_eval.sh
276
+ status_run.sh
277
  src/tmf921_train/
278
  utils.py
279
  requirements.txt
scripts/nohup_eval.sh ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Evaluate a completed or checkpointed run under nohup.
5
+ # Usage:
6
+ # bash scripts/nohup_eval.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS [adapter_path]
7
+
8
+ if [ $# -lt 1 ]; then
9
+ echo "Usage: $0 <RUN_DIR> [ADAPTER_PATH]" >&2
10
+ exit 1
11
+ fi
12
+
13
+ RUN_DIR="$1"
14
+ ADAPTER="${2:-$RUN_DIR/outputs/adapter}"
15
+
16
+ if [ ! -d "$ADAPTER" ]; then
17
+ echo "ERROR: adapter path not found: $ADAPTER" >&2
18
+ exit 1
19
+ fi
20
+
21
+ source .venv/bin/activate
22
+ export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
23
+ export PYTHONPATH="$PWD/src:${PYTHONPATH:-}"
24
+ export TOKENIZERS_PARALLELISM=false
25
+
26
+ mkdir -p "$RUN_DIR/logs" "$RUN_DIR/eval"
27
+ LOG="$RUN_DIR/logs/eval_$(date +%Y%m%d-%H%M%S).log"
28
+
29
+ nohup bash -lc "
30
+ source .venv/bin/activate
31
+ export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'
32
+ export PYTHONPATH='$PWD/src'
33
+ export TOKENIZERS_PARALLELISM=false
34
+
35
+ python scripts/evaluate_model.py \\
36
+ --model Qwen/Qwen3-8B \\
37
+ --adapter '$ADAPTER' \\
38
+ --dataset nraptisss/TMF921-intent-to-config-research-sota \\
39
+ --output_dir '$RUN_DIR/eval' \\
40
+ --load_in_4bit
41
+ " > "$LOG" 2>&1 &
42
+
43
+ echo $! > "$RUN_DIR/EVAL_PID.txt"
44
+
45
+ cat <<EOF
46
+ Started nohup evaluation.
47
+ RUN_DIR=$RUN_DIR
48
+ ADAPTER=$ADAPTER
49
+ PID=$(cat "$RUN_DIR/EVAL_PID.txt")
50
+ LOG=$LOG
51
+ RESULTS=$RUN_DIR/eval/all_metrics.json
52
+
53
+ Monitor:
54
+ tail -f "$LOG"
55
+ EOF
scripts/nohup_new_run.sh ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Start a new unique TMF921 QLoRA run under nohup.
5
+ # Usage:
6
+ # export HF_TOKEN=hf_...
7
+ # bash scripts/nohup_new_run.sh [configs/rtx6000ada_qwen3_8b_qlora.yaml]
8
+
9
+ CONFIG_TEMPLATE="${1:-configs/rtx6000ada_qwen3_8b_qlora.yaml}"
10
+
11
+ if [ ! -f "$CONFIG_TEMPLATE" ]; then
12
+ echo "ERROR: config not found: $CONFIG_TEMPLATE" >&2
13
+ exit 1
14
+ fi
15
+
16
+ if [ -z "${HF_TOKEN:-}" ]; then
17
+ echo "ERROR: HF_TOKEN is not set. Run: export HF_TOKEN=hf_..." >&2
18
+ exit 1
19
+ fi
20
+
21
+ source .venv/bin/activate
22
+
23
+ export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
24
+ export PYTHONPATH="$PWD/src:${PYTHONPATH:-}"
25
+ export TOKENIZERS_PARALLELISM=false
26
+
27
+ RUN_ID="qwen3-8b-qlora-$(date +%Y%m%d-%H%M%S)"
28
+ RUN_DIR="$PWD/runs/$RUN_ID"
29
+ mkdir -p "$RUN_DIR"/{logs,outputs,eval,configs}
30
+ cp "$CONFIG_TEMPLATE" "$RUN_DIR/configs/config.yaml"
31
+
32
+ python - <<PY
33
+ from pathlib import Path
34
+ import yaml
35
+
36
+ run_id = "$RUN_ID"
37
+ run_dir = Path("$RUN_DIR")
38
+ cfg_path = run_dir / "configs" / "config.yaml"
39
+ cfg = yaml.safe_load(cfg_path.read_text())
40
+
41
+ cfg["output_dir"] = str(run_dir / "outputs" / "adapter")
42
+ cfg["run_name"] = run_id
43
+ cfg["hub_model_id"] = f"nraptisss/Qwen3-8B-TMF921-Intent-QLoRA-{run_id}"
44
+ cfg["push_to_hub"] = True
45
+
46
+ # Trackio can also be passed via env TRACKIO_SPACE_ID.
47
+ if "TRACKIO_SPACE_ID" in __import__('os').environ and __import__('os').environ["TRACKIO_SPACE_ID"]:
48
+ cfg["trackio_space_id"] = __import__('os').environ["TRACKIO_SPACE_ID"]
49
+
50
+ cfg_path.write_text(yaml.safe_dump(cfg, sort_keys=False))
51
+ print(yaml.safe_dump(cfg, sort_keys=False))
52
+ PY
53
+
54
+ echo "$RUN_ID" > "$RUN_DIR/RUN_ID.txt"
55
+ echo "$RUN_DIR" > "$RUN_DIR/RUN_DIR.txt"
56
+ echo "$(date -Is)" > "$RUN_DIR/STARTED_AT.txt"
57
+
58
+ nohup bash -lc "
59
+ source .venv/bin/activate
60
+ export HF_TOKEN='$HF_TOKEN'
61
+ export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'
62
+ export PYTHONPATH='$PWD/src'
63
+ export TOKENIZERS_PARALLELISM=false
64
+ export TRACKIO_SPACE_ID='${TRACKIO_SPACE_ID:-}'
65
+
66
+ python scripts/train_qlora.py \\
67
+ --config '$RUN_DIR/configs/config.yaml'
68
+ " > "$RUN_DIR/logs/train.log" 2>&1 &
69
+
70
+ echo $! > "$RUN_DIR/TRAIN_PID.txt"
71
+
72
+ cat <<EOF
73
+ Started new nohup training run.
74
+ RUN_ID=$RUN_ID
75
+ RUN_DIR=$RUN_DIR
76
+ PID=$(cat "$RUN_DIR/TRAIN_PID.txt")
77
+ LOG=$RUN_DIR/logs/train.log
78
+
79
+ Monitor:
80
+ tail -f "$RUN_DIR/logs/train.log"
81
+ watch -n 2 nvidia-smi
82
+
83
+ Resume if interrupted:
84
+ bash scripts/nohup_resume.sh "$RUN_DIR"
85
+
86
+ Evaluate after training:
87
+ bash scripts/nohup_eval.sh "$RUN_DIR"
88
+ EOF
scripts/nohup_resume.sh ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Resume a previous run from latest checkpoint under nohup.
5
+ # Usage:
6
+ # export HF_TOKEN=hf_...
7
+ # bash scripts/nohup_resume.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
8
+
9
+ if [ $# -lt 1 ]; then
10
+ echo "Usage: $0 <RUN_DIR>" >&2
11
+ exit 1
12
+ fi
13
+
14
+ RUN_DIR="$1"
15
+ CONFIG="$RUN_DIR/configs/config.yaml"
16
+ OUT_DIR="$RUN_DIR/outputs/adapter"
17
+
18
+ if [ ! -f "$CONFIG" ]; then
19
+ echo "ERROR: config not found: $CONFIG" >&2
20
+ exit 1
21
+ fi
22
+
23
+ if [ -z "${HF_TOKEN:-}" ]; then
24
+ echo "ERROR: HF_TOKEN is not set. Run: export HF_TOKEN=hf_..." >&2
25
+ exit 1
26
+ fi
27
+
28
+ LATEST_CKPT=$(find "$OUT_DIR" -maxdepth 1 -type d -name "checkpoint-*" 2>/dev/null | sort -V | tail -n 1 || true)
29
+ if [ -z "$LATEST_CKPT" ]; then
30
+ echo "ERROR: no checkpoint found in $OUT_DIR" >&2
31
+ exit 1
32
+ fi
33
+
34
+ source .venv/bin/activate
35
+ export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
36
+ export PYTHONPATH="$PWD/src:${PYTHONPATH:-}"
37
+ export TOKENIZERS_PARALLELISM=false
38
+
39
+ LOG="$RUN_DIR/logs/train_resume_$(date +%Y%m%d-%H%M%S).log"
40
+
41
+ echo "$(date -Is) resume from $LATEST_CKPT" >> "$RUN_DIR/RESUME_HISTORY.txt"
42
+
43
+ nohup bash -lc "
44
+ source .venv/bin/activate
45
+ export HF_TOKEN='$HF_TOKEN'
46
+ export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'
47
+ export PYTHONPATH='$PWD/src'
48
+ export TOKENIZERS_PARALLELISM=false
49
+ export TRACKIO_SPACE_ID='${TRACKIO_SPACE_ID:-}'
50
+
51
+ python scripts/train_qlora.py \\
52
+ --config '$CONFIG' \\
53
+ --resume_from_checkpoint '$LATEST_CKPT'
54
+ " > "$LOG" 2>&1 &
55
+
56
+ echo $! > "$RUN_DIR/TRAIN_RESUME_PID.txt"
57
+
58
+ cat <<EOF
59
+ Resumed nohup training run.
60
+ RUN_DIR=$RUN_DIR
61
+ CHECKPOINT=$LATEST_CKPT
62
+ PID=$(cat "$RUN_DIR/TRAIN_RESUME_PID.txt")
63
+ LOG=$LOG
64
+
65
+ Monitor:
66
+ tail -f "$LOG"
67
+ EOF
scripts/status_run.sh ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Show status for a run directory.
5
+ # Usage: bash scripts/status_run.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
6
+
7
+ if [ $# -lt 1 ]; then
8
+ echo "Usage: $0 <RUN_DIR>" >&2
9
+ exit 1
10
+ fi
11
+
12
+ RUN_DIR="$1"
13
+
14
+ echo "RUN_DIR=$RUN_DIR"
15
+ [ -f "$RUN_DIR/RUN_ID.txt" ] && echo "RUN_ID=$(cat "$RUN_DIR/RUN_ID.txt")"
16
+
17
+ for f in TRAIN_PID.txt TRAIN_RESUME_PID.txt EVAL_PID.txt; do
18
+ if [ -f "$RUN_DIR/$f" ]; then
19
+ PID=$(cat "$RUN_DIR/$f")
20
+ if kill -0 "$PID" 2>/dev/null; then
21
+ echo "$f PID=$PID running"
22
+ else
23
+ echo "$f PID=$PID not running"
24
+ fi
25
+ fi
26
+ done
27
+
28
+ echo
29
+ if [ -d "$RUN_DIR/outputs/adapter" ]; then
30
+ echo "Latest checkpoints:"
31
+ find "$RUN_DIR/outputs/adapter" -maxdepth 1 -type d -name "checkpoint-*" | sort -V | tail -n 5 || true
32
+ fi
33
+
34
+ echo
35
+ if ls "$RUN_DIR"/logs/*.log >/dev/null 2>&1; then
36
+ echo "Logs:"
37
+ ls -lh "$RUN_DIR"/logs/*.log
38
+ echo
39
+ echo "Last train/eval lines:"
40
+ tail -n 20 "$RUN_DIR"/logs/*.log
41
+ fi
42
+
43
+ echo
44
+ if [ -f "$RUN_DIR/eval/all_metrics.json" ]; then
45
+ echo "Eval metrics: $RUN_DIR/eval/all_metrics.json"
46
+ fi
scripts/train_qlora.py CHANGED
@@ -66,6 +66,7 @@ def parse_args():
66
  p.add_argument("--no_push", action="store_true")
67
  p.add_argument("--packing", action="store_true", help="Override config and enable packing. Requires compatible attention setup.")
68
  p.add_argument("--flash_attn", action="store_true", help="Use flash_attention_2 in model_init_kwargs. Install flash-attn first.")
 
69
  p.add_argument("--seed", type=int, default=42)
70
  return p.parse_args()
71
 
@@ -181,7 +182,10 @@ def main():
181
  callbacks=[TrackioAlertCallback()],
182
  )
183
 
184
- trainer.train()
 
 
 
185
  metrics = trainer.evaluate()
186
  write_json(Path(cfg["output_dir"]) / "final_eval_metrics.json", metrics)
187
  trainer.save_model(cfg["output_dir"])
 
66
  p.add_argument("--no_push", action="store_true")
67
  p.add_argument("--packing", action="store_true", help="Override config and enable packing. Requires compatible attention setup.")
68
  p.add_argument("--flash_attn", action="store_true", help="Use flash_attention_2 in model_init_kwargs. Install flash-attn first.")
69
+ p.add_argument("--resume_from_checkpoint", default=None, help="Path to checkpoint dir, or 'true' to auto-resume latest checkpoint in output_dir")
70
  p.add_argument("--seed", type=int, default=42)
71
  return p.parse_args()
72
 
 
182
  callbacks=[TrackioAlertCallback()],
183
  )
184
 
185
+ resume_arg = args.resume_from_checkpoint
186
+ if resume_arg is not None and str(resume_arg).lower() == "true":
187
+ resume_arg = True
188
+ trainer.train(resume_from_checkpoint=resume_arg)
189
  metrics = trainer.evaluate()
190
  write_json(Path(cfg["output_dir"]) / "final_eval_metrics.json", metrics)
191
  trainer.save_model(cfg["output_dir"])