PEFT
qlora
sft
trl
qwen3
tmf921
intent-based-networking
network-slicing
rtx-6000-ada
ml-intern
tmf921-intent-training / scripts /nohup_stage2_weak.sh
nraptisss's picture
Add stage2 weak-layer nohup runner
c3b8793 verified
#!/usr/bin/env bash
set -euo pipefail
# Build weak-layer replay dataset and continue training from an existing adapter.
# Usage:
# export HF_TOKEN=hf_...
# export DISABLE_TRACKIO=1
# bash scripts/nohup_stage2_weak.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS [adapter_path]
if [ $# -lt 1 ]; then
echo "Usage: $0 <STAGE1_RUN_DIR> [ADAPTER_PATH]" >&2
exit 1
fi
STAGE1_RUN_DIR="$1"
ADAPTER="${2:-$STAGE1_RUN_DIR/outputs/adapter}"
if [ ! -d "$ADAPTER" ]; then
echo "ERROR: adapter path not found: $ADAPTER" >&2
exit 1
fi
if [ -z "${HF_TOKEN:-}" ]; then
echo "ERROR: HF_TOKEN is not set" >&2
exit 1
fi
source .venv/bin/activate
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
export PYTHONPATH="$PWD/src:${PYTHONPATH:-}"
export TOKENIZERS_PARALLELISM=false
python scripts/check_gpu.py
RUN_ID="stage2-weak-$(date +%Y%m%d-%H%M%S)"
RUN_DIR="$PWD/runs/$RUN_ID"
mkdir -p "$RUN_DIR"/{logs,outputs,configs,weak_layer_data,eval_merged}
cp configs/stage2_weak_layer_qwen3_8b.yaml "$RUN_DIR/configs/config.yaml"
echo "Building weak-layer dataset at $RUN_DIR/weak_layer_data"
python scripts/build_weak_layer_dataset.py \
--output_dir "$RUN_DIR/weak_layer_data" \
--rare_min_per_layer "${STAGE2_RARE_MIN_PER_LAYER:-1500}" \
--replay_ratio "${STAGE2_REPLAY_RATIO:-0.30}"
python - <<PY
from pathlib import Path
import yaml
run_id = "$RUN_ID"
run_dir = Path("$RUN_DIR")
cfg_path = run_dir / "configs" / "config.yaml"
cfg = yaml.safe_load(cfg_path.read_text())
cfg["adapter_path"] = "$ADAPTER"
cfg["dataset_dir"] = str(run_dir / "weak_layer_data")
cfg["output_dir"] = str(run_dir / "outputs" / "adapter")
cfg["run_name"] = run_id
cfg["hub_model_id"] = f"nraptisss/Qwen3-8B-TMF921-Intent-QLoRA-{run_id}"
cfg["push_to_hub"] = True
cfg_path.write_text(yaml.safe_dump(cfg, sort_keys=False))
print(yaml.safe_dump(cfg, sort_keys=False))
PY
echo "$RUN_ID" > "$RUN_DIR/RUN_ID.txt"
echo "$RUN_DIR" > "$RUN_DIR/RUN_DIR.txt"
echo "$STAGE1_RUN_DIR" > "$RUN_DIR/STAGE1_RUN_DIR.txt"
echo "$ADAPTER" > "$RUN_DIR/STAGE1_ADAPTER.txt"
LOG="$RUN_DIR/logs/train_stage2.log"
nohup bash -lc "
source .venv/bin/activate
export HF_TOKEN='$HF_TOKEN'
export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES'
export PYTHONPATH='$PWD/src'
export TOKENIZERS_PARALLELISM=false
export DISABLE_TRACKIO='${DISABLE_TRACKIO:-1}'
export TRACKIO_SPACE_ID='${TRACKIO_SPACE_ID:-}'
python scripts/train_continue_adapter.py \\
--config '$RUN_DIR/configs/config.yaml'
" > "$LOG" 2>&1 &
echo $! > "$RUN_DIR/TRAIN_PID.txt"
cat <<EOF
Started stage2 weak-layer continuation.
RUN_ID=$RUN_ID
RUN_DIR=$RUN_DIR
STAGE1_ADAPTER=$ADAPTER
PID=$(cat "$RUN_DIR/TRAIN_PID.txt")
LOG=$LOG
Monitor:
tail -f "$LOG"
bash scripts/status_run.sh "$RUN_DIR"
After training:
python scripts/merge_adapter.py --base_model Qwen/Qwen3-8B --adapter "$RUN_DIR/outputs/adapter" --output_dir "$RUN_DIR/outputs/merged"
EVAL_BATCH_SIZE=8 bash scripts/nohup_eval.sh "$RUN_DIR" "$RUN_DIR/outputs/merged"
python scripts/normalize_eval_metrics.py --eval_dir "$RUN_DIR/eval"
EOF