#!/usr/bin/env bash set -euo pipefail # Build weak-layer replay dataset and continue training from an existing adapter. # Usage: # export HF_TOKEN=hf_... # export DISABLE_TRACKIO=1 # bash scripts/nohup_stage2_weak.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS [adapter_path] if [ $# -lt 1 ]; then echo "Usage: $0 [ADAPTER_PATH]" >&2 exit 1 fi STAGE1_RUN_DIR="$1" ADAPTER="${2:-$STAGE1_RUN_DIR/outputs/adapter}" if [ ! -d "$ADAPTER" ]; then echo "ERROR: adapter path not found: $ADAPTER" >&2 exit 1 fi if [ -z "${HF_TOKEN:-}" ]; then echo "ERROR: HF_TOKEN is not set" >&2 exit 1 fi source .venv/bin/activate export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" export PYTHONPATH="$PWD/src:${PYTHONPATH:-}" export TOKENIZERS_PARALLELISM=false python scripts/check_gpu.py RUN_ID="stage2-weak-$(date +%Y%m%d-%H%M%S)" RUN_DIR="$PWD/runs/$RUN_ID" mkdir -p "$RUN_DIR"/{logs,outputs,configs,weak_layer_data,eval_merged} cp configs/stage2_weak_layer_qwen3_8b.yaml "$RUN_DIR/configs/config.yaml" echo "Building weak-layer dataset at $RUN_DIR/weak_layer_data" python scripts/build_weak_layer_dataset.py \ --output_dir "$RUN_DIR/weak_layer_data" \ --rare_min_per_layer "${STAGE2_RARE_MIN_PER_LAYER:-1500}" \ --replay_ratio "${STAGE2_REPLAY_RATIO:-0.30}" python - < "$RUN_DIR/RUN_ID.txt" echo "$RUN_DIR" > "$RUN_DIR/RUN_DIR.txt" echo "$STAGE1_RUN_DIR" > "$RUN_DIR/STAGE1_RUN_DIR.txt" echo "$ADAPTER" > "$RUN_DIR/STAGE1_ADAPTER.txt" LOG="$RUN_DIR/logs/train_stage2.log" nohup bash -lc " source .venv/bin/activate export HF_TOKEN='$HF_TOKEN' export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES' export PYTHONPATH='$PWD/src' export TOKENIZERS_PARALLELISM=false export DISABLE_TRACKIO='${DISABLE_TRACKIO:-1}' export TRACKIO_SPACE_ID='${TRACKIO_SPACE_ID:-}' python scripts/train_continue_adapter.py \\ --config '$RUN_DIR/configs/config.yaml' " > "$LOG" 2>&1 & echo $! > "$RUN_DIR/TRAIN_PID.txt" cat <