| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| |
| |
| |
| |
| |
|
|
| if [ $# -lt 1 ]; then |
| echo "Usage: $0 <STAGE1_RUN_DIR> [ADAPTER_PATH]" >&2 |
| exit 1 |
| fi |
|
|
| STAGE1_RUN_DIR="$1" |
| ADAPTER="${2:-$STAGE1_RUN_DIR/outputs/adapter}" |
|
|
| if [ ! -d "$ADAPTER" ]; then |
| echo "ERROR: adapter path not found: $ADAPTER" >&2 |
| exit 1 |
| fi |
| if [ -z "${HF_TOKEN:-}" ]; then |
| echo "ERROR: HF_TOKEN is not set" >&2 |
| exit 1 |
| fi |
|
|
| source .venv/bin/activate |
| export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" |
| export PYTHONPATH="$PWD/src:${PYTHONPATH:-}" |
| export TOKENIZERS_PARALLELISM=false |
| python scripts/check_gpu.py |
|
|
| RUN_ID="stage2-weak-$(date +%Y%m%d-%H%M%S)" |
| RUN_DIR="$PWD/runs/$RUN_ID" |
| mkdir -p "$RUN_DIR"/{logs,outputs,configs,weak_layer_data,eval_merged} |
| cp configs/stage2_weak_layer_qwen3_8b.yaml "$RUN_DIR/configs/config.yaml" |
|
|
| echo "Building weak-layer dataset at $RUN_DIR/weak_layer_data" |
| python scripts/build_weak_layer_dataset.py \ |
| --output_dir "$RUN_DIR/weak_layer_data" \ |
| --rare_min_per_layer "${STAGE2_RARE_MIN_PER_LAYER:-1500}" \ |
| --replay_ratio "${STAGE2_REPLAY_RATIO:-0.30}" |
|
|
| python - <<PY |
| from pathlib import Path |
| import yaml |
| run_id = "$RUN_ID" |
| run_dir = Path("$RUN_DIR") |
| cfg_path = run_dir / "configs" / "config.yaml" |
| cfg = yaml.safe_load(cfg_path.read_text()) |
| cfg["adapter_path"] = "$ADAPTER" |
| cfg["dataset_dir"] = str(run_dir / "weak_layer_data") |
| cfg["output_dir"] = str(run_dir / "outputs" / "adapter") |
| cfg["run_name"] = run_id |
| cfg["hub_model_id"] = f"nraptisss/Qwen3-8B-TMF921-Intent-QLoRA-{run_id}" |
| cfg["push_to_hub"] = True |
| cfg_path.write_text(yaml.safe_dump(cfg, sort_keys=False)) |
| print(yaml.safe_dump(cfg, sort_keys=False)) |
| PY |
|
|
| echo "$RUN_ID" > "$RUN_DIR/RUN_ID.txt" |
| echo "$RUN_DIR" > "$RUN_DIR/RUN_DIR.txt" |
| echo "$STAGE1_RUN_DIR" > "$RUN_DIR/STAGE1_RUN_DIR.txt" |
| echo "$ADAPTER" > "$RUN_DIR/STAGE1_ADAPTER.txt" |
|
|
| LOG="$RUN_DIR/logs/train_stage2.log" |
| nohup bash -lc " |
| source .venv/bin/activate |
| export HF_TOKEN='$HF_TOKEN' |
| export CUDA_VISIBLE_DEVICES='$CUDA_VISIBLE_DEVICES' |
| export PYTHONPATH='$PWD/src' |
| export TOKENIZERS_PARALLELISM=false |
| export DISABLE_TRACKIO='${DISABLE_TRACKIO:-1}' |
| export TRACKIO_SPACE_ID='${TRACKIO_SPACE_ID:-}' |
| |
| python scripts/train_continue_adapter.py \\ |
| --config '$RUN_DIR/configs/config.yaml' |
| " > "$LOG" 2>&1 & |
|
|
| echo $! > "$RUN_DIR/TRAIN_PID.txt" |
| cat <<EOF |
| Started stage2 weak-layer continuation. |
| RUN_ID=$RUN_ID |
| RUN_DIR=$RUN_DIR |
| STAGE1_ADAPTER=$ADAPTER |
| PID=$(cat "$RUN_DIR/TRAIN_PID.txt") |
| LOG=$LOG |
| |
| Monitor: |
| tail -f "$LOG" |
| bash scripts/status_run.sh "$RUN_DIR" |
| |
| After training: |
| python scripts/merge_adapter.py --base_model Qwen/Qwen3-8B --adapter "$RUN_DIR/outputs/adapter" --output_dir "$RUN_DIR/outputs/merged" |
| EVAL_BATCH_SIZE=8 bash scripts/nohup_eval.sh "$RUN_DIR" "$RUN_DIR/outputs/merged" |
| python scripts/normalize_eval_metrics.py --eval_dir "$RUN_DIR/eval" |
| EOF |
|
|