Buckets:

bochen2079
/

katherine-k0

Files

xet

bochen2079/katherine-k0 / logs /bootstrap-runpod.sh

bochen2079

15 days ago

download

raw

4.88 kB

	#!/usr/bin/env bash
	# One-shot bootstrap for fresh RunPod / Lambda Labs / similar Linux GPU pod.
	#
	# Usage on the pod:
	# curl -sSL https://raw.githubusercontent.com/bochen2029-pixel/katherine-k0-finetune/master/bootstrap-runpod.sh \| bash
	#
	# Or after manual clone:
	# cd katherine-k0-finetune && ./bootstrap-runpod.sh
	#
	# Sets up: clone repo, install Python deps (unsloth + trl + transformers + hf),
	# verify CUDA, optional HF auth, leave you ready to run ./run-cloud-runpod.sh

	set -euo pipefail

	REPO_URL="${REPO_URL:-https://github.com/bochen2029-pixel/katherine-k0-finetune.git}"
	REPO_DIR="${REPO_DIR:-$HOME/katherine-k0-finetune}"

	echo "============================================================"
	echo "Katherine k0 fine-tune — bootstrap"
	echo "============================================================"
	echo "Date: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
	echo "Host: $(hostname)"
	echo "User: $(whoami)"
	echo

	# 1. CUDA toolkit check
	echo "[1/6] Verifying CUDA toolkit..."
	if ! command -v nvcc >/dev/null; then
	echo " nvcc not in PATH; trying /usr/local/cuda/bin"
	if [ -d /usr/local/cuda/bin ]; then
	export PATH=/usr/local/cuda/bin:$PATH
	fi
	fi
	if command -v nvcc >/dev/null; then
	nvcc --version \| grep release
	else
	echo " WARN: nvcc not found. Unsloth doesn't strictly need it, but llama.cpp"
	echo " compilation for GGUF export may. Continue at your own risk."
	fi

	echo
	echo "[2/6] Detecting GPUs..."
	nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv

	# 3. Clone repo
	echo
	echo "[3/6] Cloning repo..."
	if [ -d "$REPO_DIR/.git" ]; then
	echo " repo already at $REPO_DIR; pulling latest"
	cd "$REPO_DIR"
	git pull --ff-only
	else
	git clone "$REPO_URL" "$REPO_DIR"
	cd "$REPO_DIR"
	fi
	echo "[3/6] In: $(pwd)"

	# 4. Install Python deps
	echo
	echo "[4/6] Installing Python dependencies..."
	echo " (this can take 5-10 min on first run — unsloth pulls a lot)"

	# Use the system Python (RunPod's pytorch image has Python 3.11 with pip)
	PY=python3
	if ! command -v $PY >/dev/null; then PY=python; fi

	$PY -m pip install --quiet --upgrade pip

	# Core stack. Pin major versions to avoid surprises.
	# Unsloth latest is generally safe; transformers/peft/trl pinned compatible.
	$PY -m pip install --quiet \
	"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" \
	"transformers>=4.50.0" \
	"trl>=0.12.0" \
	"peft>=0.12.0" \
	"bitsandbytes>=0.43.0" \
	"accelerate>=1.0.0" \
	"datasets>=2.20.0" \
	"huggingface_hub>=0.27.0" \
	"sentencepiece" \
	"protobuf" \
	"xformers"

	# Verify import
	$PY -c "import unsloth; print(f' unsloth: {unsloth.__version__}')"
	$PY -c "import transformers; print(f' transformers: {transformers.__version__}')"
	$PY -c "import trl; print(f' trl: {trl.__version__}')"
	$PY -c "import peft; print(f' peft: {peft.__version__}')"

	echo
	echo "[5/6] HF CLI + auth..."
	if command -v hf >/dev/null; then
	HF_VER=$(hf --version 2>&1 \| head -1 \|\| echo "unknown")
	echo " hf CLI: $HF_VER"
	else
	echo " WARN: hf command not in PATH after install; trying huggingface-cli fallback"
	fi

	if [ -n "${HF_TOKEN:-}" ]; then
	if hf auth login --token "$HF_TOKEN" >/dev/null 2>&1; then
	echo " HF logged in as $(hf auth whoami 2>&1 \| grep user: \| awk '{print $2}')"
	else
	echo " WARN: HF login failed; HF sync will skip at run-time"
	fi
	elif [ -f "$HOME/.hf_token" ]; then
	HF_TOKEN=$(cat "$HOME/.hf_token")
	export HF_TOKEN
	hf auth login --token "$HF_TOKEN" >/dev/null 2>&1 \|\| true
	echo " HF token loaded from \$HOME/.hf_token"
	else
	echo " HF_TOKEN not set; HF sync will be disabled at run-time"
	echo " To enable: export HF_TOKEN=<your_token> before running ./run-cloud-runpod.sh"
	fi

	# 6. Verify dataset
	echo
	echo "[6/6] Verifying canonical datasets..."
	chmod +x run-cloud-runpod.sh _supervise-cloud.sh bootstrap-runpod.sh 2>/dev/null \|\| true

	if [ -f data/k0_canonical.jsonl ]; then
	SFT_LINES=$(wc -l < data/k0_canonical.jsonl)
	echo " ✓ data/k0_canonical.jsonl ($SFT_LINES SFT examples)"
	else
	echo " WARN: data/k0_canonical.jsonl missing; rebuild with prep_dataset.py if you have raw sources"
	fi
	if [ -f data/k0_dpo_curated.jsonl ]; then
	DPO_LINES=$(wc -l < data/k0_dpo_curated.jsonl)
	echo " ✓ data/k0_dpo_curated.jsonl ($DPO_LINES DPO pairs)"
	else
	echo " (no DPO data; DPO stage will skip)"
	fi

	echo
	echo "============================================================"
	echo "Bootstrap complete."
	echo
	echo "To launch the full pipeline:"
	echo " cd $REPO_DIR"
	echo " export HF_TOKEN=<your_token> # if not already set"
	echo " ./run-cloud-runpod.sh"
	echo
	echo "Stages: SFT → DPO → merge+GGUF (3 quants) → push to HF bucket"
	echo "Total wallclock: ~50-70 min on H200, ~75-90 min on H100"
	echo "============================================================"

Xet Storage Details

Size:: 4.88 kB
Xet hash:: 37c831ea3fd265386806b406ce7b0d7098cbd38d1cf8314c9afc4e5aedd96470

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.