#!/usr/bin/env bash # ============================================================================ # LUNA 100M — LoRA SFT on RAG/MCP data (GPU instance one-shot script) # ============================================================================ # Clones code from HF, downloads the SFT model + dataset, runs LoRA training. # # Usage on a fresh GPU instance (RunPod / Lambda / Vast.ai / etc.): # export HF_TOKEN="hf_your_token_here" # bash gpu_train.sh # ============================================================================ set -euo pipefail HF_TOKEN="${HF_TOKEN:?Set HF_TOKEN env var}" CODE_REPO="ASTERIZER/LUNA-Training" MODEL_REPO="ASTERIZER/LUNA-100M" DATASET_REPO="ASTERIZER/LUNA-RAG-MCP-SFT-10M" WORK_DIR="/workspace/luna" echo "============================================================" echo " LUNA 100M — LoRA SFT (RAG/MCP) — GPU Setup" echo "============================================================" # ── 1. System deps ────────────────────────────────────────────── echo "[1/6] Installing system dependencies..." apt-get update -qq && apt-get install -y -qq git git-lfs python3-pip > /dev/null 2>&1 git lfs install --skip-smudge > /dev/null 2>&1 # ── 2. Clone code ────────────────────────────────────────────── echo "[2/6] Cloning training code from $CODE_REPO..." mkdir -p "$WORK_DIR" cd "$WORK_DIR" if [ ! -f "lora_sft_train.py" ] || [ ! -f "upload_lora_to_hf.py" ]; then pip install -q huggingface_hub python3 -c " from huggingface_hub import snapshot_download snapshot_download( repo_id='${CODE_REPO}', local_dir='${WORK_DIR}', token='${HF_TOKEN}', ) print('Code downloaded.') " fi # ── 3. Python deps ───────────────────────────────────────────── echo "[3/6] Installing Python dependencies..." pip install -q torch --index-url https://download.pytorch.org/whl/cu121 2>/dev/null || true pip install -q -r requirements.txt 2>/dev/null # ── 4. Download SFT model checkpoint ────────────────────────── echo "[4/6] Downloading SFT base model from $MODEL_REPO..." python3 -c " import os from pathlib import Path from huggingface_hub import hf_hub_download ckpt_dir = Path('Base/out/input_models/luna_sft_v1') target = ckpt_dir / 'sft_v1' / 'final' / 'model.pth' if target.exists(): print(f'Checkpoint already exists: {target}') else: ckpt_dir.mkdir(parents=True, exist_ok=True) hf_hub_download( repo_id='${MODEL_REPO}', filename='sft_v1/final/model.pth', local_dir=str(ckpt_dir), token=os.environ.get('HF_TOKEN'), ) print('Model downloaded.') " # ── 5. Download RAG/MCP SFT dataset ─────────────────────────── echo "[5/6] Downloading RAG/MCP dataset from $DATASET_REPO..." python3 -c " import os from pathlib import Path from huggingface_hub import hf_hub_download data_dir = Path('Base/Datasets/rag_mcp_sft') data_dir.mkdir(parents=True, exist_ok=True) for fname in ['train.json', 'val.json']: target = data_dir / fname if target.exists(): print(f'Already exists: {target}') continue hf_hub_download( repo_id='${DATASET_REPO}', filename=fname, local_dir=str(data_dir), repo_type='dataset', token=os.environ.get('HF_TOKEN'), ) print(f'Downloaded: {fname}') " # ── 6. Launch LoRA SFT training ─────────────────────────────── echo "[6/6] Starting LoRA SFT training..." echo "============================================================" nvidia-smi --query-gpu=name,memory.total --format=csv,noheader || true echo "" CUDA_VISIBLE_DEVICES=0 python3 lora_sft_train.py \ --config rag_mcp_lora_config.yaml echo "============================================================" echo " Training complete!" echo " Adapter saved to: Base/out/sft/rag_mcp_lora/final/" echo " Full run folder : Base/out/sft/rag_mcp_lora/" echo " To upload it to Hugging Face, run:" echo " python3 upload_lora_to_hf.py --repo-id ASTERIZER/LUNA-100M --folder Base/out/sft/rag_mcp_lora --path-in-repo rag_mcp_lora" if [ "${UPLOAD_TO_HF:-0}" = "1" ]; then echo " UPLOAD_TO_HF=1 detected. Uploading adapter to Hugging Face..." if [ -f "upload_lora_to_hf.py" ]; then python3 upload_lora_to_hf.py \ --repo-id ASTERIZER/LUNA-100M \ --folder Base/out/sft/rag_mcp_lora \ --path-in-repo rag_mcp_lora else python3 -c " import os from pathlib import Path from huggingface_hub import HfApi folder = Path('Base/out/sft/rag_mcp_lora') required = [folder / 'final' / 'adapter_model.pt', folder / 'final' / 'adapter_bundle.pt'] missing = [str(path) for path in required if not path.exists()] if missing: raise FileNotFoundError('Missing expected adapter files: ' + ', '.join(missing)) api = HfApi(token=os.environ['HF_TOKEN']) api.create_repo(repo_id='ASTERIZER/LUNA-100M', repo_type='model', exist_ok=True) api.upload_folder( repo_id='ASTERIZER/LUNA-100M', repo_type='model', folder_path=str(folder), path_in_repo='rag_mcp_lora', ) print('uploaded_lora url=https://huggingface.co/ASTERIZER/LUNA-100M/tree/main/rag_mcp_lora') " fi fi echo "============================================================"