#!/usr/bin/env bash # ============================================================================= # LUNA 100M — Cloud Setup & Train Entrypoint # Runs on RunPod, Vast.ai, Lambda Labs, or any Linux GPU pod. # # USAGE (after cloning repo): # bash setup_and_train.sh [gdrive|huggingface] [SOURCE_ID] [MAX_TOKENS] # # EXAMPLES: # # Full dataset from Google Drive folder: # bash setup_and_train.sh gdrive 1AbCdEfGhIjKlMnOpQrStUvWx # # # Full dataset from HuggingFace: # bash setup_and_train.sh huggingface ASTERIZER/Luna_Dataset # # # Quick smoke test (10M tokens only): # bash setup_and_train.sh huggingface ASTERIZER/Luna_Dataset 10000000 # # # Dataset already on disk: # bash setup_and_train.sh local /workspace/data/litdata_pretrain_final # ============================================================================= set -e DATA_SOURCE="${1:-local}" DATA_ID="${2:-Base/data/litdata_pretrain_final}" MAX_TOKENS="${3:-4515286950}" DATA_DIR="/workspace/data/litdata_pretrain_final" OUT_DIR="/workspace/out/pretrain/luna-100m" echo "==========================================" echo " LUNA 100M — Cloud Setup" echo " Source : $DATA_SOURCE" echo " ID/Path : $DATA_ID" echo " Tokens : $MAX_TOKENS" echo "==========================================" # ── 1. Python packages ──────────────────────────────────────────────────────── echo "" echo "[1/4] Installing dependencies..." pip install -q --upgrade pip pip install -q \ torch torchvision \ psutil \ huggingface_hub \ gdown \ tensorboard \ litgpt 2>/dev/null || true echo " Done." # ── 2. Download dataset ─────────────────────────────────────────────────────── echo "" echo "[2/4] Fetching dataset..." if [ "$DATA_SOURCE" = "gdrive" ]; then python fetch_data.py --source gdrive --gdrive_id "$DATA_ID" --out_dir "$DATA_DIR" elif [ "$DATA_SOURCE" = "huggingface" ]; then HF_TOKEN="${HF_TOKEN:-}" python fetch_data.py --source huggingface --hf_repo "$DATA_ID" --out_dir "$DATA_DIR" --hf_token "$HF_TOKEN" elif [ "$DATA_SOURCE" = "local" ]; then python fetch_data.py --source local --local_path "$DATA_ID" --out_dir "$DATA_DIR" else echo "Unknown source: $DATA_SOURCE (use: gdrive | huggingface | local)" exit 1 fi # ── 3. System + batch size probe ────────────────────────────────────────────── echo "" echo "[3/4] System probe (auto-detects VRAM, RAM, CPU)..." python -c " import torch, psutil, os props = torch.cuda.get_device_properties(0) if torch.cuda.is_available() else None print(f' GPU : {props.name if props else \"CPU only\"} ({props.total_memory/1024**3:.1f} GB)' if props else ' GPU: None') print(f' RAM : {psutil.virtual_memory().total/1024**3:.1f} GB') print(f' CPUs : {os.cpu_count()}') " # ── 4. Train ────────────────────────────────────────────────────────────────── echo "" echo "[4/4] Starting training (auto_config reads from train_config.yaml)..." echo "" python train.py \ --config train_config.yaml \ --data_path "$DATA_DIR" \ --out_dir "$OUT_DIR" \ --max_tokens "$MAX_TOKENS" echo "" echo "==========================================" echo " Training complete! Output: $OUT_DIR" echo "=========================================="