| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -e |
|
|
| DATA_SOURCE="${1:-local}" |
| DATA_ID="${2:-Base/data/litdata_pretrain_final}" |
| MAX_TOKENS="${3:-4515286950}" |
| DATA_DIR="/workspace/data/litdata_pretrain_final" |
| OUT_DIR="/workspace/out/pretrain/luna-100m" |
|
|
| echo "==========================================" |
| echo " LUNA 100M β Cloud Setup" |
| echo " Source : $DATA_SOURCE" |
| echo " ID/Path : $DATA_ID" |
| echo " Tokens : $MAX_TOKENS" |
| echo "==========================================" |
|
|
| |
| echo "" |
| echo "[1/4] Installing dependencies..." |
|
|
| pip install -q --upgrade pip |
| pip install -q \ |
| torch torchvision \ |
| psutil \ |
| huggingface_hub \ |
| gdown \ |
| tensorboard \ |
| litgpt 2>/dev/null || true |
|
|
| echo " Done." |
|
|
| |
| echo "" |
| echo "[2/4] Fetching dataset..." |
|
|
| if [ "$DATA_SOURCE" = "gdrive" ]; then |
| python fetch_data.py --source gdrive --gdrive_id "$DATA_ID" --out_dir "$DATA_DIR" |
| elif [ "$DATA_SOURCE" = "huggingface" ]; then |
| HF_TOKEN="${HF_TOKEN:-}" |
| python fetch_data.py --source huggingface --hf_repo "$DATA_ID" --out_dir "$DATA_DIR" --hf_token "$HF_TOKEN" |
| elif [ "$DATA_SOURCE" = "local" ]; then |
| python fetch_data.py --source local --local_path "$DATA_ID" --out_dir "$DATA_DIR" |
| else |
| echo "Unknown source: $DATA_SOURCE (use: gdrive | huggingface | local)" |
| exit 1 |
| fi |
|
|
| |
| echo "" |
| echo "[3/4] System probe (auto-detects VRAM, RAM, CPU)..." |
| python -c " |
| import torch, psutil, os |
| props = torch.cuda.get_device_properties(0) if torch.cuda.is_available() else None |
| print(f' GPU : {props.name if props else \"CPU only\"} ({props.total_memory/1024**3:.1f} GB)' if props else ' GPU: None') |
| print(f' RAM : {psutil.virtual_memory().total/1024**3:.1f} GB') |
| print(f' CPUs : {os.cpu_count()}') |
| " |
|
|
| |
| echo "" |
| echo "[4/4] Starting training (auto_config reads from train_config.yaml)..." |
| echo "" |
|
|
| python train.py \ |
| --config train_config.yaml \ |
| --data_path "$DATA_DIR" \ |
| --out_dir "$OUT_DIR" \ |
| --max_tokens "$MAX_TOKENS" |
|
|
| echo "" |
| echo "==========================================" |
| echo " Training complete! Output: $OUT_DIR" |
| echo "==========================================" |
|
|