ASTERIZER commited on
Commit
5411740
Β·
verified Β·
1 Parent(s): 88d5091

Upload setup_and_train.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. setup_and_train.sh +94 -0
setup_and_train.sh ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # LUNA 100M β€” Cloud Setup & Train Entrypoint
4
+ # Runs on RunPod, Vast.ai, Lambda Labs, or any Linux GPU pod.
5
+ #
6
+ # USAGE (after cloning repo):
7
+ # bash setup_and_train.sh [gdrive|huggingface] [SOURCE_ID] [MAX_TOKENS]
8
+ #
9
+ # EXAMPLES:
10
+ # # Full dataset from Google Drive folder:
11
+ # bash setup_and_train.sh gdrive 1AbCdEfGhIjKlMnOpQrStUvWx
12
+ #
13
+ # # Full dataset from HuggingFace:
14
+ # bash setup_and_train.sh huggingface ASTERIZER/Luna_Dataset
15
+ #
16
+ # # Quick smoke test (10M tokens only):
17
+ # bash setup_and_train.sh huggingface ASTERIZER/Luna_Dataset 10000000
18
+ #
19
+ # # Dataset already on disk:
20
+ # bash setup_and_train.sh local /workspace/data/litdata_pretrain_final
21
+ # =============================================================================
22
+
23
+ set -e
24
+
25
+ DATA_SOURCE="${1:-local}"
26
+ DATA_ID="${2:-Base/data/litdata_pretrain_final}"
27
+ MAX_TOKENS="${3:-4515286950}"
28
+ DATA_DIR="/workspace/data/litdata_pretrain_final"
29
+ OUT_DIR="/workspace/out/pretrain/luna-100m"
30
+
31
+ echo "=========================================="
32
+ echo " LUNA 100M β€” Cloud Setup"
33
+ echo " Source : $DATA_SOURCE"
34
+ echo " ID/Path : $DATA_ID"
35
+ echo " Tokens : $MAX_TOKENS"
36
+ echo "=========================================="
37
+
38
+ # ── 1. Python packages ────────────────────────────────────────────────────────
39
+ echo ""
40
+ echo "[1/4] Installing dependencies..."
41
+
42
+ pip install -q --upgrade pip
43
+ pip install -q \
44
+ torch torchvision \
45
+ psutil \
46
+ huggingface_hub \
47
+ gdown \
48
+ tensorboard \
49
+ litgpt 2>/dev/null || true
50
+
51
+ echo " Done."
52
+
53
+ # ── 2. Download dataset ───────────────────────────────────────────────────────
54
+ echo ""
55
+ echo "[2/4] Fetching dataset..."
56
+
57
+ if [ "$DATA_SOURCE" = "gdrive" ]; then
58
+ python fetch_data.py --source gdrive --gdrive_id "$DATA_ID" --out_dir "$DATA_DIR"
59
+ elif [ "$DATA_SOURCE" = "huggingface" ]; then
60
+ HF_TOKEN="${HF_TOKEN:-}"
61
+ python fetch_data.py --source huggingface --hf_repo "$DATA_ID" --out_dir "$DATA_DIR" --hf_token "$HF_TOKEN"
62
+ elif [ "$DATA_SOURCE" = "local" ]; then
63
+ python fetch_data.py --source local --local_path "$DATA_ID" --out_dir "$DATA_DIR"
64
+ else
65
+ echo "Unknown source: $DATA_SOURCE (use: gdrive | huggingface | local)"
66
+ exit 1
67
+ fi
68
+
69
+ # ── 3. System + batch size probe ──────────────────────────────────────────────
70
+ echo ""
71
+ echo "[3/4] System probe (auto-detects VRAM, RAM, CPU)..."
72
+ python -c "
73
+ import torch, psutil, os
74
+ props = torch.cuda.get_device_properties(0) if torch.cuda.is_available() else None
75
+ print(f' GPU : {props.name if props else \"CPU only\"} ({props.total_memory/1024**3:.1f} GB)' if props else ' GPU: None')
76
+ print(f' RAM : {psutil.virtual_memory().total/1024**3:.1f} GB')
77
+ print(f' CPUs : {os.cpu_count()}')
78
+ "
79
+
80
+ # ── 4. Train ──────────────────────────────────────────────────────────────────
81
+ echo ""
82
+ echo "[4/4] Starting training (auto_config reads from train_config.yaml)..."
83
+ echo ""
84
+
85
+ python train.py \
86
+ --config train_config.yaml \
87
+ --data_path "$DATA_DIR" \
88
+ --out_dir "$OUT_DIR" \
89
+ --max_tokens "$MAX_TOKENS"
90
+
91
+ echo ""
92
+ echo "=========================================="
93
+ echo " Training complete! Output: $OUT_DIR"
94
+ echo "=========================================="