Spaces:
Paused
Paused
Commit ·
0980a17
1
Parent(s): 225cdfe
training: split remote runner into in-repo script
Browse files- Add training/hf_remote_run.sh executed inside the HF Job container.
- Shrink training/hf_run_space_train_job.sh to a minimal bootstrap
(clone Space + exec in-repo script) to avoid argv "File name too long".
- Default flavor a100x4 (4xA100, 48 vCPU, 568 GB RAM, 320 GB VRAM).
Made-with: Cursor
- training/hf_remote_run.sh +67 -0
- training/hf_run_space_train_job.sh +18 -16
training/hf_remote_run.sh
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Remote runner executed inside the HF Job container.
|
| 3 |
+
# The launcher (training/hf_run_space_train_job.sh) clones the Space into /work
|
| 4 |
+
# and then exec's this script. Keeping the heavy lifting in a file (instead of
|
| 5 |
+
# inlining via `bash -lc`) avoids the argv "File name too long" failure.
|
| 6 |
+
#
|
| 7 |
+
# Required env (set by launcher):
|
| 8 |
+
# SPACE_REPO e.g. ycwhencpp/train-new
|
| 9 |
+
# HF_TOKEN secret, injected via --secrets
|
| 10 |
+
# NB_EXEC_TIMEOUT papermill execution timeout in seconds (default 14400)
|
| 11 |
+
# SMOKE_MODE 0 = full training, 1 = smoke (default 0 here)
|
| 12 |
+
|
| 13 |
+
set -euo pipefail
|
| 14 |
+
|
| 15 |
+
: "${SPACE_REPO:?SPACE_REPO must be set}"
|
| 16 |
+
: "${HF_TOKEN:?HF_TOKEN must be set (use --secrets HF_TOKEN)}"
|
| 17 |
+
|
| 18 |
+
NB_EXEC_TIMEOUT="${NB_EXEC_TIMEOUT:-14400}"
|
| 19 |
+
export SMOKE_MODE="${SMOKE_MODE:-0}"
|
| 20 |
+
export HF_HUB_ENABLE_HF_TRANSFER="${HF_HUB_ENABLE_HF_TRANSFER:-1}"
|
| 21 |
+
export TOKENIZERS_PARALLELISM="${TOKENIZERS_PARALLELISM:-false}"
|
| 22 |
+
export DEBIAN_FRONTEND=noninteractive
|
| 23 |
+
|
| 24 |
+
echo "===== hf_remote_run.sh starting at $(date -u +%FT%TZ) ====="
|
| 25 |
+
echo "SPACE_REPO=${SPACE_REPO}"
|
| 26 |
+
echo "SMOKE_MODE=${SMOKE_MODE}"
|
| 27 |
+
echo "NB_EXEC_TIMEOUT=${NB_EXEC_TIMEOUT}"
|
| 28 |
+
|
| 29 |
+
nvidia-smi || true
|
| 30 |
+
|
| 31 |
+
echo "----- pip installs -----"
|
| 32 |
+
pip install -q --root-user-action=ignore --upgrade \
|
| 33 |
+
"typing_extensions>=4.15.0" \
|
| 34 |
+
jupyter nbconvert nbclient ipykernel \
|
| 35 |
+
huggingface_hub hf_transfer papermill
|
| 36 |
+
|
| 37 |
+
echo "----- executing notebook with papermill -----"
|
| 38 |
+
mkdir -p plots run-output checkpoints
|
| 39 |
+
papermill --log-output --progress-bar \
|
| 40 |
+
--execution-timeout "${NB_EXEC_TIMEOUT}" \
|
| 41 |
+
training/train_grpo.ipynb \
|
| 42 |
+
training/train_grpo.executed.ipynb
|
| 43 |
+
|
| 44 |
+
echo "----- uploading artifacts back to Space (run-output/) -----"
|
| 45 |
+
python - <<'PY'
|
| 46 |
+
import os
|
| 47 |
+
from huggingface_hub import HfApi
|
| 48 |
+
|
| 49 |
+
api = HfApi(token=os.environ["HF_TOKEN"])
|
| 50 |
+
api.upload_folder(
|
| 51 |
+
folder_path=".",
|
| 52 |
+
path_in_repo="run-output",
|
| 53 |
+
repo_id=os.environ["SPACE_REPO"],
|
| 54 |
+
repo_type="space",
|
| 55 |
+
allow_patterns=[
|
| 56 |
+
"training/train_grpo.executed.ipynb",
|
| 57 |
+
"plots/**",
|
| 58 |
+
"checkpoints/**/adapter_*",
|
| 59 |
+
"checkpoints/**/lora-*/**",
|
| 60 |
+
"**/lora-*/**",
|
| 61 |
+
],
|
| 62 |
+
commit_message="HF Job: train_grpo run output",
|
| 63 |
+
)
|
| 64 |
+
print("upload complete")
|
| 65 |
+
PY
|
| 66 |
+
|
| 67 |
+
echo "===== hf_remote_run.sh done at $(date -u +%FT%TZ) ====="
|
training/hf_run_space_train_job.sh
CHANGED
|
@@ -1,36 +1,37 @@
|
|
| 1 |
#!/usr/bin/env bash
|
| 2 |
-
#
|
| 3 |
-
#
|
|
|
|
| 4 |
#
|
| 5 |
-
#
|
| 6 |
-
#
|
|
|
|
|
|
|
| 7 |
|
| 8 |
set -euo pipefail
|
| 9 |
|
| 10 |
IMAGE="${HF_JOB_IMAGE:-pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime}"
|
| 11 |
FLAVOR="${HF_JOB_FLAVOR:-a100x4}"
|
| 12 |
TIMEOUT="${HF_JOB_TIMEOUT:-8h}"
|
| 13 |
-
SPACE_REPO="${HF_SPACE_REPO_ID:-
|
| 14 |
-
NB_EXEC_TIMEOUT="${NB_EXEC_TIMEOUT:-
|
|
|
|
| 15 |
|
| 16 |
if ! hf auth whoami &>/dev/null; then
|
| 17 |
echo "Run: hf auth login" >&2
|
| 18 |
exit 1
|
| 19 |
fi
|
| 20 |
|
| 21 |
-
|
| 22 |
-
set -euo pipefail
|
| 23 |
export DEBIAN_FRONTEND=noninteractive
|
| 24 |
-
apt-get update -qq
|
| 25 |
-
|
| 26 |
rm -rf /work
|
| 27 |
git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work
|
| 28 |
cd /work
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
python -c "import os; from huggingface_hub import HfApi; HfApi().upload_folder(folder_path='.', path_in_repo='run-output', repo_id=os.environ['SPACE_REPO'], repo_type='space', allow_patterns=['training/train_grpo.executed.ipynb','plots/**','**/lora-*/**'])"
|
| 32 |
-
EOS
|
| 33 |
-
)
|
| 34 |
|
| 35 |
exec hf jobs run \
|
| 36 |
--flavor "$FLAVOR" \
|
|
@@ -39,5 +40,6 @@ exec hf jobs run \
|
|
| 39 |
--secrets HF_TOKEN \
|
| 40 |
--env "SPACE_REPO=$SPACE_REPO" \
|
| 41 |
--env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \
|
|
|
|
| 42 |
"$IMAGE" \
|
| 43 |
-
bash -lc "$
|
|
|
|
| 1 |
#!/usr/bin/env bash
|
| 2 |
+
# Launch a HF Job that:
|
| 3 |
+
# 1. clones the Space repo (default: ycwhencpp/train-new) into /work
|
| 4 |
+
# 2. execs training/hf_remote_run.sh from that clone (heavy lifting lives there)
|
| 5 |
#
|
| 6 |
+
# We keep the inline bootstrap intentionally tiny — anything larger risks the
|
| 7 |
+
# "File name too long" failure when the whole command becomes argv to bash -lc.
|
| 8 |
+
#
|
| 9 |
+
# Requires: hf auth login (token forwarded via --secrets HF_TOKEN)
|
| 10 |
|
| 11 |
set -euo pipefail
|
| 12 |
|
| 13 |
IMAGE="${HF_JOB_IMAGE:-pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime}"
|
| 14 |
FLAVOR="${HF_JOB_FLAVOR:-a100x4}"
|
| 15 |
TIMEOUT="${HF_JOB_TIMEOUT:-8h}"
|
| 16 |
+
SPACE_REPO="${HF_SPACE_REPO_ID:-ycwhencpp/train-new}"
|
| 17 |
+
NB_EXEC_TIMEOUT="${NB_EXEC_TIMEOUT:-14400}"
|
| 18 |
+
SMOKE_MODE="${SMOKE_MODE:-0}"
|
| 19 |
|
| 20 |
if ! hf auth whoami &>/dev/null; then
|
| 21 |
echo "Run: hf auth login" >&2
|
| 22 |
exit 1
|
| 23 |
fi
|
| 24 |
|
| 25 |
+
# Tiny bootstrap: install git, clone the Space, hand off to the in-repo script.
|
| 26 |
+
BOOTSTRAP='set -euo pipefail
|
| 27 |
export DEBIAN_FRONTEND=noninteractive
|
| 28 |
+
apt-get update -qq
|
| 29 |
+
apt-get install -y --no-install-recommends git curl ca-certificates >/dev/null
|
| 30 |
rm -rf /work
|
| 31 |
git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work
|
| 32 |
cd /work
|
| 33 |
+
chmod +x training/hf_remote_run.sh
|
| 34 |
+
exec bash training/hf_remote_run.sh'
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
exec hf jobs run \
|
| 37 |
--flavor "$FLAVOR" \
|
|
|
|
| 40 |
--secrets HF_TOKEN \
|
| 41 |
--env "SPACE_REPO=$SPACE_REPO" \
|
| 42 |
--env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \
|
| 43 |
+
--env "SMOKE_MODE=$SMOKE_MODE" \
|
| 44 |
"$IMAGE" \
|
| 45 |
+
bash -lc "$BOOTSTRAP"
|