anuragredbus commited on
Commit
0980a17
·
1 Parent(s): 225cdfe

training: split remote runner into in-repo script

Browse files

- Add training/hf_remote_run.sh executed inside the HF Job container.
- Shrink training/hf_run_space_train_job.sh to a minimal bootstrap
(clone Space + exec in-repo script) to avoid argv "File name too long".
- Default flavor a100x4 (4xA100, 48 vCPU, 568 GB RAM, 320 GB VRAM).

Made-with: Cursor

training/hf_remote_run.sh ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Remote runner executed inside the HF Job container.
3
+ # The launcher (training/hf_run_space_train_job.sh) clones the Space into /work
4
+ # and then exec's this script. Keeping the heavy lifting in a file (instead of
5
+ # inlining via `bash -lc`) avoids the argv "File name too long" failure.
6
+ #
7
+ # Required env (set by launcher):
8
+ # SPACE_REPO e.g. ycwhencpp/train-new
9
+ # HF_TOKEN secret, injected via --secrets
10
+ # NB_EXEC_TIMEOUT papermill execution timeout in seconds (default 14400)
11
+ # SMOKE_MODE 0 = full training, 1 = smoke (default 0 here)
12
+
13
+ set -euo pipefail
14
+
15
+ : "${SPACE_REPO:?SPACE_REPO must be set}"
16
+ : "${HF_TOKEN:?HF_TOKEN must be set (use --secrets HF_TOKEN)}"
17
+
18
+ NB_EXEC_TIMEOUT="${NB_EXEC_TIMEOUT:-14400}"
19
+ export SMOKE_MODE="${SMOKE_MODE:-0}"
20
+ export HF_HUB_ENABLE_HF_TRANSFER="${HF_HUB_ENABLE_HF_TRANSFER:-1}"
21
+ export TOKENIZERS_PARALLELISM="${TOKENIZERS_PARALLELISM:-false}"
22
+ export DEBIAN_FRONTEND=noninteractive
23
+
24
+ echo "===== hf_remote_run.sh starting at $(date -u +%FT%TZ) ====="
25
+ echo "SPACE_REPO=${SPACE_REPO}"
26
+ echo "SMOKE_MODE=${SMOKE_MODE}"
27
+ echo "NB_EXEC_TIMEOUT=${NB_EXEC_TIMEOUT}"
28
+
29
+ nvidia-smi || true
30
+
31
+ echo "----- pip installs -----"
32
+ pip install -q --root-user-action=ignore --upgrade \
33
+ "typing_extensions>=4.15.0" \
34
+ jupyter nbconvert nbclient ipykernel \
35
+ huggingface_hub hf_transfer papermill
36
+
37
+ echo "----- executing notebook with papermill -----"
38
+ mkdir -p plots run-output checkpoints
39
+ papermill --log-output --progress-bar \
40
+ --execution-timeout "${NB_EXEC_TIMEOUT}" \
41
+ training/train_grpo.ipynb \
42
+ training/train_grpo.executed.ipynb
43
+
44
+ echo "----- uploading artifacts back to Space (run-output/) -----"
45
+ python - <<'PY'
46
+ import os
47
+ from huggingface_hub import HfApi
48
+
49
+ api = HfApi(token=os.environ["HF_TOKEN"])
50
+ api.upload_folder(
51
+ folder_path=".",
52
+ path_in_repo="run-output",
53
+ repo_id=os.environ["SPACE_REPO"],
54
+ repo_type="space",
55
+ allow_patterns=[
56
+ "training/train_grpo.executed.ipynb",
57
+ "plots/**",
58
+ "checkpoints/**/adapter_*",
59
+ "checkpoints/**/lora-*/**",
60
+ "**/lora-*/**",
61
+ ],
62
+ commit_message="HF Job: train_grpo run output",
63
+ )
64
+ print("upload complete")
65
+ PY
66
+
67
+ echo "===== hf_remote_run.sh done at $(date -u +%FT%TZ) ====="
training/hf_run_space_train_job.sh CHANGED
@@ -1,36 +1,37 @@
1
  #!/usr/bin/env bash
2
- # Same environment as your HF Job (Space clone + nbconvert + upload to Space).
3
- # Old UI command was invalid shell (no &&); this version is a proper chain.
 
4
  #
5
- # Requires: hf auth login (token is sent via --secrets HF_TOKEN from the CLI cache)
6
- # Optional: HF_SPACE_REPO_ID (default vaibhavkhandare/train-bhai-train)
 
 
7
 
8
  set -euo pipefail
9
 
10
  IMAGE="${HF_JOB_IMAGE:-pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime}"
11
  FLAVOR="${HF_JOB_FLAVOR:-a100x4}"
12
  TIMEOUT="${HF_JOB_TIMEOUT:-8h}"
13
- SPACE_REPO="${HF_SPACE_REPO_ID:-vaibhavkhandare/train-bhai-train}"
14
- NB_EXEC_TIMEOUT="${NB_EXEC_TIMEOUT:-3600}"
 
15
 
16
  if ! hf auth whoami &>/dev/null; then
17
  echo "Run: hf auth login" >&2
18
  exit 1
19
  fi
20
 
21
- REMOTE_SCRIPT=$(cat <<'EOS'
22
- set -euo pipefail
23
  export DEBIAN_FRONTEND=noninteractive
24
- apt-get update -qq && apt-get install -y --no-install-recommends git curl ca-certificates
25
- pip install -q --root-user-action=ignore --upgrade "typing_extensions>=4.15.0" jupyter nbconvert nbclient ipykernel huggingface_hub papermill
26
  rm -rf /work
27
  git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work
28
  cd /work
29
- papermill --log-output --progress-bar --execution-timeout "${NB_EXEC_TIMEOUT}" \
30
- training/train_grpo.ipynb training/train_grpo.executed.ipynb
31
- python -c "import os; from huggingface_hub import HfApi; HfApi().upload_folder(folder_path='.', path_in_repo='run-output', repo_id=os.environ['SPACE_REPO'], repo_type='space', allow_patterns=['training/train_grpo.executed.ipynb','plots/**','**/lora-*/**'])"
32
- EOS
33
- )
34
 
35
  exec hf jobs run \
36
  --flavor "$FLAVOR" \
@@ -39,5 +40,6 @@ exec hf jobs run \
39
  --secrets HF_TOKEN \
40
  --env "SPACE_REPO=$SPACE_REPO" \
41
  --env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \
 
42
  "$IMAGE" \
43
- bash -lc "$REMOTE_SCRIPT"
 
1
  #!/usr/bin/env bash
2
+ # Launch a HF Job that:
3
+ # 1. clones the Space repo (default: ycwhencpp/train-new) into /work
4
+ # 2. execs training/hf_remote_run.sh from that clone (heavy lifting lives there)
5
  #
6
+ # We keep the inline bootstrap intentionally tiny anything larger risks the
7
+ # "File name too long" failure when the whole command becomes argv to bash -lc.
8
+ #
9
+ # Requires: hf auth login (token forwarded via --secrets HF_TOKEN)
10
 
11
  set -euo pipefail
12
 
13
  IMAGE="${HF_JOB_IMAGE:-pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime}"
14
  FLAVOR="${HF_JOB_FLAVOR:-a100x4}"
15
  TIMEOUT="${HF_JOB_TIMEOUT:-8h}"
16
+ SPACE_REPO="${HF_SPACE_REPO_ID:-ycwhencpp/train-new}"
17
+ NB_EXEC_TIMEOUT="${NB_EXEC_TIMEOUT:-14400}"
18
+ SMOKE_MODE="${SMOKE_MODE:-0}"
19
 
20
  if ! hf auth whoami &>/dev/null; then
21
  echo "Run: hf auth login" >&2
22
  exit 1
23
  fi
24
 
25
+ # Tiny bootstrap: install git, clone the Space, hand off to the in-repo script.
26
+ BOOTSTRAP='set -euo pipefail
27
  export DEBIAN_FRONTEND=noninteractive
28
+ apt-get update -qq
29
+ apt-get install -y --no-install-recommends git curl ca-certificates >/dev/null
30
  rm -rf /work
31
  git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work
32
  cd /work
33
+ chmod +x training/hf_remote_run.sh
34
+ exec bash training/hf_remote_run.sh'
 
 
 
35
 
36
  exec hf jobs run \
37
  --flavor "$FLAVOR" \
 
40
  --secrets HF_TOKEN \
41
  --env "SPACE_REPO=$SPACE_REPO" \
42
  --env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \
43
+ --env "SMOKE_MODE=$SMOKE_MODE" \
44
  "$IMAGE" \
45
+ bash -lc "$BOOTSTRAP"