Fix: split remote script to avoid ARG_MAX / ENAMETOOLONG on bash -lc
Browse files- training/hf_remote_run.sh +43 -0
- training/hf_run_space_train_job.sh +2 -63
training/hf_remote_run.sh
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Executed inside the HF Job container after the repo is cloned to /work.
|
| 3 |
+
# Called by hf_run_space_train_job.sh bootstrap command.
|
| 4 |
+
set -euo pipefail
|
| 5 |
+
|
| 6 |
+
cd /work
|
| 7 |
+
|
| 8 |
+
papermill --log-output --progress-bar --execution-timeout "${NB_EXEC_TIMEOUT}" \
|
| 9 |
+
training/train_grpo.ipynb training/train_grpo.executed.ipynb
|
| 10 |
+
|
| 11 |
+
STAGE="/work/_run_upload"
|
| 12 |
+
rm -rf "$STAGE" && mkdir -p "$STAGE/training"
|
| 13 |
+
cp -a training/train_grpo.executed.ipynb "$STAGE/training/"
|
| 14 |
+
[ -d plots ] && cp -a plots "$STAGE/"
|
| 15 |
+
[ -d viraltest_trained_adapter ] && cp -a viraltest_trained_adapter "$STAGE/"
|
| 16 |
+
if [ "${INCLUDE_CHECKPOINTS:-0}" = "1" ] && [ -d checkpoints ]; then
|
| 17 |
+
cp -a checkpoints "$STAGE/"
|
| 18 |
+
fi
|
| 19 |
+
|
| 20 |
+
TARBALL="/tmp/r.tgz"
|
| 21 |
+
( cd "$STAGE" && tar -czf "$TARBALL" . )
|
| 22 |
+
|
| 23 |
+
export HOME=/tmp HF_HOME=/tmp/hf HUGGINGFACE_HUB_CACHE=/tmp/hfc HF_HUB_CACHE=/tmp/hfc TMPDIR=/tmp XDG_CACHE_HOME=/tmp
|
| 24 |
+
mkdir -p "$HF_HOME" "$HF_HUB_CACHE"
|
| 25 |
+
|
| 26 |
+
python3 -c "
|
| 27 |
+
import os
|
| 28 |
+
os.environ.setdefault('HOME', '/tmp')
|
| 29 |
+
os.environ.setdefault('HF_HOME', '/tmp/hf')
|
| 30 |
+
os.environ.setdefault('HUGGINGFACE_HUB_CACHE', '/tmp/hfc')
|
| 31 |
+
os.environ.setdefault('HF_HUB_CACHE', '/tmp/hfc')
|
| 32 |
+
os.environ.setdefault('TMPDIR', '/tmp')
|
| 33 |
+
os.environ.setdefault('XDG_CACHE_HOME', '/tmp')
|
| 34 |
+
from huggingface_hub import HfApi
|
| 35 |
+
with open('/tmp/r.tgz', 'rb') as f:
|
| 36 |
+
HfApi(token=os.environ.get('HF_TOKEN')).upload_file(
|
| 37 |
+
path_or_fileobj=f,
|
| 38 |
+
path_in_repo='run-output/artifacts.tar.gz',
|
| 39 |
+
repo_id=os.environ['SPACE_REPO'],
|
| 40 |
+
repo_type='space',
|
| 41 |
+
)
|
| 42 |
+
print('Artifacts uploaded.')
|
| 43 |
+
"
|
training/hf_run_space_train_job.sh
CHANGED
|
@@ -32,68 +32,7 @@ if ! hf auth list &>/dev/null; then
|
|
| 32 |
exit 1
|
| 33 |
fi
|
| 34 |
|
| 35 |
-
|
| 36 |
-
set -euo pipefail
|
| 37 |
-
export DEBIAN_FRONTEND=noninteractive
|
| 38 |
-
apt-get update -qq && apt-get install -y --no-install-recommends git curl ca-certificates
|
| 39 |
-
pip install -q --root-user-action=ignore --upgrade "typing_extensions>=4.15.0" jupyter nbconvert nbclient ipykernel huggingface_hub papermill
|
| 40 |
-
rm -rf /work
|
| 41 |
-
git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work
|
| 42 |
-
cd /work
|
| 43 |
-
papermill --log-output --progress-bar --execution-timeout "${NB_EXEC_TIMEOUT}" \
|
| 44 |
-
training/train_grpo.ipynb training/train_grpo.executed.ipynb
|
| 45 |
-
|
| 46 |
-
# Artifacts: keep paths short; tar once, upload one file. Large uploads: huggingface_hub
|
| 47 |
-
# uses Xet (hf_xet) when the file is a path + Xet is installed, which can hit ENAMETOOLONG
|
| 48 |
-
# in ~/.cache/.../xet on some job images. Open the tarball in binary mode and pass the
|
| 49 |
-
# file object so the client does not offer Xet and falls back to LFS/HTTP
|
| 50 |
-
# (see huggingface_hub _commit_api._upload_preupload_* / transfers list).
|
| 51 |
-
# LoRA: viraltest_trained_adapter. Checkpoints: optional INCLUDE_CHECKPOINTS=1.
|
| 52 |
-
STAGE="/work/_run_upload"
|
| 53 |
-
rm -rf "$STAGE" && mkdir -p "$STAGE/training"
|
| 54 |
-
cp -a training/train_grpo.executed.ipynb "$STAGE/training/"
|
| 55 |
-
[ -d plots ] && cp -a plots "$STAGE/"
|
| 56 |
-
[ -d viraltest_trained_adapter ] && cp -a viraltest_trained_adapter "$STAGE/"
|
| 57 |
-
if [ "${INCLUDE_CHECKPOINTS:-0}" = "1" ] && [ -d checkpoints ]; then
|
| 58 |
-
cp -a checkpoints "$STAGE/"
|
| 59 |
-
fi
|
| 60 |
-
|
| 61 |
-
TARBALL="/tmp/r.tgz"
|
| 62 |
-
( cd "$STAGE" && tar -czf "$TARBALL" . )
|
| 63 |
-
|
| 64 |
-
# Short cache/home so anything still touching ~/.cache stays under /tmp
|
| 65 |
-
export HOME=/tmp
|
| 66 |
-
export HF_HOME=/tmp/hf
|
| 67 |
-
export HUGGINGFACE_HUB_CACHE=/tmp/hfc
|
| 68 |
-
export HF_HUB_CACHE=/tmp/hfc
|
| 69 |
-
export TMPDIR=/tmp
|
| 70 |
-
export XDG_CACHE_HOME=/tmp
|
| 71 |
-
mkdir -p "$HF_HOME" "$HF_HUB_CACHE"
|
| 72 |
-
|
| 73 |
-
python <<'PY'
|
| 74 |
-
import os
|
| 75 |
-
|
| 76 |
-
# Before importing huggingface_hub (constants read cache paths at import)
|
| 77 |
-
os.environ.setdefault("HOME", "/tmp")
|
| 78 |
-
os.environ.setdefault("HF_HOME", "/tmp/hf")
|
| 79 |
-
os.environ.setdefault("HUGGINGFACE_HUB_CACHE", "/tmp/hfc")
|
| 80 |
-
os.environ.setdefault("HF_HUB_CACHE", "/tmp/hfc")
|
| 81 |
-
os.environ.setdefault("TMPDIR", "/tmp")
|
| 82 |
-
os.environ.setdefault("XDG_CACHE_HOME", "/tmp")
|
| 83 |
-
|
| 84 |
-
from huggingface_hub import HfApi
|
| 85 |
-
|
| 86 |
-
# Buffered file object: hub omits "xet" from LFS transfer offers (avoids xet chunk-cache path blowups)
|
| 87 |
-
with open("/tmp/r.tgz", "rb") as f:
|
| 88 |
-
HfApi(token=os.environ.get("HF_TOKEN")).upload_file(
|
| 89 |
-
path_or_fileobj=f,
|
| 90 |
-
path_in_repo="run-output/artifacts.tar.gz",
|
| 91 |
-
repo_id=os.environ["SPACE_REPO"],
|
| 92 |
-
repo_type="space",
|
| 93 |
-
)
|
| 94 |
-
PY
|
| 95 |
-
EOS
|
| 96 |
-
)
|
| 97 |
|
| 98 |
exec hf jobs run \
|
| 99 |
--namespace "$NAMESPACE" \
|
|
@@ -105,4 +44,4 @@ exec hf jobs run \
|
|
| 105 |
--env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \
|
| 106 |
--env "INCLUDE_CHECKPOINTS=$INCLUDE_CHECKPOINTS" \
|
| 107 |
"$IMAGE" \
|
| 108 |
-
bash -lc "$
|
|
|
|
| 32 |
exit 1
|
| 33 |
fi
|
| 34 |
|
| 35 |
+
BOOTSTRAP='set -euo pipefail; export DEBIAN_FRONTEND=noninteractive; apt-get update -qq && apt-get install -y --no-install-recommends git curl ca-certificates; pip install -q --root-user-action=ignore --upgrade "typing_extensions>=4.15.0" jupyter nbconvert nbclient ipykernel huggingface_hub papermill; rm -rf /work; git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work; cd /work; bash training/hf_remote_run.sh'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
exec hf jobs run \
|
| 38 |
--namespace "$NAMESPACE" \
|
|
|
|
| 44 |
--env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \
|
| 45 |
--env "INCLUDE_CHECKPOINTS=$INCLUDE_CHECKPOINTS" \
|
| 46 |
"$IMAGE" \
|
| 47 |
+
bash -lc "$BOOTSTRAP"
|