ycwhencpp commited on
Commit
e1fcc90
·
verified ·
1 Parent(s): 5e9fb2f

Fix: split remote script to avoid ARG_MAX / ENAMETOOLONG on bash -lc

Browse files
training/hf_remote_run.sh ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Executed inside the HF Job container after the repo is cloned to /work.
3
+ # Called by hf_run_space_train_job.sh bootstrap command.
4
+ set -euo pipefail
5
+
6
+ cd /work
7
+
8
+ papermill --log-output --progress-bar --execution-timeout "${NB_EXEC_TIMEOUT}" \
9
+ training/train_grpo.ipynb training/train_grpo.executed.ipynb
10
+
11
+ STAGE="/work/_run_upload"
12
+ rm -rf "$STAGE" && mkdir -p "$STAGE/training"
13
+ cp -a training/train_grpo.executed.ipynb "$STAGE/training/"
14
+ [ -d plots ] && cp -a plots "$STAGE/"
15
+ [ -d viraltest_trained_adapter ] && cp -a viraltest_trained_adapter "$STAGE/"
16
+ if [ "${INCLUDE_CHECKPOINTS:-0}" = "1" ] && [ -d checkpoints ]; then
17
+ cp -a checkpoints "$STAGE/"
18
+ fi
19
+
20
+ TARBALL="/tmp/r.tgz"
21
+ ( cd "$STAGE" && tar -czf "$TARBALL" . )
22
+
23
+ export HOME=/tmp HF_HOME=/tmp/hf HUGGINGFACE_HUB_CACHE=/tmp/hfc HF_HUB_CACHE=/tmp/hfc TMPDIR=/tmp XDG_CACHE_HOME=/tmp
24
+ mkdir -p "$HF_HOME" "$HF_HUB_CACHE"
25
+
26
+ python3 -c "
27
+ import os
28
+ os.environ.setdefault('HOME', '/tmp')
29
+ os.environ.setdefault('HF_HOME', '/tmp/hf')
30
+ os.environ.setdefault('HUGGINGFACE_HUB_CACHE', '/tmp/hfc')
31
+ os.environ.setdefault('HF_HUB_CACHE', '/tmp/hfc')
32
+ os.environ.setdefault('TMPDIR', '/tmp')
33
+ os.environ.setdefault('XDG_CACHE_HOME', '/tmp')
34
+ from huggingface_hub import HfApi
35
+ with open('/tmp/r.tgz', 'rb') as f:
36
+ HfApi(token=os.environ.get('HF_TOKEN')).upload_file(
37
+ path_or_fileobj=f,
38
+ path_in_repo='run-output/artifacts.tar.gz',
39
+ repo_id=os.environ['SPACE_REPO'],
40
+ repo_type='space',
41
+ )
42
+ print('Artifacts uploaded.')
43
+ "
training/hf_run_space_train_job.sh CHANGED
@@ -32,68 +32,7 @@ if ! hf auth list &>/dev/null; then
32
  exit 1
33
  fi
34
 
35
- REMOTE_SCRIPT=$(cat <<'EOS'
36
- set -euo pipefail
37
- export DEBIAN_FRONTEND=noninteractive
38
- apt-get update -qq && apt-get install -y --no-install-recommends git curl ca-certificates
39
- pip install -q --root-user-action=ignore --upgrade "typing_extensions>=4.15.0" jupyter nbconvert nbclient ipykernel huggingface_hub papermill
40
- rm -rf /work
41
- git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work
42
- cd /work
43
- papermill --log-output --progress-bar --execution-timeout "${NB_EXEC_TIMEOUT}" \
44
- training/train_grpo.ipynb training/train_grpo.executed.ipynb
45
-
46
- # Artifacts: keep paths short; tar once, upload one file. Large uploads: huggingface_hub
47
- # uses Xet (hf_xet) when the file is a path + Xet is installed, which can hit ENAMETOOLONG
48
- # in ~/.cache/.../xet on some job images. Open the tarball in binary mode and pass the
49
- # file object so the client does not offer Xet and falls back to LFS/HTTP
50
- # (see huggingface_hub _commit_api._upload_preupload_* / transfers list).
51
- # LoRA: viraltest_trained_adapter. Checkpoints: optional INCLUDE_CHECKPOINTS=1.
52
- STAGE="/work/_run_upload"
53
- rm -rf "$STAGE" && mkdir -p "$STAGE/training"
54
- cp -a training/train_grpo.executed.ipynb "$STAGE/training/"
55
- [ -d plots ] && cp -a plots "$STAGE/"
56
- [ -d viraltest_trained_adapter ] && cp -a viraltest_trained_adapter "$STAGE/"
57
- if [ "${INCLUDE_CHECKPOINTS:-0}" = "1" ] && [ -d checkpoints ]; then
58
- cp -a checkpoints "$STAGE/"
59
- fi
60
-
61
- TARBALL="/tmp/r.tgz"
62
- ( cd "$STAGE" && tar -czf "$TARBALL" . )
63
-
64
- # Short cache/home so anything still touching ~/.cache stays under /tmp
65
- export HOME=/tmp
66
- export HF_HOME=/tmp/hf
67
- export HUGGINGFACE_HUB_CACHE=/tmp/hfc
68
- export HF_HUB_CACHE=/tmp/hfc
69
- export TMPDIR=/tmp
70
- export XDG_CACHE_HOME=/tmp
71
- mkdir -p "$HF_HOME" "$HF_HUB_CACHE"
72
-
73
- python <<'PY'
74
- import os
75
-
76
- # Before importing huggingface_hub (constants read cache paths at import)
77
- os.environ.setdefault("HOME", "/tmp")
78
- os.environ.setdefault("HF_HOME", "/tmp/hf")
79
- os.environ.setdefault("HUGGINGFACE_HUB_CACHE", "/tmp/hfc")
80
- os.environ.setdefault("HF_HUB_CACHE", "/tmp/hfc")
81
- os.environ.setdefault("TMPDIR", "/tmp")
82
- os.environ.setdefault("XDG_CACHE_HOME", "/tmp")
83
-
84
- from huggingface_hub import HfApi
85
-
86
- # Buffered file object: hub omits "xet" from LFS transfer offers (avoids xet chunk-cache path blowups)
87
- with open("/tmp/r.tgz", "rb") as f:
88
- HfApi(token=os.environ.get("HF_TOKEN")).upload_file(
89
- path_or_fileobj=f,
90
- path_in_repo="run-output/artifacts.tar.gz",
91
- repo_id=os.environ["SPACE_REPO"],
92
- repo_type="space",
93
- )
94
- PY
95
- EOS
96
- )
97
 
98
  exec hf jobs run \
99
  --namespace "$NAMESPACE" \
@@ -105,4 +44,4 @@ exec hf jobs run \
105
  --env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \
106
  --env "INCLUDE_CHECKPOINTS=$INCLUDE_CHECKPOINTS" \
107
  "$IMAGE" \
108
- bash -lc "$REMOTE_SCRIPT"
 
32
  exit 1
33
  fi
34
 
35
+ BOOTSTRAP='set -euo pipefail; export DEBIAN_FRONTEND=noninteractive; apt-get update -qq && apt-get install -y --no-install-recommends git curl ca-certificates; pip install -q --root-user-action=ignore --upgrade "typing_extensions>=4.15.0" jupyter nbconvert nbclient ipykernel huggingface_hub papermill; rm -rf /work; git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work; cd /work; bash training/hf_remote_run.sh'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  exec hf jobs run \
38
  --namespace "$NAMESPACE" \
 
44
  --env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \
45
  --env "INCLUDE_CHECKPOINTS=$INCLUDE_CHECKPOINTS" \
46
  "$IMAGE" \
47
+ bash -lc "$BOOTSTRAP"