File size: 2,313 Bytes
0980a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95d0045
0980a17
 
 
 
 
 
 
95d0045
0980a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env bash
# Remote runner executed inside the HF Job container.
# The launcher (training/hf_run_space_train_job.sh) clones the Space into /work
# and then exec's this script. Keeping the heavy lifting in a file (instead of
# inlining via `bash -lc`) avoids the argv "File name too long" failure.
#
# Required env (set by launcher):
#   SPACE_REPO          e.g. ycwhencpp/train-new
#   HF_TOKEN            secret, injected via --secrets
#   NB_EXEC_TIMEOUT     papermill execution timeout in seconds (default 14400)
#   SMOKE_MODE          0 = full training, 1 = smoke (default 0 here)

set -euo pipefail

: "${SPACE_REPO:?SPACE_REPO must be set}"
: "${HF_TOKEN:?HF_TOKEN must be set (use --secrets HF_TOKEN)}"

NB_EXEC_TIMEOUT="${NB_EXEC_TIMEOUT:-14400}"
export SMOKE_MODE="${SMOKE_MODE:-0}"
export TEST_ONLY="${TEST_ONLY:-0}"
export HF_HUB_ENABLE_HF_TRANSFER="${HF_HUB_ENABLE_HF_TRANSFER:-1}"
export TOKENIZERS_PARALLELISM="${TOKENIZERS_PARALLELISM:-false}"
export DEBIAN_FRONTEND=noninteractive

echo "===== hf_remote_run.sh starting at $(date -u +%FT%TZ) ====="
echo "SPACE_REPO=${SPACE_REPO}"
echo "SMOKE_MODE=${SMOKE_MODE}"
echo "TEST_ONLY=${TEST_ONLY}"
echo "NB_EXEC_TIMEOUT=${NB_EXEC_TIMEOUT}"

nvidia-smi || true

echo "----- pip installs -----"
pip install -q --root-user-action=ignore --upgrade \
  "typing_extensions>=4.15.0" \
  jupyter nbconvert nbclient ipykernel \
  huggingface_hub hf_transfer papermill

echo "----- executing notebook with papermill -----"
mkdir -p plots run-output checkpoints
papermill --log-output --progress-bar \
  --execution-timeout "${NB_EXEC_TIMEOUT}" \
  training/train_grpo.ipynb \
  training/train_grpo.executed.ipynb

echo "----- uploading artifacts back to Space (run-output/) -----"
python - <<'PY'
import os
from huggingface_hub import HfApi

api = HfApi(token=os.environ["HF_TOKEN"])
api.upload_folder(
    folder_path=".",
    path_in_repo="run-output",
    repo_id=os.environ["SPACE_REPO"],
    repo_type="space",
    allow_patterns=[
        "training/train_grpo.executed.ipynb",
        "plots/**",
        "checkpoints/**/adapter_*",
        "checkpoints/**/lora-*/**",
        "**/lora-*/**",
    ],
    commit_message="HF Job: train_grpo run output",
)
print("upload complete")
PY

echo "===== hf_remote_run.sh done at $(date -u +%FT%TZ) ====="