| # Same environment as your HF Job (Space clone + nbconvert + upload to Space). | |
| # Old UI command was invalid shell (no &&); this version is a proper chain. | |
| # | |
| # Requires: hf auth login (token is sent via --secrets HF_TOKEN from the CLI cache) | |
| # Optional env: | |
| # HF_SPACE_REPO_ID — Space to clone and upload into (default: ycwhencpp/train-new). | |
| # HF_JOB_NAMESPACE — Hub namespace for the Job (default: part before / in HF_SPACE_REPO_ID). | |
| # The HF CLI otherwise calls /whoami-v2 to get it, which is rate-limited. | |
| # INCLUDE_CHECKPOINTS — set to 1 to bundle checkpoints/ in artifacts.tar.gz (default: 0) | |
| # | |
| # The token must have job.write in that namespace. Use a token from the same org/user as the Space. | |
| set -euo pipefail | |
| IMAGE="${HF_JOB_IMAGE:-pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime}" | |
| FLAVOR="${HF_JOB_FLAVOR:-l40sx1}" | |
| TIMEOUT="${HF_JOB_TIMEOUT:-8h}" | |
| SPACE_REPO="${HF_SPACE_REPO_ID:-ycwhencpp/train-new}" | |
| if [[ "$SPACE_REPO" != */* ]]; then | |
| echo "HF_SPACE_REPO_ID must be org/name (e.g. ycwhencpp/my-space), got: $SPACE_REPO" >&2 | |
| exit 1 | |
| fi | |
| NB_EXEC_TIMEOUT="${NB_EXEC_TIMEOUT:-3600}" | |
| INCLUDE_CHECKPOINTS="${INCLUDE_CHECKPOINTS:-0}" | |
| # hf jobs run → HfApi.run_job() calls whoami() when --namespace is omitted; /whoami-v2 is strict. | |
| NAMESPACE="${HF_JOB_NAMESPACE:-${SPACE_REPO%%/*}}" | |
| # Local auth is enough to submit; token is passed as --secrets HF_TOKEN on the job | |
| if ! hf auth list &>/dev/null; then | |
| echo "Run: hf auth login" >&2 | |
| exit 1 | |
| fi | |
| BOOTSTRAP='set -euo pipefail; export DEBIAN_FRONTEND=noninteractive; apt-get update -qq && apt-get install -y --no-install-recommends git curl ca-certificates; pip install -q --root-user-action=ignore --upgrade "typing_extensions>=4.15.0" jupyter nbconvert nbclient ipykernel huggingface_hub papermill; rm -rf /work; git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work; cd /work; bash training/hf_remote_run.sh' | |
| exec hf jobs run \ | |
| --namespace "$NAMESPACE" \ | |
| --flavor "$FLAVOR" \ | |
| --detach \ | |
| --timeout "$TIMEOUT" \ | |
| --secrets HF_TOKEN \ | |
| --env "SPACE_REPO=$SPACE_REPO" \ | |
| --env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \ | |
| --env "INCLUDE_CHECKPOINTS=$INCLUDE_CHECKPOINTS" \ | |
| "$IMAGE" \ | |
| bash -lc "$BOOTSTRAP" | |