#!/usr/bin/env bash # Same environment as your HF Job (Space clone + nbconvert + upload to Space). # Old UI command was invalid shell (no &&); this version is a proper chain. # # Requires: hf auth login (token is sent via --secrets HF_TOKEN from the CLI cache) # Optional env: # HF_SPACE_REPO_ID — Space to clone and upload into (default: ycwhencpp/train-new). # HF_JOB_NAMESPACE — Hub namespace for the Job (default: part before / in HF_SPACE_REPO_ID). # The HF CLI otherwise calls /whoami-v2 to get it, which is rate-limited. # INCLUDE_CHECKPOINTS — set to 1 to bundle checkpoints/ in artifacts.tar.gz (default: 0) # # The token must have job.write in that namespace. Use a token from the same org/user as the Space. set -euo pipefail IMAGE="${HF_JOB_IMAGE:-pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime}" FLAVOR="${HF_JOB_FLAVOR:-l40sx1}" TIMEOUT="${HF_JOB_TIMEOUT:-8h}" SPACE_REPO="${HF_SPACE_REPO_ID:-ycwhencpp/train-new}" if [[ "$SPACE_REPO" != */* ]]; then echo "HF_SPACE_REPO_ID must be org/name (e.g. ycwhencpp/my-space), got: $SPACE_REPO" >&2 exit 1 fi NB_EXEC_TIMEOUT="${NB_EXEC_TIMEOUT:-3600}" INCLUDE_CHECKPOINTS="${INCLUDE_CHECKPOINTS:-0}" # hf jobs run → HfApi.run_job() calls whoami() when --namespace is omitted; /whoami-v2 is strict. NAMESPACE="${HF_JOB_NAMESPACE:-${SPACE_REPO%%/*}}" # Local auth is enough to submit; token is passed as --secrets HF_TOKEN on the job if ! hf auth list &>/dev/null; then echo "Run: hf auth login" >&2 exit 1 fi BOOTSTRAP='set -euo pipefail; export DEBIAN_FRONTEND=noninteractive; apt-get update -qq && apt-get install -y --no-install-recommends git curl ca-certificates; pip install -q --root-user-action=ignore --upgrade "typing_extensions>=4.15.0" jupyter nbconvert nbclient ipykernel huggingface_hub papermill; rm -rf /work; git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work; cd /work; bash training/hf_remote_run.sh' exec hf jobs run \ --namespace "$NAMESPACE" \ --flavor "$FLAVOR" \ --detach \ --timeout "$TIMEOUT" \ --secrets HF_TOKEN \ --env "SPACE_REPO=$SPACE_REPO" \ --env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \ --env "INCLUDE_CHECKPOINTS=$INCLUDE_CHECKPOINTS" \ "$IMAGE" \ bash -lc "$BOOTSTRAP"