File size: 2,266 Bytes
0813516
 
 
 
 
5e9fb2f
 
 
 
 
 
 
0813516
 
 
 
 
 
5e9fb2f
 
 
 
 
0813516
5e9fb2f
 
 
0813516
5e9fb2f
 
0813516
 
 
 
e1fcc90
0813516
 
5e9fb2f
0813516
 
 
 
 
 
5e9fb2f
0813516
e1fcc90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env bash
# Same environment as your HF Job (Space clone + nbconvert + upload to Space).
# Old UI command was invalid shell (no &&); this version is a proper chain.
#
# Requires: hf auth login (token is sent via --secrets HF_TOKEN from the CLI cache)
# Optional env:
#   HF_SPACE_REPO_ID  — Space to clone and upload into (default: ycwhencpp/train-new).
#   HF_JOB_NAMESPACE   — Hub namespace for the Job (default: part before / in HF_SPACE_REPO_ID).
#                        The HF CLI otherwise calls /whoami-v2 to get it, which is rate-limited.
#   INCLUDE_CHECKPOINTS — set to 1 to bundle checkpoints/ in artifacts.tar.gz (default: 0)
#
# The token must have job.write in that namespace. Use a token from the same org/user as the Space.

set -euo pipefail

IMAGE="${HF_JOB_IMAGE:-pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime}"
FLAVOR="${HF_JOB_FLAVOR:-l40sx1}"
TIMEOUT="${HF_JOB_TIMEOUT:-8h}"
SPACE_REPO="${HF_SPACE_REPO_ID:-ycwhencpp/train-new}"
if [[ "$SPACE_REPO" != */* ]]; then
  echo "HF_SPACE_REPO_ID must be org/name (e.g. ycwhencpp/my-space), got: $SPACE_REPO" >&2
  exit 1
fi
NB_EXEC_TIMEOUT="${NB_EXEC_TIMEOUT:-3600}"
INCLUDE_CHECKPOINTS="${INCLUDE_CHECKPOINTS:-0}"
# hf jobs run → HfApi.run_job() calls whoami() when --namespace is omitted; /whoami-v2 is strict.
NAMESPACE="${HF_JOB_NAMESPACE:-${SPACE_REPO%%/*}}"

# Local auth is enough to submit; token is passed as --secrets HF_TOKEN on the job
if ! hf auth list &>/dev/null; then
  echo "Run: hf auth login" >&2
  exit 1
fi

BOOTSTRAP='set -euo pipefail; export DEBIAN_FRONTEND=noninteractive; apt-get update -qq && apt-get install -y --no-install-recommends git curl ca-certificates; pip install -q --root-user-action=ignore --upgrade "typing_extensions>=4.15.0" jupyter nbconvert nbclient ipykernel huggingface_hub papermill; rm -rf /work; git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work; cd /work; bash training/hf_remote_run.sh'

exec hf jobs run \
  --namespace "$NAMESPACE" \
  --flavor "$FLAVOR" \
  --detach \
  --timeout "$TIMEOUT" \
  --secrets HF_TOKEN \
  --env "SPACE_REPO=$SPACE_REPO" \
  --env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \
  --env "INCLUDE_CHECKPOINTS=$INCLUDE_CHECKPOINTS" \
  "$IMAGE" \
  bash -lc "$BOOTSTRAP"