Spaces:
Paused
Paused
File size: 1,701 Bytes
4419350 0980a17 4419350 0980a17 4419350 225cdfe 4419350 6279175 0980a17 95d0045 4419350 0980a17 4419350 0980a17 4419350 0980a17 4419350 bcc27a5 4419350 0980a17 95d0045 bcc27a5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | #!/usr/bin/env bash
# Launch a HF Job that:
# 1. clones the Space repo (default: ycwhencpp/train-new) into /work
# 2. execs training/hf_remote_run.sh from that clone (heavy lifting lives there)
#
# We keep the inline bootstrap intentionally tiny — anything larger risks the
# "File name too long" failure when the whole command becomes argv to bash -lc.
#
# Requires: hf auth login (token forwarded via --secrets HF_TOKEN)
set -euo pipefail
IMAGE="${HF_JOB_IMAGE:-pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime}"
FLAVOR="${HF_JOB_FLAVOR:-a100x4}"
TIMEOUT="${HF_JOB_TIMEOUT:-8h}"
SPACE_REPO="${HF_SPACE_REPO_ID:-ycwhencpp/final-iteration}"
NB_EXEC_TIMEOUT="${NB_EXEC_TIMEOUT:-14400}"
SMOKE_MODE="${SMOKE_MODE:-0}"
TEST_ONLY="${TEST_ONLY:-0}"
if ! hf auth whoami &>/dev/null; then
echo "Run: hf auth login" >&2
exit 1
fi
# Tiny bootstrap: install git, clone the Space, hand off to the in-repo script.
BOOTSTRAP='set -euo pipefail
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y --no-install-recommends git curl ca-certificates >/dev/null
rm -rf /work
git clone --depth 1 "https://user:${HF_TOKEN}@huggingface.co/spaces/${SPACE_REPO}" /work
cd /work
chmod +x training/hf_remote_run.sh
exec bash training/hf_remote_run.sh'
# Use `--` to terminate hf CLI option parsing — otherwise `bash -lc <script>`
# is parsed as `--label c <script>` (typer consumes the `-l` short flag).
exec hf jobs run \
--flavor "$FLAVOR" \
--detach \
--timeout "$TIMEOUT" \
--secrets HF_TOKEN \
--env "SPACE_REPO=$SPACE_REPO" \
--env "NB_EXEC_TIMEOUT=$NB_EXEC_TIMEOUT" \
--env "SMOKE_MODE=$SMOKE_MODE" \
--env "TEST_ONLY=$TEST_ONLY" \
-- "$IMAGE" bash -c "$BOOTSTRAP"
|