train-new / training /hf_remote_run.sh
ycwhencpp's picture
Fix: split remote script to avoid ARG_MAX / ENAMETOOLONG on bash -lc
e1fcc90 verified
#!/usr/bin/env bash
# Executed inside the HF Job container after the repo is cloned to /work.
# Called by hf_run_space_train_job.sh bootstrap command.
set -euo pipefail
cd /work
papermill --log-output --progress-bar --execution-timeout "${NB_EXEC_TIMEOUT}" \
training/train_grpo.ipynb training/train_grpo.executed.ipynb
STAGE="/work/_run_upload"
rm -rf "$STAGE" && mkdir -p "$STAGE/training"
cp -a training/train_grpo.executed.ipynb "$STAGE/training/"
[ -d plots ] && cp -a plots "$STAGE/"
[ -d viraltest_trained_adapter ] && cp -a viraltest_trained_adapter "$STAGE/"
if [ "${INCLUDE_CHECKPOINTS:-0}" = "1" ] && [ -d checkpoints ]; then
cp -a checkpoints "$STAGE/"
fi
TARBALL="/tmp/r.tgz"
( cd "$STAGE" && tar -czf "$TARBALL" . )
export HOME=/tmp HF_HOME=/tmp/hf HUGGINGFACE_HUB_CACHE=/tmp/hfc HF_HUB_CACHE=/tmp/hfc TMPDIR=/tmp XDG_CACHE_HOME=/tmp
mkdir -p "$HF_HOME" "$HF_HUB_CACHE"
python3 -c "
import os
os.environ.setdefault('HOME', '/tmp')
os.environ.setdefault('HF_HOME', '/tmp/hf')
os.environ.setdefault('HUGGINGFACE_HUB_CACHE', '/tmp/hfc')
os.environ.setdefault('HF_HUB_CACHE', '/tmp/hfc')
os.environ.setdefault('TMPDIR', '/tmp')
os.environ.setdefault('XDG_CACHE_HOME', '/tmp')
from huggingface_hub import HfApi
with open('/tmp/r.tgz', 'rb') as f:
HfApi(token=os.environ.get('HF_TOKEN')).upload_file(
path_or_fileobj=f,
path_in_repo='run-output/artifacts.tar.gz',
repo_id=os.environ['SPACE_REPO'],
repo_type='space',
)
print('Artifacts uploaded.')
"