physix-live / train /submit.py
Pratyush-01's picture
Upload folder using huggingface_hub
08f8699 verified
"""Submit job_train.py to Hugging Face Jobs.
The `hf jobs uv run` CLI hangs intermittently on the whoami/encoding step
(observed twice — see the conversation log around 2026-04-26). Submitting
via huggingface_hub.HfApi.run_uv_job directly is reliable and lets us pass
the dataset volume mount that the CLI version captures via `-v`.
Usage:
export HF_TOKEN=hf_...
export WANDB_API_KEY=wandb_v1_...
python submit.py
"""
from __future__ import annotations
import os
import sys
from huggingface_hub import HfApi
from huggingface_hub.utils import HfHubHTTPError
# Mount the dataset that contains physix-live/ source at /physix-live in the
# container. _stage_physix_live() in job_train.py expects this exact path.
DATASET_REPO = "Pratyush-01/physix-live-src"
MOUNT_PATH = "/physix-live"
IMAGE = "unsloth/unsloth:2026.3.8-pt2.9.0-vllm-0.16.0-cu12.8-studio-release"
# Switched from a100-large (80 GB, $2.50/hr) to l40sx1 (48 GB, $1.80/hr).
#
# Why: the a100-large pool is heavily queue-bound right now — the unsloth-
# jobs blog drove a wave of users onto exactly that flavor and our last
# submission sat in SCHEDULING for 17+ min before we cancelled. l40sx1 is
# in a separate (less contested) pool and is *cheaper*. It uses an Ada
# Lovelace L40S GPU instead of an Ampere A100 — Unsloth and vLLM both
# fully support L40S, and 3B + LoRA-32 + vLLM standby uses <20 GB, so the
# 48 GB allotment is comfortable. Per-step throughput is roughly comparable
# to A100 for small (≤7B) LoRA workloads.
#
# HF Jobs flavors verified via https://huggingface.co/docs/hub/jobs-pricing
# (note: there is NO a100-medium — only a100-large/x4/x8 in the A100 line).
FLAVOR = "l40sx1"
TIMEOUT = "3h"
SCRIPT = os.path.join(os.path.dirname(__file__), "job_train.py")
# Pass namespace explicitly so run_uv_job() skips its internal whoami() call
# (it needs the namespace to upload the script as <ns>/job-...). The /whoami-v2
# endpoint is heavily rate-limited and was tripping us when chained after
# dataset uploads. Hardcoding our own namespace avoids the round-trip.
NAMESPACE = "Pratyush-01"
def main() -> None:
hf_token = os.environ.get("HF_TOKEN")
wandb_key = os.environ.get("WANDB_API_KEY")
if not hf_token:
sys.exit("HF_TOKEN env var is required")
if not wandb_key:
sys.exit("WANDB_API_KEY env var is required")
api = HfApi(token=hf_token)
from huggingface_hub import Volume
volumes = [
Volume(
type="dataset",
source=DATASET_REPO,
mount_path=MOUNT_PATH,
),
]
print(f"Submitting job_train.py from {SCRIPT}")
print(f" image={IMAGE}")
print(f" flavor={FLAVOR}")
print(f" volume={DATASET_REPO} -> {MOUNT_PATH}")
print(f" timeout={TIMEOUT}")
print(f" namespace={NAMESPACE} (skips whoami round-trip)")
try:
job = api.run_uv_job(
script=SCRIPT,
image=IMAGE,
flavor=FLAVOR,
secrets={"HF_TOKEN": hf_token, "WANDB_API_KEY": wandb_key},
volumes=volumes,
timeout=TIMEOUT,
namespace=NAMESPACE,
)
except HfHubHTTPError as exc:
sys.exit(f"FAILED: {exc}")
print("\n=== Submitted ===")
print(f" job_id: {job.id}")
print(f" url: {job.url}")
print(f" status: {job.status.stage}")
print(f"\nTail logs with:\n hf jobs logs {job.id}")
if __name__ == "__main__":
main()