"""Submit job_train.py to Hugging Face Jobs. The `hf jobs uv run` CLI hangs intermittently on the whoami/encoding step (observed twice — see the conversation log around 2026-04-26). Submitting via huggingface_hub.HfApi.run_uv_job directly is reliable and lets us pass the dataset volume mount that the CLI version captures via `-v`. Usage: export HF_TOKEN=hf_... export WANDB_API_KEY=wandb_v1_... python submit.py """ from __future__ import annotations import os import sys from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError # Mount the dataset that contains physix-live/ source at /physix-live in the # container. _stage_physix_live() in job_train.py expects this exact path. DATASET_REPO = "Pratyush-01/physix-live-src" MOUNT_PATH = "/physix-live" IMAGE = "unsloth/unsloth:2026.3.8-pt2.9.0-vllm-0.16.0-cu12.8-studio-release" # Switched from a100-large (80 GB, $2.50/hr) to l40sx1 (48 GB, $1.80/hr). # # Why: the a100-large pool is heavily queue-bound right now — the unsloth- # jobs blog drove a wave of users onto exactly that flavor and our last # submission sat in SCHEDULING for 17+ min before we cancelled. l40sx1 is # in a separate (less contested) pool and is *cheaper*. It uses an Ada # Lovelace L40S GPU instead of an Ampere A100 — Unsloth and vLLM both # fully support L40S, and 3B + LoRA-32 + vLLM standby uses <20 GB, so the # 48 GB allotment is comfortable. Per-step throughput is roughly comparable # to A100 for small (≤7B) LoRA workloads. # # HF Jobs flavors verified via https://huggingface.co/docs/hub/jobs-pricing # (note: there is NO a100-medium — only a100-large/x4/x8 in the A100 line). FLAVOR = "l40sx1" TIMEOUT = "3h" SCRIPT = os.path.join(os.path.dirname(__file__), "job_train.py") # Pass namespace explicitly so run_uv_job() skips its internal whoami() call # (it needs the namespace to upload the script as /job-...). The /whoami-v2 # endpoint is heavily rate-limited and was tripping us when chained after # dataset uploads. Hardcoding our own namespace avoids the round-trip. NAMESPACE = "Pratyush-01" def main() -> None: hf_token = os.environ.get("HF_TOKEN") wandb_key = os.environ.get("WANDB_API_KEY") if not hf_token: sys.exit("HF_TOKEN env var is required") if not wandb_key: sys.exit("WANDB_API_KEY env var is required") api = HfApi(token=hf_token) from huggingface_hub import Volume volumes = [ Volume( type="dataset", source=DATASET_REPO, mount_path=MOUNT_PATH, ), ] print(f"Submitting job_train.py from {SCRIPT}") print(f" image={IMAGE}") print(f" flavor={FLAVOR}") print(f" volume={DATASET_REPO} -> {MOUNT_PATH}") print(f" timeout={TIMEOUT}") print(f" namespace={NAMESPACE} (skips whoami round-trip)") try: job = api.run_uv_job( script=SCRIPT, image=IMAGE, flavor=FLAVOR, secrets={"HF_TOKEN": hf_token, "WANDB_API_KEY": wandb_key}, volumes=volumes, timeout=TIMEOUT, namespace=NAMESPACE, ) except HfHubHTTPError as exc: sys.exit(f"FAILED: {exc}") print("\n=== Submitted ===") print(f" job_id: {job.id}") print(f" url: {job.url}") print(f" status: {job.status.stage}") print(f"\nTail logs with:\n hf jobs logs {job.id}") if __name__ == "__main__": main()