Spaces:
Sleeping
Sleeping
| """Submit job_train.py to Hugging Face Jobs. | |
| The `hf jobs uv run` CLI hangs intermittently on the whoami/encoding step | |
| (observed twice — see the conversation log around 2026-04-26). Submitting | |
| via huggingface_hub.HfApi.run_uv_job directly is reliable and lets us pass | |
| the dataset volume mount that the CLI version captures via `-v`. | |
| Usage: | |
| export HF_TOKEN=hf_... | |
| export WANDB_API_KEY=wandb_v1_... | |
| python submit.py | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import sys | |
| from huggingface_hub import HfApi | |
| from huggingface_hub.utils import HfHubHTTPError | |
| # Mount the dataset that contains physix-live/ source at /physix-live in the | |
| # container. _stage_physix_live() in job_train.py expects this exact path. | |
| DATASET_REPO = "Pratyush-01/physix-live-src" | |
| MOUNT_PATH = "/physix-live" | |
| IMAGE = "unsloth/unsloth:2026.3.8-pt2.9.0-vllm-0.16.0-cu12.8-studio-release" | |
| # Switched from a100-large (80 GB, $2.50/hr) to l40sx1 (48 GB, $1.80/hr). | |
| # | |
| # Why: the a100-large pool is heavily queue-bound right now — the unsloth- | |
| # jobs blog drove a wave of users onto exactly that flavor and our last | |
| # submission sat in SCHEDULING for 17+ min before we cancelled. l40sx1 is | |
| # in a separate (less contested) pool and is *cheaper*. It uses an Ada | |
| # Lovelace L40S GPU instead of an Ampere A100 — Unsloth and vLLM both | |
| # fully support L40S, and 3B + LoRA-32 + vLLM standby uses <20 GB, so the | |
| # 48 GB allotment is comfortable. Per-step throughput is roughly comparable | |
| # to A100 for small (≤7B) LoRA workloads. | |
| # | |
| # HF Jobs flavors verified via https://huggingface.co/docs/hub/jobs-pricing | |
| # (note: there is NO a100-medium — only a100-large/x4/x8 in the A100 line). | |
| FLAVOR = "l40sx1" | |
| TIMEOUT = "3h" | |
| SCRIPT = os.path.join(os.path.dirname(__file__), "job_train.py") | |
| # Pass namespace explicitly so run_uv_job() skips its internal whoami() call | |
| # (it needs the namespace to upload the script as <ns>/job-...). The /whoami-v2 | |
| # endpoint is heavily rate-limited and was tripping us when chained after | |
| # dataset uploads. Hardcoding our own namespace avoids the round-trip. | |
| NAMESPACE = "Pratyush-01" | |
| def main() -> None: | |
| hf_token = os.environ.get("HF_TOKEN") | |
| wandb_key = os.environ.get("WANDB_API_KEY") | |
| if not hf_token: | |
| sys.exit("HF_TOKEN env var is required") | |
| if not wandb_key: | |
| sys.exit("WANDB_API_KEY env var is required") | |
| api = HfApi(token=hf_token) | |
| from huggingface_hub import Volume | |
| volumes = [ | |
| Volume( | |
| type="dataset", | |
| source=DATASET_REPO, | |
| mount_path=MOUNT_PATH, | |
| ), | |
| ] | |
| print(f"Submitting job_train.py from {SCRIPT}") | |
| print(f" image={IMAGE}") | |
| print(f" flavor={FLAVOR}") | |
| print(f" volume={DATASET_REPO} -> {MOUNT_PATH}") | |
| print(f" timeout={TIMEOUT}") | |
| print(f" namespace={NAMESPACE} (skips whoami round-trip)") | |
| try: | |
| job = api.run_uv_job( | |
| script=SCRIPT, | |
| image=IMAGE, | |
| flavor=FLAVOR, | |
| secrets={"HF_TOKEN": hf_token, "WANDB_API_KEY": wandb_key}, | |
| volumes=volumes, | |
| timeout=TIMEOUT, | |
| namespace=NAMESPACE, | |
| ) | |
| except HfHubHTTPError as exc: | |
| sys.exit(f"FAILED: {exc}") | |
| print("\n=== Submitted ===") | |
| print(f" job_id: {job.id}") | |
| print(f" url: {job.url}") | |
| print(f" status: {job.status.stage}") | |
| print(f"\nTail logs with:\n hf jobs logs {job.id}") | |
| if __name__ == "__main__": | |
| main() | |