Spaces:

Pratyush-01
/

physix

Sleeping

File size: 3,445 Bytes

0e24aff

"""Submit job_train.py to Hugging Face Jobs.

The `hf jobs uv run` CLI hangs intermittently on the whoami/encoding step
(observed twice — see the conversation log around 2026-04-26). Submitting
via huggingface_hub.HfApi.run_uv_job directly is reliable and lets us pass
the dataset volume mount that the CLI version captures via `-v`.

Usage:

    export HF_TOKEN=hf_...
    export WANDB_API_KEY=wandb_v1_...
    python submit.py
"""
from __future__ import annotations

import os
import sys
from huggingface_hub import HfApi
from huggingface_hub.utils import HfHubHTTPError


# Mount the dataset that contains physix-live/ source at /physix-live in the
# container. _stage_physix_live() in job_train.py expects this exact path.
DATASET_REPO = "Pratyush-01/physix-live-src"
MOUNT_PATH = "/physix-live"

IMAGE = "unsloth/unsloth:2026.3.8-pt2.9.0-vllm-0.16.0-cu12.8-studio-release"
# Switched from a100-large (80 GB, $2.50/hr) to l40sx1 (48 GB, $1.80/hr).
#
# Why: the a100-large pool is heavily queue-bound right now — the unsloth-
# jobs blog drove a wave of users onto exactly that flavor and our last
# submission sat in SCHEDULING for 17+ min before we cancelled. l40sx1 is
# in a separate (less contested) pool and is *cheaper*. It uses an Ada
# Lovelace L40S GPU instead of an Ampere A100 — Unsloth and vLLM both
# fully support L40S, and 3B + LoRA-32 + vLLM standby uses <20 GB, so the
# 48 GB allotment is comfortable. Per-step throughput is roughly comparable
# to A100 for small (≤7B) LoRA workloads.
#
# HF Jobs flavors verified via https://huggingface.co/docs/hub/jobs-pricing
# (note: there is NO a100-medium — only a100-large/x4/x8 in the A100 line).
FLAVOR = "l40sx1"
TIMEOUT = "3h"
SCRIPT = os.path.join(os.path.dirname(__file__), "job_train.py")

# Pass namespace explicitly so run_uv_job() skips its internal whoami() call
# (it needs the namespace to upload the script as <ns>/job-...). The /whoami-v2
# endpoint is heavily rate-limited and was tripping us when chained after
# dataset uploads. Hardcoding our own namespace avoids the round-trip.
NAMESPACE = "Pratyush-01"


def main() -> None:
    hf_token = os.environ.get("HF_TOKEN")
    wandb_key = os.environ.get("WANDB_API_KEY")
    if not hf_token:
        sys.exit("HF_TOKEN env var is required")
    if not wandb_key:
        sys.exit("WANDB_API_KEY env var is required")

    api = HfApi(token=hf_token)

    from huggingface_hub import Volume

    volumes = [
        Volume(
            type="dataset",
            source=DATASET_REPO,
            mount_path=MOUNT_PATH,
        ),
    ]

    print(f"Submitting job_train.py from {SCRIPT}")
    print(f"  image={IMAGE}")
    print(f"  flavor={FLAVOR}")
    print(f"  volume={DATASET_REPO} -> {MOUNT_PATH}")
    print(f"  timeout={TIMEOUT}")
    print(f"  namespace={NAMESPACE} (skips whoami round-trip)")

    try:
        job = api.run_uv_job(
            script=SCRIPT,
            image=IMAGE,
            flavor=FLAVOR,
            secrets={"HF_TOKEN": hf_token, "WANDB_API_KEY": wandb_key},
            volumes=volumes,
            timeout=TIMEOUT,
            namespace=NAMESPACE,
        )
    except HfHubHTTPError as exc:
        sys.exit(f"FAILED: {exc}")

    print("\n=== Submitted ===")
    print(f"  job_id: {job.id}")
    print(f"  url:    {job.url}")
    print(f"  status: {job.status.stage}")
    print(f"\nTail logs with:\n  hf jobs logs {job.id}")


if __name__ == "__main__":
    main()