Spaces:

Pratyush-01
/

physix-live

Sleeping

App Files Files Community

physix-live / train /submit.py

Pratyush-01

Upload folder using huggingface_hub

08f8699 verified 12 days ago

raw

history blame contribute delete

3.45 kB

	"""Submit job_train.py to Hugging Face Jobs.

	The `hf jobs uv run` CLI hangs intermittently on the whoami/encoding step
	(observed twice — see the conversation log around 2026-04-26). Submitting
	via huggingface_hub.HfApi.run_uv_job directly is reliable and lets us pass
	the dataset volume mount that the CLI version captures via `-v`.

	Usage:

	export HF_TOKEN=hf_...
	export WANDB_API_KEY=wandb_v1_...
	python submit.py
	"""
	from __future__ import annotations

	import os
	import sys
	from huggingface_hub import HfApi
	from huggingface_hub.utils import HfHubHTTPError


	# Mount the dataset that contains physix-live/ source at /physix-live in the
	# container. _stage_physix_live() in job_train.py expects this exact path.
	DATASET_REPO = "Pratyush-01/physix-live-src"
	MOUNT_PATH = "/physix-live"

	IMAGE = "unsloth/unsloth:2026.3.8-pt2.9.0-vllm-0.16.0-cu12.8-studio-release"
	# Switched from a100-large (80 GB, $2.50/hr) to l40sx1 (48 GB, $1.80/hr).
	#
	# Why: the a100-large pool is heavily queue-bound right now — the unsloth-
	# jobs blog drove a wave of users onto exactly that flavor and our last
	# submission sat in SCHEDULING for 17+ min before we cancelled. l40sx1 is
	# in a separate (less contested) pool and is cheaper. It uses an Ada
	# Lovelace L40S GPU instead of an Ampere A100 — Unsloth and vLLM both
	# fully support L40S, and 3B + LoRA-32 + vLLM standby uses <20 GB, so the
	# 48 GB allotment is comfortable. Per-step throughput is roughly comparable
	# to A100 for small (≤7B) LoRA workloads.
	#
	# HF Jobs flavors verified via https://huggingface.co/docs/hub/jobs-pricing
	# (note: there is NO a100-medium — only a100-large/x4/x8 in the A100 line).
	FLAVOR = "l40sx1"
	TIMEOUT = "3h"
	SCRIPT = os.path.join(os.path.dirname(__file__), "job_train.py")

	# Pass namespace explicitly so run_uv_job() skips its internal whoami() call
	# (it needs the namespace to upload the script as <ns>/job-...). The /whoami-v2
	# endpoint is heavily rate-limited and was tripping us when chained after
	# dataset uploads. Hardcoding our own namespace avoids the round-trip.
	NAMESPACE = "Pratyush-01"


	def main() -> None:
	hf_token = os.environ.get("HF_TOKEN")
	wandb_key = os.environ.get("WANDB_API_KEY")
	if not hf_token:
	sys.exit("HF_TOKEN env var is required")
	if not wandb_key:
	sys.exit("WANDB_API_KEY env var is required")

	api = HfApi(token=hf_token)

	from huggingface_hub import Volume

	volumes = [
	Volume(
	type="dataset",
	source=DATASET_REPO,
	mount_path=MOUNT_PATH,
	),
	]

	print(f"Submitting job_train.py from {SCRIPT}")
	print(f" image={IMAGE}")
	print(f" flavor={FLAVOR}")
	print(f" volume={DATASET_REPO} -> {MOUNT_PATH}")
	print(f" timeout={TIMEOUT}")
	print(f" namespace={NAMESPACE} (skips whoami round-trip)")

	try:
	job = api.run_uv_job(
	script=SCRIPT,
	image=IMAGE,
	flavor=FLAVOR,
	secrets={"HF_TOKEN": hf_token, "WANDB_API_KEY": wandb_key},
	volumes=volumes,
	timeout=TIMEOUT,
	namespace=NAMESPACE,
	)
	except HfHubHTTPError as exc:
	sys.exit(f"FAILED: {exc}")

	print("\n=== Submitted ===")
	print(f" job_id: {job.id}")
	print(f" url: {job.url}")
	print(f" status: {job.status.stage}")
	print(f"\nTail logs with:\n hf jobs logs {job.id}")


	if __name__ == "__main__":
	main()