Spaces:

lablab-ai-amd-developer-hackathon
/

movimento

Running on Zero

Kimodo Bot

Add core kimodo package modules required by native demo

6d5047c 5 days ago

5.59 kB

	"""Card 9 runtime device bootstrap helpers (AMD/ROCm-friendly)."""

	from __future__ import annotations

	import logging
	import os
	from dataclasses import asdict, dataclass
	from typing import Optional

	import torch

	LOGGER = logging.getLogger(__name__)


	@dataclass(frozen=True)
	class RuntimeHealthReport:
	"""Runtime/backend detection report for startup health checks."""

	requested_device: str
	selected_device: str
	backend: str
	cuda_available: bool
	rocm_available: bool
	mps_available: bool
	strict_mode: bool
	reason: str

	def to_dict(self) -> dict:
	return asdict(self)


	def _env_bool(name: str, default: bool = False) -> bool:
	raw = os.environ.get(name)
	if raw is None:
	return default
	return str(raw).strip().lower() in ("1", "true", "yes", "on")


	def _normalize_requested_device(requested: Optional[str]) -> str:
	value = requested or os.environ.get("KIMODO_DEVICE") or os.environ.get("DEVICE") or "auto"
	return str(value).strip().lower()


	def _has_mps() -> bool:
	backends = getattr(torch, "backends", None)
	mps = getattr(backends, "mps", None)
	if mps is None:
	return False
	is_available = getattr(mps, "is_available", None)
	if callable(is_available):
	try:
	return bool(is_available())
	except Exception: # pragma: no cover
	return False
	return False


	def _backend_name(cuda_available: bool, rocm_available: bool, mps_available: bool) -> str:
	if rocm_available:
	return "rocm"
	if cuda_available:
	return "cuda"
	if mps_available:
	return "mps"
	return "cpu"


	def select_runtime_device(requested: Optional[str] = None) -> str:
	"""Resolve runtime device with ROCm/CUDA/CPU fallback.

	Resolution order:
	- explicit requested argument
	- environment variable KIMODO_DEVICE (or DEVICE)
	- auto

	If KIMODO_STRICT_DEVICE=true and requested accelerator is unavailable, raises ValueError.
	"""
	LOGGER.info("card9.select_runtime_device.start requested=%s", requested)
	strict_mode = _env_bool("KIMODO_STRICT_DEVICE", default=False)
	req = _normalize_requested_device(requested)

	cuda_available = bool(torch.cuda.is_available())
	rocm_available = cuda_available and bool(getattr(torch.version, "hip", None))
	mps_available = _has_mps()

	accelerator_aliases = {"cuda", "cuda:0", "gpu", "rocm", "hip", "amd"}

	if req == "cpu":
	selected = "cpu"
	reason = "explicit_cpu"
	elif req in ("mps", "apple"):
	if mps_available:
	selected = "mps"
	reason = "explicit_mps"
	elif strict_mode:
	raise ValueError("Requested MPS device but MPS backend is unavailable")
	else:
	selected = "cpu"
	reason = "mps_unavailable_fallback_cpu"
	elif req in accelerator_aliases:
	if cuda_available:
	selected = "cuda:0"
	reason = "explicit_accelerator_available"
	elif strict_mode:
	raise ValueError(f"Requested accelerator '{req}' but no torch accelerator is available")
	else:
	selected = "cpu"
	reason = "accelerator_unavailable_fallback_cpu"
	elif req == "auto":
	if cuda_available:
	selected = "cuda:0"
	reason = "auto_accelerator"
	elif mps_available:
	selected = "mps"
	reason = "auto_mps"
	else:
	selected = "cpu"
	reason = "auto_cpu"
	else:
	# Preserve explicit torch device strings (e.g. cuda:1, cpu) when possible.
	if req.startswith("cuda"):
	if cuda_available:
	selected = req
	reason = "explicit_cuda_index"
	elif strict_mode:
	raise ValueError(f"Requested device '{req}' but CUDA/ROCm backend is unavailable")
	else:
	selected = "cpu"
	reason = "explicit_cuda_unavailable_fallback_cpu"
	else:
	if strict_mode:
	raise ValueError(f"Unknown device specifier '{req}'")
	selected = "cpu"
	reason = "unknown_device_fallback_cpu"

	LOGGER.info("card9.select_runtime_device.exit selected=%s reason=%s", selected, reason)
	return selected


	def runtime_health_report(requested: Optional[str] = None) -> RuntimeHealthReport:
	"""Return a startup runtime report suitable for health checks and logs."""
	LOGGER.info("card9.runtime_health_report.start requested=%s", requested)

	strict_mode = _env_bool("KIMODO_STRICT_DEVICE", default=False)
	req = _normalize_requested_device(requested)
	cuda_available = bool(torch.cuda.is_available())
	rocm_available = cuda_available and bool(getattr(torch.version, "hip", None))
	mps_available = _has_mps()

	selected = select_runtime_device(req)
	reason = "ok"
	if selected == "cpu" and req in {"cuda", "cuda:0", "gpu", "rocm", "hip", "amd"}:
	reason = "fallback_cpu"

	report = RuntimeHealthReport(
	requested_device=req,
	selected_device=selected,
	backend=_backend_name(cuda_available, rocm_available, mps_available),
	cuda_available=cuda_available,
	rocm_available=rocm_available,
	mps_available=mps_available,
	strict_mode=strict_mode,
	reason=reason,
	)
	LOGGER.info(
	"card9.runtime_health_report.exit requested=%s selected=%s backend=%s reason=%s",
	report.requested_device,
	report.selected_device,
	report.backend,
	report.reason,
	)
	return report