Spaces:
Running on Zero
Running on Zero
| """Card 9 runtime device bootstrap helpers (AMD/ROCm-friendly).""" | |
| from __future__ import annotations | |
| import logging | |
| import os | |
| from dataclasses import asdict, dataclass | |
| from typing import Optional | |
| import torch | |
| LOGGER = logging.getLogger(__name__) | |
| class RuntimeHealthReport: | |
| """Runtime/backend detection report for startup health checks.""" | |
| requested_device: str | |
| selected_device: str | |
| backend: str | |
| cuda_available: bool | |
| rocm_available: bool | |
| mps_available: bool | |
| strict_mode: bool | |
| reason: str | |
| def to_dict(self) -> dict: | |
| return asdict(self) | |
| def _env_bool(name: str, default: bool = False) -> bool: | |
| raw = os.environ.get(name) | |
| if raw is None: | |
| return default | |
| return str(raw).strip().lower() in ("1", "true", "yes", "on") | |
| def _normalize_requested_device(requested: Optional[str]) -> str: | |
| value = requested or os.environ.get("KIMODO_DEVICE") or os.environ.get("DEVICE") or "auto" | |
| return str(value).strip().lower() | |
| def _has_mps() -> bool: | |
| backends = getattr(torch, "backends", None) | |
| mps = getattr(backends, "mps", None) | |
| if mps is None: | |
| return False | |
| is_available = getattr(mps, "is_available", None) | |
| if callable(is_available): | |
| try: | |
| return bool(is_available()) | |
| except Exception: # pragma: no cover | |
| return False | |
| return False | |
| def _backend_name(cuda_available: bool, rocm_available: bool, mps_available: bool) -> str: | |
| if rocm_available: | |
| return "rocm" | |
| if cuda_available: | |
| return "cuda" | |
| if mps_available: | |
| return "mps" | |
| return "cpu" | |
| def select_runtime_device(requested: Optional[str] = None) -> str: | |
| """Resolve runtime device with ROCm/CUDA/CPU fallback. | |
| Resolution order: | |
| - explicit requested argument | |
| - environment variable KIMODO_DEVICE (or DEVICE) | |
| - auto | |
| If KIMODO_STRICT_DEVICE=true and requested accelerator is unavailable, raises ValueError. | |
| """ | |
| LOGGER.info("card9.select_runtime_device.start requested=%s", requested) | |
| strict_mode = _env_bool("KIMODO_STRICT_DEVICE", default=False) | |
| req = _normalize_requested_device(requested) | |
| cuda_available = bool(torch.cuda.is_available()) | |
| rocm_available = cuda_available and bool(getattr(torch.version, "hip", None)) | |
| mps_available = _has_mps() | |
| accelerator_aliases = {"cuda", "cuda:0", "gpu", "rocm", "hip", "amd"} | |
| if req == "cpu": | |
| selected = "cpu" | |
| reason = "explicit_cpu" | |
| elif req in ("mps", "apple"): | |
| if mps_available: | |
| selected = "mps" | |
| reason = "explicit_mps" | |
| elif strict_mode: | |
| raise ValueError("Requested MPS device but MPS backend is unavailable") | |
| else: | |
| selected = "cpu" | |
| reason = "mps_unavailable_fallback_cpu" | |
| elif req in accelerator_aliases: | |
| if cuda_available: | |
| selected = "cuda:0" | |
| reason = "explicit_accelerator_available" | |
| elif strict_mode: | |
| raise ValueError(f"Requested accelerator '{req}' but no torch accelerator is available") | |
| else: | |
| selected = "cpu" | |
| reason = "accelerator_unavailable_fallback_cpu" | |
| elif req == "auto": | |
| if cuda_available: | |
| selected = "cuda:0" | |
| reason = "auto_accelerator" | |
| elif mps_available: | |
| selected = "mps" | |
| reason = "auto_mps" | |
| else: | |
| selected = "cpu" | |
| reason = "auto_cpu" | |
| else: | |
| # Preserve explicit torch device strings (e.g. cuda:1, cpu) when possible. | |
| if req.startswith("cuda"): | |
| if cuda_available: | |
| selected = req | |
| reason = "explicit_cuda_index" | |
| elif strict_mode: | |
| raise ValueError(f"Requested device '{req}' but CUDA/ROCm backend is unavailable") | |
| else: | |
| selected = "cpu" | |
| reason = "explicit_cuda_unavailable_fallback_cpu" | |
| else: | |
| if strict_mode: | |
| raise ValueError(f"Unknown device specifier '{req}'") | |
| selected = "cpu" | |
| reason = "unknown_device_fallback_cpu" | |
| LOGGER.info("card9.select_runtime_device.exit selected=%s reason=%s", selected, reason) | |
| return selected | |
| def runtime_health_report(requested: Optional[str] = None) -> RuntimeHealthReport: | |
| """Return a startup runtime report suitable for health checks and logs.""" | |
| LOGGER.info("card9.runtime_health_report.start requested=%s", requested) | |
| strict_mode = _env_bool("KIMODO_STRICT_DEVICE", default=False) | |
| req = _normalize_requested_device(requested) | |
| cuda_available = bool(torch.cuda.is_available()) | |
| rocm_available = cuda_available and bool(getattr(torch.version, "hip", None)) | |
| mps_available = _has_mps() | |
| selected = select_runtime_device(req) | |
| reason = "ok" | |
| if selected == "cpu" and req in {"cuda", "cuda:0", "gpu", "rocm", "hip", "amd"}: | |
| reason = "fallback_cpu" | |
| report = RuntimeHealthReport( | |
| requested_device=req, | |
| selected_device=selected, | |
| backend=_backend_name(cuda_available, rocm_available, mps_available), | |
| cuda_available=cuda_available, | |
| rocm_available=rocm_available, | |
| mps_available=mps_available, | |
| strict_mode=strict_mode, | |
| reason=reason, | |
| ) | |
| LOGGER.info( | |
| "card9.runtime_health_report.exit requested=%s selected=%s backend=%s reason=%s", | |
| report.requested_device, | |
| report.selected_device, | |
| report.backend, | |
| report.reason, | |
| ) | |
| return report |