#!/usr/bin/env python3 """Fail-fast GPU/CUDA preflight for RTX 6000 Ada training.""" import os import sys import subprocess import torch print("=== GPU PREFLIGHT ===") print("python:", sys.version.replace("\n", " ")) print("torch:", torch.__version__) print("torch.version.cuda:", torch.version.cuda) print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES")) print("torch.cuda.is_available:", torch.cuda.is_available()) try: out = subprocess.run(["nvidia-smi"], check=False, text=True, capture_output=True, timeout=20) print("nvidia-smi returncode:", out.returncode) print(out.stdout[:4000]) if out.stderr: print(out.stderr[:2000]) except Exception as e: print("nvidia-smi failed:", repr(e)) if not torch.cuda.is_available(): raise SystemExit( "ERROR: PyTorch cannot see CUDA. This run would use CPU and fail/underperform.\n" "Fix: install CUDA PyTorch with scripts/install_rtx6000ada.sh, check NVIDIA driver, and set CUDA_VISIBLE_DEVICES=0." ) n = torch.cuda.device_count() print("cuda device_count:", n) for i in range(n): props = torch.cuda.get_device_properties(i) print(f"gpu[{i}]: {props.name}, capability={props.major}.{props.minor}, total_vram_gb={props.total_memory/1024**3:.2f}") # Allocate a tiny tensor to ensure CUDA runtime works. x = torch.ones((1,), device="cuda") print("cuda allocation test:", x.item()) print("=== GPU PREFLIGHT OK ===")