| |
| """Fail-fast GPU/CUDA preflight for RTX 6000 Ada training.""" |
| import os |
| import sys |
| import subprocess |
|
|
| import torch |
|
|
| print("=== GPU PREFLIGHT ===") |
| print("python:", sys.version.replace("\n", " ")) |
| print("torch:", torch.__version__) |
| print("torch.version.cuda:", torch.version.cuda) |
| print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES")) |
| print("torch.cuda.is_available:", torch.cuda.is_available()) |
|
|
| try: |
| out = subprocess.run(["nvidia-smi"], check=False, text=True, capture_output=True, timeout=20) |
| print("nvidia-smi returncode:", out.returncode) |
| print(out.stdout[:4000]) |
| if out.stderr: |
| print(out.stderr[:2000]) |
| except Exception as e: |
| print("nvidia-smi failed:", repr(e)) |
|
|
| if not torch.cuda.is_available(): |
| raise SystemExit( |
| "ERROR: PyTorch cannot see CUDA. This run would use CPU and fail/underperform.\n" |
| "Fix: install CUDA PyTorch with scripts/install_rtx6000ada.sh, check NVIDIA driver, and set CUDA_VISIBLE_DEVICES=0." |
| ) |
|
|
| n = torch.cuda.device_count() |
| print("cuda device_count:", n) |
| for i in range(n): |
| props = torch.cuda.get_device_properties(i) |
| print(f"gpu[{i}]: {props.name}, capability={props.major}.{props.minor}, total_vram_gb={props.total_memory/1024**3:.2f}") |
|
|
| |
| x = torch.ones((1,), device="cuda") |
| print("cuda allocation test:", x.item()) |
| print("=== GPU PREFLIGHT OK ===") |
|
|