File size: 1,435 Bytes
91d636a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | #!/usr/bin/env python3
"""Fail-fast GPU/CUDA preflight for RTX 6000 Ada training."""
import os
import sys
import subprocess
import torch
print("=== GPU PREFLIGHT ===")
print("python:", sys.version.replace("\n", " "))
print("torch:", torch.__version__)
print("torch.version.cuda:", torch.version.cuda)
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("torch.cuda.is_available:", torch.cuda.is_available())
try:
out = subprocess.run(["nvidia-smi"], check=False, text=True, capture_output=True, timeout=20)
print("nvidia-smi returncode:", out.returncode)
print(out.stdout[:4000])
if out.stderr:
print(out.stderr[:2000])
except Exception as e:
print("nvidia-smi failed:", repr(e))
if not torch.cuda.is_available():
raise SystemExit(
"ERROR: PyTorch cannot see CUDA. This run would use CPU and fail/underperform.\n"
"Fix: install CUDA PyTorch with scripts/install_rtx6000ada.sh, check NVIDIA driver, and set CUDA_VISIBLE_DEVICES=0."
)
n = torch.cuda.device_count()
print("cuda device_count:", n)
for i in range(n):
props = torch.cuda.get_device_properties(i)
print(f"gpu[{i}]: {props.name}, capability={props.major}.{props.minor}, total_vram_gb={props.total_memory/1024**3:.2f}")
# Allocate a tiny tensor to ensure CUDA runtime works.
x = torch.ones((1,), device="cuda")
print("cuda allocation test:", x.item())
print("=== GPU PREFLIGHT OK ===")
|