PEFT
qlora
sft
trl
qwen3
tmf921
intent-based-networking
network-slicing
rtx-6000-ada
ml-intern
File size: 1,435 Bytes
91d636a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python3
"""Fail-fast GPU/CUDA preflight for RTX 6000 Ada training."""
import os
import sys
import subprocess

import torch

print("=== GPU PREFLIGHT ===")
print("python:", sys.version.replace("\n", " "))
print("torch:", torch.__version__)
print("torch.version.cuda:", torch.version.cuda)
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("torch.cuda.is_available:", torch.cuda.is_available())

try:
    out = subprocess.run(["nvidia-smi"], check=False, text=True, capture_output=True, timeout=20)
    print("nvidia-smi returncode:", out.returncode)
    print(out.stdout[:4000])
    if out.stderr:
        print(out.stderr[:2000])
except Exception as e:
    print("nvidia-smi failed:", repr(e))

if not torch.cuda.is_available():
    raise SystemExit(
        "ERROR: PyTorch cannot see CUDA. This run would use CPU and fail/underperform.\n"
        "Fix: install CUDA PyTorch with scripts/install_rtx6000ada.sh, check NVIDIA driver, and set CUDA_VISIBLE_DEVICES=0."
    )

n = torch.cuda.device_count()
print("cuda device_count:", n)
for i in range(n):
    props = torch.cuda.get_device_properties(i)
    print(f"gpu[{i}]: {props.name}, capability={props.major}.{props.minor}, total_vram_gb={props.total_memory/1024**3:.2f}")

# Allocate a tiny tensor to ensure CUDA runtime works.
x = torch.ones((1,), device="cuda")
print("cuda allocation test:", x.item())
print("=== GPU PREFLIGHT OK ===")