Spaces:
Running on Zero
Running on Zero
Add GPU/CUDA diagnostics at startup
Browse files- app.py +34 -1
- autotune_cache.json +12 -0
app.py
CHANGED
|
@@ -4,8 +4,41 @@ import argparse
|
|
| 4 |
import math
|
| 5 |
import time
|
| 6 |
import shutil
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
| 8 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
import numpy as np
|
| 10 |
import base64
|
| 11 |
import io
|
|
|
|
| 4 |
import math
|
| 5 |
import time
|
| 6 |
import shutil
|
| 7 |
+
|
| 8 |
+
# ============================================================================
|
| 9 |
+
# GPU / CUDA Environment Diagnostics
|
| 10 |
+
# ============================================================================
|
| 11 |
import torch
|
| 12 |
+
print("=" * 60)
|
| 13 |
+
print("[Diagnostics] PyTorch version:", torch.__version__)
|
| 14 |
+
print("[Diagnostics] CUDA available:", torch.cuda.is_available())
|
| 15 |
+
if torch.cuda.is_available():
|
| 16 |
+
print("[Diagnostics] CUDA version:", torch.version.cuda)
|
| 17 |
+
print("[Diagnostics] cuDNN version:", torch.backends.cudnn.version())
|
| 18 |
+
for i in range(torch.cuda.device_count()):
|
| 19 |
+
name = torch.cuda.get_device_name(i)
|
| 20 |
+
cap = torch.cuda.get_device_capability(i)
|
| 21 |
+
mem = torch.cuda.get_device_properties(i).total_mem / 1024**3
|
| 22 |
+
print(f"[Diagnostics] GPU {i}: {name}, compute capability: sm_{cap[0]}{cap[1]}, memory: {mem:.1f} GB")
|
| 23 |
+
else:
|
| 24 |
+
print("[Diagnostics] WARNING: No CUDA GPU detected!")
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
import flash_attn_3
|
| 28 |
+
print("[Diagnostics] flash_attn_3 imported OK")
|
| 29 |
+
from flash_attn_interface import flash_attn_func
|
| 30 |
+
print("[Diagnostics] flash_attn_func imported OK")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"[Diagnostics] flash_attn_3 import FAILED: {e}")
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
result = subprocess.run(["nvidia-smi"], capture_output=True, text=True, timeout=10)
|
| 36 |
+
print("[Diagnostics] nvidia-smi:\n" + result.stdout[:500])
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"[Diagnostics] nvidia-smi failed: {e}")
|
| 39 |
+
print("=" * 60)
|
| 40 |
+
|
| 41 |
+
import cv2
|
| 42 |
import numpy as np
|
| 43 |
import base64
|
| 44 |
import io
|
autotune_cache.json
CHANGED
|
@@ -25039,6 +25039,18 @@
|
|
| 25039 |
"maxnreg": null,
|
| 25040 |
"pre_hook": null,
|
| 25041 |
"ir_override": null
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25042 |
}
|
| 25043 |
},
|
| 25044 |
"flex_gemm.kernels.triton.spconv.sparse_conv_implicit_gemm.sparse_conv_fwd_implicit_gemm_kernel": {},
|
|
|
|
| 25039 |
"maxnreg": null,
|
| 25040 |
"pre_hook": null,
|
| 25041 |
"ir_override": null
|
| 25042 |
+
},
|
| 25043 |
+
"(23, 7739451, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": {
|
| 25044 |
+
"kwargs": {
|
| 25045 |
+
"BM": 16,
|
| 25046 |
+
"BK": 8
|
| 25047 |
+
},
|
| 25048 |
+
"num_warps": 2,
|
| 25049 |
+
"num_ctas": 1,
|
| 25050 |
+
"num_stages": 3,
|
| 25051 |
+
"maxnreg": null,
|
| 25052 |
+
"pre_hook": null,
|
| 25053 |
+
"ir_override": null
|
| 25054 |
}
|
| 25055 |
},
|
| 25056 |
"flex_gemm.kernels.triton.spconv.sparse_conv_implicit_gemm.sparse_conv_fwd_implicit_gemm_kernel": {},
|