Spaces:
Running on Zero
Running on Zero
Upload 22 files
Browse files- app.py +5 -36
- optimization.py +0 -2
app.py
CHANGED
|
@@ -89,7 +89,7 @@ scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
|
|
| 89 |
# Load the model pipeline
|
| 90 |
from safetensors.torch import load_file
|
| 91 |
import torch.nn.functional as F
|
| 92 |
-
|
| 93 |
|
| 94 |
|
| 95 |
# ---------------------------------------------------------------------------
|
|
@@ -106,16 +106,6 @@ def setup_intelligent_memory_pipeline(pipe, apply_quantization=True):
|
|
| 106 |
|
| 107 |
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 108 |
print(f"Total VRAM: {total_vram:.1f}GB")
|
| 109 |
-
print(f"Base model size: ~55GB unquantized, ~20GB with quantization")
|
| 110 |
-
|
| 111 |
-
# Apply quantization to reduce memory footprint (only if requested)
|
| 112 |
-
# ZeroGPU has limited VRAM - use Int8 only to avoid NVRTC issues
|
| 113 |
-
if apply_quantization:
|
| 114 |
-
print("Applying Int8 quantization for ZeroGPU...")
|
| 115 |
-
if hasattr(pipe, 'text_encoder') and pipe.text_encoder is not None:
|
| 116 |
-
quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
|
| 117 |
-
if hasattr(pipe, 'transformer') and pipe.transformer is not None:
|
| 118 |
-
quantize_(pipe.transformer, Int8WeightOnlyConfig())
|
| 119 |
|
| 120 |
# Enable VAE optimizations for all GPUs
|
| 121 |
print("Enabling VAE slicing and tiling...")
|
|
@@ -123,31 +113,10 @@ def setup_intelligent_memory_pipeline(pipe, apply_quantization=True):
|
|
| 123 |
pipe.vae.enable_slicing()
|
| 124 |
pipe.vae.enable_tiling()
|
| 125 |
|
| 126 |
-
#
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
if hasattr(pipe, 'text_encoder') and pipe.text_encoder is not None:
|
| 131 |
-
pipe.text_encoder = pipe.text_encoder.to('cuda')
|
| 132 |
-
clear_vram()
|
| 133 |
-
|
| 134 |
-
print(" - Moving transformer...")
|
| 135 |
-
if hasattr(pipe, 'transformer') and pipe.transformer is not None:
|
| 136 |
-
pipe.transformer = pipe.transformer.to('cuda')
|
| 137 |
-
clear_vram()
|
| 138 |
-
|
| 139 |
-
print(" - Moving VAE...")
|
| 140 |
-
if hasattr(pipe, 'vae') and pipe.vae is not None:
|
| 141 |
-
pipe.vae = pipe.vae.to('cuda')
|
| 142 |
-
clear_vram()
|
| 143 |
-
|
| 144 |
-
return pipe
|
| 145 |
-
elif total_vram >= 16:
|
| 146 |
-
print("Mid-range GPU: Using model CPU offloading")
|
| 147 |
-
pipe.enable_model_cpu_offload()
|
| 148 |
-
return pipe
|
| 149 |
-
else:
|
| 150 |
-
print("Low VRAM GPU: Using sequential CPU offloading")
|
| 151 |
pipe.enable_sequential_cpu_offload()
|
| 152 |
return pipe
|
| 153 |
|
|
|
|
| 89 |
# Load the model pipeline
|
| 90 |
from safetensors.torch import load_file
|
| 91 |
import torch.nn.functional as F
|
| 92 |
+
# REMOVED: torchao quantization - causes NVRTC kernel compilation errors
|
| 93 |
|
| 94 |
|
| 95 |
# ---------------------------------------------------------------------------
|
|
|
|
| 106 |
|
| 107 |
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 108 |
print(f"Total VRAM: {total_vram:.1f}GB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
# Enable VAE optimizations for all GPUs
|
| 111 |
print("Enabling VAE slicing and tiling...")
|
|
|
|
| 113 |
pipe.vae.enable_slicing()
|
| 114 |
pipe.vae.enable_tiling()
|
| 115 |
|
| 116 |
+
# Use CPU offloading for ZeroGPU - no quantization to avoid NVRTC issues
|
| 117 |
+
print("Using model CPU offloading for ZeroGPU")
|
| 118 |
+
pipe.enable_model_cpu_offload()
|
| 119 |
+
return pipe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
pipe.enable_sequential_cpu_offload()
|
| 121 |
return pipe
|
| 122 |
|
optimization.py
CHANGED
|
@@ -3,8 +3,6 @@
|
|
| 3 |
from typing import Any
|
| 4 |
from typing import Callable
|
| 5 |
from typing import ParamSpec
|
| 6 |
-
from torchao.quantization import quantize_
|
| 7 |
-
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
|
| 8 |
# spaces import REMOVED — spaces.GPU / spaces.aoti_* are HuggingFace ZeroGPU-only APIs.
|
| 9 |
# They do not exist outside of HF infrastructure and would crash on a local VPS.
|
| 10 |
# Replaced below with standard torch.compile() which gives equivalent or better
|
|
|
|
| 3 |
from typing import Any
|
| 4 |
from typing import Callable
|
| 5 |
from typing import ParamSpec
|
|
|
|
|
|
|
| 6 |
# spaces import REMOVED — spaces.GPU / spaces.aoti_* are HuggingFace ZeroGPU-only APIs.
|
| 7 |
# They do not exist outside of HF infrastructure and would crash on a local VPS.
|
| 8 |
# Replaced below with standard torch.compile() which gives equivalent or better
|