devindevine commited on
Commit
02be60b
·
verified ·
1 Parent(s): 9747003

Upload 22 files

Browse files
Files changed (2) hide show
  1. app.py +5 -36
  2. optimization.py +0 -2
app.py CHANGED
@@ -89,7 +89,7 @@ scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
89
  # Load the model pipeline
90
  from safetensors.torch import load_file
91
  import torch.nn.functional as F
92
- from torchao.quantization import quantize_, Int8WeightOnlyConfig, Float8DynamicActivationFloat8WeightConfig
93
 
94
 
95
  # ---------------------------------------------------------------------------
@@ -106,16 +106,6 @@ def setup_intelligent_memory_pipeline(pipe, apply_quantization=True):
106
 
107
  print(f"GPU: {torch.cuda.get_device_name(0)}")
108
  print(f"Total VRAM: {total_vram:.1f}GB")
109
- print(f"Base model size: ~55GB unquantized, ~20GB with quantization")
110
-
111
- # Apply quantization to reduce memory footprint (only if requested)
112
- # ZeroGPU has limited VRAM - use Int8 only to avoid NVRTC issues
113
- if apply_quantization:
114
- print("Applying Int8 quantization for ZeroGPU...")
115
- if hasattr(pipe, 'text_encoder') and pipe.text_encoder is not None:
116
- quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
117
- if hasattr(pipe, 'transformer') and pipe.transformer is not None:
118
- quantize_(pipe.transformer, Int8WeightOnlyConfig())
119
 
120
  # Enable VAE optimizations for all GPUs
121
  print("Enabling VAE slicing and tiling...")
@@ -123,31 +113,10 @@ def setup_intelligent_memory_pipeline(pipe, apply_quantization=True):
123
  pipe.vae.enable_slicing()
124
  pipe.vae.enable_tiling()
125
 
126
- # Memory strategy based on VRAM
127
- if total_vram >= 40:
128
- print("High VRAM GPU: Loading fully on GPU")
129
- print(" - Moving text_encoder...")
130
- if hasattr(pipe, 'text_encoder') and pipe.text_encoder is not None:
131
- pipe.text_encoder = pipe.text_encoder.to('cuda')
132
- clear_vram()
133
-
134
- print(" - Moving transformer...")
135
- if hasattr(pipe, 'transformer') and pipe.transformer is not None:
136
- pipe.transformer = pipe.transformer.to('cuda')
137
- clear_vram()
138
-
139
- print(" - Moving VAE...")
140
- if hasattr(pipe, 'vae') and pipe.vae is not None:
141
- pipe.vae = pipe.vae.to('cuda')
142
- clear_vram()
143
-
144
- return pipe
145
- elif total_vram >= 16:
146
- print("Mid-range GPU: Using model CPU offloading")
147
- pipe.enable_model_cpu_offload()
148
- return pipe
149
- else:
150
- print("Low VRAM GPU: Using sequential CPU offloading")
151
  pipe.enable_sequential_cpu_offload()
152
  return pipe
153
 
 
89
  # Load the model pipeline
90
  from safetensors.torch import load_file
91
  import torch.nn.functional as F
92
+ # REMOVED: torchao quantization - causes NVRTC kernel compilation errors
93
 
94
 
95
  # ---------------------------------------------------------------------------
 
106
 
107
  print(f"GPU: {torch.cuda.get_device_name(0)}")
108
  print(f"Total VRAM: {total_vram:.1f}GB")
 
 
 
 
 
 
 
 
 
 
109
 
110
  # Enable VAE optimizations for all GPUs
111
  print("Enabling VAE slicing and tiling...")
 
113
  pipe.vae.enable_slicing()
114
  pipe.vae.enable_tiling()
115
 
116
+ # Use CPU offloading for ZeroGPU - no quantization to avoid NVRTC issues
117
+ print("Using model CPU offloading for ZeroGPU")
118
+ pipe.enable_model_cpu_offload()
119
+ return pipe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  pipe.enable_sequential_cpu_offload()
121
  return pipe
122
 
optimization.py CHANGED
@@ -3,8 +3,6 @@
3
  from typing import Any
4
  from typing import Callable
5
  from typing import ParamSpec
6
- from torchao.quantization import quantize_
7
- from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
8
  # spaces import REMOVED — spaces.GPU / spaces.aoti_* are HuggingFace ZeroGPU-only APIs.
9
  # They do not exist outside of HF infrastructure and would crash on a local VPS.
10
  # Replaced below with standard torch.compile() which gives equivalent or better
 
3
  from typing import Any
4
  from typing import Callable
5
  from typing import ParamSpec
 
 
6
  # spaces import REMOVED — spaces.GPU / spaces.aoti_* are HuggingFace ZeroGPU-only APIs.
7
  # They do not exist outside of HF infrastructure and would crash on a local VPS.
8
  # Replaced below with standard torch.compile() which gives equivalent or better