Spaces:

devindevine
/

picgen

Running on Zero

App Files Files Community

devindevine commited on 14 days ago

Commit

02be60b

verified ·

1 Parent(s): 9747003

Upload 22 files

Browse files

Files changed (2) hide show

app.py +5 -36
optimization.py +0 -2

app.py CHANGED Viewed

@@ -89,7 +89,7 @@ scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
 # Load the model pipeline
 from safetensors.torch import load_file
 import torch.nn.functional as F
-from torchao.quantization import quantize_, Int8WeightOnlyConfig, Float8DynamicActivationFloat8WeightConfig
 # ---------------------------------------------------------------------------
@@ -106,16 +106,6 @@ def setup_intelligent_memory_pipeline(pipe, apply_quantization=True):
     print(f"GPU: {torch.cuda.get_device_name(0)}")
     print(f"Total VRAM: {total_vram:.1f}GB")
-    print(f"Base model size: ~55GB unquantized, ~20GB with quantization")
-    # Apply quantization to reduce memory footprint (only if requested)
-    # ZeroGPU has limited VRAM - use Int8 only to avoid NVRTC issues
-    if apply_quantization:
-        print("Applying Int8 quantization for ZeroGPU...")
-        if hasattr(pipe, 'text_encoder') and pipe.text_encoder is not None:
-            quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
-        if hasattr(pipe, 'transformer') and pipe.transformer is not None:
-            quantize_(pipe.transformer, Int8WeightOnlyConfig())
     # Enable VAE optimizations for all GPUs
     print("Enabling VAE slicing and tiling...")
@@ -123,31 +113,10 @@ def setup_intelligent_memory_pipeline(pipe, apply_quantization=True):
         pipe.vae.enable_slicing()
         pipe.vae.enable_tiling()
-    # Memory strategy based on VRAM
-    if total_vram >= 40:
-        print("High VRAM GPU: Loading fully on GPU")
-        print("  - Moving text_encoder...")
-        if hasattr(pipe, 'text_encoder') and pipe.text_encoder is not None:
-            pipe.text_encoder = pipe.text_encoder.to('cuda')
-        clear_vram()
-        print("  - Moving transformer...")
-        if hasattr(pipe, 'transformer') and pipe.transformer is not None:
-            pipe.transformer = pipe.transformer.to('cuda')
-        clear_vram()
-        print("  - Moving VAE...")
-        if hasattr(pipe, 'vae') and pipe.vae is not None:
-            pipe.vae = pipe.vae.to('cuda')
-        clear_vram()
-        return pipe
-    elif total_vram >= 16:
-        print("Mid-range GPU: Using model CPU offloading")
-        pipe.enable_model_cpu_offload()
-        return pipe
-    else:
-        print("Low VRAM GPU: Using sequential CPU offloading")
         pipe.enable_sequential_cpu_offload()
         return pipe

 # Load the model pipeline
 from safetensors.torch import load_file
 import torch.nn.functional as F
+# REMOVED: torchao quantization - causes NVRTC kernel compilation errors
 # ---------------------------------------------------------------------------
     print(f"GPU: {torch.cuda.get_device_name(0)}")
     print(f"Total VRAM: {total_vram:.1f}GB")
     # Enable VAE optimizations for all GPUs
     print("Enabling VAE slicing and tiling...")
         pipe.vae.enable_slicing()
         pipe.vae.enable_tiling()
+    # Use CPU offloading for ZeroGPU - no quantization to avoid NVRTC issues
+    print("Using model CPU offloading for ZeroGPU")
+    pipe.enable_model_cpu_offload()
+    return pipe
         pipe.enable_sequential_cpu_offload()
         return pipe

optimization.py CHANGED Viewed

@@ -3,8 +3,6 @@
 from typing import Any
 from typing import Callable
 from typing import ParamSpec
-from torchao.quantization import quantize_
-from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
 # spaces import REMOVED — spaces.GPU / spaces.aoti_* are HuggingFace ZeroGPU-only APIs.
 # They do not exist outside of HF infrastructure and would crash on a local VPS.
 # Replaced below with standard torch.compile() which gives equivalent or better

 from typing import Any
 from typing import Callable
 from typing import ParamSpec
 # spaces import REMOVED — spaces.GPU / spaces.aoti_* are HuggingFace ZeroGPU-only APIs.
 # They do not exist outside of HF infrastructure and would crash on a local VPS.
 # Replaced below with standard torch.compile() which gives equivalent or better