Spaces:

pearsonkyle
/

SDXL-Model-Merger

Sleeping

App Files Files Community

Kyle Pearson commited on about 1 month ago

Commit

3631a8e

1 Parent(s): 570384a

Add zero-gpu support, enhance model export with quantization/gpu acceleration helpers, optimize inference pipeline with vae fixes, modernize pipeline loading with unified decorators, implement gpu decorator infrastructure.

Browse files

Files changed (5) hide show

requirements.txt +3 -0
src/exporter.py +77 -86
src/generator.py +20 -31
src/gpu_decorator.py +10 -0
src/pipeline.py +68 -73

requirements.txt CHANGED Viewed

@@ -26,3 +26,6 @@ psutil>=5.9.0
 # Optional: quantization support
 optimum-quanto>=0.2.0

 # Optional: quantization support
 optimum-quanto>=0.2.0
+# ZeroGPU support for HuggingFace Spaces
+spaces

src/exporter.py CHANGED Viewed

@@ -1,13 +1,85 @@
 """Model export functionality for SDXL Model Merger."""
-import os
-from pathlib import Path
 import torch
 from safetensors.torch import save_file
 from . import config
 from .config import SCRIPT_DIR
 def export_merged_model(
@@ -45,90 +117,9 @@ def export_merged_model(
             yield None, f"❌ Invalid quantization type: {qtype}. Must be one of: {valid_qtypes}"
             return
-        # Step 1: Unload LoRAs
-        yield "💾 Exporting model...", "Unloading LoRAs..."
-        if include_lora:
-            try:
-                pipe.unload_lora_weights()
-            except Exception as e:
-                print(f"  ℹ️ Could not unload LoRAs: {e}")
-        merged_state_dict = {}
-        # Step 2: Extract UNet weights
-        yield "💾 Exporting model...", "Extracting UNet weights..."
-        for k, v in pipe.unet.state_dict().items():
-            merged_state_dict[f"unet.{k}"] = v.contiguous().half()
-        # Step 3: Extract text encoder weights
-        yield "💾 Exporting model...", "Extracting text encoders..."
-        if pipe.text_encoder is not None:
-            for k, v in pipe.text_encoder.state_dict().items():
-                merged_state_dict[f"text_encoder.{k}"] = v.contiguous().half()
-        if pipe.text_encoder_2 is not None:
-            for k, v in pipe.text_encoder_2.state_dict().items():
-                merged_state_dict[f"text_encoder_2.{k}"] = v.contiguous().half()
-        # Step 4: Extract VAE weights
-        yield "💾 Exporting model...", "Extracting VAE weights..."
-        if pipe.vae is not None:
-            for k, v in pipe.vae.state_dict().items():
-                merged_state_dict[f"first_stage_model.{k}"] = v.contiguous().half()
-        # Step 5: Quantize if requested and optimum.quanto is available
-        try:
-            from optimum.quanto import quantize as quanto_quantize, QTensor
-            QUANTO_AVAILABLE = True
-        except ImportError:
-            QUANTO_AVAILABLE = False
-        if quantize and qtype != "none" and QUANTO_AVAILABLE:
-            yield "💾 Exporting model...", f"Applying {qtype} quantization..."
-            class FakeModel(torch.nn.Module):
-                pass
-            fake_model = FakeModel()
-            fake_model.__dict__.update(merged_state_dict)
-            # Select quantization method
-            if qtype == "int8":
-                from optimum.quanto import int8_weight_only
-                quanto_quantize(fake_model, int8_weight_only())
-            elif qtype == "int4":
-                from optimum.quanto import int4_weight_only
-                quanto_quantize(fake_model, int4_weight_only())
-            elif qtype == "float8":
-                from optimum.quanto import float8_dynamic_activation_float8_weight
-                quanto_quantize(fake_model, float8_dynamic_activation_float8_weight())
-            else:
-                raise ValueError(f"Unsupported qtype: {qtype}")
-            merged_state_dict = {
-                k: v.dequantize().half() if isinstance(v, QTensor) else v
-                for k, v in fake_model.state_dict().items()
-            }
-        elif quantize and not QUANTO_AVAILABLE:
-            yield None, "❌ optimum.quanto not installed. Install with: pip install optimum-quanto"
-            return
-        # Step 6: Save model
-        yield "💾 Exporting model...", "Saving weights..."
-        ext = ".bin" if save_format == "bin" else ".safetensors"
-        # Build filename based on options
-        prefix = ""
-        if quantize and qtype != "none":
-            prefix = f"{qtype}_"
-        out_path = SCRIPT_DIR / f"merged_{prefix}checkpoint{ext}"
-        # Save appropriately
-        if ext == ".bin":
-            torch.save(merged_state_dict, str(out_path))
-        else:
-            save_file(merged_state_dict, str(out_path))
         size_gb = out_path.stat().st_size / 1024**3

 """Model export functionality for SDXL Model Merger."""
 import torch
 from safetensors.torch import save_file
 from . import config
 from .config import SCRIPT_DIR
+from .gpu_decorator import GPU
+@GPU(duration=180)
+def _extract_and_save(pipe, include_lora, quantize, qtype, save_format):
+    """GPU-decorated helper that extracts weights and saves the model."""
+    if include_lora:
+        try:
+            pipe.unload_lora_weights()
+        except Exception as e:
+            print(f"  ℹ️ Could not unload LoRAs: {e}")
+    merged_state_dict = {}
+    # Extract UNet weights
+    for k, v in pipe.unet.state_dict().items():
+        merged_state_dict[f"unet.{k}"] = v.contiguous().half()
+    # Extract text encoder weights
+    if pipe.text_encoder is not None:
+        for k, v in pipe.text_encoder.state_dict().items():
+            merged_state_dict[f"text_encoder.{k}"] = v.contiguous().half()
+    if pipe.text_encoder_2 is not None:
+        for k, v in pipe.text_encoder_2.state_dict().items():
+            merged_state_dict[f"text_encoder_2.{k}"] = v.contiguous().half()
+    # Extract VAE weights
+    if pipe.vae is not None:
+        for k, v in pipe.vae.state_dict().items():
+            merged_state_dict[f"first_stage_model.{k}"] = v.contiguous().half()
+    # Quantize if requested
+    try:
+        from optimum.quanto import quantize as quanto_quantize, QTensor
+        QUANTO_AVAILABLE = True
+    except ImportError:
+        QUANTO_AVAILABLE = False
+    if quantize and qtype != "none" and QUANTO_AVAILABLE:
+        class FakeModel(torch.nn.Module):
+            pass
+        fake_model = FakeModel()
+        fake_model.__dict__.update(merged_state_dict)
+        if qtype == "int8":
+            from optimum.quanto import int8_weight_only
+            quanto_quantize(fake_model, int8_weight_only())
+        elif qtype == "int4":
+            from optimum.quanto import int4_weight_only
+            quanto_quantize(fake_model, int4_weight_only())
+        elif qtype == "float8":
+            from optimum.quanto import float8_dynamic_activation_float8_weight
+            quanto_quantize(fake_model, float8_dynamic_activation_float8_weight())
+        else:
+            raise ValueError(f"Unsupported qtype: {qtype}")
+        merged_state_dict = {
+            k: v.dequantize().half() if isinstance(v, QTensor) else v
+            for k, v in fake_model.state_dict().items()
+        }
+    elif quantize and not QUANTO_AVAILABLE:
+        raise ImportError("optimum.quanto not installed. Install with: pip install optimum-quanto")
+    # Save model
+    ext = ".bin" if save_format == "bin" else ".safetensors"
+    prefix = f"{qtype}_" if quantize and qtype != "none" else ""
+    out_path = SCRIPT_DIR / f"merged_{prefix}checkpoint{ext}"
+    if ext == ".bin":
+        torch.save(merged_state_dict, str(out_path))
+    else:
+        save_file(merged_state_dict, str(out_path))
+    return out_path
 def export_merged_model(
             yield None, f"❌ Invalid quantization type: {qtype}. Must be one of: {valid_qtypes}"
             return
+        yield "💾 Exporting model...", "Extracting and saving weights..."
+        out_path = _extract_and_save(pipe, include_lora, quantize, qtype, save_format)
         size_gb = out_path.stat().st_size / 1024**3

src/generator.py CHANGED Viewed

@@ -3,10 +3,25 @@
 import torch
 from . import config
-from .config import device, dtype, is_running_on_spaces
 from .tiling import enable_seamless_tiling
 def generate_image(
     prompt: str,
     negative_prompt: str,
@@ -45,25 +60,8 @@ def generate_image(
         yield None, "⚠️ Please load a pipeline first."
         return
-    # For CPU mode, use float32 and warn about slow generation
-    effective_dtype = dtype
-    effective_device = device
-    if is_running_on_spaces() and device == "cpu":
-        print("  ℹ️ CPU mode: using float32 for stability (generation will be slower)")
-        # Store original dtype and temporarily use float32
-        original_dtype = effective_dtype
-        effective_dtype = torch.float32
-        # Update pipeline to use float32
-        pipe.unet.to(dtype=torch.float32)
-        if pipe.text_encoder is not None:
-            pipe.text_encoder.to(dtype=torch.float32)
-        if pipe.text_encoder_2 is not None:
-            pipe.text_encoder_2.to(dtype=torch.float32)
-        if pipe.vae is not None:
-            pipe.vae.to(dtype=torch.float32)
-    else:
-        original_dtype = None
     # Enable seamless tiling on UNet & VAE decoder
     enable_seamless_tiling(pipe.unet, tile_x=tile_x, tile_y=tile_y)
@@ -72,18 +70,9 @@ def generate_image(
     yield None, "🎨 Generating image..."
     try:
-        # Use provided seed or generate a random one if None
         actual_seed = seed if seed is not None else int(torch.randint(0, 2**63, (1,)).item())
-        generator = torch.Generator(device=effective_device).manual_seed(actual_seed)
-        result = pipe(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            width=int(width),
-            height=int(height),
-            num_inference_steps=int(steps),
-            guidance_scale=float(cfg),
-            generator=generator,
-        )
         image = result.images[0]
         yield image, f"✅ Complete! ({int(width)}x{int(height)})"

 import torch
 from . import config
+from .config import device, dtype
+from .gpu_decorator import GPU
 from .tiling import enable_seamless_tiling
+@GPU(duration=120)
+def _run_inference(pipe, prompt, negative_prompt, width, height, steps, cfg, generator):
+    """GPU-decorated helper that runs the actual inference."""
+    return pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        width=int(width),
+        height=int(height),
+        num_inference_steps=int(steps),
+        guidance_scale=float(cfg),
+        generator=generator,
+    )
 def generate_image(
     prompt: str,
     negative_prompt: str,
         yield None, "⚠️ Please load a pipeline first."
         return
+    # Ensure VAE stays in float32 to prevent colorful static output
+    pipe.vae.to(dtype=torch.float32)
     # Enable seamless tiling on UNet & VAE decoder
     enable_seamless_tiling(pipe.unet, tile_x=tile_x, tile_y=tile_y)
     yield None, "🎨 Generating image..."
     try:
         actual_seed = seed if seed is not None else int(torch.randint(0, 2**63, (1,)).item())
+        generator = torch.Generator(device=device).manual_seed(actual_seed)
+        result = _run_inference(pipe, prompt, negative_prompt, width, height, steps, cfg, generator)
         image = result.images[0]
         yield image, f"✅ Complete! ({int(width)}x{int(height)})"

src/gpu_decorator.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""ZeroGPU compatibility decorator for HuggingFace Spaces."""
+try:
+    import spaces
+    GPU = spaces.GPU
+except ImportError:
+    def GPU(func=None, duration=None):
+        if func is None:
+            return lambda f: f
+        return func

src/pipeline.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Pipeline management for SDXL Model Merger."""
-from pathlib import Path
 from diffusers import (
     StableDiffusionXLPipeline,
     AutoencoderKL,
@@ -11,7 +10,61 @@ from diffusers import (
 from . import config
 from .config import device, dtype, CACHE_DIR, device_description, is_running_on_spaces, set_download_cancelled
 from .downloader import get_safe_filename_from_url, download_file_with_progress
-from .tiling import enable_seamless_tiling
 def load_pipeline(
@@ -90,8 +143,7 @@ def load_pipeline(
         if not checkpoint_cached:
             download_file_with_progress(checkpoint_url, checkpoint_path)
-        # Download VAE if provided
-        vae = None
         if vae_url and vae_url.strip():
             if vae_path:
                 status_msg = f"📥 Downloading {vae_path.name}..." if not vae_cached else f"✅ Using cached {vae_path.name}"
@@ -105,23 +157,6 @@ def load_pipeline(
                 if not vae_cached:
                     download_file_with_progress(vae_url, vae_path)
-                # Load VAE from file
-                print("  ⚙️ Loading VAE weights...")
-                yield "⚙️ Loading VAE...", f"Loading VAE: {vae_path.name}"
-                vae = AutoencoderKL.from_single_file(
-                    str(vae_path),
-                    torch_dtype=dtype,
-                )
-                if progress:
-                    progress(0.25, desc="VAE loaded")
-        # Load base pipeline (yield progress during this heavy operation)
-        print("  ⚙️ Loading SDXL pipeline from single file...")
-        yield "⚙️ Loading SDXL pipeline...", "Loading model weights into memory..."
-        if progress:
-            progress(0.3, desc="Loading text encoders...")
         # For CPU/low-memory environments on Spaces, use device_map for better RAM management
         load_kwargs = {
             "torch_dtype": dtype,
@@ -132,31 +167,6 @@ def load_pipeline(
             print("  ℹ️ CPU mode detected: enabling device_map='auto' for better RAM management")
             load_kwargs["device_map"] = "auto"
-        # Use a local variable for the pipeline being built — only stored globally on success.
-        _pipe = StableDiffusionXLPipeline.from_single_file(
-            str(checkpoint_path),
-            **load_kwargs,
-        )
-        print("  ✅ Text encoders loaded")
-        if progress:
-            progress(0.5, desc="Loading UNet...")
-        print("  ✅ UNet loaded")
-        # Move to device (unless using device_map='auto' which handles this automatically)
-        if not is_running_on_spaces() or device != "cpu":
-            print(f"  ⚙️ Moving pipeline to device: {device_description}...")
-            _pipe = _pipe.to(device=device, dtype=dtype)
-        yield "⚙️ Pipeline loaded, setting up components...", f"Using device: {device_description}"
-        # Load VAE into pipeline if provided
-        if vae is not None:
-            print("  ⚙️ Setting custom VAE...")
-            _pipe.vae = vae.to(device=device, dtype=dtype)
-            yield "⚙️ Pipeline loaded, setting up components...", f"VAE loaded: {vae_path.name}"
         # Parse LoRA URLs & ensure strengths list matches
         lora_urls = [u.strip() for u in lora_urls_str.split("\n") if u.strip()]
         strengths_raw = [s.strip() for s in lora_strengths_str.split(",")]
@@ -168,11 +178,9 @@ def load_pipeline(
             except ValueError:
                 strengths.append(1.0)
-        # Load and fuse each LoRA sequentially (only if URLs exist)
         if lora_urls:
-            print(f"  ⚙️ Moving pipeline to device: {device_description}...")
-            _pipe = _pipe.to(device=device, dtype=dtype)
             for i, (lora_url, strength) in enumerate(zip(lora_urls, strengths)):
                 lora_filename = get_safe_filename_from_url(lora_url, suffix="_lora")
                 lora_path = CACHE_DIR / lora_filename
@@ -202,34 +210,21 @@ def load_pipeline(
                 if not lora_cached:
                     download_file_with_progress(lora_url, lora_path)
-                print(f"  ⚙️ Loading LoRA {i+1}/{len(lora_urls)}...")
-                yield f"⚙️ Loading LoRA {i+1}/{len(lora_urls)}...", f"Fusing {lora_path.name}..."
-                if progress:
-                    progress(0.7 + (0.2 * i / len(lora_urls)), desc=f"Loading LoRA {i+1}/{len(lora_urls)}...")
-                adapter_name = f"lora_{i}"
-                _pipe.load_lora_weights(str(lora_path), adapter_name=adapter_name)
-                print(f"  ⚙️ Fusing LoRA {i+1} with strength={strength}...")
-                _pipe.fuse_lora(adapter_names=[adapter_name], lora_scale=strength)
-                _pipe.unload_lora_weights()
-        else:
-            # Move pipeline to device even without LoRAs
-            print(f"  ⚙️ Moving pipeline to device: {device_description}...")
-            _pipe = _pipe.to(device=device, dtype=dtype)
-        # Set scheduler and finalize (do this once at the end)
-        print("  ⚙️ Configuring scheduler...")
-        yield "⚙️ Finalizing pipeline...", "Setting up scheduler..."
         if progress:
-            progress(0.95, desc="Finalizing...")
-        _pipe.scheduler = DPMSolverSDEScheduler.from_config(
-            _pipe.scheduler.config,
-            algorithm_type="sde-dpmsolver++",
-            use_karras_sigmas=False,
         )
         # ✅ Only publish the pipeline globally AFTER all steps succeed
         config.set_pipe(_pipe)

 """Pipeline management for SDXL Model Merger."""
+import torch
 from diffusers import (
     StableDiffusionXLPipeline,
     AutoencoderKL,
 from . import config
 from .config import device, dtype, CACHE_DIR, device_description, is_running_on_spaces, set_download_cancelled
 from .downloader import get_safe_filename_from_url, download_file_with_progress
+from .gpu_decorator import GPU
+@GPU(duration=300)
+def _load_and_setup_pipeline(checkpoint_path, vae_path, lora_paths_and_strengths, load_kwargs):
+    """GPU-decorated helper that performs all GPU-intensive pipeline setup."""
+    _pipe = StableDiffusionXLPipeline.from_single_file(
+        str(checkpoint_path),
+        **load_kwargs,
+    )
+    print("  ✅ Text encoders loaded")
+    # Move to device (unless using device_map='auto' which handles this automatically)
+    if not is_running_on_spaces() or device != "cpu":
+        print(f"  ⚙️ Moving pipeline to device: {device_description}...")
+        _pipe = _pipe.to(device=device, dtype=dtype)
+    # Load custom VAE if provided
+    if vae_path is not None:
+        print("  ⚙️ Loading VAE weights...")
+        vae = AutoencoderKL.from_single_file(
+            str(vae_path),
+            torch_dtype=dtype,
+        )
+        print("  ⚙️ Setting custom VAE...")
+        _pipe.vae = vae.to(device=device, dtype=torch.float32)
+    # Load and fuse each LoRA
+    if lora_paths_and_strengths:
+        # Ensure pipeline is on device for LoRA fusion
+        _pipe = _pipe.to(device=device, dtype=dtype)
+        for i, (lora_path, strength) in enumerate(lora_paths_and_strengths):
+            adapter_name = f"lora_{i}"
+            print(f"  ⚙️ Loading LoRA {i+1}/{len(lora_paths_and_strengths)}...")
+            _pipe.load_lora_weights(str(lora_path), adapter_name=adapter_name)
+            print(f"  ⚙️ Fusing LoRA {i+1} with strength={strength}...")
+            _pipe.fuse_lora(adapter_names=[adapter_name], lora_scale=strength)
+            _pipe.unload_lora_weights()
+    else:
+        # Move pipeline to device even without LoRAs
+        _pipe = _pipe.to(device=device, dtype=dtype)
+    # Set scheduler
+    print("  ⚙️ Configuring scheduler...")
+    _pipe.scheduler = DPMSolverSDEScheduler.from_config(
+        _pipe.scheduler.config,
+        algorithm_type="sde-dpmsolver++",
+        use_karras_sigmas=False,
+    )
+    # Keep VAE in float32 to prevent colorful static output
+    _pipe.vae.to(dtype=torch.float32)
+    return _pipe
 def load_pipeline(
         if not checkpoint_cached:
             download_file_with_progress(checkpoint_url, checkpoint_path)
+        # Download VAE if provided (loading happens in _load_and_setup_pipeline)
         if vae_url and vae_url.strip():
             if vae_path:
                 status_msg = f"📥 Downloading {vae_path.name}..." if not vae_cached else f"✅ Using cached {vae_path.name}"
                 if not vae_cached:
                     download_file_with_progress(vae_url, vae_path)
         # For CPU/low-memory environments on Spaces, use device_map for better RAM management
         load_kwargs = {
             "torch_dtype": dtype,
             print("  ℹ️ CPU mode detected: enabling device_map='auto' for better RAM management")
             load_kwargs["device_map"] = "auto"
         # Parse LoRA URLs & ensure strengths list matches
         lora_urls = [u.strip() for u in lora_urls_str.split("\n") if u.strip()]
         strengths_raw = [s.strip() for s in lora_strengths_str.split(",")]
             except ValueError:
                 strengths.append(1.0)
+        # Download LoRAs (CPU-bound downloads, before GPU work)
+        lora_paths_and_strengths = []
         if lora_urls:
             for i, (lora_url, strength) in enumerate(zip(lora_urls, strengths)):
                 lora_filename = get_safe_filename_from_url(lora_url, suffix="_lora")
                 lora_path = CACHE_DIR / lora_filename
                 if not lora_cached:
                     download_file_with_progress(lora_url, lora_path)
+                lora_paths_and_strengths.append((lora_path, strength))
+        # All downloads complete — now do GPU-intensive setup in one decorated call
+        yield "⚙️ Loading SDXL pipeline...", "Loading model weights into memory..."
         if progress:
+            progress(0.5, desc="Loading pipeline...")
+        _pipe = _load_and_setup_pipeline(
+            checkpoint_path, vae_path, lora_paths_and_strengths, load_kwargs
         )
+        if progress:
+            progress(0.95, desc="Finalizing...")
         # ✅ Only publish the pipeline globally AFTER all steps succeed
         config.set_pipe(_pipe)