Upload 17 files

Browse files

Files changed (17) hide show

ComfyUI-TiledDiffusion/.gitignore +167 -0
ComfyUI-TiledDiffusion/.patches.py +189 -0
ComfyUI-TiledDiffusion/README.md +112 -0
ComfyUI-TiledDiffusion/__init__.py +9 -0
ComfyUI-TiledDiffusion/__pycache__/.patches.cpython-310.pyc +0 -0
ComfyUI-TiledDiffusion/__pycache__/.patches.cpython-311.pyc +0 -0
ComfyUI-TiledDiffusion/__pycache__/__init__.cpython-310.pyc +0 -0
ComfyUI-TiledDiffusion/__pycache__/__init__.cpython-311.pyc +0 -0
ComfyUI-TiledDiffusion/__pycache__/tiled_diffusion.cpython-310.pyc +0 -0
ComfyUI-TiledDiffusion/__pycache__/tiled_diffusion.cpython-311.pyc +0 -0
ComfyUI-TiledDiffusion/__pycache__/tiled_vae.cpython-310.pyc +0 -0
ComfyUI-TiledDiffusion/__pycache__/tiled_vae.cpython-311.pyc +0 -0
ComfyUI-TiledDiffusion/__pycache__/utils.cpython-310.pyc +0 -0
ComfyUI-TiledDiffusion/__pycache__/utils.cpython-311.pyc +0 -0
ComfyUI-TiledDiffusion/tiled_diffusion.py +650 -0
ComfyUI-TiledDiffusion/tiled_vae.py +868 -0
ComfyUI-TiledDiffusion/utils.py +246 -0

ComfyUI-TiledDiffusion/.gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+backup*
+**/.DS_Store
+**/.venv
+**/.vscode
+.*

ComfyUI-TiledDiffusion/.patches.py ADDED Viewed

	@@ -0,0 +1,189 @@

+def calc_cond_batch(model, conds, x_in, timestep, model_options):
+    if 'tiled_diffusion' not in model_options:
+        return calc_cond_batch_original_tiled_diffusion_875b8c8d(model, conds, x_in, timestep, model_options)
+    out_conds = []
+    out_counts = []
+    to_run = []
+    for i in range(len(conds)):
+        out_conds.append(torch.zeros_like(x_in))
+        out_counts.append(torch.ones_like(x_in) * 1e-37)
+        cond = conds[i]
+        if cond is not None:
+            for x in cond:
+                p = get_area_and_mult(x, x_in, timestep)
+                if p is None:
+                    continue
+                to_run += [(p, i)]
+    while len(to_run) > 0:
+        first = to_run[0]
+        first_shape = first[0][0].shape
+        to_batch_temp = []
+        for x in range(len(to_run)):
+            if can_concat_cond(to_run[x][0], first[0]):
+                to_batch_temp += [x]
+        to_batch_temp.reverse()
+        to_batch = to_batch_temp[:1]
+        free_memory = model_management.get_free_memory(x_in.device)
+        for i in range(1, len(to_batch_temp) + 1):
+            batch_amount = to_batch_temp[:len(to_batch_temp)//i]
+            input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
+            if model.memory_required(input_shape) * 1.5 < free_memory:
+                to_batch = batch_amount
+                break
+        input_x = []
+        mult = []
+        c = []
+        cond_or_uncond = []
+        area = []
+        control = None
+        patches = None
+        for x in to_batch:
+            o = to_run.pop(x)
+            p = o[0]
+            input_x.append(p.input_x)
+            mult.append(p.mult)
+            c.append(p.conditioning)
+            area.append(p.area)
+            cond_or_uncond.append(o[1])
+            control = p.control
+            patches = p.patches
+        batch_chunks = len(cond_or_uncond)
+        input_x = torch.cat(input_x)
+        c = cond_cat(c)
+        timestep_ = torch.cat([timestep] * batch_chunks)
+        if control is not None:
+            c['control'] = control if 'tiled_diffusion' in model_options else control.get_control(input_x, timestep_, c, len(cond_or_uncond))
+        transformer_options = {}
+        if 'transformer_options' in model_options:
+            transformer_options = model_options['transformer_options'].copy()
+        if patches is not None:
+            if "patches" in transformer_options:
+                cur_patches = transformer_options["patches"].copy()
+                for p in patches:
+                    if p in cur_patches:
+                        cur_patches[p] = cur_patches[p] + patches[p]
+                    else:
+                        cur_patches[p] = patches[p]
+                transformer_options["patches"] = cur_patches
+            else:
+                transformer_options["patches"] = patches
+        transformer_options["cond_or_uncond"] = cond_or_uncond[:]
+        transformer_options["sigmas"] = timestep
+        c['transformer_options'] = transformer_options
+        if 'model_function_wrapper' in model_options:
+            output = model_options['model_function_wrapper'](model.apply_model, {"input": input_x, "timestep": timestep_, "c": c, "cond_or_uncond": cond_or_uncond}).chunk(batch_chunks)
+        else:
+            output = model.apply_model(input_x, timestep_, **c).chunk(batch_chunks)
+        for o in range(batch_chunks):
+            cond_index = cond_or_uncond[o]
+            a = area[o]
+            if a is None:
+                out_conds[cond_index] += output[o] * mult[o]
+                out_counts[cond_index] += mult[o]
+            else:
+                out_c = out_conds[cond_index]
+                out_cts = out_counts[cond_index]
+                dims = len(a) // 2
+                for i in range(dims):
+                    out_c = out_c.narrow(i + 2, a[i + dims], a[i])
+                    out_cts = out_cts.narrow(i + 2, a[i + dims], a[i])
+                out_c += output[o] * mult[o]
+                out_cts += mult[o]
+    for i in range(len(out_conds)):
+        out_conds[i] /= out_counts[i]
+    return out_conds
+def create_blur_map(x0, attn, sigma=3.0, threshold=1.0):
+    # reshape and GAP the attention map
+    _, hw1, hw2 = attn.shape
+    b, _, lh, lw = x0.shape
+    attn = attn.reshape(b, -1, hw1, hw2)
+    # Global Average Pool
+    mask = attn.mean(1, keepdim=False).sum(1, keepdim=False) > threshold
+    def calc_closest_factors(a):
+        for b in range(int(math.sqrt(a)), 0, -1):
+            if a%b == 0:
+                c = a // b
+                return (b,c)
+    m = calc_closest_factors(hw1)
+    mh = max(m) if lh > lw else min(m)
+    mw = m[1] if mh == m[0] else m[0]
+    mid_shape = mh, mw
+    # Reshape
+    mask = (
+        mask.reshape(b, *mid_shape)
+        .unsqueeze(1)
+        .type(attn.dtype)
+    )
+    # Upsample
+    mask = F.interpolate(mask, (lh, lw))
+    blurred = gaussian_blur_2d(x0, kernel_size=9, sigma=sigma)
+    blurred = blurred * mask + x0 * (1 - mask)
+    return blurred
+def pre_run_control(model, conds):
+    s = model.model_sampling
+    def find_outer_instance(target:str, target_type):
+        import inspect
+        frame = inspect.currentframe()
+        i = 0
+        while frame and i < 7:
+            if (found:=frame.f_locals.get(target, None)) is not None:
+                if isinstance(found, target_type):
+                    return found
+            frame = frame.f_back
+            i += 1
+        return None
+    from comfy.model_patcher import ModelPatcher
+    if (_model:=find_outer_instance('model', ModelPatcher)) is not None:
+        if (model_function_wrapper:=_model.model_options.get('model_function_wrapper', None)) is not None:
+            import sys
+            tiled_diffusion = sys.modules.get('ComfyUI-TiledDiffusion.tiled_diffusion', None)
+            if tiled_diffusion is None:
+                for key in sys.modules:
+                    if 'tiled_diffusion' in key:
+                        tiled_diffusion = sys.modules[key]
+                        break
+            if (AbstractDiffusion:=getattr(tiled_diffusion, 'AbstractDiffusion', None)) is not None:
+                if isinstance(model_function_wrapper, AbstractDiffusion):
+                    model_function_wrapper.reset()
+    for t in range(len(conds)):
+        x = conds[t]
+        timestep_start = None
+        timestep_end = None
+        percent_to_timestep_function = lambda a: s.percent_to_sigma(a)
+        if 'control' in x:
+            try: x['control'].cleanup()
+            except Exception: ...
+            x['control'].pre_run(model, percent_to_timestep_function)
+def _set_position(self, boxes, masks, positive_embeddings):
+    objs = self.position_net(boxes, masks, positive_embeddings)
+    def func(x, extra_options):
+        key = extra_options["transformer_index"]
+        module = self.module_list[key]
+        return module(x, objs.to(device=x.device, dtype=x.dtype))
+    return func

ComfyUI-TiledDiffusion/README.md ADDED Viewed

	@@ -0,0 +1,112 @@

+# Tiled Diffusion & VAE for ComfyUI
+Check out the [SD-WebUI extension](https://github.com/pkuliyi2015/multidiffusion-upscaler-for-automatic1111/) for more information.
+This extension enables **large image drawing & upscaling with limited VRAM** via the following techniques:
+1. Two SOTA diffusion tiling algorithms: [Mixture of Diffusers](https://github.com/albarji/mixture-of-diffusers) <a href="https://arxiv.org/abs/2302.02412"><img width="32" alt="Mixture of Diffusers Paper" src="https://github.com/shiimizu/ComfyUI-TiledDiffusion/assets/54494639/b753b7f6-f9c0-405d-bace-792b9bbce5d5"></a> and [MultiDiffusion](https://github.com/omerbt/MultiDiffusion) <a href="https://arxiv.org/abs/2302.08113"><img width="32" alt="MultiDiffusion Paper" src="https://github.com/shiimizu/ComfyUI-TiledDiffusion/assets/54494639/b753b7f6-f9c0-405d-bace-792b9bbce5d5"></a>
+2. pkuliyi2015 & Kahsolt's Tiled VAE algorithm.
+3. ~~pkuliyi2015 & Kahsolt's TIled Noise Inversion for better upscaling.~~
+> [!NOTE]
+> Sizes/dimensions are in pixels and then converted to latent-space sizes.
+## Features
+- [x] SDXL model support
+- [x] ControlNet support
+- [ ] ~~StableSR support~~
+- [ ] ~~Tiled Noise Inversion~~
+- [x] Tiled VAE
+- [ ] Regional Prompt Control
+- [x] Img2img upscale
+- [x] Ultra-Large image generation
+## Tiled Diffusion
+<div align="center">
+  <img width="500" alt="Tiled_Diffusion" src="https://github.com/shiimizu/ComfyUI-TiledDiffusion/assets/54494639/7cb897a3-a645-426f-8742-d6ba5cf04b64">
+</div>
+> [!TIP]
+> Set `tile_overlap` to 0 and `denoise` to 1 to see the tile seams and then adjust the options to your needs. Also, increase `tile_batch_size` to increase speed (if your machine can handle it).
+| Name              | Description                                                  |
+|-------------------|--------------------------------------------------------------|
+| `method`          | Tiling [strategy](https://github.com/pkuliyi2015/multidiffusion-upscaler-for-automatic1111/blob/fbb24736c9bc374c7f098f82b575fcd14a73936a/scripts/tilediffusion.py#L39-L46). `MultiDiffusion` or `Mixture of Diffusers`. |
+| `tile_width`      | Tile's width                                                 |
+| `tile_height`     | Tile's height                                                |
+| `tile_overlap`    | Tile's overlap                                               |
+| `tile_batch_size` | The number of tiles to process in a batch                    |
+### How can I specify the tiles' arrangement?
+If you have the [Math Expression](https://github.com/pythongosssss/ComfyUI-Custom-Scripts#math-expression) node (or something similar), you can use that to pass in the latent that's passed in your KSampler and divide the `tile_height`/`tile_width` by the number of rows/columns you want.
+`C` = number of columns you want
+`R` = number of rows you want
+`pixel width of input image or latent // C` = `tile_width`
+`pixel height of input image or latent // R` = `tile_height`
+<img width="800" alt="Tile_arrangement" src="https://github.com/shiimizu/ComfyUI-TiledDiffusion/assets/54494639/9952e7d8-909e-436f-a284-c00f0fb71665">
+## Tiled VAE
+<div align="center">
+  <img width="900" alt="Tiled_VAE" src="https://github.com/shiimizu/ComfyUI-TiledDiffusion/assets/54494639/b5850e03-2cac-49ce-b1fe-a67906bf4c9d">
+</div>
+<br>
+The recommended tile sizes are given upon the creation of the node based on the available VRAM.
+> [!NOTE]
+> Enabling `fast` for the decoder may produce images with slightly higher contrast and brightness.
+| Name        | Description                                                                                                                                  |
+|-------------|----------------------------------------------------------------------------------------------------------------------------------------------|
+| `tile_size` |  <blockquote>The image is split into tiles, which are then padded with 11/32 pixels' in the decoder/encoder.</blockquote>                                 |
+| `fast`      |  <blockquote><p>When Fast Mode is disabled:</p> <ol> <li>The original VAE forward is decomposed into a task queue and a task worker, which starts to process each tile.</li> <li>When GroupNorm is needed, it suspends, stores current GroupNorm mean and var, send everything to RAM, and turns to the next tile.</li> <li>After all GroupNorm means and vars are summarized, it applies group norm to tiles and continues. </li> <li>A zigzag execution order is used to reduce unnecessary data transfer.</li> </ol> <p>When Fast Mode is enabled:</p> <ol> <li>The original input is downsampled and passed to a separate task queue.</li> <li>Its group norm parameters are recorded and used by all tiles&#39; task queues.</li> <li>Each tile is separately processed without any RAM-VRAM data transfer.</li> </ol> <p>After all tiles are processed, tiles are written to a result buffer and returned.</p></blockquote> |
+| `color_fix` | <blockquote>Only estimate GroupNorm before downsampling, i.e., run in a semi-fast mode.</blockquote><p>Only for the encoder. Can restore colors if tiles are too small.</p>  |
+## Workflows
+The following images can be loaded in ComfyUI.
+<div align="center">
+  <img alt="ComfyUI_07501_" src="https://github.com/shiimizu/ComfyUI-TiledDiffusion/assets/54494639/c3713cfb-e083-4df4-a310-9467827ee666">
+  <p>Simple upscale.</p>
+</div>
+<br>
+<div align="center">
+  <img alt="ComfyUI_07503_" src="https://github.com/shiimizu/ComfyUI-TiledDiffusion/assets/54494639/b681b617-4bb1-49e5-b85a-ef5a0f6e4830">
+  <p>4x upscale. 3 passes.</p>
+</div>
+## Citation
+```bibtex
+@article{jimenez2023mixtureofdiffusers,
+  title={Mixture of Diffusers for scene composition and high resolution image generation},
+  author={Álvaro Barbero Jiménez},
+  journal={arXiv preprint arXiv:2302.02412},
+  year={2023}
+}
+```
+```bibtex
+@article{bar2023multidiffusion,
+  title={MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation},
+  author={Bar-Tal, Omer and Yariv, Lior and Lipman, Yaron and Dekel, Tali},
+  journal={arXiv preprint arXiv:2302.08113},
+  year={2023}
+}
+```

ComfyUI-TiledDiffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .tiled_diffusion import NODE_CLASS_MAPPINGS as TD_NCM, NODE_DISPLAY_NAME_MAPPINGS as TD_NDCM
+from .tiled_vae import NODE_CLASS_MAPPINGS as TV_NCM, NODE_DISPLAY_NAME_MAPPINGS as TV_NDCM
+NODE_CLASS_MAPPINGS = {}
+NODE_DISPLAY_NAME_MAPPINGS = {}
+NODE_CLASS_MAPPINGS.update(TD_NCM)
+NODE_DISPLAY_NAME_MAPPINGS.update(TD_NDCM)
+NODE_CLASS_MAPPINGS.update(TV_NCM)
+NODE_DISPLAY_NAME_MAPPINGS.update(TV_NDCM)
+__all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']

ComfyUI-TiledDiffusion/__pycache__/.patches.cpython-310.pyc ADDED Viewed

Binary file (5 kB). View file

ComfyUI-TiledDiffusion/__pycache__/.patches.cpython-311.pyc ADDED Viewed

Binary file (10.4 kB). View file

ComfyUI-TiledDiffusion/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (431 Bytes). View file

ComfyUI-TiledDiffusion/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (717 Bytes). View file

ComfyUI-TiledDiffusion/__pycache__/tiled_diffusion.cpython-310.pyc ADDED Viewed

Binary file (19 kB). View file

ComfyUI-TiledDiffusion/__pycache__/tiled_diffusion.cpython-311.pyc ADDED Viewed

Binary file (37 kB). View file

ComfyUI-TiledDiffusion/__pycache__/tiled_vae.cpython-310.pyc ADDED Viewed

Binary file (24.9 kB). View file

ComfyUI-TiledDiffusion/__pycache__/tiled_vae.cpython-311.pyc ADDED Viewed

Binary file (45.7 kB). View file

ComfyUI-TiledDiffusion/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (7.83 kB). View file

ComfyUI-TiledDiffusion/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (12.9 kB). View file

ComfyUI-TiledDiffusion/tiled_diffusion.py ADDED Viewed

	@@ -0,0 +1,650 @@

+from __future__ import division
+import torch
+from torch import Tensor
+import comfy.model_management
+from comfy.model_patcher import ModelPatcher
+import comfy.model_patcher
+from comfy.model_base import BaseModel
+from typing import List, Union, Tuple, Dict
+from nodes import ImageScale
+import comfy.utils
+from comfy.controlnet import ControlNet, T2IAdapter
+opt_C = 4
+opt_f = 8
+def ceildiv(big, small):
+    # Correct ceiling division that avoids floating-point errors and importing math.ceil.
+    return -(big // -small)
+from enum import Enum
+class BlendMode(Enum):  # i.e. LayerType
+    FOREGROUND = 'Foreground'
+    BACKGROUND = 'Background'
+class Processing: ...
+class Device: ...
+devices = Device()
+devices.device = comfy.model_management.get_torch_device()
+def null_decorator(fn):
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+    return wrapper
+keep_signature = null_decorator
+controlnet     = null_decorator
+stablesr       = null_decorator
+grid_bbox      = null_decorator
+custom_bbox    = null_decorator
+noise_inverse  = null_decorator
+class BBox:
+    ''' grid bbox '''
+    def __init__(self, x:int, y:int, w:int, h:int):
+        self.x = x
+        self.y = y
+        self.w = w
+        self.h = h
+        self.box = [x, y, x+w, y+h]
+        self.slicer = slice(None), slice(None), slice(y, y+h), slice(x, x+w)
+    def __getitem__(self, idx:int) -> int:
+        return self.box[idx]
+def split_bboxes(w:int, h:int, tile_w:int, tile_h:int, overlap:int=16, init_weight:Union[Tensor, float]=1.0) -> Tuple[List[BBox], Tensor]:
+    cols = ceildiv((w - overlap) , (tile_w - overlap))
+    rows = ceildiv((h - overlap) , (tile_h - overlap))
+    dx = (w - tile_w) / (cols - 1) if cols > 1 else 0
+    dy = (h - tile_h) / (rows - 1) if rows > 1 else 0
+    bbox_list: List[BBox] = []
+    weight = torch.zeros((1, 1, h, w), device=devices.device, dtype=torch.float32)
+    for row in range(rows):
+        y = min(int(row * dy), h - tile_h)
+        for col in range(cols):
+            x = min(int(col * dx), w - tile_w)
+            bbox = BBox(x, y, tile_w, tile_h)
+            bbox_list.append(bbox)
+            weight[bbox.slicer] += init_weight
+    return bbox_list, weight
+class CustomBBox(BBox):
+    ''' region control bbox '''
+    pass
+class AbstractDiffusion:
+    def __init__(self):
+        self.method = self.__class__.__name__
+        self.pbar = None
+        self.w: int = 0
+        self.h: int = 0
+        self.tile_width: int = None
+        self.tile_height: int = None
+        self.tile_overlap: int = None
+        self.tile_batch_size: int = None
+        # cache. final result of current sampling step, [B, C=4, H//8, W//8]
+        # avoiding overhead of creating new tensors and weight summing
+        self.x_buffer: Tensor = None
+        # self.w: int = int(self.p.width  // opt_f)       # latent size
+        # self.h: int = int(self.p.height // opt_f)
+        # weights for background & grid bboxes
+        self._weights: Tensor = None
+        # self.weights: Tensor = torch.zeros((1, 1, self.h, self.w), device=devices.device, dtype=torch.float32)
+        self._init_grid_bbox = None
+        self._init_done = None
+        # count the step correctly
+        self.step_count = 0
+        self.inner_loop_count = 0
+        self.kdiff_step = -1
+        # ext. Grid tiling painting (grid bbox)
+        self.enable_grid_bbox: bool = False
+        self.tile_w: int = None
+        self.tile_h: int = None
+        self.tile_bs: int = None
+        self.num_tiles: int = None
+        self.num_batches: int = None
+        self.batched_bboxes: List[List[BBox]] = []
+        # ext. Region Prompt Control (custom bbox)
+        self.enable_custom_bbox: bool = False
+        self.custom_bboxes: List[CustomBBox] = []
+        # self.cond_basis: Cond = None
+        # self.uncond_basis: Uncond = None
+        # self.draw_background: bool = True       # by default we draw major prompts in grid tiles
+        # self.causal_layers: bool = None
+        # ext. ControlNet
+        self.enable_controlnet: bool = False
+        # self.controlnet_script: ModuleType = None
+        self.control_tensor_batch_dict = {}
+        self.control_tensor_batch: List[List[Tensor]] = [[]]
+        # self.control_params: Dict[str, Tensor] = None # {}
+        self.control_params: Dict[Tuple, List[List[Tensor]]] = {}
+        self.control_tensor_cpu: bool = None
+        self.control_tensor_custom: List[List[Tensor]] = []
+        self.draw_background: bool = True       # by default we draw major prompts in grid tiles
+        self.control_tensor_cpu = False
+        self.weights = None
+        self.imagescale = ImageScale()
+    def reset(self):
+        tile_width = self.tile_width
+        tile_height = self.tile_height
+        tile_overlap = self.tile_overlap
+        tile_batch_size = self.tile_batch_size
+        self.__init__()
+        self.tile_width = tile_width
+        self.tile_height = tile_height
+        self.tile_overlap = tile_overlap
+        self.tile_batch_size = tile_batch_size
+    def repeat_tensor(self, x:Tensor, n:int, concat=False, concat_to=0) -> Tensor:
+        ''' repeat the tensor on it's first dim '''
+        if n == 1: return x
+        B = x.shape[0]
+        r_dims = len(x.shape) - 1
+        if B == 1:      # batch_size = 1 (not `tile_batch_size`)
+            shape = [n] + [-1] * r_dims     # [N, -1, ...]
+            return x.expand(shape)          # `expand` is much lighter than `tile`
+        else:
+            if concat:
+                return torch.cat([x for _ in range(n)], dim=0)[:concat_to]
+            shape = [n] + [1] * r_dims      # [N, 1, ...]
+            return x.repeat(shape)
+    def update_pbar(self):
+        if self.pbar.n >= self.pbar.total:
+            self.pbar.close()
+        else:
+            # self.pbar.update()
+            sampling_step = 20
+            if self.step_count == sampling_step:
+                self.inner_loop_count += 1
+                if self.inner_loop_count < self.total_bboxes:
+                    self.pbar.update()
+            else:
+                self.step_count = sampling_step
+                self.inner_loop_count = 0
+    def reset_buffer(self, x_in:Tensor):
+        # Judge if the shape of x_in is the same as the shape of x_buffer
+        if self.x_buffer is None or self.x_buffer.shape != x_in.shape:
+            self.x_buffer = torch.zeros_like(x_in, device=x_in.device, dtype=x_in.dtype)
+        else:
+            self.x_buffer.zero_()
+    @grid_bbox
+    def init_grid_bbox(self, tile_w:int, tile_h:int, overlap:int, tile_bs:int):
+        # if self._init_grid_bbox is not None: return
+        # self._init_grid_bbox = True
+        self.weights = torch.zeros((1, 1, self.h, self.w), device=devices.device, dtype=torch.float32)
+        self.enable_grid_bbox = True
+        self.tile_w = min(tile_w, self.w)
+        self.tile_h = min(tile_h, self.h)
+        overlap = max(0, min(overlap, min(tile_w, tile_h) - 4))
+        # split the latent into overlapped tiles, then batching
+        # weights basically indicate how many times a pixel is painted
+        bboxes, weights = split_bboxes(self.w, self.h, self.tile_w, self.tile_h, overlap, self.get_tile_weights())
+        self.weights += weights
+        self.num_tiles = len(bboxes)
+        self.num_batches = ceildiv(self.num_tiles , tile_bs)
+        self.tile_bs = ceildiv(len(bboxes) , self.num_batches)          # optimal_batch_size
+        self.batched_bboxes = [bboxes[i*self.tile_bs:(i+1)*self.tile_bs] for i in range(self.num_batches)]
+    @grid_bbox
+    def get_tile_weights(self) -> Union[Tensor, float]:
+        return 1.0
+    @noise_inverse
+    def init_noise_inverse(self, steps:int, retouch:float, get_cache_callback, set_cache_callback, renoise_strength:float, renoise_kernel:int):
+        self.noise_inverse_enabled = True
+        self.noise_inverse_steps = steps
+        self.noise_inverse_retouch = float(retouch)
+        self.noise_inverse_renoise_strength = float(renoise_strength)
+        self.noise_inverse_renoise_kernel = int(renoise_kernel)
+        self.noise_inverse_set_cache = set_cache_callback
+        self.noise_inverse_get_cache = get_cache_callback
+    def init_done(self):
+        '''
+          Call this after all `init_*`, settings are done, now perform:
+            - settings sanity check
+            - pre-computations, cache init
+            - anything thing needed before denoising starts
+        '''
+        # if self._init_done is not None: return
+        # self._init_done = True
+        self.total_bboxes = 0
+        if self.enable_grid_bbox:   self.total_bboxes += self.num_batches
+        if self.enable_custom_bbox: self.total_bboxes += len(self.custom_bboxes)
+        assert self.total_bboxes > 0, "Nothing to paint! No background to draw and no custom bboxes were provided."
+        # sampling_steps = _steps
+        # self.pbar = tqdm(total=(self.total_bboxes) * sampling_steps, desc=f"{self.method} Sampling: ")
+    @controlnet
+    def prepare_controlnet_tensors(self, refresh:bool=False, tensor=None):
+        ''' Crop the control tensor into tiles and cache them '''
+        if not refresh:
+            if self.control_tensor_batch is not None or self.control_params is not None: return
+        tensors = [tensor]
+        self.org_control_tensor_batch = tensors
+        self.control_tensor_batch = []
+        for i in range(len(tensors)):
+            control_tile_list = []
+            control_tensor = tensors[i]
+            for bboxes in self.batched_bboxes:
+                single_batch_tensors = []
+                for bbox in bboxes:
+                    if len(control_tensor.shape) == 3:
+                        control_tensor.unsqueeze_(0)
+                    control_tile = control_tensor[:, :, bbox[1]*opt_f:bbox[3]*opt_f, bbox[0]*opt_f:bbox[2]*opt_f]
+                    single_batch_tensors.append(control_tile)
+                control_tile = torch.cat(single_batch_tensors, dim=0)
+                if self.control_tensor_cpu:
+                    control_tile = control_tile.cpu()
+                control_tile_list.append(control_tile)
+            self.control_tensor_batch.append(control_tile_list)
+            if len(self.custom_bboxes) > 0:
+                custom_control_tile_list = []
+                for bbox in self.custom_bboxes:
+                    if len(control_tensor.shape) == 3:
+                        control_tensor.unsqueeze_(0)
+                    control_tile = control_tensor[:, :, bbox[1]*opt_f:bbox[3]*opt_f, bbox[0]*opt_f:bbox[2]*opt_f]
+                    if self.control_tensor_cpu:
+                        control_tile = control_tile.cpu()
+                    custom_control_tile_list.append(control_tile)
+                self.control_tensor_custom.append(custom_control_tile_list)
+    @controlnet
+    def switch_controlnet_tensors(self, batch_id:int, x_batch_size:int, tile_batch_size:int, is_denoise=False):
+        # if not self.enable_controlnet: return
+        if self.control_tensor_batch is None: return
+        # self.control_params = [0]
+        # for param_id in range(len(self.control_params)):
+        for param_id in range(len(self.control_tensor_batch)):
+            # tensor that was concatenated in `prepare_controlnet_tensors`
+            control_tile = self.control_tensor_batch[param_id][batch_id]
+            # broadcast to latent batch size
+            if x_batch_size > 1: # self.is_kdiff:
+                all_control_tile = []
+                for i in range(tile_batch_size):
+                    this_control_tile = [control_tile[i].unsqueeze(0)] * x_batch_size
+                    all_control_tile.append(torch.cat(this_control_tile, dim=0))
+                control_tile = torch.cat(all_control_tile, dim=0) # [:x_tile.shape[0]]
+                self.control_tensor_batch[param_id][batch_id] = control_tile
+            # else:
+            #     control_tile = control_tile.repeat([x_batch_size if is_denoise else x_batch_size * 2, 1, 1, 1])
+            # self.control_params[param_id].hint_cond = control_tile.to(devices.device)
+    def process_controlnet(self, x_shape, x_dtype, c_in: dict, cond_or_uncond: List, bboxes, batch_size: int, batch_id: int):
+        control: ControlNet = c_in['control']
+        param_id = -1 # current controlnet & previous_controlnets
+        tuple_key = tuple(cond_or_uncond) + tuple(x_shape)
+        while control is not None:
+            param_id += 1
+            PH, PW = self.h*8, self.w*8
+            if self.control_params.get(tuple_key, None) is None:
+                self.control_params[tuple_key] = [[None]]
+                val = self.control_params[tuple_key]
+                if param_id+1 >= len(val):
+                    val.extend([[None] for _ in range(param_id+1)])
+                if len(self.batched_bboxes) >= len(val[param_id]):
+                    val[param_id].extend([[None] for _ in range(len(self.batched_bboxes))])
+            while len(self.control_params[tuple_key]) <= param_id:
+                self.control_params[tuple_key].extend([None])
+                # print('extending param_id')
+            while len(self.control_params[tuple_key][param_id]) <= batch_id:
+                self.control_params[tuple_key][param_id].extend([None])
+                # print('extending batch_id')
+            # Below is taken from comfy.controlnet.py, but we need to additionally tile the cnets.
+            # if statement: eager eval. first time when cond_hint is None.
+            if self.refresh or control.cond_hint is None or not isinstance(self.control_params[tuple_key][param_id][batch_id], Tensor):
+                dtype = getattr(control, 'manual_cast_dtype', None)
+                if dtype is None: dtype = getattr(getattr(control, 'control_model', None), 'dtype', None)
+                if dtype is None: dtype = x_dtype
+                if isinstance(control, T2IAdapter):
+                    width, height = control.scale_image_to(PW, PH)
+                    control.cond_hint = comfy.utils.common_upscale(control.cond_hint_original, width, height, 'nearest-exact', "center").float().to(control.device)
+                    if control.channels_in == 1 and control.cond_hint.shape[1] > 1:
+                        control.cond_hint = torch.mean(control.cond_hint, 1, keepdim=True)
+                elif control.__class__.__name__ == 'ControlLLLiteAdvanced':
+                    if control.sub_idxs is not None and control.cond_hint_original.shape[0] >= control.full_latent_length:
+                        control.cond_hint = comfy.utils.common_upscale(control.cond_hint_original[control.sub_idxs], PW, PH, 'nearest-exact', "center").to(dtype=dtype, device=control.device)
+                    else:
+                        if (PH, PW) == (control.cond_hint_original.shape[-2], control.cond_hint_original.shape[-1]):
+                            control.cond_hint = control.cond_hint_original.clone().to(dtype=dtype, device=control.device)
+                        else:
+                            control.cond_hint = comfy.utils.common_upscale(control.cond_hint_original, PW, PH, 'nearest-exact', "center").to(dtype=dtype, device=control.device)
+                else:
+                    if (PH, PW) == (control.cond_hint_original.shape[-2], control.cond_hint_original.shape[-1]):
+                        control.cond_hint = control.cond_hint_original.clone().to(dtype=dtype, device=control.device)
+                    else:
+                        control.cond_hint = comfy.utils.common_upscale(control.cond_hint_original, PW, PH, 'nearest-exact', 'center').to(dtype=dtype, device=control.device)
+                # Broadcast then tile
+                #
+                # Below can be in the parent's if clause because self.refresh will trigger on resolution change, e.g. cause of ConditioningSetArea
+                # so that particular case isn't cached atm.
+                cond_hint_pre_tile = control.cond_hint
+                if control.cond_hint.shape[0] < batch_size :
+                    cond_hint_pre_tile = self.repeat_tensor(control.cond_hint, ceildiv(batch_size, control.cond_hint.shape[0]))[:batch_size]
+                cns = [cond_hint_pre_tile[:, :, bbox[1]*opt_f:bbox[3]*opt_f, bbox[0]*opt_f:bbox[2]*opt_f] for bbox in bboxes]
+                control.cond_hint = torch.cat(cns, dim=0)
+                self.control_params[tuple_key][param_id][batch_id]=control.cond_hint
+            else:
+                control.cond_hint = self.control_params[tuple_key][param_id][batch_id]
+            control = control.previous_controlnet
+import numpy as np
+from numpy import pi, exp, sqrt
+def gaussian_weights(tile_w:int, tile_h:int) -> Tensor:
+    '''
+    Copy from the original implementation of Mixture of Diffusers
+    https://github.com/albarji/mixture-of-diffusers/blob/master/mixdiff/tiling.py
+    This generates gaussian weights to smooth the noise of each tile.
+    This is critical for this method to work.
+    '''
+    f = lambda x, midpoint, var=0.01: exp(-(x-midpoint)*(x-midpoint) / (tile_w*tile_w) / (2*var)) / sqrt(2*pi*var)
+    x_probs = [f(x, (tile_w - 1) / 2) for x in range(tile_w)]   # -1 because index goes from 0 to latent_width - 1
+    y_probs = [f(y,  tile_h      / 2) for y in range(tile_h)]
+    w = np.outer(y_probs, x_probs)
+    return torch.from_numpy(w).to(devices.device, dtype=torch.float32)
+class CondDict: ...
+class MultiDiffusion(AbstractDiffusion):
+    @torch.no_grad()
+    def __call__(self, model_function: BaseModel.apply_model, args: dict):
+        x_in: Tensor = args["input"]
+        t_in: Tensor = args["timestep"]
+        c_in: dict = args["c"]
+        cond_or_uncond: List = args["cond_or_uncond"]
+        c_crossattn: Tensor = c_in['c_crossattn']
+        N, C, H, W = x_in.shape
+        # comfyui can feed in a latent that's a different size cause of SetArea, so we'll refresh in that case.
+        self.refresh = False
+        if self.weights is None or self.h != H or self.w != W:
+            self.h, self.w = H, W
+            self.refresh = True
+            self.init_grid_bbox(self.tile_width, self.tile_height, self.tile_overlap, self.tile_batch_size)
+            # init everything done, perform sanity check & pre-computations
+            self.init_done()
+        self.h, self.w = H, W
+        # clear buffer canvas
+        self.reset_buffer(x_in)
+        # Background sampling (grid bbox)
+        if self.draw_background:
+            for batch_id, bboxes in enumerate(self.batched_bboxes):
+                if comfy.model_management.processing_interrupted():
+                    # self.pbar.close()
+                    return x_in
+                # batching & compute tiles
+                x_tile = torch.cat([x_in[bbox.slicer] for bbox in bboxes], dim=0)   # [TB, C, TH, TW]
+                n_rep = len(bboxes)
+                ts_tile = self.repeat_tensor(t_in, n_rep)
+                cond_tile = self.repeat_tensor(c_crossattn, n_rep)
+                c_tile = c_in.copy()
+                c_tile['c_crossattn'] = cond_tile
+                if 'time_context' in c_in:
+                    c_tile['time_context'] = self.repeat_tensor(c_in['time_context'], n_rep)
+                for key in c_tile:
+                    if key in ['y', 'c_concat']:
+                        icond = c_tile[key]
+                        if icond.shape[2:] == (self.h, self.w):
+                            c_tile[key] = torch.cat([icond[bbox.slicer] for bbox in bboxes])
+                        else:
+                            c_tile[key] = self.repeat_tensor(icond, n_rep)
+                # controlnet tiling
+                # self.switch_controlnet_tensors(batch_id, N, len(bboxes))
+                if 'control' in c_in:
+                    control=c_in['control']
+                    self.process_controlnet(x_tile.shape, x_tile.dtype, c_in, cond_or_uncond, bboxes, N, batch_id)
+                    c_tile['control'] = control.get_control(x_tile, ts_tile, c_tile, len(cond_or_uncond))
+                # stablesr tiling
+                # self.switch_stablesr_tensors(batch_id)
+                x_tile_out = model_function(x_tile, ts_tile, **c_tile)
+                for i, bbox in enumerate(bboxes):
+                    self.x_buffer[bbox.slicer] += x_tile_out[i*N:(i+1)*N, :, :, :]
+                del x_tile_out, x_tile, ts_tile, c_tile
+                # update progress bar
+                # self.update_pbar()
+        # Averaging background buffer
+        x_out = torch.where(self.weights > 1, self.x_buffer / self.weights, self.x_buffer)
+        return x_out
+class MixtureOfDiffusers(AbstractDiffusion):
+    """
+        Mixture-of-Diffusers Implementation
+        https://github.com/albarji/mixture-of-diffusers
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # weights for custom bboxes
+        self.custom_weights: List[Tensor] = []
+        self.get_weight = gaussian_weights
+    def init_done(self):
+        super().init_done()
+        # The original gaussian weights can be extremely small, so we rescale them for numerical stability
+        self.rescale_factor = 1 / self.weights
+        # Meanwhile, we rescale the custom weights in advance to save time of slicing
+        for bbox_id, bbox in enumerate(self.custom_bboxes):
+            if bbox.blend_mode == BlendMode.BACKGROUND:
+                self.custom_weights[bbox_id] *= self.rescale_factor[bbox.slicer]
+    @grid_bbox
+    def get_tile_weights(self) -> Tensor:
+        # weights for grid bboxes
+        # if not hasattr(self, 'tile_weights'):
+        # x_in can change sizes cause of ConditioningSetArea, so we have to recalcualte each time
+        self.tile_weights = self.get_weight(self.tile_w, self.tile_h)
+        return self.tile_weights
+    @torch.no_grad()
+    def __call__(self, model_function: BaseModel.apply_model, args: dict):
+        x_in: Tensor = args["input"]
+        t_in: Tensor = args["timestep"]
+        c_in: dict = args["c"]
+        cond_or_uncond: List= args["cond_or_uncond"]
+        c_crossattn: Tensor = c_in['c_crossattn']
+        N, C, H, W = x_in.shape
+        self.refresh = False
+        # self.refresh = True
+        if self.weights is None or self.h != H or self.w != W:
+            self.h, self.w = H, W
+            self.refresh = True
+            self.init_grid_bbox(self.tile_width, self.tile_height, self.tile_overlap, self.tile_batch_size)
+            # init everything done, perform sanity check & pre-computations
+            self.init_done()
+        self.h, self.w = H, W
+        # clear buffer canvas
+        self.reset_buffer(x_in)
+        # self.pbar = tqdm(total=(self.total_bboxes) * sampling_steps, desc=f"{self.method} Sampling: ")
+        # self.pbar = tqdm(total=len(self.batched_bboxes), desc=f"{self.method} Sampling: ")
+        # Global sampling
+        if self.draw_background:
+            for batch_id, bboxes in enumerate(self.batched_bboxes):     # batch_id is the `Latent tile batch size`
+                if comfy.model_management.processing_interrupted():
+                    # self.pbar.close()
+                    return x_in
+                # batching
+                x_tile_list     = []
+                t_tile_list     = []
+                icond_map = {}
+                # tcond_tile_list = []
+                # icond_tile_list = []
+                # vcond_tile_list = []
+                # control_list = []
+                for bbox in bboxes:
+                    x_tile_list.append(x_in[bbox.slicer])
+                    t_tile_list.append(t_in)
+                    if isinstance(c_in, dict):
+                        # tcond
+                        # tcond_tile = c_crossattn #self.get_tcond(c_in)      # cond, [1, 77, 768]
+                        # tcond_tile_list.append(tcond_tile)
+                        # present in sdxl
+                        for key in ['y', 'c_concat']:
+                            if key in c_in:
+                                icond=c_in[key] # self.get_icond(c_in)
+                                if icond.shape[2:] == (self.h, self.w):
+                                    icond = icond[bbox.slicer]
+                                if icond_map.get(key, None) is None:
+                                    icond_map[key] = []
+                                icond_map[key].append(icond)
+                        # # vcond:
+                        # vcond = self.get_vcond(c_in)
+                        # vcond_tile_list.append(vcond)
+                    else:
+                        print('>> [WARN] not supported, make an issue on github!!')
+                n_rep = len(bboxes)
+                x_tile      = torch.cat(x_tile_list,     dim=0)          # differs each
+                t_tile      = self.repeat_tensor(t_in, n_rep)           # just repeat
+                tcond_tile = self.repeat_tensor(c_crossattn, n_rep) # just repeat
+                c_tile = c_in.copy()
+                c_tile['c_crossattn'] = tcond_tile
+                if 'time_context' in c_in:
+                    c_tile['time_context'] = self.repeat_tensor(c_in['time_context'], n_rep) # just repeat
+                for key in c_tile:
+                    if key in ['y', 'c_concat']:
+                        icond_tile = torch.cat(icond_map[key], dim=0)  # differs each
+                        c_tile[key] = icond_tile
+                # vcond_tile = torch.cat(vcond_tile_list, dim=0) if None not in vcond_tile_list else None # just repeat
+                # controlnet
+                # self.switch_controlnet_tensors(batch_id, N, len(bboxes), is_denoise=True)
+                if 'control' in c_in:
+                    control=c_in['control']
+                    self.process_controlnet(x_tile.shape, x_tile.dtype, c_in, cond_or_uncond, bboxes, N, batch_id)
+                    c_tile['control'] = control.get_control(x_tile, t_tile, c_tile, len(cond_or_uncond))
+                # stablesr
+                # self.switch_stablesr_tensors(batch_id)
+                # denoising: here the x is the noise
+                x_tile_out = model_function(x_tile, t_tile, **c_tile)
+                # de-batching
+                for i, bbox in enumerate(bboxes):
+                    # These weights can be calcluated in advance, but will cost a lot of vram
+                    # when you have many tiles. So we calculate it here.
+                    w = self.tile_weights * self.rescale_factor[bbox.slicer]
+                    self.x_buffer[bbox.slicer] += x_tile_out[i*N:(i+1)*N, :, :, :] * w
+                del x_tile_out, x_tile, t_tile, c_tile
+                # self.update_pbar()
+                # self.pbar.update()
+        # self.pbar.close()
+        x_out = self.x_buffer
+        return x_out
+from .utils import hook_all
+hook_all()
+MAX_RESOLUTION=8192
+class TiledDiffusion():
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"model": ("MODEL", ),
+                                "method": (["MultiDiffusion", "Mixture of Diffusers"], {"default": "Mixture of Diffusers"}),
+                                # "tile_width": ("INT", {"default": 96, "min": 16, "max": 256, "step": 16}),
+                                "tile_width": ("INT", {"default": 96*opt_f, "min": 16, "max": MAX_RESOLUTION, "step": 16}),
+                                # "tile_height": ("INT", {"default": 96, "min": 16, "max": 256, "step": 16}),
+                                "tile_height": ("INT", {"default": 96*opt_f, "min": 16, "max": MAX_RESOLUTION, "step": 16}),
+                                "tile_overlap": ("INT", {"default": 8*opt_f, "min": 0, "max": 256*opt_f, "step": 4*opt_f}),
+                                "tile_batch_size": ("INT", {"default": 4, "min": 1, "max": MAX_RESOLUTION, "step": 1}),
+                            }}
+    RETURN_TYPES = ("MODEL",)
+    FUNCTION = "apply"
+    CATEGORY = "_for_testing"
+    def apply(self, model: ModelPatcher, method, tile_width, tile_height, tile_overlap, tile_batch_size):
+        if method == "Mixture of Diffusers":
+            implement = MixtureOfDiffusers()
+        else:
+            implement = MultiDiffusion()
+        # if noise_inversion:
+        #     get_cache_callback = self.noise_inverse_get_cache
+        #     set_cache_callback = None # lambda x0, xt, prompts: self.noise_inverse_set_cache(p, x0, xt, prompts, steps, retouch)
+        #     implement.init_noise_inverse(steps, retouch, get_cache_callback, set_cache_callback, renoise_strength, renoise_kernel_size)
+        implement.tile_width = tile_width // opt_f
+        implement.tile_height = tile_height // opt_f
+        implement.tile_overlap = tile_overlap // opt_f
+        implement.tile_batch_size = tile_batch_size
+        # implement.init_grid_bbox(tile_width, tile_height, tile_overlap, tile_batch_size)
+        # # init everything done, perform sanity check & pre-computations
+        # implement.init_done()
+        # hijack the behaviours
+        # implement.hook()
+        model = model.clone()
+        model.set_model_unet_function_wrapper(implement)
+        model.model_options['tiled_diffusion'] = True
+        return (model,)
+class NoiseInversion():
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"model": ("MODEL", ),
+                                "positive": ("CONDITIONING", ),
+                                "negative": ("CONDITIONING", ),
+                                "latent_image": ("LATENT", ),
+                                "image": ("IMAGE", ),
+                                "steps": ("INT", {"default": 10, "min": 1, "max": 208, "step": 1}),
+                                "retouch": ("FLOAT", {"default": 1, "min": 1, "max": 100, "step": 0.1}),
+                                "renoise_strength": ("FLOAT", {"default": 1, "min": 1, "max": 2, "step": 0.01}),
+                                "renoise_kernel_size": ("INT", {"default": 2, "min": 2, "max": 512, "step": 1}),
+                            }}
+    RETURN_TYPES = ("LATENT",)
+    FUNCTION = "sample"
+    CATEGORY = "sampling"
+    def sample(self, model: ModelPatcher, positive, negative,
+                    latent_image, image, steps, retouch, renoise_strength, renoise_kernel_size):
+        return (latent_image,)
+NODE_CLASS_MAPPINGS = {
+    "TiledDiffusion": TiledDiffusion,
+    # "NoiseInversion": NoiseInversion,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "TiledDiffusion": "Tiled Diffusion",
+    # "NoiseInversion": "Noise Inversion",
+}

ComfyUI-TiledDiffusion/tiled_vae.py ADDED Viewed

	@@ -0,0 +1,868 @@

+'''
+# ------------------------------------------------------------------------
+#
+#   Tiled VAE
+#
+#   Introducing a revolutionary new optimization designed to make
+#   the VAE work with giant images on limited VRAM!
+#   Say goodbye to the frustration of OOM and hello to seamless output!
+#
+# ------------------------------------------------------------------------
+#
+#   This script is a wild hack that splits the image into tiles,
+#   encodes each tile separately, and merges the result back together.
+#
+#   Advantages:
+#   - The VAE can now work with giant images on limited VRAM
+#       (~10 GB for 8K images!)
+#   - The merged output is completely seamless without any post-processing.
+#
+#   Drawbacks:
+#   - NaNs always appear in for 8k images when you use fp16 (half) VAE
+#       You must use --no-half-vae to disable half VAE for that giant image.
+#   - The gradient calculation is not compatible with this hack. It
+#       will break any backward() or torch.autograd.grad() that passes VAE.
+#       (But you can still use the VAE to generate training data.)
+#
+#   How it works:
+#   1. The image is split into tiles, which are then padded with 11/32 pixels' in the decoder/encoder.
+#   2. When Fast Mode is disabled:
+#       1. The original VAE forward is decomposed into a task queue and a task worker, which starts to process each tile.
+#       2. When GroupNorm is needed, it suspends, stores current GroupNorm mean and var, send everything to RAM, and turns to the next tile.
+#       3. After all GroupNorm means and vars are summarized, it applies group norm to tiles and continues.
+#       4. A zigzag execution order is used to reduce unnecessary data transfer.
+#   3. When Fast Mode is enabled:
+#       1. The original input is downsampled and passed to a separate task queue.
+#       2. Its group norm parameters are recorded and used by all tiles' task queues.
+#       3. Each tile is separately processed without any RAM-VRAM data transfer.
+#   4. After all tiles are processed, tiles are written to a result buffer and returned.
+#   Encoder color fix = only estimate GroupNorm before downsampling, i.e., run in a semi-fast mode.
+#
+#   Enjoy!
+#
+#   @Author: LI YI @ Nanyang Technological University - Singapore
+#   @Date: 2023-03-02
+#   @License: CC BY-NC-SA 4.0
+#
+#   Please give https://github.com/pkuliyi2015/multidiffusion-upscaler-for-automatic1111
+#   a star if you like the project!
+#
+# -------------------------------------------------------------------------
+'''
+import gc
+import math
+from time import time
+from tqdm import tqdm
+import torch
+import torch.version
+import torch.nn.functional as F
+# import gradio as gr
+# import modules.scripts as scripts
+# from .modules import devices
+# from modules.shared import state
+# from modules.ui import gr_show
+# from modules.processing import opt_f
+# from modules.sd_vae_approx import cheap_approximation
+# from ldm.modules.diffusionmodules.model import AttnBlock, MemoryEfficientAttnBlock
+# from tile_utils.attn import get_attn_func
+# from tile_utils.typing import Processing
+import comfy
+import comfy.model_management
+from comfy.model_management import processing_interrupted
+import contextlib
+opt_C = 4
+opt_f = 8
+is_sdxl = False
+disable_nan_check = True
+class Device: ...
+devices = Device()
+devices.device = comfy.model_management.get_torch_device()
+devices.cpu = torch.device('cpu')
+devices.torch_gc = lambda: comfy.model_management.soft_empty_cache()
+devices.get_optimal_device = lambda: comfy.model_management.get_torch_device()
+class NansException(Exception): ...
+def test_for_nans(x, where):
+    if disable_nan_check:
+        return
+    if not torch.all(torch.isnan(x)).item():
+        return
+    if where == "unet":
+        message = "A tensor with all NaNs was produced in Unet."
+        if comfy.model_management.unet_dtype(x.device) != torch.float32:
+            message += " This could be either because there's not enough precision to represent the picture, or because your video card does not support half type. Try setting the \"Upcast cross attention layer to float32\" option in Settings > Stable Diffusion or using the --no-half commandline argument to fix this."
+    elif where == "vae":
+        message = "A tensor with all NaNs was produced in VAE."
+        if comfy.model_management.unet_dtype(x.device) != torch.float32 and comfy.model_management.vae_dtype()  != torch.float32:
+            message += " This could be because there's not enough precision to represent the picture. Try adding --no-half-vae commandline argument to fix this."
+    else:
+        message = "A tensor with all NaNs was produced."
+    message += " Use --disable-nan-check commandline argument to disable this check."
+    raise NansException(message)
+def _autocast(disable=False):
+    if disable:
+        return contextlib.nullcontext()
+    if comfy.model_management.unet_dtype() == torch.float32 or comfy.model_management.get_torch_device() == torch.device("mps"): # or shared.cmd_opts.precision == "full":
+        return contextlib.nullcontext()
+    # only cuda
+    autocast_device = comfy.model_management.get_autocast_device(comfy.model_management.get_torch_device())
+    return torch.autocast(autocast_device)
+def without_autocast(disable=False):
+    return torch.autocast("cuda", enabled=False) if torch.is_autocast_enabled() and not disable else contextlib.nullcontext()
+devices.test_for_nans = test_for_nans
+devices.autocast = _autocast
+devices.without_autocast = without_autocast
+def cheap_approximation(sample):
+    # https://discuss.huggingface.co/t/decoding-latents-to-rgb-without-upscaling/23204/2
+    if is_sdxl:
+        coeffs = [
+            [ 0.3448,  0.4168,  0.4395],
+            [-0.1953, -0.0290,  0.0250],
+            [ 0.1074,  0.0886, -0.0163],
+            [-0.3730, -0.2499, -0.2088],
+        ]
+    else:
+        coeffs = [
+            [ 0.298,  0.207,  0.208],
+            [ 0.187,  0.286,  0.173],
+            [-0.158,  0.189,  0.264],
+            [-0.184, -0.271, -0.473],
+        ]
+    coefs = torch.tensor(coeffs).to(sample.device)
+    x_sample = torch.einsum("...lxy,lr -> ...rxy", sample, coefs)
+    return x_sample
+def get_rcmd_enc_tsize():
+    if torch.cuda.is_available() and devices.device not in ['cpu', devices.cpu]:
+        total_memory = torch.cuda.get_device_properties(devices.device).total_memory // 2**20
+        if   total_memory > 16*1000: ENCODER_TILE_SIZE = 3072
+        elif total_memory > 12*1000: ENCODER_TILE_SIZE = 2048
+        elif total_memory >  8*1000: ENCODER_TILE_SIZE = 1536
+        else:                        ENCODER_TILE_SIZE = 960
+    else:                            ENCODER_TILE_SIZE = 512
+    return ENCODER_TILE_SIZE
+def get_rcmd_dec_tsize():
+    if torch.cuda.is_available() and devices.device not in ['cpu', devices.cpu]:
+        total_memory = torch.cuda.get_device_properties(devices.device).total_memory // 2**20
+        if   total_memory > 30*1000: DECODER_TILE_SIZE = 256
+        elif total_memory > 16*1000: DECODER_TILE_SIZE = 192
+        elif total_memory > 12*1000: DECODER_TILE_SIZE = 128
+        elif total_memory >  8*1000: DECODER_TILE_SIZE = 96
+        else:                        DECODER_TILE_SIZE = 64
+    else:                            DECODER_TILE_SIZE = 64
+    return DECODER_TILE_SIZE
+def inplace_nonlinearity(x):
+    # Test: fix for Nans
+    return F.silu(x, inplace=True)
+def _attn_forward(self, x):
+    # From comfy.Idm.modules.diffusionmodules.model.AttnBlock.forward
+    # However, the residual & normalization are removed and computed separately.
+    h_ = x
+    q = self.q(h_)
+    k = self.k(h_)
+    v = self.v(h_)
+    h_ = self.optimized_attention(q, k, v)
+    h_ = self.proj_out(h_)
+    return h_
+def get_attn_func():
+    return _attn_forward
+def attn2task(task_queue, net):
+    attn_forward = get_attn_func()
+    task_queue.append(('store_res', lambda x: x))
+    task_queue.append(('pre_norm', net.norm))
+    task_queue.append(('attn', lambda x, net=net: attn_forward(net, x)))
+    task_queue.append(['add_res', None])
+def resblock2task(queue, block):
+    """
+    Turn a ResNetBlock into a sequence of tasks and append to the task queue
+    @param queue: the target task queue
+    @param block: ResNetBlock
+    """
+    if block.in_channels != block.out_channels:
+        if block.use_conv_shortcut:
+            queue.append(('store_res', block.conv_shortcut))
+        else:
+            queue.append(('store_res', block.nin_shortcut))
+    else:
+        queue.append(('store_res', lambda x: x))
+    queue.append(('pre_norm', block.norm1))
+    queue.append(('silu', inplace_nonlinearity))
+    queue.append(('conv1', block.conv1))
+    queue.append(('pre_norm', block.norm2))
+    queue.append(('silu', inplace_nonlinearity))
+    queue.append(('conv2', block.conv2))
+    queue.append(['add_res', None])
+def build_sampling(task_queue, net, is_decoder):
+    """
+    Build the sampling part of a task queue
+    @param task_queue: the target task queue
+    @param net: the network
+    @param is_decoder: currently building decoder or encoder
+    """
+    if is_decoder:
+        resblock2task(task_queue, net.mid.block_1)
+        attn2task(task_queue, net.mid.attn_1)
+        resblock2task(task_queue, net.mid.block_2)
+        resolution_iter = reversed(range(net.num_resolutions))
+        block_ids = net.num_res_blocks + 1
+        condition = 0
+        module = net.up
+        func_name = 'upsample'
+    else:
+        resolution_iter = range(net.num_resolutions)
+        block_ids = net.num_res_blocks
+        condition = net.num_resolutions - 1
+        module = net.down
+        func_name = 'downsample'
+    for i_level in resolution_iter:
+        for i_block in range(block_ids):
+            resblock2task(task_queue, module[i_level].block[i_block])
+        if i_level != condition:
+            task_queue.append((func_name, getattr(module[i_level], func_name)))
+    if not is_decoder:
+        resblock2task(task_queue, net.mid.block_1)
+        attn2task(task_queue, net.mid.attn_1)
+        resblock2task(task_queue, net.mid.block_2)
+def build_task_queue(net, is_decoder):
+    """
+    Build a single task queue for the encoder or decoder
+    @param net: the VAE decoder or encoder network
+    @param is_decoder: currently building decoder or encoder
+    @return: the task queue
+    """
+    task_queue = []
+    task_queue.append(('conv_in', net.conv_in))
+    # construct the sampling part of the task queue
+    # because encoder and decoder share the same architecture, we extract the sampling part
+    build_sampling(task_queue, net, is_decoder)
+    if not is_decoder or not net.give_pre_end:
+        task_queue.append(('pre_norm', net.norm_out))
+        task_queue.append(('silu', inplace_nonlinearity))
+        task_queue.append(('conv_out', net.conv_out))
+        if is_decoder and net.tanh_out:
+            task_queue.append(('tanh', torch.tanh))
+    return task_queue
+def clone_task_queue(task_queue):
+    """
+    Clone a task queue
+    @param task_queue: the task queue to be cloned
+    @return: the cloned task queue
+    """
+    return [[item for item in task] for task in task_queue]
+def get_var_mean(input, num_groups, eps=1e-6):
+    """
+    Get mean and var for group norm
+    """
+    b, c = input.size(0), input.size(1)
+    channel_in_group = int(c/num_groups)
+    input_reshaped = input.contiguous().view(1, int(b * num_groups), channel_in_group, *input.size()[2:])
+    var, mean = torch.var_mean(input_reshaped, dim=[0, 2, 3, 4], unbiased=False)
+    return var, mean
+def custom_group_norm(input, num_groups, mean, var, weight=None, bias=None, eps=1e-6):
+    """
+    Custom group norm with fixed mean and var
+    @param input: input tensor
+    @param num_groups: number of groups. by default, num_groups = 32
+    @param mean: mean, must be pre-calculated by get_var_mean
+    @param var: var, must be pre-calculated by get_var_mean
+    @param weight: weight, should be fetched from the original group norm
+    @param bias: bias, should be fetched from the original group norm
+    @param eps: epsilon, by default, eps = 1e-6 to match the original group norm
+    @return: normalized tensor
+    """
+    b, c = input.size(0), input.size(1)
+    channel_in_group = int(c/num_groups)
+    input_reshaped = input.contiguous().view(
+        1, int(b * num_groups), channel_in_group, *input.size()[2:])
+    out = F.batch_norm(input_reshaped, mean, var, weight=None, bias=None, training=False, momentum=0, eps=eps)
+    out = out.view(b, c, *input.size()[2:])
+    # post affine transform
+    if weight is not None:
+        out *= weight.view(1, -1, 1, 1)
+    if bias is not None:
+        out += bias.view(1, -1, 1, 1)
+    return out
+def crop_valid_region(x, input_bbox, target_bbox, is_decoder):
+    """
+    Crop the valid region from the tile
+    @param x: input tile
+    @param input_bbox: original input bounding box
+    @param target_bbox: output bounding box
+    @param scale: scale factor
+    @return: cropped tile
+    """
+    padded_bbox = [i * 8 if is_decoder else i//8 for i in input_bbox]
+    margin = [target_bbox[i] - padded_bbox[i] for i in range(4)]
+    return x[:, :, margin[2]:x.size(2)+margin[3], margin[0]:x.size(3)+margin[1]]
+# ↓↓↓ https://github.com/Kahsolt/stable-diffusion-webui-vae-tile-infer ↓↓↓
+def perfcount(fn):
+    def wrapper(*args, **kwargs):
+        ts = time()
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats(devices.device)
+        devices.torch_gc()
+        gc.collect()
+        ret = fn(*args, **kwargs)
+        devices.torch_gc()
+        gc.collect()
+        if torch.cuda.is_available():
+            vram = torch.cuda.max_memory_allocated(devices.device) / 2**20
+            print(f'[Tiled VAE]: Done in {time() - ts:.3f}s, max VRAM alloc {vram:.3f} MB')
+        else:
+            print(f'[Tiled VAE]: Done in {time() - ts:.3f}s')
+        return ret
+    return wrapper
+# ↑↑↑ https://github.com/Kahsolt/stable-diffusion-webui-vae-tile-infer ↑↑↑
+class GroupNormParam:
+    def __init__(self):
+        self.var_list = []
+        self.mean_list = []
+        self.pixel_list = []
+        self.weight = None
+        self.bias = None
+    def add_tile(self, tile, layer):
+        var, mean = get_var_mean(tile, 32)
+        # For giant images, the variance can be larger than max float16
+        # In this case we create a copy to float32
+        if var.dtype == torch.float16 and var.isinf().any():
+            fp32_tile = tile.float()
+            var, mean = get_var_mean(fp32_tile, 32)
+        # ============= DEBUG: test for infinite =============
+        # if torch.isinf(var).any():
+        #    print('[Tiled VAE]: inf test', var)
+        # ====================================================
+        self.var_list.append(var)
+        self.mean_list.append(mean)
+        self.pixel_list.append(
+            tile.shape[2]*tile.shape[3])
+        if hasattr(layer, 'weight'):
+            self.weight = layer.weight
+            self.bias = layer.bias
+        else:
+            self.weight = None
+            self.bias = None
+    def summary(self):
+        """
+        summarize the mean and var and return a function
+        that apply group norm on each tile
+        """
+        if len(self.var_list) == 0: return None
+        var = torch.vstack(self.var_list)
+        mean = torch.vstack(self.mean_list)
+        max_value = max(self.pixel_list)
+        pixels = torch.tensor(self.pixel_list, dtype=torch.float32, device=devices.device) / max_value
+        sum_pixels = torch.sum(pixels)
+        pixels = pixels.unsqueeze(1) / sum_pixels
+        # var = torch.sum(var * pixels.to(var.device), dim=0)
+        # mean = torch.sum(mean * pixels.to(var.device), dim=0)
+        var = torch.sum(var * pixels, dim=0)
+        mean = torch.sum(mean * pixels, dim=0)
+        return lambda x:  custom_group_norm(x, 32, mean, var, self.weight, self.bias)
+    @staticmethod
+    def from_tile(tile, norm):
+        """
+        create a function from a single tile without summary
+        """
+        var, mean = get_var_mean(tile, 32)
+        if var.dtype == torch.float16 and var.isinf().any():
+            fp32_tile = tile.float()
+            var, mean = get_var_mean(fp32_tile, 32)
+            # if it is a macbook, we need to convert back to float16
+            if var.device.type == 'mps':
+                # clamp to avoid overflow
+                var = torch.clamp(var, 0, 60000)
+                var = var.half()
+                mean = mean.half()
+        if hasattr(norm, 'weight'):
+            weight = norm.weight
+            bias = norm.bias
+        else:
+            weight = None
+            bias = None
+        def group_norm_func(x, mean=mean, var=var, weight=weight, bias=bias):
+            return custom_group_norm(x, 32, mean, var, weight, bias, 1e-6)
+        return group_norm_func
+class VAEHook:
+    def __init__(self, net, tile_size, is_decoder:bool, fast_decoder:bool, fast_encoder:bool, color_fix:bool, to_gpu:bool=False):
+        self.net = net                  # encoder | decoder
+        self.tile_size = tile_size
+        self.is_decoder = is_decoder
+        self.fast_mode = (fast_encoder and not is_decoder) or (fast_decoder and is_decoder)
+        self.color_fix = color_fix and not is_decoder
+        self.to_gpu = to_gpu
+        self.pad = 11 if is_decoder else 32         # FIXME: magic number
+    def __call__(self, x):
+        # original_device = next(self.net.parameters()).device
+        try:
+            # if self.to_gpu:
+            #     self.net = self.net.to(devices.get_optimal_device())
+            B, C, H, W = x.shape
+            if False:#max(H, W) <= self.pad * 2 + self.tile_size:
+                print("[Tiled VAE]: the input size is tiny and unnecessary to tile.", x.shape, self.pad * 2 + self.tile_size)
+                return self.net.original_forward(x)
+            else:
+                return self.vae_tile_forward(x)
+        finally:
+            pass
+            # self.net = self.net.to(original_device)
+    def get_best_tile_size(self, lowerbound, upperbound):
+        """
+        Get the best tile size for GPU memory
+        """
+        divider = 32
+        while divider >= 2:
+            remainer = lowerbound % divider
+            if remainer == 0:
+                return lowerbound
+            candidate = lowerbound - remainer + divider
+            if candidate <= upperbound:
+                return candidate
+            divider //= 2
+        return lowerbound
+    def split_tiles(self, h, w):
+        """
+        Tool function to split the image into tiles
+        @param h: height of the image
+        @param w: width of the image
+        @return: tile_input_bboxes, tile_output_bboxes
+        """
+        tile_input_bboxes, tile_output_bboxes = [], []
+        tile_size = self.tile_size
+        pad = self.pad
+        num_height_tiles = math.ceil((h - 2 * pad) / tile_size)
+        num_width_tiles = math.ceil((w - 2 * pad) / tile_size)
+        # If any of the numbers are 0, we let it be 1
+        # This is to deal with long and thin images
+        num_height_tiles = max(num_height_tiles, 1)
+        num_width_tiles = max(num_width_tiles, 1)
+        # Suggestions from https://github.com/Kahsolt: auto shrink the tile size
+        real_tile_height = math.ceil((h - 2 * pad) / num_height_tiles)
+        real_tile_width = math.ceil((w - 2 * pad) / num_width_tiles)
+        real_tile_height = self.get_best_tile_size(real_tile_height, tile_size)
+        real_tile_width = self.get_best_tile_size(real_tile_width, tile_size)
+        print(f'[Tiled VAE]: split to {num_height_tiles}x{num_width_tiles} = {num_height_tiles*num_width_tiles} tiles. ' +
+              f'Optimal tile size {real_tile_width}x{real_tile_height}, original tile size {tile_size}x{tile_size}')
+        for i in range(num_height_tiles):
+            for j in range(num_width_tiles):
+                # bbox: [x1, x2, y1, y2]
+                # the padding is is unnessary for image borders. So we directly start from (32, 32)
+                input_bbox = [
+                    pad + j * real_tile_width,
+                    min(pad + (j + 1) * real_tile_width, w),
+                    pad + i * real_tile_height,
+                    min(pad + (i + 1) * real_tile_height, h),
+                ]
+                # if the output bbox is close to the image boundary, we extend it to the image boundary
+                output_bbox = [
+                    input_bbox[0] if input_bbox[0] > pad else 0,
+                    input_bbox[1] if input_bbox[1] < w - pad else w,
+                    input_bbox[2] if input_bbox[2] > pad else 0,
+                    input_bbox[3] if input_bbox[3] < h - pad else h,
+                ]
+                # scale to get the final output bbox
+                output_bbox = [x * 8 if self.is_decoder else x // 8 for x in output_bbox]
+                tile_output_bboxes.append(output_bbox)
+                # indistinguishable expand the input bbox by pad pixels
+                tile_input_bboxes.append([
+                    max(0, input_bbox[0] - pad),
+                    min(w, input_bbox[1] + pad),
+                    max(0, input_bbox[2] - pad),
+                    min(h, input_bbox[3] + pad),
+                ])
+        return tile_input_bboxes, tile_output_bboxes
+    @torch.no_grad()
+    def estimate_group_norm(self, z, task_queue, color_fix):
+        device = z.device
+        tile = z
+        last_id = len(task_queue) - 1
+        while last_id >= 0 and task_queue[last_id][0] != 'pre_norm':
+            last_id -= 1
+        if last_id <= 0 or task_queue[last_id][0] != 'pre_norm':
+            raise ValueError('No group norm found in the task queue')
+        # estimate until the last group norm
+        for i in range(last_id + 1):
+            task = task_queue[i]
+            if task[0] == 'pre_norm':
+                group_norm_func = GroupNormParam.from_tile(tile, task[1])
+                task_queue[i] = ('apply_norm', group_norm_func)
+                if i == last_id:
+                    return True
+                tile = group_norm_func(tile)
+            elif task[0] == 'store_res':
+                task_id = i + 1
+                while task_id < last_id and task_queue[task_id][0] != 'add_res':
+                    task_id += 1
+                if task_id >= last_id:
+                    continue
+                task_queue[task_id][1] = task[1](tile)
+            elif task[0] == 'add_res':
+                tile += task[1].to(device)
+                task[1] = None
+            elif color_fix and task[0] == 'downsample':
+                for j in range(i, last_id + 1):
+                    if task_queue[j][0] == 'store_res':
+                        task_queue[j] = ('store_res_cpu', task_queue[j][1])
+                return True
+            else:
+                tile = task[1](tile)
+            try:
+                devices.test_for_nans(tile, "vae")
+            except:
+                print(f'Nan detected in fast mode estimation. Fast mode disabled.')
+                return False
+        raise IndexError('Should not reach here')
+    @perfcount
+    @torch.no_grad()
+    def vae_tile_forward(self, z):
+        """
+        Decode a latent vector z into an image in a tiled manner.
+        @param z: latent vector
+        @return: image
+        """
+        device = next(self.net.parameters()).device
+        net = self.net
+        tile_size = self.tile_size
+        is_decoder = self.is_decoder
+        z = z.detach() # detach the input to avoid backprop
+        N, height, width = z.shape[0], z.shape[2], z.shape[3]
+        net.last_z_shape = z.shape
+        # Split the input into tiles and build a task queue for each tile
+        print(f'[Tiled VAE]: input_size: {z.shape}, tile_size: {tile_size}, padding: {self.pad}')
+        in_bboxes, out_bboxes = self.split_tiles(height, width)
+        # Prepare tiles by split the input latents
+        tiles = []
+        for input_bbox in in_bboxes:
+            tile = z[:, :, input_bbox[2]:input_bbox[3], input_bbox[0]:input_bbox[1]].cpu()
+            tiles.append(tile)
+        num_tiles = len(tiles)
+        num_completed = 0
+        # Build task queues
+        single_task_queue = build_task_queue(net, is_decoder)
+        if self.fast_mode:
+            # Fast mode: downsample the input image to the tile size,
+            # then estimate the group norm parameters on the downsampled image
+            scale_factor = tile_size / max(height, width)
+            z = z.to(device)
+            downsampled_z = F.interpolate(z, scale_factor=scale_factor, mode='nearest-exact')
+            # use nearest-exact to keep statictics as close as possible
+            print(f'[Tiled VAE]: Fast mode enabled, estimating group norm parameters on {downsampled_z.shape[3]} x {downsampled_z.shape[2]} image')
+            # ======= Special thanks to @Kahsolt for distribution shift issue ======= #
+            # The downsampling will heavily distort its mean and std, so we need to recover it.
+            std_old, mean_old = torch.std_mean(z, dim=[0, 2, 3], keepdim=True)
+            std_new, mean_new = torch.std_mean(downsampled_z, dim=[0, 2, 3], keepdim=True)
+            downsampled_z = (downsampled_z - mean_new) / std_new * std_old + mean_old
+            del std_old, mean_old, std_new, mean_new
+            # occasionally the std_new is too small or too large, which exceeds the range of float16
+            # so we need to clamp it to max z's range.
+            downsampled_z = torch.clamp_(downsampled_z, min=z.min(), max=z.max())
+            estimate_task_queue = clone_task_queue(single_task_queue)
+            if self.estimate_group_norm(downsampled_z, estimate_task_queue, color_fix=self.color_fix):
+                single_task_queue = estimate_task_queue
+            del downsampled_z
+        task_queues = [clone_task_queue(single_task_queue) for _ in range(num_tiles)]
+        # Dummy result
+        result = None
+        result_approx = None
+        try:
+            with devices.autocast():
+                result_approx = torch.cat([F.interpolate(cheap_approximation(x).unsqueeze(0), scale_factor=opt_f, mode='nearest-exact') for x in z], dim=0).cpu()
+        except: pass
+        # Free memory of input latent tensor
+        del z
+        # Task queue execution
+        pbar = tqdm(total=num_tiles * len(task_queues[0]), desc=f"[Tiled VAE]: Executing {'Decoder' if is_decoder else 'Encoder'} Task Queue: ")
+        pbar_comfy = comfy.utils.ProgressBar(num_tiles * len(task_queues[0]))
+        # execute the task back and forth when switch tiles so that we always
+        # keep one tile on the GPU to reduce unnecessary data transfer
+        forward = True
+        interrupted = False
+        state_interrupted = processing_interrupted()
+        #state.interrupted = interrupted
+        while True:
+            if state_interrupted: interrupted = True ; break
+            group_norm_param = GroupNormParam()
+            for i in range(num_tiles) if forward else reversed(range(num_tiles)):
+                if state_interrupted: interrupted = True ; break
+                tile = tiles[i].to(device)
+                input_bbox = in_bboxes[i]
+                task_queue = task_queues[i]
+                interrupted = False
+                while len(task_queue) > 0:
+                    if state_interrupted: interrupted = True ; break
+                    # DEBUG: current task
+                    # print('Running task: ', task_queue[0][0], ' on tile ', i, '/', num_tiles, ' with shape ', tile.shape)
+                    task = task_queue.pop(0)
+                    if task[0] == 'pre_norm':
+                        group_norm_param.add_tile(tile, task[1])
+                        break
+                    elif task[0] == 'store_res' or task[0] == 'store_res_cpu':
+                        task_id = 0
+                        res = task[1](tile)
+                        if not self.fast_mode or task[0] == 'store_res_cpu':
+                            res = res.cpu()
+                        while task_queue[task_id][0] != 'add_res':
+                            task_id += 1
+                        task_queue[task_id][1] = res
+                    elif task[0] == 'add_res':
+                        tile += task[1].to(device)
+                        task[1] = None
+                    else:
+                        tile = task[1](tile)
+                    pbar.update(1)
+                    pbar_comfy.update(1)
+                if interrupted: break
+                # check for NaNs in the tile.
+                # If there are NaNs, we abort the process to save user's time
+                devices.test_for_nans(tile, "vae")
+                if len(task_queue) == 0:
+                    tiles[i] = None
+                    num_completed += 1
+                    if result is None:      # NOTE: dim C varies from different cases, can only be inited dynamically
+                        result = torch.zeros((N, tile.shape[1], height * 8 if is_decoder else height // 8, width * 8 if is_decoder else width // 8), device=device, requires_grad=False)
+                    result[:, :, out_bboxes[i][2]:out_bboxes[i][3], out_bboxes[i][0]:out_bboxes[i][1]] = crop_valid_region(tile, in_bboxes[i], out_bboxes[i], is_decoder)
+                    del tile
+                elif i == num_tiles - 1 and forward:
+                    forward = False
+                    tiles[i] = tile
+                elif i == 0 and not forward:
+                    forward = True
+                    tiles[i] = tile
+                else:
+                    tiles[i] = tile.cpu()
+                    del tile
+            if interrupted: break
+            if num_completed == num_tiles: break
+            # insert the group norm task to the head of each task queue
+            group_norm_func = group_norm_param.summary()
+            if group_norm_func is not None:
+                for i in range(num_tiles):
+                    task_queue = task_queues[i]
+                    task_queue.insert(0, ('apply_norm', group_norm_func))
+        # Done!
+        pbar.close()
+        if interrupted:
+            del result, result_approx
+            comfy.model_management.throw_exception_if_processing_interrupted()
+        vae_dtype = comfy.model_management.vae_dtype()
+        return result.to(dtype=vae_dtype, device=device) if result is not None else result_approx.to(device=device, dtype=vae_dtype)
+# from .tiled_vae import VAEHook, get_rcmd_enc_tsize, get_rcmd_dec_tsize
+from nodes import VAEEncode, VAEDecode
+class TiledVAE:
+    def process(self, *args, **kwargs):
+        samples = kwargs['samples'] if 'samples' in kwargs else (kwargs['pixels'] if 'pixels' in kwargs else args[0])
+        _vae = kwargs['vae'] if 'vae' in kwargs else args[1]
+        tile_size = kwargs['tile_size'] if 'tile_size' in kwargs else args[2]
+        fast = kwargs['fast'] if 'fast' in kwargs else args[3]
+        color_fix = kwargs['color_fix'] if 'color_fix' in kwargs else False
+        is_decoder = self.is_decoder
+        # for shorthand
+        vae = _vae.first_stage_model
+        encoder = vae.encoder
+        decoder = vae.decoder
+        # # undo hijack if disabled (in cases last time crashed)
+        # if not enabled:
+        #     if self.hooked:
+        if isinstance(encoder.forward, VAEHook):
+            encoder.forward.net = None
+            encoder.forward = encoder.original_forward
+        if isinstance(decoder.forward, VAEHook):
+            decoder.forward.net = None
+            decoder.forward = decoder.original_forward
+        #         self.hooked = False
+        #     return
+        # if devices.get_optimal_device_name().startswith('cuda') and vae.device == devices.cpu and not vae_to_gpu:
+        #     print("[Tiled VAE] warn: VAE is not on GPU, check 'Move VAE to GPU' if possible.")
+        # do hijack
+        # kwargs = {
+        #     'fast_decoder': fast_decoder,
+        #     'fast_encoder': fast_encoder,
+        #     'color_fix':    color_fix,
+        #     'to_gpu':       vae_to_gpu,
+        # }
+        # save original forward (only once)
+        if not hasattr(encoder, 'original_forward'): setattr(encoder, 'original_forward', encoder.forward)
+        if not hasattr(decoder, 'original_forward'): setattr(decoder, 'original_forward', decoder.forward)
+        # self.hooked = True
+        # encoder.forward = VAEHook(encoder, encoder_tile_size, is_decoder=False, **kwargs)
+        # decoder.forward = VAEHook(decoder, decoder_tile_size, is_decoder=True,  **kwargs)
+        fn = VAEHook(net=decoder if is_decoder else encoder, tile_size=tile_size // 8 if is_decoder else tile_size,
+                        is_decoder=is_decoder, fast_decoder=fast, fast_encoder=fast,
+                        color_fix=color_fix, to_gpu=comfy.model_management.vae_device().type != 'cpu')
+        if is_decoder:
+            decoder.forward = fn
+        else:
+            encoder.forward = fn
+        ret = (None,)
+        try:
+            with devices.without_autocast():
+                if not is_decoder:
+                    ret = VAEEncode().encode(_vae, samples)
+                else:
+                    ret = VAEDecode().decode(_vae, samples) if is_decoder else VAEEncode().encode(_vae, samples)
+        finally:
+            if isinstance(encoder.forward, VAEHook):
+                encoder.forward.net = None
+                encoder.forward = encoder.original_forward
+            if isinstance(decoder.forward, VAEHook):
+                decoder.forward.net = None
+                decoder.forward = decoder.original_forward
+        return ret
+class VAEEncodeTiled_TiledDiffusion(TiledVAE):
+    @classmethod
+    def INPUT_TYPES(s):
+        fast = True
+        tile_size = get_rcmd_enc_tsize()
+        return {"required": {"pixels": ("IMAGE", ),
+                                "vae": ("VAE", ),
+                                "tile_size": ("INT", {"default": tile_size, "min": 256, "max": 4096, "step": 16}),
+                                "fast": ("BOOLEAN", {"default": fast}),
+                                "color_fix": ("BOOLEAN", {"default": fast}),
+                            }}
+    RETURN_TYPES = ("LATENT",)
+    FUNCTION = "process"
+    CATEGORY = "_for_testing"
+    def __init__(self):
+        self.is_decoder = False
+        super().__init__()
+class VAEDecodeTiled_TiledDiffusion(TiledVAE):
+    @classmethod
+    def INPUT_TYPES(s):
+        tile_size = get_rcmd_dec_tsize() * opt_f
+        return {"required": {"samples": ("LATENT", ),
+                                "vae": ("VAE", ),
+                                "tile_size": ("INT", {"default": tile_size, "min": 48*opt_f, "max": 4096, "step": 16}),
+                                "fast": ("BOOLEAN", {"default": True}),
+                            }}
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "process"
+    CATEGORY = "_for_testing"
+    def __init__(self):
+        self.is_decoder = True
+        super().__init__()
+NODE_CLASS_MAPPINGS = {
+    "VAEEncodeTiled_TiledDiffusion": VAEEncodeTiled_TiledDiffusion,
+    "VAEDecodeTiled_TiledDiffusion": VAEDecodeTiled_TiledDiffusion,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "VAEEncodeTiled_TiledDiffusion": "Tiled VAE Encode",
+    "VAEDecodeTiled_TiledDiffusion": "Tiled VAE Decode",
+}

ComfyUI-TiledDiffusion/utils.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import inspect
+import importlib
+from textwrap import dedent, indent
+from copy import copy
+import types
+import functools
+import os
+import sys
+import binascii
+from typing import List, NamedTuple
+class Hook(NamedTuple):
+    fn: object
+    module_name: str
+    target: str
+    orig_key: str
+    module_name_path: str
+def gen_id():
+    return binascii.hexlify(os.urandom(1024))[64:72].decode("utf-8")
+def hook_calc_cond_uncond_batch():
+    try:
+        from comfy.samplers import calc_cond_batch
+        calc_cond_batch_ = calc_cond_batch
+    except Exception:
+        from comfy.samplers import calc_cond_uncond_batch
+        calc_cond_batch_ = calc_cond_uncond_batch
+    # this function should only be run by us
+    orig_key = f"{calc_cond_batch_.__name__}_original_tiled_diffusion_{gen_id()}"
+    payload = [{
+        "mode": "replace",
+        "target_line": 'control.get_control',
+        "code_to_insert": """control if 'tiled_diffusion' in model_options else control.get_control"""
+    },
+    {
+        "dedent": False,
+        "target_line": calc_cond_batch_.__name__,
+        "code_to_insert": f"""
+    if 'tiled_diffusion' not in model_options:
+        return {orig_key}{inspect.signature(calc_cond_batch_)}"""
+    }]
+    fn = inject_code(calc_cond_batch_, payload, 'w')
+    return create_hook(fn, 'comfy.samplers', orig_key=orig_key)
+def hook_sag_create_blur_map():
+    imported = False
+    try:
+        import comfy_extras
+        from comfy_extras import nodes_sag
+        imported = True
+    except Exception: ...
+    if not imported: return
+    import comfy_extras
+    from comfy_extras import nodes_sag
+    import re
+    source=inspect.getsource(nodes_sag.create_blur_map)
+    replace_str="""
+    def calc_closest_factors(a):
+        for b in range(int(math.sqrt(a)), 0, -1):
+            if a%b == 0:
+                c = a // b
+                return (b,c)
+    m = calc_closest_factors(hw1)
+    mh = max(m) if lh > lw else min(m)
+    mw = m[1] if mh == m[0] else m[0]
+    mid_shape = mh, mw"""
+    modified_source = re.sub(r"ratio =.*\s+mid_shape =.*", replace_str, source, flags=re.MULTILINE)
+    fn = write_to_file_and_return_fn(nodes_sag.create_blur_map, modified_source)
+    return create_hook(fn, 'comfy_extras.nodes_sag')
+def hook_samplers_pre_run_control():
+    from comfy.samplers import pre_run_control
+    payload = [{
+        "dedent": False,
+        "target_line": "if 'control' in x:",
+        "code_to_insert": """    try: x['control'].cleanup()\n    except Exception: ..."""
+    },
+    {
+    "target_line": "s = model.model_sampling",
+    "code_to_insert": """
+    def find_outer_instance(target:str, target_type):
+        import inspect
+        frame = inspect.currentframe()
+        i = 0
+        while frame and i < 7:
+            if (found:=frame.f_locals.get(target, None)) is not None:
+                if isinstance(found, target_type):
+                    return found
+            frame = frame.f_back
+            i += 1
+        return None
+    from comfy.model_patcher import ModelPatcher
+    if (_model:=find_outer_instance('model', ModelPatcher)) is not None:
+        if (model_function_wrapper:=_model.model_options.get('model_function_wrapper', None)) is not None:
+            import sys
+            tiled_diffusion = sys.modules.get('ComfyUI-TiledDiffusion.tiled_diffusion', None)
+            if tiled_diffusion is None:
+                for key in sys.modules:
+                    if 'tiled_diffusion' in key:
+                        tiled_diffusion = sys.modules[key]
+                        break
+            if (AbstractDiffusion:=getattr(tiled_diffusion, 'AbstractDiffusion', None)) is not None:
+                if isinstance(model_function_wrapper, AbstractDiffusion):
+                    model_function_wrapper.reset()
+    """}]
+    fn = inject_code(pre_run_control, payload)
+    return create_hook(fn, 'comfy.samplers')
+def hook_gligen__set_position():
+    from comfy.gligen import Gligen
+    source=inspect.getsource(Gligen._set_position)
+    replace_str="""
+            nonlocal objs
+            if x.shape[0] > objs.shape[0]:
+                _objs = objs.repeat(-(x.shape[0] // -objs.shape[0]),1,1)
+            else:
+                _objs = objs
+            return module(x, _objs)"""
+    modified_source = dedent(source.replace("    return module(x, objs)", replace_str, 1))
+    fn = write_to_file_and_return_fn(Gligen._set_position, modified_source)
+    return create_hook(fn, 'comfy.gligen', 'Gligen._set_position')
+def create_hook(fn, module_name:str, target = None, orig_key = None):
+    if target is None: target = fn.__name__
+    if orig_key is None: orig_key = f'{target}_original'
+    module_name_path = os.path.normpath(module_name.replace('.', '/'))
+    return Hook(fn, module_name, target, orig_key, module_name_path)
+def _getattr(obj, name:str, default=None):
+    """multi-level getattr"""
+    for attr in name.split('.'):
+        obj = getattr(obj, attr, default)
+    return obj
+def _hasattr(obj, name:str):
+    """multi-level hasattr"""
+    return _getattr(obj, name) is not None
+def _setattr(obj, name:str, value=None):
+    """multi-level setattr"""
+    split = name.split('.')
+    if not split[:-1]:
+        return setattr(obj, name, value)
+    else:
+        name = split[-1]
+        for attr in split[:-1]:
+            obj = getattr(obj, attr, None)
+        return setattr(obj, name, value)
+def hook_all(restore=False, hooks=None):
+    if hooks is None:
+        hooks: List[Hook] = [
+            hook_calc_cond_uncond_batch(),
+            hook_sag_create_blur_map(),
+            hook_samplers_pre_run_control(),
+            hook_gligen__set_position(),
+        ]
+    for key, module in sys.modules.items():
+        for hook in hooks:
+            if key == hook.module_name or key.endswith(hook.module_name_path):
+                if _hasattr(module, hook.target):
+                    if not _hasattr(module, hook.orig_key):
+                        if (orig_fn:=_getattr(module, hook.target, None)) is not None:
+                            _setattr(module, hook.orig_key, orig_fn)
+                    if restore:
+                        _setattr(module, hook.target, _getattr(module, hook.orig_key, None))
+                    else:
+                        _setattr(module, hook.target, hook.fn)
+def inject_code(original_func, data, mode='a'):
+    # Get the source code of the original function
+    original_source = inspect.getsource(original_func)
+    # Split the source code into lines
+    lines = original_source.split("\n")
+    for item in data:
+        # Find the line number of the target line
+        target_line_number = None
+        for i, line in enumerate(lines):
+            if item['target_line'] not in line: continue
+            target_line_number = i + 1
+            if item.get("mode","insert") == "replace":
+                lines[i] = lines[i].replace(item['target_line'], item['code_to_insert'])
+                break
+            # Find the indentation of the line where the new code will be inserted
+            indentation = ''
+            for char in line:
+                if char == ' ':
+                    indentation += char
+                else:
+                    break
+            # Indent the new code to match the original
+            code_to_insert = item['code_to_insert']
+            if item.get("dedent",True):
+                code_to_insert = dedent(item['code_to_insert'])
+            code_to_insert = indent(code_to_insert, indentation)
+            break
+        # Insert the code to be injected after the target line
+        if item.get("mode","insert") == "insert" and target_line_number is not None:
+            lines.insert(target_line_number, code_to_insert)
+    # Recreate the modified source code
+    modified_source = "\n".join(lines)
+    modified_source = dedent(modified_source.strip("\n"))
+    return write_to_file_and_return_fn(original_func, modified_source, mode)
+def write_to_file_and_return_fn(original_func, source:str, mode='a'):
+    # Write the modified source code to a temporary file so the
+    # source code and stack traces can still be viewed when debugging.
+    custom_name = ".patches.py"
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    temp_file_path = os.path.join(current_dir, custom_name)
+    with open(temp_file_path, mode) as temp_file:
+        temp_file.write(source)
+        temp_file.write("\n")
+        temp_file.flush()
+        MODULE_PATH = temp_file.name
+        MODULE_NAME = __name__.split('.')[0].replace('-','_') + "_patch_modules"
+        spec = importlib.util.spec_from_file_location(MODULE_NAME, MODULE_PATH)
+        module = importlib.util.module_from_spec(spec)
+        sys.modules[spec.name] = module
+        spec.loader.exec_module(module)
+        # Retrieve the modified function from the module
+        modified_function = getattr(module, original_func.__name__)
+    # Adapted from https://stackoverflow.com/a/49077211
+    def copy_func(f, globals=None, module=None, code=None, update_wrapper=True):
+        if globals is None: globals = f.__globals__
+        if code is None: code = f.__code__
+        g = types.FunctionType(code, globals, name=f.__name__,
+                            argdefs=f.__defaults__, closure=f.__closure__)
+        if update_wrapper: g = functools.update_wrapper(g, f)
+        if module is not None: g.__module__ = module
+        g.__kwdefaults__ = copy(f.__kwdefaults__)
+        return g
+    return copy_func(original_func, code=modified_function.__code__, update_wrapper=False)