Jashan887 commited on 5 days ago

Commit

234f949

verified ·

1 Parent(s): 2c37abe

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +14 -0
.gitignore +30 -0
ComfyUI_AEMatter/AEMatter.py +1248 -0
ComfyUI_AEMatter/AEMatter.run.sh +3 -0
ComfyUI_AEMatter/README.org +1357 -0
ComfyUI_AEMatter/__init__.py +1248 -0
ComfyUI_MVANet/MVANet_inference.py +1548 -0
ComfyUI_MVANet/MVANet_inference.run.sh +3 -0
ComfyUI_MVANet/README.org +1694 -0
ComfyUI_MVANet/__init__.py +1548 -0
ComfyUI_MVANet/download.sh +13 -0
ComfyUI_MVANet/requirements.txt +3 -0
LICENSE +21 -0
MVANet_Inference/README.org +2179 -0
README.md +131 -0
checkpoints/AEMatter/AEM_RWA.ckpt +3 -0
checkpoints/MVANet/garment.pth +3 -0
checkpoints/MVANet/skin.pth +3 -0
checkpoints/Model_80.pth +3 -0
checkpoints/StableDiffusion/90c7c97574f8db765509b6a5d2e7b2551b430a10cac03e37d368654eac5e8169cd149644d188be4b5b2f1b9f29e66b64a02535f622f2bf284c319b076224cb2b +3 -0
checkpoints/StableDiffusion/b970812225cfb95427c13e73b75eef66430e2a525876dddac494d70fe4ed0524cb197043e0ac3dc3026b32a45cd1d6d126ec2fe74a5bc3ef5df21836ca022b30 +3 -0
checkpoints/StableDiffusion/hash +2 -0
checkpoints/atr.pth +3 -0
checkpoints/lip.pth +3 -0
checkpoints/pascal.pth +3 -0
datasets/__init__.py +0 -0
datasets/datasets.py +201 -0
datasets/simple_extractor_dataset.py +78 -0
datasets/target_generation.py +40 -0
demo/demo.jpg +3 -0
demo/demo_atr.png +0 -0
demo/demo_lip.png +0 -0
demo/demo_pascal.png +0 -0
demo/lip-visualization.jpg +3 -0
environment.yaml +49 -0
evaluate.py +209 -0
main.org +663 -0
mhp_extension/README.md +38 -0
mhp_extension/coco_style_annotation_creator/__pycache__/pycococreatortools.cpython-37.pyc +0 -0
mhp_extension/coco_style_annotation_creator/human_to_coco.py +166 -0
mhp_extension/coco_style_annotation_creator/pycococreatortools.py +114 -0
mhp_extension/coco_style_annotation_creator/test_human2coco_format.py +74 -0
mhp_extension/demo.ipynb +0 -0
mhp_extension/detectron2/.circleci/config.yml +179 -0
mhp_extension/detectron2/.clang-format +85 -0
mhp_extension/detectron2/.flake8 +9 -0
mhp_extension/detectron2/.gitignore +46 -0
mhp_extension/detectron2/GETTING_STARTED.md +79 -0
mhp_extension/detectron2/INSTALL.md +184 -0
mhp_extension/detectron2/LICENSE +201 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoints/atr.pth filter=lfs diff=lfs merge=lfs -text
+checkpoints/lip.pth filter=lfs diff=lfs merge=lfs -text
+checkpoints/pascal.pth filter=lfs diff=lfs merge=lfs -text
+checkpoints/Model_80.pth filter=lfs diff=lfs merge=lfs -text
+checkpoints/AEMatter/AEM_RWA.ckpt filter=lfs diff=lfs merge=lfs -text
+checkpoints/StableDiffusion/90c7c97574f8db765509b6a5d2e7b2551b430a10cac03e37d368654eac5e8169cd149644d188be4b5b2f1b9f29e66b64a02535f622f2bf284c319b076224cb2b filter=lfs diff=lfs merge=lfs -text
+checkpoints/StableDiffusion/b970812225cfb95427c13e73b75eef66430e2a525876dddac494d70fe4ed0524cb197043e0ac3dc3026b32a45cd1d6d126ec2fe74a5bc3ef5df21836ca022b30 filter=lfs diff=lfs merge=lfs -text
+checkpoints/MVANet/skin.pth filter=lfs diff=lfs merge=lfs -text
+checkpoints/MVANet/garment.pth filter=lfs diff=lfs merge=lfs -text
+demo/demo_lip.png filter=lfs diff=lfs merge=lfs -text
+demo/lip-visualization.jpg filter=lfs diff=lfs merge=lfs -text
+demo/demo_pascal.png filter=lfs diff=lfs merge=lfs -text
+demo/demo_atr.png filter=lfs diff=lfs merge=lfs -text
+demo/demo.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,30 @@

+/ComfyUI_MVANet/__pycache__/__init__.cpython-310.pyc
+/ComfyUI_MVANet/#README.org#
+/ComfyUI_MVANet/.#README.org
+/ComfyUI_MVANet/README.org~
+/ComfyUI_MVANet/.README.org.~undo-tree~
+/#main.org#
+/.#main.org
+/main.org~
+/.main.org.~undo-tree~
+/.README.md.~undo-tree~
+/ComfyUI_MVANet/.#README.org
+/ComfyUI_AEMatter/__pycache__/__init__.cpython-310.pyc
+/ComfyUI_AEMatter/AEMatter.class.py
+/ComfyUI_AEMatter/AEMatter.execute.py
+/ComfyUI_AEMatter/AEMatter.function.py
+/ComfyUI_AEMatter/AEMatter.import.py
+/ComfyUI_MVANet/MVANet_inference.class.py
+/ComfyUI_MVANet/MVANet_inference.execute.py
+/ComfyUI_MVANet/MVANet_inference.function.py
+/ComfyUI_MVANet/MVANet_inference.import.py
+/ComfyUI_MVANet/MVANet_inference.unify.sh
+/ComfyUI_AEMatter/AEMatter.unify.sh
+/git_add.txt
+/git_lfs_track.txt
+/gitignore.txt
+/rm.txt
+/work.sh
+log/
+pretrain_model/
+commit_and_push.sh

ComfyUI_AEMatter/AEMatter.py ADDED Viewed

	@@ -0,0 +1,1248 @@

+#!/usr/bin/python3
+import cv2
+import math
+import numpy as np
+import os
+import random
+import wget
+import torch
+import torch.nn as nn
+from torch.nn import init
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from collections import OrderedDict
+from einops import rearrange, repeat
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+import folder_paths
+from folder_paths import models_dir
+#!/usr/bin/python3
+def mkdir_safe(out_path):
+    if type(out_path) == str:
+        if len(out_path) > 0:
+            if not os.path.exists(out_path):
+                os.mkdir(out_path)
+def get_model_path():
+    import folder_paths
+    from folder_paths import models_dir
+    path_file_model = models_dir
+    mkdir_safe(out_path=path_file_model)
+    path_file_model = os.path.join(path_file_model, 'AEMatter')
+    mkdir_safe(out_path=path_file_model)
+    path_file_model = os.path.join(path_file_model, 'AEM_RWA.ckpt')
+    return path_file_model
+def download_model(path):
+    if not os.path.exists(path):
+        wget.download(
+            'https://huggingface.co/aravindhv10/Self-Correction-Human-Parsing/resolve/main/checkpoints/AEMatter/AEM_RWA.ckpt?download=true',
+            out=path)
+def from_torch_image(image):
+    image = image.cpu().numpy() * 255.0
+    image = np.clip(image, 0, 255).astype(np.uint8)
+    return image
+def to_torch_image(image):
+    image = image.astype(dtype=np.float32)
+    image /= 255.0
+    image = torch.from_numpy(image)
+    return image
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
+               C)
+    windows = x.permute(0, 1, 3, 2, 4,
+                        5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size,
+                     window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+def get_AEMatter_model(path_model_checkpoint):
+    download_model(path=path_model_checkpoint)
+    matmodel = AEMatter()
+    matmodel.load_state_dict(
+        torch.load(path_model_checkpoint, map_location='cpu')['model'])
+    matmodel = matmodel.cuda()
+    matmodel.eval()
+    return matmodel
+def do_infer(rawimg, trimap, matmodel):
+    trimap_nonp = trimap.copy()
+    h, w, c = rawimg.shape
+    nonph, nonpw, _ = rawimg.shape
+    newh = (((h - 1) // 32) + 1) * 32
+    neww = (((w - 1) // 32) + 1) * 32
+    padh = newh - h
+    padh1 = int(padh / 2)
+    padh2 = padh - padh1
+    padw = neww - w
+    padw1 = int(padw / 2)
+    padw2 = padw - padw1
+    rawimg_pad = cv2.copyMakeBorder(rawimg, padh1, padh2, padw1, padw2,
+                                    cv2.BORDER_REFLECT)
+    trimap_pad = cv2.copyMakeBorder(trimap, padh1, padh2, padw1, padw2,
+                                    cv2.BORDER_REFLECT)
+    h_pad, w_pad, _ = rawimg_pad.shape
+    tritemp = np.zeros([*trimap_pad.shape, 3], np.float32)
+    tritemp[:, :, 0] = (trimap_pad == 0)
+    tritemp[:, :, 1] = (trimap_pad == 128)
+    tritemp[:, :, 2] = (trimap_pad == 255)
+    tritempimgs = np.transpose(tritemp, (2, 0, 1))
+    tritempimgs = tritempimgs[np.newaxis, :, :, :]
+    img = np.transpose(rawimg_pad, (2, 0, 1))[np.newaxis, ::-1, :, :]
+    img = np.array(img, np.float32)
+    img = img / 255.
+    img = torch.from_numpy(img).cuda()
+    tritempimgs = torch.from_numpy(tritempimgs).cuda()
+    with torch.no_grad():
+        pred = matmodel(img, tritempimgs)
+        pred = pred.detach().cpu().numpy()[0]
+        pred = pred[:, padh1:padh1 + h, padw1:padw1 + w]
+        preda = pred[
+            0:1,
+        ] * 255
+        preda = np.transpose(preda, (1, 2, 0))
+        preda = preda * (trimap_nonp[:, :, None]
+                         == 128) + (trimap_nonp[:, :, None] == 255) * 255
+    preda = np.array(preda, np.uint8)
+    return preda
+def main():
+    ptrimap = '/home/asd/Desktop/demo/retriever_trimap.png'
+    pimgs = '/home/asd/Desktop/demo/retriever_rgb.png'
+    p_outs = 'alpha.png'
+    matmodel = get_AEMatter_model(
+        path_model_checkpoint='/home/asd/Desktop/AEM_RWA.ckpt')
+    # matmodel = AEMatter()
+    # matmodel.load_state_dict(
+    #     torch.load('/home/asd/Desktop/AEM_RWA.ckpt',
+    #                map_location='cpu')['model'])
+    # matmodel = matmodel.cuda()
+    # matmodel.eval()
+    rawimg = pimgs
+    trimap = ptrimap
+    rawimg = cv2.imread(rawimg, cv2.IMREAD_COLOR)
+    trimap = cv2.imread(trimap, cv2.IMREAD_GRAYSCALE)
+    trimap_nonp = trimap.copy()
+    h, w, c = rawimg.shape
+    nonph, nonpw, _ = rawimg.shape
+    newh = (((h - 1) // 32) + 1) * 32
+    neww = (((w - 1) // 32) + 1) * 32
+    padh = newh - h
+    padh1 = int(padh / 2)
+    padh2 = padh - padh1
+    padw = neww - w
+    padw1 = int(padw / 2)
+    padw2 = padw - padw1
+    rawimg_pad = cv2.copyMakeBorder(rawimg, padh1, padh2, padw1, padw2,
+                                    cv2.BORDER_REFLECT)
+    trimap_pad = cv2.copyMakeBorder(trimap, padh1, padh2, padw1, padw2,
+                                    cv2.BORDER_REFLECT)
+    h_pad, w_pad, _ = rawimg_pad.shape
+    tritemp = np.zeros([*trimap_pad.shape, 3], np.float32)
+    tritemp[:, :, 0] = (trimap_pad == 0)
+    tritemp[:, :, 1] = (trimap_pad == 128)
+    tritemp[:, :, 2] = (trimap_pad == 255)
+    tritempimgs = np.transpose(tritemp, (2, 0, 1))
+    tritempimgs = tritempimgs[np.newaxis, :, :, :]
+    img = np.transpose(rawimg_pad, (2, 0, 1))[np.newaxis, ::-1, :, :]
+    img = np.array(img, np.float32)
+    img = img / 255.
+    img = torch.from_numpy(img).cuda()
+    tritempimgs = torch.from_numpy(tritempimgs).cuda()
+    with torch.no_grad():
+        pred = matmodel(img, tritempimgs)
+        pred = pred.detach().cpu().numpy()[0]
+        pred = pred[:, padh1:padh1 + h, padw1:padw1 + w]
+        preda = pred[
+            0:1,
+        ] * 255
+        preda = np.transpose(preda, (1, 2, 0))
+        preda = preda * (trimap_nonp[:, :, None]
+                         == 128) + (trimap_nonp[:, :, None] == 255) * 255
+    preda = np.array(preda, np.uint8)
+    cv2.imwrite(p_outs, preda)
+#!/usr/bin/python3
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(dim,
+                                    window_size=to_2tuple(self.window_size),
+                                    num_heads=num_heads,
+                                    qkv_bias=qkv_bias,
+                                    qk_scale=qk_scale,
+                                    attn_drop=attn_drop,
+                                    proj_drop=drop)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x,
+                                   shifts=(-self.shift_size, -self.shift_size),
+                                   dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                   C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x,
+                           shifts=(self.shift_size, self.shift_size),
+                           dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim,
+                                 num_heads=num_heads,
+                                 window_size=window_size,
+                                 shift_size=0 if
+                                 (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias,
+                                 qk_scale=qk_scale,
+                                 drop=drop,
+                                 attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(
+                                     drop_path, list) else drop_path,
+                                 norm_layer=norm_layer) for i in range(depth)
+        ])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # print(x.shape,H,W)
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1,
+                                         self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(
+            2)  # nW, ww window_size*window_size
+        attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                          float(-100.0)).masked_fill(
+                                              attn_mask == 0, float(0.0))
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x,
+                      (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0],
+                            patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if
+                (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed,
+                                               size=(Wh, Ww),
+                                               mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1,
+                                                              2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W,
+                                 self.num_features[i]).permute(0, 3, 1,
+                                                               2).contiguous()
+                outs.append(out)
+        return tuple(outs)
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class ResBlock(nn.Module):
+    def __init__(self, inc, midc):
+        super(ResBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inc,
+                               midc,
+                               kernel_size=1,
+                               stride=1,
+                               padding=0,
+                               bias=True)
+        self.gn1 = nn.GroupNorm(16, midc)
+        self.conv2 = nn.Conv2d(midc,
+                               midc,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1,
+                               bias=True)
+        self.gn2 = nn.GroupNorm(16, midc)
+        self.conv3 = nn.Conv2d(midc,
+                               inc,
+                               kernel_size=1,
+                               stride=1,
+                               padding=0,
+                               bias=True)
+        self.relu = nn.LeakyReLU(0.1)
+    def forward(self, x):
+        x_ = x
+        x = self.conv1(x)
+        x = self.gn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.gn2(x)
+        x = self.relu(x)
+        x = self.conv3(x)
+        x = x + x_
+        x = self.relu(x)
+        return x
+class AEALblock(nn.Module):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=512,
+                 dropout=0.0,
+                 layer_norm_eps=1e-5,
+                 batch_first=True,
+                 norm_first=False,
+                 width=5):
+        super(AEALblock, self).__init__()
+        self.self_attn2 = nn.MultiheadAttention(d_model // 2,
+                                                nhead // 2,
+                                                dropout=dropout,
+                                                batch_first=batch_first)
+        self.self_attn1 = nn.MultiheadAttention(d_model // 2,
+                                                nhead // 2,
+                                                dropout=dropout,
+                                                batch_first=batch_first)
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm_first = norm_first
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = nn.ReLU()
+        self.width = width
+        self.trans = nn.Sequential(
+            nn.Conv2d(d_model + 512, d_model // 2, 1, 1, 0),
+            ResBlock(d_model // 2, d_model // 4),
+            nn.Conv2d(d_model // 2, d_model, 1, 1, 0))
+        self.gamma = nn.Parameter(torch.zeros(1))
+    def forward(
+        self,
+        src,
+        feats,
+    ):
+        src = self.gamma * self.trans(torch.cat([src, feats], 1)) + src
+        b, c, h, w = src.shape
+        x1 = src[:, 0:c // 2]
+        x1_ = rearrange(x1, 'b c (h1 h2) w -> b c h1 h2 w', h2=self.width)
+        x1_ = rearrange(x1_, 'b c h1 h2 w -> (b h1) (h2 w) c')
+        x2 = src[:, c // 2:]
+        x2_ = rearrange(x2, 'b c h (w1 w2) -> b c h w1 w2', w2=self.width)
+        x2_ = rearrange(x2_, 'b c h w1 w2 -> (b w1) (h w2) c')
+        x = rearrange(src, 'b c h w-> b (h w) c')
+        x = self.norm1(x + self._sa_block(x1_, x2_, h, w))
+        x = self.norm2(x + self._ff_block(x))
+        x = rearrange(x, 'b (h w) c->b c h w', h=h, w=w)
+        return x
+    def _sa_block(self, x1, x2, h, w):
+        x1 = self.self_attn1(x1,
+                             x1,
+                             x1,
+                             attn_mask=None,
+                             key_padding_mask=None,
+                             need_weights=False)[0]
+        x2 = self.self_attn2(x2,
+                             x2,
+                             x2,
+                             attn_mask=None,
+                             key_padding_mask=None,
+                             need_weights=False)[0]
+        x1 = rearrange(x1,
+                       '(b h1) (h2 w) c-> b (h1 h2 w) c',
+                       h2=self.width,
+                       h1=h // self.width)
+        x2 = rearrange(x2,
+                       ' (b w1) (h w2) c-> b (h w1 w2) c',
+                       w2=self.width,
+                       w1=w // self.width)
+        x = torch.cat([x1, x2], dim=2)
+        return self.dropout1(x)
+    def _ff_block(self, x):
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+class AEMatter(nn.Module):
+    def __init__(self):
+        super(AEMatter, self).__init__()
+        trans = SwinTransformer(pretrain_img_size=224,
+                                embed_dim=96,
+                                depths=[2, 2, 6, 2],
+                                num_heads=[3, 6, 12, 24],
+                                window_size=7,
+                                ape=False,
+                                drop_path_rate=0.2,
+                                patch_norm=True,
+                                use_checkpoint=False)
+        # trans.load_state_dict(torch.load(
+        #     '/home/asd/Desktop/swin_tiny_patch4_window7_224.pth',
+        #     map_location="cpu")["model"],
+        #                       strict=False)
+        trans.patch_embed.proj = nn.Conv2d(64, 96, 3, 2, 1)
+        self.start_conv0 = nn.Sequential(nn.Conv2d(6, 48, 3, 1, 1),
+                                         nn.PReLU(48))
+        self.start_conv = nn.Sequential(nn.Conv2d(48, 64, 3, 2,
+                                                  1), nn.PReLU(64),
+                                        nn.Conv2d(64, 64, 3, 1, 1),
+                                        nn.PReLU(64))
+        self.trans = trans
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels=640 + 768,
+                      out_channels=256,
+                      kernel_size=1,
+                      stride=1,
+                      padding=0,
+                      bias=True))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_channels=256 + 384,
+                      out_channels=256,
+                      kernel_size=1,
+                      stride=1,
+                      padding=0,
+                      bias=True), )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(in_channels=256 + 192,
+                      out_channels=192,
+                      kernel_size=1,
+                      stride=1,
+                      padding=0,
+                      bias=True), )
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(in_channels=192 + 96,
+                      out_channels=128,
+                      kernel_size=1,
+                      stride=1,
+                      padding=0,
+                      bias=True), )
+        self.ctran0 = BasicLayer(256, 3, 8, 7, drop_path=0.09)
+        self.ctran1 = BasicLayer(256, 3, 8, 7, drop_path=0.07)
+        self.ctran2 = BasicLayer(192, 3, 6, 7, drop_path=0.05)
+        self.ctran3 = BasicLayer(128, 3, 4, 7, drop_path=0.03)
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(in_channels=192,
+                      out_channels=64,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True), nn.PReLU(64),
+            nn.Conv2d(in_channels=64,
+                      out_channels=64,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True), nn.PReLU(64),
+            nn.Conv2d(in_channels=64,
+                      out_channels=48,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True), nn.PReLU(48))
+        self.convo = nn.Sequential(
+            nn.Conv2d(in_channels=48 + 48 + 6,
+                      out_channels=32,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True), nn.PReLU(32),
+            nn.Conv2d(in_channels=32,
+                      out_channels=32,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True), nn.PReLU(32),
+            nn.Conv2d(in_channels=32,
+                      out_channels=1,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True))
+        self.up = nn.Upsample(scale_factor=2,
+                              mode='bilinear',
+                              align_corners=False)
+        self.upn = nn.Upsample(scale_factor=2, mode='nearest')
+        self.apptrans = nn.Sequential(
+            nn.Conv2d(256 + 384, 256, 1, 1, bias=True), ResBlock(256, 128),
+            ResBlock(256, 128), nn.Conv2d(256, 512, 2, 2, bias=True),
+            ResBlock(512, 128))
+        self.emb = nn.Sequential(nn.Conv2d(768, 640, 1, 1, 0),
+                                 ResBlock(640, 160))
+        self.embdp = nn.Sequential(nn.Conv2d(640, 640, 1, 1, 0))
+        self.h2l = nn.Conv2d(768, 256, 1, 1, 0)
+        self.width = 5
+        self.trans1 = AEALblock(d_model=640,
+                                nhead=20,
+                                dim_feedforward=2048,
+                                dropout=0.2,
+                                width=self.width)
+        self.trans2 = AEALblock(d_model=640,
+                                nhead=20,
+                                dim_feedforward=2048,
+                                dropout=0.2,
+                                width=self.width)
+        self.trans3 = AEALblock(d_model=640,
+                                nhead=20,
+                                dim_feedforward=2048,
+                                dropout=0.2,
+                                width=self.width)
+    def aeal(self, x, sem):
+        xe = self.emb(x)
+        x_ = xe
+        x_ = self.embdp(x_)
+        b, c, h1, w1 = x_.shape
+        bnew_ph = int(np.ceil(h1 / self.width) * self.width) - h1
+        bnew_pw = int(np.ceil(w1 / self.width) * self.width) - w1
+        newph1 = bnew_ph // 2
+        newph2 = bnew_ph - newph1
+        newpw1 = bnew_pw // 2
+        newpw2 = bnew_pw - newpw1
+        x_ = F.pad(x_, (newpw1, newpw2, newph1, newph2))
+        sem = F.pad(sem, (newpw1, newpw2, newph1, newph2))
+        x_ = self.trans1(x_, sem)
+        x_ = self.trans2(x_, sem)
+        x_ = self.trans3(x_, sem)
+        x_ = x_[:, :, newph1:h1 + newph1, newpw1:w1 + newpw1]
+        return x_
+    def forward(self, x, y):
+        inputs = torch.cat((x, y), 1)
+        x = self.start_conv0(inputs)
+        x_ = self.start_conv(x)
+        x1, x2, x3, x4 = self.trans(x_)
+        x4h = self.h2l(x4)
+        x3s = self.apptrans(torch.cat([x3, self.upn(x4h)], 1))
+        x4_ = self.aeal(x4, x3s)
+        x4 = torch.cat((x4, x4_), 1)
+        X4 = self.conv1(x4)
+        wh, ww = X4.shape[2], X4.shape[3]
+        X4 = rearrange(X4, 'b c h w -> b (h w) c')
+        X4, _, _, _, _, _ = self.ctran0(X4, wh, ww)
+        X4 = rearrange(X4, 'b (h w) c -> b c h w', h=wh, w=ww)
+        X3 = self.up(X4)
+        X3 = torch.cat((x3, X3), 1)
+        X3 = self.conv2(X3)
+        wh, ww = X3.shape[2], X3.shape[3]
+        X3 = rearrange(X3, 'b c h w -> b (h w) c')
+        X3, _, _, _, _, _ = self.ctran1(X3, wh, ww)
+        X3 = rearrange(X3, 'b (h w) c -> b c h w', h=wh, w=ww)
+        X2 = self.up(X3)
+        X2 = torch.cat((x2, X2), 1)
+        X2 = self.conv3(X2)
+        wh, ww = X2.shape[2], X2.shape[3]
+        X2 = rearrange(X2, 'b c h w -> b (h w) c')
+        X2, _, _, _, _, _ = self.ctran2(X2, wh, ww)
+        X2 = rearrange(X2, 'b (h w) c -> b c h w', h=wh, w=ww)
+        X1 = self.up(X2)
+        X1 = torch.cat((x1, X1), 1)
+        X1 = self.conv4(X1)
+        wh, ww = X1.shape[2], X1.shape[3]
+        X1 = rearrange(X1, 'b c h w -> b (h w) c')
+        X1, _, _, _, _, _ = self.ctran3(X1, wh, ww)
+        X1 = rearrange(X1, 'b (h w) c -> b c h w', h=wh, w=ww)
+        X0 = self.up(X1)
+        X0 = torch.cat((x_, X0), 1)
+        X0 = self.conv5(X0)
+        X = self.up(X0)
+        X = torch.cat((inputs, x, X), 1)
+        alpha = self.convo(X)
+        alpha = torch.clamp(alpha, min=0, max=1)
+        return alpha
+class load_AEMatter_Model:
+    def __init__(self):
+        pass
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {},
+        }
+    RETURN_TYPES = ("AEMatter_Model", )
+    FUNCTION = "test"
+    CATEGORY = "AEMatter"
+    def test(self):
+        return (get_AEMatter_model(get_model_path()), )
+class run_AEMatter_inference:
+    def __init__(self):
+        pass
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "image": ("IMAGE", ),
+                "trimap": ("MASK", ),
+                "AEMatter_Model": ("AEMatter_Model", ),
+            },
+        }
+    RETURN_TYPES = ("MASK", )
+    FUNCTION = "test"
+    CATEGORY = "AEMatter"
+    def test(
+        self,
+        image,
+        trimap,
+        AEMatter_Model,
+    ):
+        ret = []
+        batch_size = image.shape[0]
+        for i in range(batch_size):
+            tmp_i = from_torch_image(image[i])
+            tmp_m = from_torch_image(trimap[i])
+            tmp = do_infer(tmp_i, tmp_m, AEMatter_Model)
+            ret.append(tmp)
+        ret = to_torch_image(np.array(ret))
+        ret = ret.squeeze(-1)
+        print(ret.shape)
+        return ret
+#!/usr/bin/python3
+NODE_CLASS_MAPPINGS = {
+    'load_AEMatter_Model': load_AEMatter_Model,
+    'run_AEMatter_inference': run_AEMatter_inference,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    'load_AEMatter_Model': 'load_AEMatter_Model',
+    'run_AEMatter_inference': 'run_AEMatter_inference',
+}

ComfyUI_AEMatter/AEMatter.run.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+#!/bin/sh
+. "${HOME}/dbnew.sh"
+python3 './AEMatter.py'

ComfyUI_AEMatter/README.org ADDED Viewed

	@@ -0,0 +1,1357 @@

+* COMMENT SAMPLE
+** AEMatter.import.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.import.py
+#+end_src
+** AEMatter.function.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.function.py
+#+end_src
+** AEMatter.class.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+#+end_src
+** AEMatter.execute.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.execute.py
+#+end_src
+** AEMatter.unify.sh
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./AEMatter.unify.sh
+#+end_src
+** AEMatter.run.sh
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./AEMatter.run.sh
+#+end_src
+* Code for AEMatter inference
+** AEMatter.import.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.import.py
+  import cv2
+  import math
+  import numpy as np
+  import os
+  import random
+  import wget
+  import torch
+  import torch.nn as nn
+  from torch.nn import init
+  import torch.nn.functional as F
+  import torch.utils.checkpoint as checkpoint
+  from collections import OrderedDict
+  from einops import rearrange, repeat
+  from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+  import folder_paths
+  from folder_paths import models_dir
+#+end_src
+** Functions to prepare directory structure and download models
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.function.py
+  def mkdir_safe(out_path):
+      if type(out_path) == str:
+          if len(out_path) > 0:
+              if not os.path.exists(out_path):
+                  os.mkdir(out_path)
+  def get_model_path():
+      import folder_paths
+      from folder_paths import models_dir
+      path_file_model = models_dir
+      mkdir_safe(out_path=path_file_model)
+      path_file_model = os.path.join(path_file_model, 'AEMatter')
+      mkdir_safe(out_path=path_file_model)
+      path_file_model = os.path.join(path_file_model, 'AEM_RWA.ckpt')
+      return path_file_model
+  def download_model(path):
+      if not os.path.exists(path):
+          wget.download(
+              'https://huggingface.co/aravindhv10/Self-Correction-Human-Parsing/resolve/main/checkpoints/AEMatter/AEM_RWA.ckpt?download=true',
+              out=path)
+  def from_torch_image(image):
+      image = image.cpu().numpy() * 255.0
+      image = np.clip(image, 0, 255).astype(np.uint8)
+      return image
+  def to_torch_image(image):
+      image = image.astype(dtype=np.float32)
+      image /= 255.0
+      image = torch.from_numpy(image)
+      return image
+#+end_src
+** AEMatter.function.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.function.py
+  def window_partition(x, window_size):
+      """
+      Args:
+          x: (B, H, W, C)
+          window_size (int): window size
+      Returns:
+          windows: (num_windows*B, window_size, window_size, C)
+      """
+      B, H, W, C = x.shape
+      x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+      windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+      return windows
+#+end_src
+** AEMatter.function.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.function.py
+  def window_reverse(windows, window_size, H, W):
+      """
+      Args:
+          windows: (num_windows*B, window_size, window_size, C)
+          window_size (int): Window size
+          H (int): Height of image
+          W (int): Width of image
+      Returns:
+          x: (B, H, W, C)
+      """
+      B = int(windows.shape[0] / (H * W / window_size / window_size))
+      x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+      x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+      return x
+#+end_src
+** AEMatter.class.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+  class WindowAttention(nn.Module):
+      """ Window based multi-head self attention (W-MSA) module with relative position bias.
+      It supports both of shifted and non-shifted window.
+      Args:
+          dim (int): Number of input channels.
+          window_size (tuple[int]): The height and width of the window.
+          num_heads (int): Number of attention heads.
+          qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+          attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+          proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+      """
+      def __init__(self,
+                   dim,
+                   window_size,
+                   num_heads,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   attn_drop=0.,
+                   proj_drop=0.):
+          super().__init__()
+          self.dim = dim
+          self.window_size = window_size  # Wh, Ww
+          self.num_heads = num_heads
+          head_dim = dim // num_heads
+          self.scale = qk_scale or head_dim**-0.5
+          # define a parameter table of relative position bias
+          self.relative_position_bias_table = nn.Parameter(
+              torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                          num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+          # get pair-wise relative position index for each token inside the window
+          coords_h = torch.arange(self.window_size[0])
+          coords_w = torch.arange(self.window_size[1])
+          coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+          coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+          relative_coords = coords_flatten[:, :,
+                                           None] - coords_flatten[:,
+                                                                  None, :]  # 2, Wh*Ww, Wh*Ww
+          relative_coords = relative_coords.permute(
+              1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+          relative_coords[:, :,
+                          0] += self.window_size[0] - 1  # shift to start from 0
+          relative_coords[:, :, 1] += self.window_size[1] - 1
+          relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+          relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+          self.register_buffer("relative_position_index",
+                               relative_position_index)
+          self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+          self.attn_drop = nn.Dropout(attn_drop)
+          self.proj = nn.Linear(dim, dim)
+          self.proj_drop = nn.Dropout(proj_drop)
+          trunc_normal_(self.relative_position_bias_table, std=.02)
+          self.softmax = nn.Softmax(dim=-1)
+      def forward(self, x, mask=None):
+          """ Forward function.
+          Args:
+              x: input features with shape of (num_windows*B, N, C)
+              mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+          """
+          B_, N, C = x.shape
+          qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                    C // self.num_heads).permute(2, 0, 3, 1, 4)
+          q, k, v = qkv[0], qkv[1], qkv[
+              2]  # make torchscript happy (cannot use tensor as tuple)
+          q = q * self.scale
+          attn = (q @ k.transpose(-2, -1))
+          relative_position_bias = self.relative_position_bias_table[
+              self.relative_position_index.view(-1)].view(
+                  self.window_size[0] * self.window_size[1],
+                  self.window_size[0] * self.window_size[1],
+                  -1)  # Wh*Ww,Wh*Ww,nH
+          relative_position_bias = relative_position_bias.permute(
+              2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+          attn = attn + relative_position_bias.unsqueeze(0)
+          if mask is not None:
+              nW = mask.shape[0]
+              attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                               N) + mask.unsqueeze(1).unsqueeze(0)
+              attn = attn.view(-1, self.num_heads, N, N)
+              attn = self.softmax(attn)
+          else:
+              attn = self.softmax(attn)
+          attn = self.attn_drop(attn)
+          x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+          x = self.proj(x)
+          x = self.proj_drop(x)
+          return x
+#+end_src
+** AEMatter.class.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+  class SwinTransformerBlock(nn.Module):
+      """ Swin Transformer Block.
+      Args:
+          dim (int): Number of input channels.
+          num_heads (int): Number of attention heads.
+          window_size (int): Window size.
+          shift_size (int): Shift size for SW-MSA.
+          mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+          qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+          drop (float, optional): Dropout rate. Default: 0.0
+          attn_drop (float, optional): Attention dropout rate. Default: 0.0
+          drop_path (float, optional): Stochastic depth rate. Default: 0.0
+          act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+          norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+      """
+      def __init__(self,
+                   dim,
+                   num_heads,
+                   window_size=7,
+                   shift_size=0,
+                   mlp_ratio=4.,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   drop=0.,
+                   attn_drop=0.,
+                   drop_path=0.,
+                   act_layer=nn.GELU,
+                   norm_layer=nn.LayerNorm):
+          super().__init__()
+          self.dim = dim
+          self.num_heads = num_heads
+          self.window_size = window_size
+          self.shift_size = shift_size
+          self.mlp_ratio = mlp_ratio
+          assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+          self.norm1 = norm_layer(dim)
+          self.attn = WindowAttention(dim,
+                                      window_size=to_2tuple(self.window_size),
+                                      num_heads=num_heads,
+                                      qkv_bias=qkv_bias,
+                                      qk_scale=qk_scale,
+                                      attn_drop=attn_drop,
+                                      proj_drop=drop)
+          self.drop_path = DropPath(
+              drop_path) if drop_path > 0. else nn.Identity()
+          self.norm2 = norm_layer(dim)
+          mlp_hidden_dim = int(dim * mlp_ratio)
+          self.mlp = Mlp(in_features=dim,
+                         hidden_features=mlp_hidden_dim,
+                         act_layer=act_layer,
+                         drop=drop)
+          self.H = None
+          self.W = None
+      def forward(self, x, mask_matrix):
+          """ Forward function.
+          Args:
+              x: Input feature, tensor size (B, H*W, C).
+              H, W: Spatial resolution of the input feature.
+              mask_matrix: Attention mask for cyclic shift.
+          """
+          B, L, C = x.shape
+          H, W = self.H, self.W
+          assert L == H * W, "input feature has wrong size"
+          shortcut = x
+          x = self.norm1(x)
+          x = x.view(B, H, W, C)
+          # pad feature maps to multiples of window size
+          pad_l = pad_t = 0
+          pad_r = (self.window_size - W % self.window_size) % self.window_size
+          pad_b = (self.window_size - H % self.window_size) % self.window_size
+          x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+          _, Hp, Wp, _ = x.shape
+          # cyclic shift
+          if self.shift_size > 0:
+              shifted_x = torch.roll(x,
+                                     shifts=(-self.shift_size, -self.shift_size),
+                                     dims=(1, 2))
+              attn_mask = mask_matrix
+          else:
+              shifted_x = x
+              attn_mask = None
+          # partition windows
+          x_windows = window_partition(
+              shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+          x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                     C)  # nW*B, window_size*window_size, C
+          # W-MSA/SW-MSA
+          attn_windows = self.attn(
+              x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+          # merge windows
+          attn_windows = attn_windows.view(-1, self.window_size,
+                                           self.window_size, C)
+          shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                     Wp)  # B H' W' C
+          # reverse cyclic shift
+          if self.shift_size > 0:
+              x = torch.roll(shifted_x,
+                             shifts=(self.shift_size, self.shift_size),
+                             dims=(1, 2))
+          else:
+              x = shifted_x
+          if pad_r > 0 or pad_b > 0:
+              x = x[:, :H, :W, :].contiguous()
+          x = x.view(B, H * W, C)
+          # FFN
+          x = shortcut + self.drop_path(x)
+          x = x + self.drop_path(self.mlp(self.norm2(x)))
+          return x
+#+end_src
+** AEMatter.class.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+  class PatchMerging(nn.Module):
+      """ Patch Merging Layer
+      Args:
+          dim (int): Number of input channels.
+          norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+      """
+      def __init__(self, dim, norm_layer=nn.LayerNorm):
+          super().__init__()
+          self.dim = dim
+          self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+          self.norm = norm_layer(4 * dim)
+      def forward(self, x, H, W):
+          """ Forward function.
+          Args:
+              x: Input feature, tensor size (B, H*W, C).
+              H, W: Spatial resolution of the input feature.
+          """
+          B, L, C = x.shape
+          assert L == H * W, "input feature has wrong size"
+          x = x.view(B, H, W, C)
+          # padding
+          pad_input = (H % 2 == 1) or (W % 2 == 1)
+          if pad_input:
+              x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+          x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+          x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+          x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+          x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+          x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+          x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+          x = self.norm(x)
+          x = self.reduction(x)
+          return x
+#+end_src
+** AEMatter.class.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+  class BasicLayer(nn.Module):
+      """ A basic Swin Transformer layer for one stage.
+      Args:
+          dim (int): Number of feature channels
+          depth (int): Depths of this stage.
+          num_heads (int): Number of attention head.
+          window_size (int): Local window size. Default: 7.
+          mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+          qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+          drop (float, optional): Dropout rate. Default: 0.0
+          attn_drop (float, optional): Attention dropout rate. Default: 0.0
+          drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+          norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+          downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+          use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+      """
+      def __init__(self,
+                   dim,
+                   depth,
+                   num_heads,
+                   window_size=7,
+                   mlp_ratio=4.,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   drop=0.,
+                   attn_drop=0.,
+                   drop_path=0.,
+                   norm_layer=nn.LayerNorm,
+                   downsample=None,
+                   use_checkpoint=False):
+          super().__init__()
+          self.window_size = window_size
+          self.shift_size = window_size // 2
+          self.depth = depth
+          self.use_checkpoint = use_checkpoint
+          # build blocks
+          self.blocks = nn.ModuleList([
+              SwinTransformerBlock(dim=dim,
+                                   num_heads=num_heads,
+                                   window_size=window_size,
+                                   shift_size=0 if
+                                   (i % 2 == 0) else window_size // 2,
+                                   mlp_ratio=mlp_ratio,
+                                   qkv_bias=qkv_bias,
+                                   qk_scale=qk_scale,
+                                   drop=drop,
+                                   attn_drop=attn_drop,
+                                   drop_path=drop_path[i] if isinstance(
+                                       drop_path, list) else drop_path,
+                                   norm_layer=norm_layer) for i in range(depth)
+          ])
+          # patch merging layer
+          if downsample is not None:
+              self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+          else:
+              self.downsample = None
+      def forward(self, x, H, W):
+          """ Forward function.
+          Args:
+              x: Input feature, tensor size (B, H*W, C).
+              H, W: Spatial resolution of the input feature.
+          """
+          # print(x.shape,H,W)
+          # calculate attention mask for SW-MSA
+          Hp = int(np.ceil(H / self.window_size)) * self.window_size
+          Wp = int(np.ceil(W / self.window_size)) * self.window_size
+          img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+          h_slices = (slice(0, -self.window_size),
+                      slice(-self.window_size,
+                            -self.shift_size), slice(-self.shift_size, None))
+          w_slices = (slice(0, -self.window_size),
+                      slice(-self.window_size,
+                            -self.shift_size), slice(-self.shift_size, None))
+          cnt = 0
+          for h in h_slices:
+              for w in w_slices:
+                  img_mask[:, h, w, :] = cnt
+                  cnt += 1
+          mask_windows = window_partition(
+              img_mask, self.window_size)  # nW, window_size, window_size, 1
+          mask_windows = mask_windows.view(-1,
+                                           self.window_size * self.window_size)
+          attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(
+              2)  # nW, ww window_size*window_size
+          attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                            float(-100.0)).masked_fill(
+                                                attn_mask == 0, float(0.0))
+          for blk in self.blocks:
+              blk.H, blk.W = H, W
+              if self.use_checkpoint:
+                  x = checkpoint.checkpoint(blk, x, attn_mask)
+              else:
+                  x = blk(x, attn_mask)
+          if self.downsample is not None:
+              x_down = self.downsample(x, H, W)
+              Wh, Ww = (H + 1) // 2, (W + 1) // 2
+              return x, H, W, x_down, Wh, Ww
+          else:
+              return x, H, W, x, H, W
+#+end_src
+** AEMatter.class.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+  class PatchEmbed(nn.Module):
+      """ Image to Patch Embedding
+      Args:
+          patch_size (int): Patch token size. Default: 4.
+          in_chans (int): Number of input image channels. Default: 3.
+          embed_dim (int): Number of linear projection output channels. Default: 96.
+          norm_layer (nn.Module, optional): Normalization layer. Default: None
+      """
+      def __init__(self,
+                   patch_size=4,
+                   in_chans=3,
+                   embed_dim=96,
+                   norm_layer=None):
+          super().__init__()
+          patch_size = to_2tuple(patch_size)
+          self.patch_size = patch_size
+          self.in_chans = in_chans
+          self.embed_dim = embed_dim
+          self.proj = nn.Conv2d(in_chans,
+                                embed_dim,
+                                kernel_size=patch_size,
+                                stride=patch_size)
+          if norm_layer is not None:
+              self.norm = norm_layer(embed_dim)
+          else:
+              self.norm = None
+      def forward(self, x):
+          """Forward function."""
+          # padding
+          _, _, H, W = x.size()
+          if W % self.patch_size[1] != 0:
+              x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+          if H % self.patch_size[0] != 0:
+              x = F.pad(x,
+                        (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+          x = self.proj(x)  # B C Wh Ww
+          if self.norm is not None:
+              Wh, Ww = x.size(2), x.size(3)
+              x = x.flatten(2).transpose(1, 2)
+              x = self.norm(x)
+              x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+          return x
+#+end_src
+** AEMatter.class.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+  class SwinTransformer(nn.Module):
+      """ Swin Transformer backbone.
+          A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+            https://arxiv.org/pdf/2103.14030
+      Args:
+          pretrain_img_size (int): Input image size for training the pretrained model,
+              used in absolute postion embedding. Default 224.
+          patch_size (int | tuple(int)): Patch size. Default: 4.
+          in_chans (int): Number of input image channels. Default: 3.
+          embed_dim (int): Number of linear projection output channels. Default: 96.
+          depths (tuple[int]): Depths of each Swin Transformer stage.
+          num_heads (tuple[int]): Number of attention head of each stage.
+          window_size (int): Window size. Default: 7.
+          mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+          qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+          drop_rate (float): Dropout rate.
+          attn_drop_rate (float): Attention dropout rate. Default: 0.
+          drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+          norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+          ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+          patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+          out_indices (Sequence[int]): Output from which stages.
+          frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+              -1 means not freezing any parameters.
+          use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+      """
+      def __init__(self,
+                   pretrain_img_size=224,
+                   patch_size=4,
+                   in_chans=3,
+                   embed_dim=96,
+                   depths=[2, 2, 6, 2],
+                   num_heads=[3, 6, 12, 24],
+                   window_size=7,
+                   mlp_ratio=4.,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   drop_rate=0.,
+                   attn_drop_rate=0.,
+                   drop_path_rate=0.2,
+                   norm_layer=nn.LayerNorm,
+                   ape=False,
+                   patch_norm=True,
+                   out_indices=(0, 1, 2, 3),
+                   frozen_stages=-1,
+                   use_checkpoint=False):
+          super().__init__()
+          self.pretrain_img_size = pretrain_img_size
+          self.num_layers = len(depths)
+          self.embed_dim = embed_dim
+          self.ape = ape
+          self.patch_norm = patch_norm
+          self.out_indices = out_indices
+          self.frozen_stages = frozen_stages
+          # split image into non-overlapping patches
+          self.patch_embed = PatchEmbed(
+              patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+              norm_layer=norm_layer if self.patch_norm else None)
+          # absolute position embedding
+          if self.ape:
+              pretrain_img_size = to_2tuple(pretrain_img_size)
+              patch_size = to_2tuple(patch_size)
+              patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
+              self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
+              trunc_normal_(self.absolute_pos_embed, std=.02)
+          self.pos_drop = nn.Dropout(p=drop_rate)
+          # stochastic depth
+          dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+          # build layers
+          self.layers = nn.ModuleList()
+          for i_layer in range(self.num_layers):
+              layer = BasicLayer(
+                  dim=int(embed_dim * 2 ** i_layer),
+                  depth=depths[i_layer],
+                  num_heads=num_heads[i_layer],
+                  window_size=window_size,
+                  mlp_ratio=mlp_ratio,
+                  qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop=drop_rate,
+                  attn_drop=attn_drop_rate,
+                  drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                  norm_layer=norm_layer,
+                  downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                  use_checkpoint=use_checkpoint)
+              self.layers.append(layer)
+          num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+          self.num_features = num_features
+          # add a norm layer for each output
+          for i_layer in out_indices:
+              layer = norm_layer(num_features[i_layer])
+              layer_name = f'norm{i_layer}'
+              self.add_module(layer_name, layer)
+          self._freeze_stages()
+      def _freeze_stages(self):
+          if self.frozen_stages >= 0:
+              self.patch_embed.eval()
+              for param in self.patch_embed.parameters():
+                  param.requires_grad = False
+          if self.frozen_stages >= 1 and self.ape:
+              self.absolute_pos_embed.requires_grad = False
+          if self.frozen_stages >= 2:
+              self.pos_drop.eval()
+              for i in range(0, self.frozen_stages - 1):
+                  m = self.layers[i]
+                  m.eval()
+                  for param in m.parameters():
+                      param.requires_grad = False
+      def init_weights(self, pretrained=None):
+          """Initialize the weights in backbone.
+          Args:
+              pretrained (str, optional): Path to pre-trained weights.
+                  Defaults to None.
+          """
+      def forward(self, x):
+          """Forward function."""
+          x = self.patch_embed(x)
+          Wh, Ww = x.size(2), x.size(3)
+          if self.ape:
+              # interpolate the position embedding to the corresponding size
+              absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+              x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+          else:
+              x = x.flatten(2).transpose(1, 2)
+          x = self.pos_drop(x)
+          outs = []
+          for i in range(self.num_layers):
+              layer = self.layers[i]
+              x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+              if i in self.out_indices:
+                  norm_layer = getattr(self, f'norm{i}')
+                  x_out = norm_layer(x_out)
+                  out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                  outs.append(out)
+          return tuple(outs)
+      def train(self, mode=True):
+          """Convert the model into training mode while keep layers freezed."""
+          super(SwinTransformer, self).train(mode)
+          self._freeze_stages()
+#+end_src
+** AEMatter.class.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+#+end_src
+** AEMatter.class.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+  class ResBlock(nn.Module):
+      def __init__(self, inc, midc):
+          super(ResBlock, self).__init__()
+          self.conv1 = nn.Conv2d(inc,
+                                 midc,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0,
+                                 bias=True)
+          self.gn1 = nn.GroupNorm(16, midc)
+          self.conv2 = nn.Conv2d(midc,
+                                 midc,
+                                 kernel_size=3,
+                                 stride=1,
+                                 padding=1,
+                                 bias=True)
+          self.gn2 = nn.GroupNorm(16, midc)
+          self.conv3 = nn.Conv2d(midc,
+                                 inc,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0,
+                                 bias=True)
+          self.relu = nn.LeakyReLU(0.1)
+      def forward(self, x):
+          x_ = x
+          x = self.conv1(x)
+          x = self.gn1(x)
+          x = self.relu(x)
+          x = self.conv2(x)
+          x = self.gn2(x)
+          x = self.relu(x)
+          x = self.conv3(x)
+          x = x + x_
+          x = self.relu(x)
+          return x
+#+end_src
+** AEMatter.class.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+  class AEALblock(nn.Module):
+      def __init__(self,
+                   d_model,
+                   nhead,
+                   dim_feedforward=512,
+                   dropout=0.0,
+                   layer_norm_eps=1e-5,
+                   batch_first=True,
+                   norm_first=False,
+                   width=5):
+          super(AEALblock, self).__init__()
+          self.self_attn2 = nn.MultiheadAttention(d_model // 2,
+                                                  nhead // 2,
+                                                  dropout=dropout,
+                                                  batch_first=batch_first)
+          self.self_attn1 = nn.MultiheadAttention(d_model // 2,
+                                                  nhead // 2,
+                                                  dropout=dropout,
+                                                  batch_first=batch_first)
+          self.linear1 = nn.Linear(d_model, dim_feedforward)
+          self.dropout = nn.Dropout(dropout)
+          self.linear2 = nn.Linear(dim_feedforward, d_model)
+          self.norm_first = norm_first
+          self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+          self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+          self.dropout1 = nn.Dropout(dropout)
+          self.dropout2 = nn.Dropout(dropout)
+          self.activation = nn.ReLU()
+          self.width = width
+          self.trans = nn.Sequential(
+              nn.Conv2d(d_model + 512, d_model // 2, 1, 1, 0),
+              ResBlock(d_model // 2, d_model // 4),
+              nn.Conv2d(d_model // 2, d_model, 1, 1, 0))
+          self.gamma = nn.Parameter(torch.zeros(1))
+      def forward(
+          self,
+          src,
+          feats,
+      ):
+          src = self.gamma * self.trans(torch.cat([src, feats], 1)) + src
+          b, c, h, w = src.shape
+          x1 = src[:, 0:c // 2]
+          x1_ = rearrange(x1, 'b c (h1 h2) w -> b c h1 h2 w', h2=self.width)
+          x1_ = rearrange(x1_, 'b c h1 h2 w -> (b h1) (h2 w) c')
+          x2 = src[:, c // 2:]
+          x2_ = rearrange(x2, 'b c h (w1 w2) -> b c h w1 w2', w2=self.width)
+          x2_ = rearrange(x2_, 'b c h w1 w2 -> (b w1) (h w2) c')
+          x = rearrange(src, 'b c h w-> b (h w) c')
+          x = self.norm1(x + self._sa_block(x1_, x2_, h, w))
+          x = self.norm2(x + self._ff_block(x))
+          x = rearrange(x, 'b (h w) c->b c h w', h=h, w=w)
+          return x
+      def _sa_block(self, x1, x2, h, w):
+          x1 = self.self_attn1(x1,
+                               x1,
+                               x1,
+                               attn_mask=None,
+                               key_padding_mask=None,
+                               need_weights=False)[0]
+          x2 = self.self_attn2(x2,
+                               x2,
+                               x2,
+                               attn_mask=None,
+                               key_padding_mask=None,
+                               need_weights=False)[0]
+          x1 = rearrange(x1,
+                         '(b h1) (h2 w) c-> b (h1 h2 w) c',
+                         h2=self.width,
+                         h1=h // self.width)
+          x2 = rearrange(x2,
+                         ' (b w1) (h w2) c-> b (h w1 w2) c',
+                         w2=self.width,
+                         w1=w // self.width)
+          x = torch.cat([x1, x2], dim=2)
+          return self.dropout1(x)
+      def _ff_block(self, x):
+          x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+          return self.dropout2(x)
+#+end_src
+** AEMatter.class.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+  class AEMatter(nn.Module):
+      def __init__(self):
+          super(AEMatter, self).__init__()
+          trans = SwinTransformer(pretrain_img_size=224,
+                                  embed_dim=96,
+                                  depths=[2, 2, 6, 2],
+                                  num_heads=[3, 6, 12, 24],
+                                  window_size=7,
+                                  ape=False,
+                                  drop_path_rate=0.2,
+                                  patch_norm=True,
+                                  use_checkpoint=False)
+          # trans.load_state_dict(torch.load(
+          #     '/home/asd/Desktop/swin_tiny_patch4_window7_224.pth',
+          #     map_location="cpu")["model"],
+          #                       strict=False)
+          trans.patch_embed.proj = nn.Conv2d(64, 96, 3, 2, 1)
+          self.start_conv0 = nn.Sequential(nn.Conv2d(6, 48, 3, 1, 1),
+                                           nn.PReLU(48))
+          self.start_conv = nn.Sequential(nn.Conv2d(48, 64, 3, 2,
+                                                    1), nn.PReLU(64),
+                                          nn.Conv2d(64, 64, 3, 1, 1),
+                                          nn.PReLU(64))
+          self.trans = trans
+          self.conv1 = nn.Sequential(
+              nn.Conv2d(in_channels=640 + 768,
+                        out_channels=256,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        bias=True))
+          self.conv2 = nn.Sequential(
+              nn.Conv2d(in_channels=256 + 384,
+                        out_channels=256,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        bias=True), )
+          self.conv3 = nn.Sequential(
+              nn.Conv2d(in_channels=256 + 192,
+                        out_channels=192,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        bias=True), )
+          self.conv4 = nn.Sequential(
+              nn.Conv2d(in_channels=192 + 96,
+                        out_channels=128,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        bias=True), )
+          self.ctran0 = BasicLayer(256, 3, 8, 7, drop_path=0.09)
+          self.ctran1 = BasicLayer(256, 3, 8, 7, drop_path=0.07)
+          self.ctran2 = BasicLayer(192, 3, 6, 7, drop_path=0.05)
+          self.ctran3 = BasicLayer(128, 3, 4, 7, drop_path=0.03)
+          self.conv5 = nn.Sequential(
+              nn.Conv2d(in_channels=192,
+                        out_channels=64,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=True), nn.PReLU(64),
+              nn.Conv2d(in_channels=64,
+                        out_channels=64,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=True), nn.PReLU(64),
+              nn.Conv2d(in_channels=64,
+                        out_channels=48,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=True), nn.PReLU(48))
+          self.convo = nn.Sequential(
+              nn.Conv2d(in_channels=48 + 48 + 6,
+                        out_channels=32,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=True), nn.PReLU(32),
+              nn.Conv2d(in_channels=32,
+                        out_channels=32,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=True), nn.PReLU(32),
+              nn.Conv2d(in_channels=32,
+                        out_channels=1,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=True))
+          self.up = nn.Upsample(scale_factor=2,
+                                mode='bilinear',
+                                align_corners=False)
+          self.upn = nn.Upsample(scale_factor=2, mode='nearest')
+          self.apptrans = nn.Sequential(
+              nn.Conv2d(256 + 384, 256, 1, 1, bias=True), ResBlock(256, 128),
+              ResBlock(256, 128), nn.Conv2d(256, 512, 2, 2, bias=True),
+              ResBlock(512, 128))
+          self.emb = nn.Sequential(nn.Conv2d(768, 640, 1, 1, 0),
+                                   ResBlock(640, 160))
+          self.embdp = nn.Sequential(nn.Conv2d(640, 640, 1, 1, 0))
+          self.h2l = nn.Conv2d(768, 256, 1, 1, 0)
+          self.width = 5
+          self.trans1 = AEALblock(d_model=640,
+                                  nhead=20,
+                                  dim_feedforward=2048,
+                                  dropout=0.2,
+                                  width=self.width)
+          self.trans2 = AEALblock(d_model=640,
+                                  nhead=20,
+                                  dim_feedforward=2048,
+                                  dropout=0.2,
+                                  width=self.width)
+          self.trans3 = AEALblock(d_model=640,
+                                  nhead=20,
+                                  dim_feedforward=2048,
+                                  dropout=0.2,
+                                  width=self.width)
+      def aeal(self, x, sem):
+          xe = self.emb(x)
+          x_ = xe
+          x_ = self.embdp(x_)
+          b, c, h1, w1 = x_.shape
+          bnew_ph = int(np.ceil(h1 / self.width) * self.width) - h1
+          bnew_pw = int(np.ceil(w1 / self.width) * self.width) - w1
+          newph1 = bnew_ph // 2
+          newph2 = bnew_ph - newph1
+          newpw1 = bnew_pw // 2
+          newpw2 = bnew_pw - newpw1
+          x_ = F.pad(x_, (newpw1, newpw2, newph1, newph2))
+          sem = F.pad(sem, (newpw1, newpw2, newph1, newph2))
+          x_ = self.trans1(x_, sem)
+          x_ = self.trans2(x_, sem)
+          x_ = self.trans3(x_, sem)
+          x_ = x_[:, :, newph1:h1 + newph1, newpw1:w1 + newpw1]
+          return x_
+      def forward(self, x, y):
+          inputs = torch.cat((x, y), 1)
+          x = self.start_conv0(inputs)
+          x_ = self.start_conv(x)
+          x1, x2, x3, x4 = self.trans(x_)
+          x4h = self.h2l(x4)
+          x3s = self.apptrans(torch.cat([x3, self.upn(x4h)], 1))
+          x4_ = self.aeal(x4, x3s)
+          x4 = torch.cat((x4, x4_), 1)
+          X4 = self.conv1(x4)
+          wh, ww = X4.shape[2], X4.shape[3]
+          X4 = rearrange(X4, 'b c h w -> b (h w) c')
+          X4, _, _, _, _, _ = self.ctran0(X4, wh, ww)
+          X4 = rearrange(X4, 'b (h w) c -> b c h w', h=wh, w=ww)
+          X3 = self.up(X4)
+          X3 = torch.cat((x3, X3), 1)
+          X3 = self.conv2(X3)
+          wh, ww = X3.shape[2], X3.shape[3]
+          X3 = rearrange(X3, 'b c h w -> b (h w) c')
+          X3, _, _, _, _, _ = self.ctran1(X3, wh, ww)
+          X3 = rearrange(X3, 'b (h w) c -> b c h w', h=wh, w=ww)
+          X2 = self.up(X3)
+          X2 = torch.cat((x2, X2), 1)
+          X2 = self.conv3(X2)
+          wh, ww = X2.shape[2], X2.shape[3]
+          X2 = rearrange(X2, 'b c h w -> b (h w) c')
+          X2, _, _, _, _, _ = self.ctran2(X2, wh, ww)
+          X2 = rearrange(X2, 'b (h w) c -> b c h w', h=wh, w=ww)
+          X1 = self.up(X2)
+          X1 = torch.cat((x1, X1), 1)
+          X1 = self.conv4(X1)
+          wh, ww = X1.shape[2], X1.shape[3]
+          X1 = rearrange(X1, 'b c h w -> b (h w) c')
+          X1, _, _, _, _, _ = self.ctran3(X1, wh, ww)
+          X1 = rearrange(X1, 'b (h w) c -> b c h w', h=wh, w=ww)
+          X0 = self.up(X1)
+          X0 = torch.cat((x_, X0), 1)
+          X0 = self.conv5(X0)
+          X = self.up(X0)
+          X = torch.cat((inputs, x, X), 1)
+          alpha = self.convo(X)
+          alpha = torch.clamp(alpha, min=0, max=1)
+          return alpha
+#+end_src
+** Function to load model
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.function.py
+  def get_AEMatter_model(path_model_checkpoint):
+      download_model(path=path_model_checkpoint)
+      matmodel = AEMatter()
+      matmodel.load_state_dict(
+          torch.load(path_model_checkpoint, map_location='cpu')['model'])
+      matmodel = matmodel.cuda()
+      matmodel.eval()
+      return matmodel
+#+end_src
+** Function to do inference
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.function.py
+  def do_infer(rawimg, trimap, matmodel):
+      trimap_nonp = trimap.copy()
+      h, w, c = rawimg.shape
+      nonph, nonpw, _ = rawimg.shape
+      newh = (((h - 1) // 32) + 1) * 32
+      neww = (((w - 1) // 32) + 1) * 32
+      padh = newh - h
+      padh1 = int(padh / 2)
+      padh2 = padh - padh1
+      padw = neww - w
+      padw1 = int(padw / 2)
+      padw2 = padw - padw1
+      rawimg_pad = cv2.copyMakeBorder(rawimg, padh1, padh2, padw1, padw2,
+                                      cv2.BORDER_REFLECT)
+      trimap_pad = cv2.copyMakeBorder(trimap, padh1, padh2, padw1, padw2,
+                                      cv2.BORDER_REFLECT)
+      h_pad, w_pad, _ = rawimg_pad.shape
+      tritemp = np.zeros([*trimap_pad.shape, 3], np.float32)
+      tritemp[:, :, 0] = (trimap_pad == 0)
+      tritemp[:, :, 1] = (trimap_pad == 128)
+      tritemp[:, :, 2] = (trimap_pad == 255)
+      tritempimgs = np.transpose(tritemp, (2, 0, 1))
+      tritempimgs = tritempimgs[np.newaxis, :, :, :]
+      img = np.transpose(rawimg_pad, (2, 0, 1))[np.newaxis, ::-1, :, :]
+      img = np.array(img, np.float32)
+      img = img / 255.
+      img = torch.from_numpy(img).cuda()
+      tritempimgs = torch.from_numpy(tritempimgs).cuda()
+      with torch.no_grad():
+          pred = matmodel(img, tritempimgs)
+          pred = pred.detach().cpu().numpy()[0]
+          pred = pred[:, padh1:padh1 + h, padw1:padw1 + w]
+          preda = pred[
+              0:1,
+          ] * 255
+          preda = np.transpose(preda, (1, 2, 0))
+          preda = preda * (trimap_nonp[:, :, None]
+                           == 128) + (trimap_nonp[:, :, None] == 255) * 255
+      preda = np.array(preda, np.uint8)
+      return preda
+#+end_src
+** Load ComfyUI AEMatter model
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.class.py
+  class load_AEMatter_Model:
+      def __init__(self):
+          pass
+      @classmethod
+      def INPUT_TYPES(s):
+          return {
+              "required": {},
+          }
+      RETURN_TYPES = ("AEMatter_Model", )
+      FUNCTION = "test"
+      CATEGORY = "AEMatter"
+      def test(self):
+          return (get_AEMatter_model(get_model_path()), )
+  class run_AEMatter_inference:
+      def __init__(self):
+          pass
+      @classmethod
+      def INPUT_TYPES(s):
+          return {
+              "required": {
+                  "image": ("IMAGE", ),
+                  "trimap": ("MASK", ),
+                  "AEMatter_Model": ("AEMatter_Model", ),
+              },
+          }
+      RETURN_TYPES = ("MASK", )
+      FUNCTION = "test"
+      CATEGORY = "AEMatter"
+      def test(
+          self,
+          image,
+          trimap,
+          AEMatter_Model,
+      ):
+          ret = []
+          batch_size = image.shape[0]
+          for i in range(batch_size):
+              tmp_i = from_torch_image(image[i])
+              tmp_m = from_torch_image(trimap[i])
+              tmp = do_infer(tmp_i, tmp_m, AEMatter_Model)
+              ret.append(tmp)
+          ret = to_torch_image(np.array(ret))
+          ret = ret.squeeze(-1)
+          print(ret.shape)
+          return ret
+#+end_src
+** Main function
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.function.py
+  def main():
+      ptrimap = '/home/asd/Desktop/demo/retriever_trimap.png'
+      pimgs = '/home/asd/Desktop/demo/retriever_rgb.png'
+      p_outs = 'alpha.png'
+      matmodel = get_AEMatter_model(
+          path_model_checkpoint='/home/asd/Desktop/AEM_RWA.ckpt')
+      # matmodel = AEMatter()
+      # matmodel.load_state_dict(
+      #     torch.load('/home/asd/Desktop/AEM_RWA.ckpt',
+      #                map_location='cpu')['model'])
+      # matmodel = matmodel.cuda()
+      # matmodel.eval()
+      rawimg = pimgs
+      trimap = ptrimap
+      rawimg = cv2.imread(rawimg, cv2.IMREAD_COLOR)
+      trimap = cv2.imread(trimap, cv2.IMREAD_GRAYSCALE)
+      trimap_nonp = trimap.copy()
+      h, w, c = rawimg.shape
+      nonph, nonpw, _ = rawimg.shape
+      newh = (((h - 1) // 32) + 1) * 32
+      neww = (((w - 1) // 32) + 1) * 32
+      padh = newh - h
+      padh1 = int(padh / 2)
+      padh2 = padh - padh1
+      padw = neww - w
+      padw1 = int(padw / 2)
+      padw2 = padw - padw1
+      rawimg_pad = cv2.copyMakeBorder(rawimg, padh1, padh2, padw1, padw2,
+                                      cv2.BORDER_REFLECT)
+      trimap_pad = cv2.copyMakeBorder(trimap, padh1, padh2, padw1, padw2,
+                                      cv2.BORDER_REFLECT)
+      h_pad, w_pad, _ = rawimg_pad.shape
+      tritemp = np.zeros([*trimap_pad.shape, 3], np.float32)
+      tritemp[:, :, 0] = (trimap_pad == 0)
+      tritemp[:, :, 1] = (trimap_pad == 128)
+      tritemp[:, :, 2] = (trimap_pad == 255)
+      tritempimgs = np.transpose(tritemp, (2, 0, 1))
+      tritempimgs = tritempimgs[np.newaxis, :, :, :]
+      img = np.transpose(rawimg_pad, (2, 0, 1))[np.newaxis, ::-1, :, :]
+      img = np.array(img, np.float32)
+      img = img / 255.
+      img = torch.from_numpy(img).cuda()
+      tritempimgs = torch.from_numpy(tritempimgs).cuda()
+      with torch.no_grad():
+          pred = matmodel(img, tritempimgs)
+          pred = pred.detach().cpu().numpy()[0]
+          pred = pred[:, padh1:padh1 + h, padw1:padw1 + w]
+          preda = pred[
+              0:1,
+          ] * 255
+          preda = np.transpose(preda, (1, 2, 0))
+          preda = preda * (trimap_nonp[:, :, None]
+                           == 128) + (trimap_nonp[:, :, None] == 255) * 255
+      preda = np.array(preda, np.uint8)
+      cv2.imwrite(p_outs, preda)
+#+end_src
+** Comfyui Dictionary
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.execute.py
+  NODE_CLASS_MAPPINGS = {
+      'load_AEMatter_Model': load_AEMatter_Model,
+      'run_AEMatter_inference': run_AEMatter_inference,
+  }
+  NODE_DISPLAY_NAME_MAPPINGS = {
+      'load_AEMatter_Model': 'load_AEMatter_Model',
+      'run_AEMatter_inference': 'run_AEMatter_inference',
+  }
+#+end_src
+** COMMENT AEMatter.execute.py
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./AEMatter.execute.py
+  if __name__ == '__main__':
+      # main()
+      rawimg = cv2.imread('/home/asd/Desktop/demo/retriever_rgb.png',
+                          cv2.IMREAD_COLOR)
+      trimap = cv2.imread('/home/asd/Desktop/demo/retriever_trimap.png',
+                          cv2.IMREAD_GRAYSCALE)
+      do_infer(rawimg, trimap,
+               get_AEMatter_model('/home/asd/Desktop/AEM_RWA.ckpt'))
+#+end_src
+** AEMatter.unify.sh
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./AEMatter.unify.sh
+  . "${HOME}/dbnew.sh"
+  cat \
+        'AEMatter.import.py' \
+        'AEMatter.function.py' \
+        'AEMatter.class.py' \
+        'AEMatter.execute.py' \
+      | expand | yapf3 \
+      > 'AEMatter.py' \
+  ;
+  cp 'AEMatter.py' '__init__.py'
+#+end_src
+** AEMatter.run.sh
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./AEMatter.run.sh
+  . "${HOME}/dbnew.sh"
+  python3 './AEMatter.py'
+#+end_src
+#+RESULTS:
+* COMMENT WORK SPACE
+** ESHELL
+#+begin_src elisp
+  (save-buffer)
+  (org-babel-tangle)
+  (shell-command "./AEMatter.unify.sh")
+#+end_src
+#+RESULTS:
+: 0
+** SHELL
+#+begin_src sh :shebang #!/bin/sh :results output
+  realpath .
+  cd /home/asd/GITHUB/aravind-h-v/dreambooth_experiments/AEMatter
+#+end_src
+#+RESULTS:
+** SHELL
+#+begin_src sh :shebang #!/bin/sh :results output
+  ls
+#+end_src

ComfyUI_AEMatter/__init__.py ADDED Viewed

	@@ -0,0 +1,1248 @@

+#!/usr/bin/python3
+import cv2
+import math
+import numpy as np
+import os
+import random
+import wget
+import torch
+import torch.nn as nn
+from torch.nn import init
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from collections import OrderedDict
+from einops import rearrange, repeat
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+import folder_paths
+from folder_paths import models_dir
+#!/usr/bin/python3
+def mkdir_safe(out_path):
+    if type(out_path) == str:
+        if len(out_path) > 0:
+            if not os.path.exists(out_path):
+                os.mkdir(out_path)
+def get_model_path():
+    import folder_paths
+    from folder_paths import models_dir
+    path_file_model = models_dir
+    mkdir_safe(out_path=path_file_model)
+    path_file_model = os.path.join(path_file_model, 'AEMatter')
+    mkdir_safe(out_path=path_file_model)
+    path_file_model = os.path.join(path_file_model, 'AEM_RWA.ckpt')
+    return path_file_model
+def download_model(path):
+    if not os.path.exists(path):
+        wget.download(
+            'https://huggingface.co/aravindhv10/Self-Correction-Human-Parsing/resolve/main/checkpoints/AEMatter/AEM_RWA.ckpt?download=true',
+            out=path)
+def from_torch_image(image):
+    image = image.cpu().numpy() * 255.0
+    image = np.clip(image, 0, 255).astype(np.uint8)
+    return image
+def to_torch_image(image):
+    image = image.astype(dtype=np.float32)
+    image /= 255.0
+    image = torch.from_numpy(image)
+    return image
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
+               C)
+    windows = x.permute(0, 1, 3, 2, 4,
+                        5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size,
+                     window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+def get_AEMatter_model(path_model_checkpoint):
+    download_model(path=path_model_checkpoint)
+    matmodel = AEMatter()
+    matmodel.load_state_dict(
+        torch.load(path_model_checkpoint, map_location='cpu')['model'])
+    matmodel = matmodel.cuda()
+    matmodel.eval()
+    return matmodel
+def do_infer(rawimg, trimap, matmodel):
+    trimap_nonp = trimap.copy()
+    h, w, c = rawimg.shape
+    nonph, nonpw, _ = rawimg.shape
+    newh = (((h - 1) // 32) + 1) * 32
+    neww = (((w - 1) // 32) + 1) * 32
+    padh = newh - h
+    padh1 = int(padh / 2)
+    padh2 = padh - padh1
+    padw = neww - w
+    padw1 = int(padw / 2)
+    padw2 = padw - padw1
+    rawimg_pad = cv2.copyMakeBorder(rawimg, padh1, padh2, padw1, padw2,
+                                    cv2.BORDER_REFLECT)
+    trimap_pad = cv2.copyMakeBorder(trimap, padh1, padh2, padw1, padw2,
+                                    cv2.BORDER_REFLECT)
+    h_pad, w_pad, _ = rawimg_pad.shape
+    tritemp = np.zeros([*trimap_pad.shape, 3], np.float32)
+    tritemp[:, :, 0] = (trimap_pad == 0)
+    tritemp[:, :, 1] = (trimap_pad == 128)
+    tritemp[:, :, 2] = (trimap_pad == 255)
+    tritempimgs = np.transpose(tritemp, (2, 0, 1))
+    tritempimgs = tritempimgs[np.newaxis, :, :, :]
+    img = np.transpose(rawimg_pad, (2, 0, 1))[np.newaxis, ::-1, :, :]
+    img = np.array(img, np.float32)
+    img = img / 255.
+    img = torch.from_numpy(img).cuda()
+    tritempimgs = torch.from_numpy(tritempimgs).cuda()
+    with torch.no_grad():
+        pred = matmodel(img, tritempimgs)
+        pred = pred.detach().cpu().numpy()[0]
+        pred = pred[:, padh1:padh1 + h, padw1:padw1 + w]
+        preda = pred[
+            0:1,
+        ] * 255
+        preda = np.transpose(preda, (1, 2, 0))
+        preda = preda * (trimap_nonp[:, :, None]
+                         == 128) + (trimap_nonp[:, :, None] == 255) * 255
+    preda = np.array(preda, np.uint8)
+    return preda
+def main():
+    ptrimap = '/home/asd/Desktop/demo/retriever_trimap.png'
+    pimgs = '/home/asd/Desktop/demo/retriever_rgb.png'
+    p_outs = 'alpha.png'
+    matmodel = get_AEMatter_model(
+        path_model_checkpoint='/home/asd/Desktop/AEM_RWA.ckpt')
+    # matmodel = AEMatter()
+    # matmodel.load_state_dict(
+    #     torch.load('/home/asd/Desktop/AEM_RWA.ckpt',
+    #                map_location='cpu')['model'])
+    # matmodel = matmodel.cuda()
+    # matmodel.eval()
+    rawimg = pimgs
+    trimap = ptrimap
+    rawimg = cv2.imread(rawimg, cv2.IMREAD_COLOR)
+    trimap = cv2.imread(trimap, cv2.IMREAD_GRAYSCALE)
+    trimap_nonp = trimap.copy()
+    h, w, c = rawimg.shape
+    nonph, nonpw, _ = rawimg.shape
+    newh = (((h - 1) // 32) + 1) * 32
+    neww = (((w - 1) // 32) + 1) * 32
+    padh = newh - h
+    padh1 = int(padh / 2)
+    padh2 = padh - padh1
+    padw = neww - w
+    padw1 = int(padw / 2)
+    padw2 = padw - padw1
+    rawimg_pad = cv2.copyMakeBorder(rawimg, padh1, padh2, padw1, padw2,
+                                    cv2.BORDER_REFLECT)
+    trimap_pad = cv2.copyMakeBorder(trimap, padh1, padh2, padw1, padw2,
+                                    cv2.BORDER_REFLECT)
+    h_pad, w_pad, _ = rawimg_pad.shape
+    tritemp = np.zeros([*trimap_pad.shape, 3], np.float32)
+    tritemp[:, :, 0] = (trimap_pad == 0)
+    tritemp[:, :, 1] = (trimap_pad == 128)
+    tritemp[:, :, 2] = (trimap_pad == 255)
+    tritempimgs = np.transpose(tritemp, (2, 0, 1))
+    tritempimgs = tritempimgs[np.newaxis, :, :, :]
+    img = np.transpose(rawimg_pad, (2, 0, 1))[np.newaxis, ::-1, :, :]
+    img = np.array(img, np.float32)
+    img = img / 255.
+    img = torch.from_numpy(img).cuda()
+    tritempimgs = torch.from_numpy(tritempimgs).cuda()
+    with torch.no_grad():
+        pred = matmodel(img, tritempimgs)
+        pred = pred.detach().cpu().numpy()[0]
+        pred = pred[:, padh1:padh1 + h, padw1:padw1 + w]
+        preda = pred[
+            0:1,
+        ] * 255
+        preda = np.transpose(preda, (1, 2, 0))
+        preda = preda * (trimap_nonp[:, :, None]
+                         == 128) + (trimap_nonp[:, :, None] == 255) * 255
+    preda = np.array(preda, np.uint8)
+    cv2.imwrite(p_outs, preda)
+#!/usr/bin/python3
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(dim,
+                                    window_size=to_2tuple(self.window_size),
+                                    num_heads=num_heads,
+                                    qkv_bias=qkv_bias,
+                                    qk_scale=qk_scale,
+                                    attn_drop=attn_drop,
+                                    proj_drop=drop)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x,
+                                   shifts=(-self.shift_size, -self.shift_size),
+                                   dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                   C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x,
+                           shifts=(self.shift_size, self.shift_size),
+                           dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim,
+                                 num_heads=num_heads,
+                                 window_size=window_size,
+                                 shift_size=0 if
+                                 (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias,
+                                 qk_scale=qk_scale,
+                                 drop=drop,
+                                 attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(
+                                     drop_path, list) else drop_path,
+                                 norm_layer=norm_layer) for i in range(depth)
+        ])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # print(x.shape,H,W)
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1,
+                                         self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(
+            2)  # nW, ww window_size*window_size
+        attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                          float(-100.0)).masked_fill(
+                                              attn_mask == 0, float(0.0))
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x,
+                      (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0],
+                            patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if
+                (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed,
+                                               size=(Wh, Ww),
+                                               mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1,
+                                                              2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W,
+                                 self.num_features[i]).permute(0, 3, 1,
+                                                               2).contiguous()
+                outs.append(out)
+        return tuple(outs)
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class ResBlock(nn.Module):
+    def __init__(self, inc, midc):
+        super(ResBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inc,
+                               midc,
+                               kernel_size=1,
+                               stride=1,
+                               padding=0,
+                               bias=True)
+        self.gn1 = nn.GroupNorm(16, midc)
+        self.conv2 = nn.Conv2d(midc,
+                               midc,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1,
+                               bias=True)
+        self.gn2 = nn.GroupNorm(16, midc)
+        self.conv3 = nn.Conv2d(midc,
+                               inc,
+                               kernel_size=1,
+                               stride=1,
+                               padding=0,
+                               bias=True)
+        self.relu = nn.LeakyReLU(0.1)
+    def forward(self, x):
+        x_ = x
+        x = self.conv1(x)
+        x = self.gn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.gn2(x)
+        x = self.relu(x)
+        x = self.conv3(x)
+        x = x + x_
+        x = self.relu(x)
+        return x
+class AEALblock(nn.Module):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=512,
+                 dropout=0.0,
+                 layer_norm_eps=1e-5,
+                 batch_first=True,
+                 norm_first=False,
+                 width=5):
+        super(AEALblock, self).__init__()
+        self.self_attn2 = nn.MultiheadAttention(d_model // 2,
+                                                nhead // 2,
+                                                dropout=dropout,
+                                                batch_first=batch_first)
+        self.self_attn1 = nn.MultiheadAttention(d_model // 2,
+                                                nhead // 2,
+                                                dropout=dropout,
+                                                batch_first=batch_first)
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm_first = norm_first
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = nn.ReLU()
+        self.width = width
+        self.trans = nn.Sequential(
+            nn.Conv2d(d_model + 512, d_model // 2, 1, 1, 0),
+            ResBlock(d_model // 2, d_model // 4),
+            nn.Conv2d(d_model // 2, d_model, 1, 1, 0))
+        self.gamma = nn.Parameter(torch.zeros(1))
+    def forward(
+        self,
+        src,
+        feats,
+    ):
+        src = self.gamma * self.trans(torch.cat([src, feats], 1)) + src
+        b, c, h, w = src.shape
+        x1 = src[:, 0:c // 2]
+        x1_ = rearrange(x1, 'b c (h1 h2) w -> b c h1 h2 w', h2=self.width)
+        x1_ = rearrange(x1_, 'b c h1 h2 w -> (b h1) (h2 w) c')
+        x2 = src[:, c // 2:]
+        x2_ = rearrange(x2, 'b c h (w1 w2) -> b c h w1 w2', w2=self.width)
+        x2_ = rearrange(x2_, 'b c h w1 w2 -> (b w1) (h w2) c')
+        x = rearrange(src, 'b c h w-> b (h w) c')
+        x = self.norm1(x + self._sa_block(x1_, x2_, h, w))
+        x = self.norm2(x + self._ff_block(x))
+        x = rearrange(x, 'b (h w) c->b c h w', h=h, w=w)
+        return x
+    def _sa_block(self, x1, x2, h, w):
+        x1 = self.self_attn1(x1,
+                             x1,
+                             x1,
+                             attn_mask=None,
+                             key_padding_mask=None,
+                             need_weights=False)[0]
+        x2 = self.self_attn2(x2,
+                             x2,
+                             x2,
+                             attn_mask=None,
+                             key_padding_mask=None,
+                             need_weights=False)[0]
+        x1 = rearrange(x1,
+                       '(b h1) (h2 w) c-> b (h1 h2 w) c',
+                       h2=self.width,
+                       h1=h // self.width)
+        x2 = rearrange(x2,
+                       ' (b w1) (h w2) c-> b (h w1 w2) c',
+                       w2=self.width,
+                       w1=w // self.width)
+        x = torch.cat([x1, x2], dim=2)
+        return self.dropout1(x)
+    def _ff_block(self, x):
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+class AEMatter(nn.Module):
+    def __init__(self):
+        super(AEMatter, self).__init__()
+        trans = SwinTransformer(pretrain_img_size=224,
+                                embed_dim=96,
+                                depths=[2, 2, 6, 2],
+                                num_heads=[3, 6, 12, 24],
+                                window_size=7,
+                                ape=False,
+                                drop_path_rate=0.2,
+                                patch_norm=True,
+                                use_checkpoint=False)
+        # trans.load_state_dict(torch.load(
+        #     '/home/asd/Desktop/swin_tiny_patch4_window7_224.pth',
+        #     map_location="cpu")["model"],
+        #                       strict=False)
+        trans.patch_embed.proj = nn.Conv2d(64, 96, 3, 2, 1)
+        self.start_conv0 = nn.Sequential(nn.Conv2d(6, 48, 3, 1, 1),
+                                         nn.PReLU(48))
+        self.start_conv = nn.Sequential(nn.Conv2d(48, 64, 3, 2,
+                                                  1), nn.PReLU(64),
+                                        nn.Conv2d(64, 64, 3, 1, 1),
+                                        nn.PReLU(64))
+        self.trans = trans
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels=640 + 768,
+                      out_channels=256,
+                      kernel_size=1,
+                      stride=1,
+                      padding=0,
+                      bias=True))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_channels=256 + 384,
+                      out_channels=256,
+                      kernel_size=1,
+                      stride=1,
+                      padding=0,
+                      bias=True), )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(in_channels=256 + 192,
+                      out_channels=192,
+                      kernel_size=1,
+                      stride=1,
+                      padding=0,
+                      bias=True), )
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(in_channels=192 + 96,
+                      out_channels=128,
+                      kernel_size=1,
+                      stride=1,
+                      padding=0,
+                      bias=True), )
+        self.ctran0 = BasicLayer(256, 3, 8, 7, drop_path=0.09)
+        self.ctran1 = BasicLayer(256, 3, 8, 7, drop_path=0.07)
+        self.ctran2 = BasicLayer(192, 3, 6, 7, drop_path=0.05)
+        self.ctran3 = BasicLayer(128, 3, 4, 7, drop_path=0.03)
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(in_channels=192,
+                      out_channels=64,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True), nn.PReLU(64),
+            nn.Conv2d(in_channels=64,
+                      out_channels=64,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True), nn.PReLU(64),
+            nn.Conv2d(in_channels=64,
+                      out_channels=48,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True), nn.PReLU(48))
+        self.convo = nn.Sequential(
+            nn.Conv2d(in_channels=48 + 48 + 6,
+                      out_channels=32,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True), nn.PReLU(32),
+            nn.Conv2d(in_channels=32,
+                      out_channels=32,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True), nn.PReLU(32),
+            nn.Conv2d(in_channels=32,
+                      out_channels=1,
+                      kernel_size=3,
+                      stride=1,
+                      padding=1,
+                      bias=True))
+        self.up = nn.Upsample(scale_factor=2,
+                              mode='bilinear',
+                              align_corners=False)
+        self.upn = nn.Upsample(scale_factor=2, mode='nearest')
+        self.apptrans = nn.Sequential(
+            nn.Conv2d(256 + 384, 256, 1, 1, bias=True), ResBlock(256, 128),
+            ResBlock(256, 128), nn.Conv2d(256, 512, 2, 2, bias=True),
+            ResBlock(512, 128))
+        self.emb = nn.Sequential(nn.Conv2d(768, 640, 1, 1, 0),
+                                 ResBlock(640, 160))
+        self.embdp = nn.Sequential(nn.Conv2d(640, 640, 1, 1, 0))
+        self.h2l = nn.Conv2d(768, 256, 1, 1, 0)
+        self.width = 5
+        self.trans1 = AEALblock(d_model=640,
+                                nhead=20,
+                                dim_feedforward=2048,
+                                dropout=0.2,
+                                width=self.width)
+        self.trans2 = AEALblock(d_model=640,
+                                nhead=20,
+                                dim_feedforward=2048,
+                                dropout=0.2,
+                                width=self.width)
+        self.trans3 = AEALblock(d_model=640,
+                                nhead=20,
+                                dim_feedforward=2048,
+                                dropout=0.2,
+                                width=self.width)
+    def aeal(self, x, sem):
+        xe = self.emb(x)
+        x_ = xe
+        x_ = self.embdp(x_)
+        b, c, h1, w1 = x_.shape
+        bnew_ph = int(np.ceil(h1 / self.width) * self.width) - h1
+        bnew_pw = int(np.ceil(w1 / self.width) * self.width) - w1
+        newph1 = bnew_ph // 2
+        newph2 = bnew_ph - newph1
+        newpw1 = bnew_pw // 2
+        newpw2 = bnew_pw - newpw1
+        x_ = F.pad(x_, (newpw1, newpw2, newph1, newph2))
+        sem = F.pad(sem, (newpw1, newpw2, newph1, newph2))
+        x_ = self.trans1(x_, sem)
+        x_ = self.trans2(x_, sem)
+        x_ = self.trans3(x_, sem)
+        x_ = x_[:, :, newph1:h1 + newph1, newpw1:w1 + newpw1]
+        return x_
+    def forward(self, x, y):
+        inputs = torch.cat((x, y), 1)
+        x = self.start_conv0(inputs)
+        x_ = self.start_conv(x)
+        x1, x2, x3, x4 = self.trans(x_)
+        x4h = self.h2l(x4)
+        x3s = self.apptrans(torch.cat([x3, self.upn(x4h)], 1))
+        x4_ = self.aeal(x4, x3s)
+        x4 = torch.cat((x4, x4_), 1)
+        X4 = self.conv1(x4)
+        wh, ww = X4.shape[2], X4.shape[3]
+        X4 = rearrange(X4, 'b c h w -> b (h w) c')
+        X4, _, _, _, _, _ = self.ctran0(X4, wh, ww)
+        X4 = rearrange(X4, 'b (h w) c -> b c h w', h=wh, w=ww)
+        X3 = self.up(X4)
+        X3 = torch.cat((x3, X3), 1)
+        X3 = self.conv2(X3)
+        wh, ww = X3.shape[2], X3.shape[3]
+        X3 = rearrange(X3, 'b c h w -> b (h w) c')
+        X3, _, _, _, _, _ = self.ctran1(X3, wh, ww)
+        X3 = rearrange(X3, 'b (h w) c -> b c h w', h=wh, w=ww)
+        X2 = self.up(X3)
+        X2 = torch.cat((x2, X2), 1)
+        X2 = self.conv3(X2)
+        wh, ww = X2.shape[2], X2.shape[3]
+        X2 = rearrange(X2, 'b c h w -> b (h w) c')
+        X2, _, _, _, _, _ = self.ctran2(X2, wh, ww)
+        X2 = rearrange(X2, 'b (h w) c -> b c h w', h=wh, w=ww)
+        X1 = self.up(X2)
+        X1 = torch.cat((x1, X1), 1)
+        X1 = self.conv4(X1)
+        wh, ww = X1.shape[2], X1.shape[3]
+        X1 = rearrange(X1, 'b c h w -> b (h w) c')
+        X1, _, _, _, _, _ = self.ctran3(X1, wh, ww)
+        X1 = rearrange(X1, 'b (h w) c -> b c h w', h=wh, w=ww)
+        X0 = self.up(X1)
+        X0 = torch.cat((x_, X0), 1)
+        X0 = self.conv5(X0)
+        X = self.up(X0)
+        X = torch.cat((inputs, x, X), 1)
+        alpha = self.convo(X)
+        alpha = torch.clamp(alpha, min=0, max=1)
+        return alpha
+class load_AEMatter_Model:
+    def __init__(self):
+        pass
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {},
+        }
+    RETURN_TYPES = ("AEMatter_Model", )
+    FUNCTION = "test"
+    CATEGORY = "AEMatter"
+    def test(self):
+        return (get_AEMatter_model(get_model_path()), )
+class run_AEMatter_inference:
+    def __init__(self):
+        pass
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "image": ("IMAGE", ),
+                "trimap": ("MASK", ),
+                "AEMatter_Model": ("AEMatter_Model", ),
+            },
+        }
+    RETURN_TYPES = ("MASK", )
+    FUNCTION = "test"
+    CATEGORY = "AEMatter"
+    def test(
+        self,
+        image,
+        trimap,
+        AEMatter_Model,
+    ):
+        ret = []
+        batch_size = image.shape[0]
+        for i in range(batch_size):
+            tmp_i = from_torch_image(image[i])
+            tmp_m = from_torch_image(trimap[i])
+            tmp = do_infer(tmp_i, tmp_m, AEMatter_Model)
+            ret.append(tmp)
+        ret = to_torch_image(np.array(ret))
+        ret = ret.squeeze(-1)
+        print(ret.shape)
+        return ret
+#!/usr/bin/python3
+NODE_CLASS_MAPPINGS = {
+    'load_AEMatter_Model': load_AEMatter_Model,
+    'run_AEMatter_inference': run_AEMatter_inference,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    'load_AEMatter_Model': 'load_AEMatter_Model',
+    'run_AEMatter_inference': 'run_AEMatter_inference',
+}

ComfyUI_MVANet/MVANet_inference.py ADDED Viewed

	@@ -0,0 +1,1548 @@

+#!/usr/bin/python3
+import os
+import sys
+HOME_DIR = os.environ.get('HOME', '/root')
+MVANET_SOURCE_DIR = HOME_DIR + '/GITHUB/qianyu-dlut/MVANet'
+finetuned_MVANet_model_path = MVANET_SOURCE_DIR + '/model/Model_80.pth'
+pretrained_SwinB_model_path = MVANET_SOURCE_DIR + '/model/swin_base_patch4_window12_384_22kto1k.pth'
+import math
+import numpy as np
+import cv2
+import wget
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch.autograd import Variable
+from torch import nn
+from torchvision import transforms
+from einops import rearrange
+from timm.models import load_checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+torch_device = 'cuda'
+torch_dtype = torch.float16
+def check_mkdir(dir_name):
+    if not os.path.isdir(dir_name):
+        os.makedirs(dir_name)
+def SwinT(pretrained=True):
+    model = SwinTransformer(embed_dim=96,
+                            depths=[2, 2, 6, 2],
+                            num_heads=[3, 6, 12, 24],
+                            window_size=7)
+    if pretrained is True:
+        model.load_state_dict(torch.load(
+            'data/backbone_ckpt/swin_tiny_patch4_window7_224.pth',
+            map_location='cpu')['model'],
+                              strict=False)
+    return model
+def SwinS(pretrained=True):
+    model = SwinTransformer(embed_dim=96,
+                            depths=[2, 2, 18, 2],
+                            num_heads=[3, 6, 12, 24],
+                            window_size=7)
+    if pretrained is True:
+        model.load_state_dict(torch.load(
+            'data/backbone_ckpt/swin_small_patch4_window7_224.pth',
+            map_location='cpu')['model'],
+                              strict=False)
+    return model
+def SwinB(pretrained=True):
+    model = SwinTransformer(embed_dim=128,
+                            depths=[2, 2, 18, 2],
+                            num_heads=[4, 8, 16, 32],
+                            window_size=12)
+    if pretrained is True:
+        import os
+        model.load_state_dict(torch.load(pretrained_SwinB_model_path,
+                                         map_location='cpu')['model'],
+                              strict=False)
+    return model
+def SwinL(pretrained=True):
+    model = SwinTransformer(embed_dim=192,
+                            depths=[2, 2, 18, 2],
+                            num_heads=[6, 12, 24, 48],
+                            window_size=12)
+    if pretrained is True:
+        model.load_state_dict(torch.load(
+            'data/backbone_ckpt/swin_large_patch4_window12_384_22kto1k.pth',
+            map_location='cpu')['model'],
+                              strict=False)
+    return model
+def get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+def make_cbr(in_dim, out_dim):
+    return nn.Sequential(nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1),
+                         nn.BatchNorm2d(out_dim), nn.PReLU())
+def make_cbg(in_dim, out_dim):
+    return nn.Sequential(nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1),
+                         nn.BatchNorm2d(out_dim), nn.GELU())
+def rescale_to(x, scale_factor: float = 2, interpolation='nearest'):
+    return F.interpolate(x, scale_factor=scale_factor, mode=interpolation)
+def resize_as(x, y, interpolation='bilinear'):
+    return F.interpolate(x, size=y.shape[-2:], mode=interpolation)
+def image2patches(x):
+    """b c (hg h) (wg w) -> (hg wg b) c h w"""
+    x = rearrange(x, 'b c (hg h) (wg w) -> (hg wg b) c h w', hg=2, wg=2)
+    return x
+def patches2image(x):
+    """(hg wg b) c h w -> b c (hg h) (wg w)"""
+    x = rearrange(x, '(hg wg b) c h w -> b c (hg h) (wg w)', hg=2, wg=2)
+    return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
+               C)
+    windows = x.permute(0, 1, 3, 2, 4,
+                        5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size,
+                     window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+def mkdir_safe(out_path):
+    if type(out_path) == str:
+        if len(out_path) > 0:
+            if not os.path.exists(out_path):
+                os.mkdir(out_path)
+def get_model_path():
+    import folder_paths
+    from folder_paths import models_dir
+    path_file_model = models_dir
+    mkdir_safe(out_path=path_file_model)
+    path_file_model = os.path.join(path_file_model, 'MVANet')
+    mkdir_safe(out_path=path_file_model)
+    path_file_model = os.path.join(path_file_model, 'Model_80.pth')
+    return path_file_model
+def download_model(path):
+    if not os.path.exists(path):
+        wget.download(
+            'https://huggingface.co/aravindhv10/Self-Correction-Human-Parsing/resolve/main/checkpoints/Model_80.pth',
+            out=path)
+def load_model(model_checkpoint_path):
+    download_model(path=model_checkpoint_path)
+    torch.cuda.set_device(0)
+    net = inf_MVANet().to(dtype=torch_dtype, device=torch_device)
+    pretrained_dict = torch.load(finetuned_MVANet_model_path,
+                                 map_location=torch_device)
+    model_dict = net.state_dict()
+    pretrained_dict = {
+        k: v
+        for k, v in pretrained_dict.items() if k in model_dict
+    }
+    model_dict.update(pretrained_dict)
+    net.load_state_dict(model_dict)
+    net = net.to(dtype=torch_dtype, device=torch_device)
+    net.eval()
+    return net
+def do_infer_tensor2tensor(img, net):
+    img_transform = transforms.Compose(
+        [transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
+    h_, w_ = img.shape[1], img.shape[2]
+    with torch.no_grad():
+        img = rearrange(img, 'B H W C -> B C H W')
+        img_resize = torch.nn.functional.interpolate(input=img,
+                                                     size=(1024, 1024),
+                                                     mode='bicubic',
+                                                     antialias=True)
+        img_var = img_transform(img_resize)
+        img_var = Variable(img_var)
+        img_var = img_var.to(dtype=torch_dtype, device=torch_device)
+        mask = []
+        mask.append(net(img_var))
+        prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+        prediction = prediction.sigmoid()
+        prediction = torch.nn.functional.interpolate(input=prediction,
+                                                     size=(h_, w_),
+                                                     mode='bicubic',
+                                                     antialias=True)
+        prediction = prediction.squeeze(0)
+        prediction = prediction.clamp(0, 1)
+        prediction = prediction.detach()
+        prediction = prediction.to(dtype=torch.float32, device='cpu')
+        return prediction
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        x = x.to(dtype=torch_dtype, device=torch_device)
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        attn = attn.to(dtype=torch_dtype, device=torch_device)
+        v = v.to(dtype=torch_dtype, device=torch_device)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(dim,
+                                    window_size=to_2tuple(self.window_size),
+                                    num_heads=num_heads,
+                                    qkv_bias=qkv_bias,
+                                    qk_scale=qk_scale,
+                                    attn_drop=attn_drop,
+                                    proj_drop=drop)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x,
+                                   shifts=(-self.shift_size, -self.shift_size),
+                                   dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                   C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x,
+                           shifts=(self.shift_size, self.shift_size),
+                           dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim,
+                                 num_heads=num_heads,
+                                 window_size=window_size,
+                                 shift_size=0 if
+                                 (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias,
+                                 qk_scale=qk_scale,
+                                 drop=drop,
+                                 attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(
+                                     drop_path, list) else drop_path,
+                                 norm_layer=norm_layer) for i in range(depth)
+        ])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1,
+                                         self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                          float(-100.0)).masked_fill(
+                                              attn_mask == 0, float(0.0))
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x,
+                      (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0],
+                            patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if
+                (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        if isinstance(pretrained, str):
+            self.apply(_init_weights)
+            load_checkpoint(self, pretrained, strict=False, logger=None)
+        elif pretrained is None:
+            self.apply(_init_weights)
+        else:
+            raise TypeError('pretrained must be a str or None')
+    def forward(self, x):
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed,
+                                               size=(Wh, Ww),
+                                               mode='bicubic')
+            x = (x + absolute_pos_embed)  # B Wh*Ww C
+        outs = [x.contiguous()]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W,
+                                 self.num_features[i]).permute(0, 3, 1,
+                                                               2).contiguous()
+                outs.append(out)
+        return tuple(outs)
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+class PositionEmbeddingSine:
+    def __init__(self,
+                 num_pos_feats=64,
+                 temperature=10000,
+                 normalize=False,
+                 scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+        self.dim_t = torch.arange(0,
+                                  self.num_pos_feats,
+                                  dtype=torch_dtype,
+                                  device=torch_device)
+    def __call__(self, b, h, w):
+        mask = torch.zeros([b, h, w], dtype=torch.bool, device=torch_device)
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(dim=1, dtype=torch_dtype)
+        x_embed = not_mask.cumsum(dim=2, dtype=torch_dtype)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = ((y_embed - 0.5) / (y_embed[:, -1:, :] + eps) *
+                       self.scale).to(device=torch_device, dtype=torch_dtype)
+            x_embed = ((x_embed - 0.5) / (x_embed[:, :, -1:] + eps) *
+                       self.scale).to(device=torch_device, dtype=torch_dtype)
+        dim_t = self.temperature**(2 * (self.dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+class MCLM(nn.Module):
+    def __init__(self, d_model, num_heads, pool_ratios=[1, 4, 8]):
+        super(MCLM, self).__init__()
+        self.attention = nn.ModuleList([
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+        ])
+        self.linear1 = nn.Linear(d_model, d_model * 2)
+        self.linear2 = nn.Linear(d_model * 2, d_model)
+        self.linear3 = nn.Linear(d_model, d_model * 2)
+        self.linear4 = nn.Linear(d_model * 2, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(0.1)
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.activation = get_activation_fn('relu')
+        self.pool_ratios = pool_ratios
+        self.p_poses = []
+        self.g_pos = None
+        self.positional_encoding = PositionEmbeddingSine(
+            num_pos_feats=d_model // 2, normalize=True)
+    def forward(self, l, g):
+        """
+        l: 4,c,h,w
+        g: 1,c,h,w
+        """
+        b, c, h, w = l.size()
+        # 4,c,h,w -> 1,c,2h,2w
+        concated_locs = rearrange(l,
+                                  '(hg wg b) c h w -> b c (hg h) (wg w)',
+                                  hg=2,
+                                  wg=2)
+        pools = []
+        for pool_ratio in self.pool_ratios:
+            # b,c,h,w
+            tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+            pool = F.adaptive_avg_pool2d(concated_locs, tgt_hw)
+            pools.append(rearrange(pool, 'b c h w -> (h w) b c'))
+            if self.g_pos is None:
+                pos_emb = self.positional_encoding(pool.shape[0],
+                                                   pool.shape[2],
+                                                   pool.shape[3])
+                pos_emb = rearrange(pos_emb, 'b c h w -> (h w) b c')
+                self.p_poses.append(pos_emb)
+        pools = torch.cat(pools, 0)
+        if self.g_pos is None:
+            self.p_poses = torch.cat(self.p_poses, dim=0)
+            pos_emb = self.positional_encoding(g.shape[0], g.shape[2],
+                                               g.shape[3])
+            self.g_pos = rearrange(pos_emb, 'b c h w -> (h w) b c')
+        # attention between glb (q) & multisensory concated-locs (k,v)
+        g_hw_b_c = rearrange(g, 'b c h w -> (h w) b c')
+        g_hw_b_c = g_hw_b_c + self.dropout1(self.attention[0](
+            g_hw_b_c + self.g_pos, pools + self.p_poses, pools)[0])
+        g_hw_b_c = self.norm1(g_hw_b_c)
+        g_hw_b_c = g_hw_b_c + self.dropout2(
+            self.linear2(
+                self.dropout(self.activation(self.linear1(g_hw_b_c)).clone())))
+        g_hw_b_c = self.norm2(g_hw_b_c)
+        # attention between origin locs (q) & freashed glb (k,v)
+        l_hw_b_c = rearrange(l, "b c h w -> (h w) b c")
+        _g_hw_b_c = rearrange(g_hw_b_c, '(h w) b c -> h w b c', h=h, w=w)
+        _g_hw_b_c = rearrange(_g_hw_b_c,
+                              "(ng h) (nw w) b c -> (h w) (ng nw b) c",
+                              ng=2,
+                              nw=2)
+        outputs_re = []
+        for i, (_l, _g) in enumerate(
+                zip(l_hw_b_c.chunk(4, dim=1), _g_hw_b_c.chunk(4, dim=1))):
+            outputs_re.append(self.attention[i + 1](_l, _g,
+                                                    _g)[0])  # (h w) 1 c
+        outputs_re = torch.cat(outputs_re, 1)  # (h w) 4 c
+        l_hw_b_c = l_hw_b_c + self.dropout1(outputs_re)
+        l_hw_b_c = self.norm1(l_hw_b_c)
+        l_hw_b_c = l_hw_b_c + self.dropout2(
+            self.linear4(
+                self.dropout(self.activation(self.linear3(l_hw_b_c)).clone())))
+        l_hw_b_c = self.norm2(l_hw_b_c)
+        l = torch.cat((l_hw_b_c, g_hw_b_c), 1)  # hw,b(5),c
+        return rearrange(l, "(h w) b c -> b c h w", h=h, w=w)  ## (5,c,h*w)
+class inf_MCLM(nn.Module):
+    def __init__(self, d_model, num_heads, pool_ratios=[1, 4, 8]):
+        super(inf_MCLM, self).__init__()
+        self.attention = nn.ModuleList([
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+        ])
+        self.linear1 = nn.Linear(d_model, d_model * 2)
+        self.linear2 = nn.Linear(d_model * 2, d_model)
+        self.linear3 = nn.Linear(d_model, d_model * 2)
+        self.linear4 = nn.Linear(d_model * 2, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(0.1)
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.activation = get_activation_fn('relu')
+        self.pool_ratios = pool_ratios
+        self.p_poses = []
+        self.g_pos = None
+        self.positional_encoding = PositionEmbeddingSine(
+            num_pos_feats=d_model // 2, normalize=True)
+    def forward(self, l, g):
+        """
+        l: 4,c,h,w
+        g: 1,c,h,w
+        """
+        b, c, h, w = l.size()
+        # 4,c,h,w -> 1,c,2h,2w
+        concated_locs = rearrange(l,
+                                  '(hg wg b) c h w -> b c (hg h) (wg w)',
+                                  hg=2,
+                                  wg=2)
+        self.p_poses = []
+        pools = []
+        for pool_ratio in self.pool_ratios:
+            # b,c,h,w
+            tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+            pool = F.adaptive_avg_pool2d(concated_locs, tgt_hw)
+            pools.append(rearrange(pool, 'b c h w -> (h w) b c'))
+            # if self.g_pos is None:
+            pos_emb = self.positional_encoding(pool.shape[0], pool.shape[2],
+                                               pool.shape[3])
+            pos_emb = rearrange(pos_emb, 'b c h w -> (h w) b c')
+            self.p_poses.append(pos_emb)
+        pools = torch.cat(pools, 0)
+        # if self.g_pos is None:
+        self.p_poses = torch.cat(self.p_poses, dim=0)
+        pos_emb = self.positional_encoding(g.shape[0], g.shape[2], g.shape[3])
+        self.g_pos = rearrange(pos_emb, 'b c h w -> (h w) b c')
+        # attention between glb (q) & multisensory concated-locs (k,v)
+        g_hw_b_c = rearrange(g, 'b c h w -> (h w) b c')
+        g_hw_b_c = g_hw_b_c + self.dropout1(self.attention[0](
+            g_hw_b_c + self.g_pos, pools + self.p_poses, pools)[0])
+        g_hw_b_c = self.norm1(g_hw_b_c)
+        g_hw_b_c = g_hw_b_c + self.dropout2(
+            self.linear2(
+                self.dropout(self.activation(self.linear1(g_hw_b_c)).clone())))
+        g_hw_b_c = self.norm2(g_hw_b_c)
+        # attention between origin locs (q) & freashed glb (k,v)
+        l_hw_b_c = rearrange(l, "b c h w -> (h w) b c")
+        _g_hw_b_c = rearrange(g_hw_b_c, '(h w) b c -> h w b c', h=h, w=w)
+        _g_hw_b_c = rearrange(_g_hw_b_c,
+                              "(ng h) (nw w) b c -> (h w) (ng nw b) c",
+                              ng=2,
+                              nw=2)
+        outputs_re = []
+        for i, (_l, _g) in enumerate(
+                zip(l_hw_b_c.chunk(4, dim=1), _g_hw_b_c.chunk(4, dim=1))):
+            outputs_re.append(self.attention[i + 1](_l, _g,
+                                                    _g)[0])  # (h w) 1 c
+        outputs_re = torch.cat(outputs_re, 1)  # (h w) 4 c
+        l_hw_b_c = l_hw_b_c + self.dropout1(outputs_re)
+        l_hw_b_c = self.norm1(l_hw_b_c)
+        l_hw_b_c = l_hw_b_c + self.dropout2(
+            self.linear4(
+                self.dropout(self.activation(self.linear3(l_hw_b_c)).clone())))
+        l_hw_b_c = self.norm2(l_hw_b_c)
+        l = torch.cat((l_hw_b_c, g_hw_b_c), 1)  # hw,b(5),c
+        return rearrange(l, "(h w) b c -> b c h w", h=h, w=w)  ## (5,c,h*w)
+class MCRM(nn.Module):
+    def __init__(self, d_model, num_heads, pool_ratios=[4, 8, 16], h=None):
+        super(MCRM, self).__init__()
+        self.attention = nn.ModuleList([
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+        ])
+        self.linear3 = nn.Linear(d_model, d_model * 2)
+        self.linear4 = nn.Linear(d_model * 2, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(0.1)
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.sigmoid = nn.Sigmoid()
+        self.activation = get_activation_fn('relu')
+        self.sal_conv = nn.Conv2d(d_model, 1, 1)
+        self.pool_ratios = pool_ratios
+        self.positional_encoding = PositionEmbeddingSine(
+            num_pos_feats=d_model // 2, normalize=True)
+    def forward(self, x):
+        b, c, h, w = x.size()
+        loc, glb = x.split([4, 1], dim=0)  # 4,c,h,w; 1,c,h,w
+        # b(4),c,h,w
+        patched_glb = rearrange(glb,
+                                'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                hg=2,
+                                wg=2)
+        # generate token attention map
+        token_attention_map = self.sigmoid(self.sal_conv(glb))
+        token_attention_map = F.interpolate(token_attention_map,
+                                            size=patches2image(loc).shape[-2:],
+                                            mode='nearest')
+        loc = loc * rearrange(token_attention_map,
+                              'b c (hg h) (wg w) -> (hg wg b) c h w',
+                              hg=2,
+                              wg=2)
+        pools = []
+        for pool_ratio in self.pool_ratios:
+            tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+            pool = F.adaptive_avg_pool2d(patched_glb, tgt_hw)
+            pools.append(rearrange(pool,
+                                   'nl c h w -> nl c (h w)'))  # nl(4),c,hw
+        # nl(4),c,nphw -> nl(4),nphw,1,c
+        pools = rearrange(torch.cat(pools, 2), "nl c nphw -> nl nphw 1 c")
+        loc_ = rearrange(loc, 'nl c h w -> nl (h w) 1 c')
+        outputs = []
+        for i, q in enumerate(
+                loc_.unbind(dim=0)):  # traverse all local patches
+            # np*hw,1,c
+            v = pools[i]
+            k = v
+            outputs.append(self.attention[i](q, k, v)[0])
+        outputs = torch.cat(outputs, 1)
+        src = loc.view(4, c, -1).permute(2, 0, 1) + self.dropout1(outputs)
+        src = self.norm1(src)
+        src = src + self.dropout2(
+            self.linear4(
+                self.dropout(self.activation(self.linear3(src)).clone())))
+        src = self.norm2(src)
+        src = src.permute(1, 2, 0).reshape(4, c, h, w)  # freshed loc
+        glb = glb + F.interpolate(patches2image(src),
+                                  size=glb.shape[-2:],
+                                  mode='nearest')  # freshed glb
+        return torch.cat((src, glb), 0), token_attention_map
+class inf_MCRM(nn.Module):
+    def __init__(self, d_model, num_heads, pool_ratios=[4, 8, 16], h=None):
+        super(inf_MCRM, self).__init__()
+        self.attention = nn.ModuleList([
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+        ])
+        self.linear3 = nn.Linear(d_model, d_model * 2)
+        self.linear4 = nn.Linear(d_model * 2, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(0.1)
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.sigmoid = nn.Sigmoid()
+        self.activation = get_activation_fn('relu')
+        self.sal_conv = nn.Conv2d(d_model, 1, 1)
+        self.pool_ratios = pool_ratios
+        self.positional_encoding = PositionEmbeddingSine(
+            num_pos_feats=d_model // 2, normalize=True)
+    def forward(self, x):
+        b, c, h, w = x.size()
+        loc, glb = x.split([4, 1], dim=0)  # 4,c,h,w; 1,c,h,w
+        # b(4),c,h,w
+        patched_glb = rearrange(glb,
+                                'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                hg=2,
+                                wg=2)
+        # generate token attention map
+        token_attention_map = self.sigmoid(self.sal_conv(glb))
+        token_attention_map = F.interpolate(token_attention_map,
+                                            size=patches2image(loc).shape[-2:],
+                                            mode='nearest')
+        loc = loc * rearrange(token_attention_map,
+                              'b c (hg h) (wg w) -> (hg wg b) c h w',
+                              hg=2,
+                              wg=2)
+        pools = []
+        for pool_ratio in self.pool_ratios:
+            tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+            pool = F.adaptive_avg_pool2d(patched_glb, tgt_hw)
+            pools.append(rearrange(pool,
+                                   'nl c h w -> nl c (h w)'))  # nl(4),c,hw
+        # nl(4),c,nphw -> nl(4),nphw,1,c
+        pools = rearrange(torch.cat(pools, 2), "nl c nphw -> nl nphw 1 c")
+        loc_ = rearrange(loc, 'nl c h w -> nl (h w) 1 c')
+        outputs = []
+        for i, q in enumerate(
+                loc_.unbind(dim=0)):  # traverse all local patches
+            # np*hw,1,c
+            v = pools[i]
+            k = v
+            outputs.append(self.attention[i](q, k, v)[0])
+        outputs = torch.cat(outputs, 1)
+        src = loc.view(4, c, -1).permute(2, 0, 1) + self.dropout1(outputs)
+        src = self.norm1(src)
+        src = src + self.dropout2(
+            self.linear4(
+                self.dropout(self.activation(self.linear3(src)).clone())))
+        src = self.norm2(src)
+        src = src.permute(1, 2, 0).reshape(4, c, h, w)  # freshed loc
+        glb = glb + F.interpolate(patches2image(src),
+                                  size=glb.shape[-2:],
+                                  mode='nearest')  # freshed glb
+        return torch.cat((src, glb), 0)
+# model for single-scale training
+class MVANet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.backbone = SwinB(pretrained=True)
+        emb_dim = 128
+        self.sideout5 = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout4 = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout3 = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout2 = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout1 = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.output5 = make_cbr(1024, emb_dim)
+        self.output4 = make_cbr(512, emb_dim)
+        self.output3 = make_cbr(256, emb_dim)
+        self.output2 = make_cbr(128, emb_dim)
+        self.output1 = make_cbr(128, emb_dim)
+        self.multifieldcrossatt = MCLM(emb_dim, 1, [1, 4, 8])
+        self.conv1 = make_cbr(emb_dim, emb_dim)
+        self.conv2 = make_cbr(emb_dim, emb_dim)
+        self.conv3 = make_cbr(emb_dim, emb_dim)
+        self.conv4 = make_cbr(emb_dim, emb_dim)
+        self.dec_blk1 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk2 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk3 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk4 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.insmask_head = nn.Sequential(
+            nn.Conv2d(emb_dim, 384, kernel_size=3, padding=1),
+            nn.BatchNorm2d(384), nn.PReLU(),
+            nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.BatchNorm2d(384),
+            nn.PReLU(), nn.Conv2d(384, emb_dim, kernel_size=3, padding=1))
+        self.shallow = nn.Sequential(
+            nn.Conv2d(3, emb_dim, kernel_size=3, padding=1))
+        self.upsample1 = make_cbg(emb_dim, emb_dim)
+        self.upsample2 = make_cbg(emb_dim, emb_dim)
+        self.output = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        for m in self.modules():
+            if isinstance(m, nn.ReLU) or isinstance(m, nn.Dropout):
+                m.inplace = True
+    def forward(self, x):
+        x = x.to(dtype=torch_dtype, device=torch_device)
+        shallow = self.shallow(x)
+        glb = rescale_to(x, scale_factor=0.5, interpolation='bilinear')
+        loc = image2patches(x)
+        input = torch.cat((loc, glb), dim=0)
+        feature = self.backbone(input)
+        e5 = self.output5(feature[4])  # (5,128,16,16)
+        e4 = self.output4(feature[3])  # (5,128,32,32)
+        e3 = self.output3(feature[2])  # (5,128,64,64)
+        e2 = self.output2(feature[1])  # (5,128,128,128)
+        e1 = self.output1(feature[0])  # (5,128,128,128)
+        loc_e5, glb_e5 = e5.split([4, 1], dim=0)
+        e5 = self.multifieldcrossatt(loc_e5, glb_e5)  # (4,128,16,16)
+        e4, tokenattmap4 = self.dec_blk4(e4 + resize_as(e5, e4))
+        e4 = self.conv4(e4)
+        e3, tokenattmap3 = self.dec_blk3(e3 + resize_as(e4, e3))
+        e3 = self.conv3(e3)
+        e2, tokenattmap2 = self.dec_blk2(e2 + resize_as(e3, e2))
+        e2 = self.conv2(e2)
+        e1, tokenattmap1 = self.dec_blk1(e1 + resize_as(e2, e1))
+        e1 = self.conv1(e1)
+        loc_e1, glb_e1 = e1.split([4, 1], dim=0)
+        output1_cat = patches2image(loc_e1)  # (1,128,256,256)
+        # add glb feat in
+        output1_cat = output1_cat + resize_as(glb_e1, output1_cat)
+        # merge
+        final_output = self.insmask_head(output1_cat)  # (1,128,256,256)
+        # shallow feature merge
+        final_output = final_output + resize_as(shallow, final_output)
+        final_output = self.upsample1(rescale_to(final_output))
+        final_output = rescale_to(final_output +
+                                  resize_as(shallow, final_output))
+        final_output = self.upsample2(final_output)
+        final_output = self.output(final_output)
+        ####
+        sideout5 = self.sideout5(e5).to(dtype=torch_dtype, device=torch_device)
+        sideout4 = self.sideout4(e4)
+        sideout3 = self.sideout3(e3)
+        sideout2 = self.sideout2(e2)
+        sideout1 = self.sideout1(e1)
+        #######glb_sideouts ######
+        glb5 = self.sideout5(glb_e5)
+        glb4 = sideout4[-1, :, :, :].unsqueeze(0)
+        glb3 = sideout3[-1, :, :, :].unsqueeze(0)
+        glb2 = sideout2[-1, :, :, :].unsqueeze(0)
+        glb1 = sideout1[-1, :, :, :].unsqueeze(0)
+        ####### concat 4 to 1 #######
+        sideout1 = patches2image(sideout1[:-1]).to(dtype=torch_dtype,
+                                                   device=torch_device)
+        sideout2 = patches2image(sideout2[:-1]).to(
+            dtype=torch_dtype,
+            device=torch_device)  ####(5,c,h,w) -> (1 c 2h,2w)
+        sideout3 = patches2image(sideout3[:-1]).to(dtype=torch_dtype,
+                                                   device=torch_device)
+        sideout4 = patches2image(sideout4[:-1]).to(dtype=torch_dtype,
+                                                   device=torch_device)
+        sideout5 = patches2image(sideout5[:-1]).to(dtype=torch_dtype,
+                                                   device=torch_device)
+        if self.training:
+            return sideout5, sideout4, sideout3, sideout2, sideout1, final_output, glb5, glb4, glb3, glb2, glb1, tokenattmap4, tokenattmap3, tokenattmap2, tokenattmap1
+        else:
+            return final_output
+# model for multi-scale testing
+class inf_MVANet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # self.backbone = SwinB(pretrained=True)
+        self.backbone = SwinB(pretrained=False)
+        emb_dim = 128
+        self.output5 = make_cbr(1024, emb_dim)
+        self.output4 = make_cbr(512, emb_dim)
+        self.output3 = make_cbr(256, emb_dim)
+        self.output2 = make_cbr(128, emb_dim)
+        self.output1 = make_cbr(128, emb_dim)
+        self.multifieldcrossatt = inf_MCLM(emb_dim, 1, [1, 4, 8])
+        self.conv1 = make_cbr(emb_dim, emb_dim)
+        self.conv2 = make_cbr(emb_dim, emb_dim)
+        self.conv3 = make_cbr(emb_dim, emb_dim)
+        self.conv4 = make_cbr(emb_dim, emb_dim)
+        self.dec_blk1 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk2 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk3 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk4 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+        self.insmask_head = nn.Sequential(
+            nn.Conv2d(emb_dim, 384, kernel_size=3, padding=1),
+            nn.BatchNorm2d(384), nn.PReLU(),
+            nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.BatchNorm2d(384),
+            nn.PReLU(), nn.Conv2d(384, emb_dim, kernel_size=3, padding=1))
+        self.shallow = nn.Sequential(
+            nn.Conv2d(3, emb_dim, kernel_size=3, padding=1))
+        self.upsample1 = make_cbg(emb_dim, emb_dim)
+        self.upsample2 = make_cbg(emb_dim, emb_dim)
+        self.output = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        for m in self.modules():
+            if isinstance(m, nn.ReLU) or isinstance(m, nn.Dropout):
+                m.inplace = True
+    def forward(self, x):
+        shallow = self.shallow(x)
+        glb = rescale_to(x, scale_factor=0.5, interpolation='bilinear')
+        loc = image2patches(x)
+        input = torch.cat((loc, glb), dim=0)
+        feature = self.backbone(input)
+        e5 = self.output5(feature[4])
+        e4 = self.output4(feature[3])
+        e3 = self.output3(feature[2])
+        e2 = self.output2(feature[1])
+        e1 = self.output1(feature[0])
+        loc_e5, glb_e5 = e5.split([4, 1], dim=0)
+        e5_cat = self.multifieldcrossatt(loc_e5, glb_e5)
+        e4 = self.conv4(self.dec_blk4(e4 + resize_as(e5_cat, e4)))
+        e3 = self.conv3(self.dec_blk3(e3 + resize_as(e4, e3)))
+        e2 = self.conv2(self.dec_blk2(e2 + resize_as(e3, e2)))
+        e1 = self.conv1(self.dec_blk1(e1 + resize_as(e2, e1)))
+        loc_e1, glb_e1 = e1.split([4, 1], dim=0)
+        # after decoder, concat loc features to a whole one, and merge
+        output1_cat = patches2image(loc_e1)
+        # add glb feat in
+        output1_cat = output1_cat + resize_as(glb_e1, output1_cat)
+        # merge
+        final_output = self.insmask_head(output1_cat)
+        # shallow feature merge
+        final_output = final_output + resize_as(shallow, final_output)
+        final_output = self.upsample1(rescale_to(final_output))
+        final_output = rescale_to(final_output +
+                                  resize_as(shallow, final_output))
+        final_output = self.upsample2(final_output)
+        final_output = self.output(final_output)
+        return final_output
+class load_MVANet_Model:
+    def __init__(self):
+        pass
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {},
+        }
+    RETURN_TYPES = ("MVANet_Model", )
+    FUNCTION = "test"
+    CATEGORY = "MVANet"
+    def test(self):
+        return (load_model(get_model_path()), )
+class run_MVANet_inference:
+    def __init__(self):
+        pass
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "image": ("IMAGE", ),
+                "MVANet_Model": ("MVANet_Model", ),
+            },
+        }
+    RETURN_TYPES = ("MASK", )
+    FUNCTION = "test"
+    CATEGORY = "MVANet"
+    def test(
+        self,
+        image,
+        MVANet_Model,
+    ):
+        ret = do_infer_tensor2tensor(img=image, net=MVANet_Model)
+        return (ret, )
+NODE_CLASS_MAPPINGS = {
+    "load_MVANet_Model": load_MVANet_Model,
+    "run_MVANet_inference": run_MVANet_inference
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "load_MVANet_Model": "load MVANet Model",
+    "run_MVANet_inference": "run_MVANet_inference"
+}

ComfyUI_MVANet/MVANet_inference.run.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+#!/bin/sh
+. "${HOME}/dbnew.sh"
+python3 './MVANet_inference.py'

ComfyUI_MVANet/README.org ADDED Viewed

	@@ -0,0 +1,1694 @@

+* COMMENT Sample
+** Shell script to download
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./download.sh
+#+end_src
+** MVANet_inference import
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.import.py
+#+end_src
+** MVANet_inference function
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+#+end_src
+** MVANet_inference class
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.class.py
+#+end_src
+** MVANet_inference execute
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.execute.py
+#+end_src
+** MVANet_inference unify
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./MVANet_inference.unify.sh
+#+end_src
+* Download the code:
+** Function to download
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./download.sh
+  get_repo(){
+      DIR_REPO="${HOME}/GITHUB/$('echo' "${1}" | 'sed' 's/^git@github.com://g ; s@^https://github.com/@@g ; s@.git$@@g' )"
+      DIR_BASE="$('dirname' '--' "${DIR_REPO}")"
+      mkdir -pv -- "${DIR_BASE}"
+      cd "${DIR_BASE}"
+      git clone "${1}"
+      cd "${DIR_REPO}"
+      git pull
+      git submodule update --recursive --init
+  }
+#+end_src
+** Download
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./download.sh
+  get_repo 'https://github.com/qianyu-dlut/MVANet.git'
+#+end_src
+* Dependencies
+#+begin_src conf :tangle ./requirements.txt
+  timm
+  einops
+  wget
+#+end_src
+* Python inference
+** Important configs
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.import.py
+  import os
+  import sys
+  HOME_DIR = os.environ.get('HOME', '/root')
+  MVANET_SOURCE_DIR = HOME_DIR + '/GITHUB/qianyu-dlut/MVANet'
+  finetuned_MVANet_model_path = MVANET_SOURCE_DIR + '/model/Model_80.pth'
+  pretrained_SwinB_model_path = MVANET_SOURCE_DIR + '/model/swin_base_patch4_window12_384_22kto1k.pth'
+#+end_src
+** MVANet_inference import
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.import.py
+  import math
+  import numpy as np
+  import cv2
+  import wget
+  import torch
+  import torch.nn as nn
+  import torch.nn.functional as F
+  import torch.utils.checkpoint as checkpoint
+  from torch.autograd import Variable
+  from torch import nn
+  from torchvision import transforms
+  from einops import rearrange
+  from timm.models import load_checkpoint
+  from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+  torch_device = 'cuda'
+  torch_dtype = torch.float16
+#+end_src
+** COMMENT Load image using CV
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+  def load_image(input_image_path):
+      img = cv2.imread(input_image_path, cv2.IMREAD_COLOR)
+      img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+      return img
+  def load_image_torch(input_image_path):
+      img = cv2.imread(input_image_path, cv2.IMREAD_COLOR)
+      img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+      img = torch.from_numpy(img)
+      img = img.to(dtype=torch.float32)
+      img /= 255.0
+      img = img.unsqueeze(0)
+      return img
+  def save_mask(output_image_path, mask):
+      cv2.imwrite(output_image_path, mask)
+  def save_mask_torch(output_image_path, mask):
+      mask = mask.detach().cpu()
+      mask *= 255.0
+      mask = mask.clamp(0, 255)
+      print(mask.shape)
+      mask = mask.squeeze(0)
+      mask = mask.to(dtype=torch.uint8)
+      print(mask.shape)
+      mask = mask.numpy()
+      print(mask.shape)
+      cv2.imwrite(output_image_path, mask)
+#+end_src
+** MVANet_inference function
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+  def check_mkdir(dir_name):
+      if not os.path.isdir(dir_name):
+          os.makedirs(dir_name)
+  def SwinT(pretrained=True):
+      model = SwinTransformer(embed_dim=96,
+                              depths=[2, 2, 6, 2],
+                              num_heads=[3, 6, 12, 24],
+                              window_size=7)
+      if pretrained is True:
+          model.load_state_dict(torch.load(
+              'data/backbone_ckpt/swin_tiny_patch4_window7_224.pth',
+              map_location='cpu')['model'],
+                                strict=False)
+      return model
+  def SwinS(pretrained=True):
+      model = SwinTransformer(embed_dim=96,
+                              depths=[2, 2, 18, 2],
+                              num_heads=[3, 6, 12, 24],
+                              window_size=7)
+      if pretrained is True:
+          model.load_state_dict(torch.load(
+              'data/backbone_ckpt/swin_small_patch4_window7_224.pth',
+              map_location='cpu')['model'],
+                                strict=False)
+      return model
+  def SwinB(pretrained=True):
+      model = SwinTransformer(embed_dim=128,
+                              depths=[2, 2, 18, 2],
+                              num_heads=[4, 8, 16, 32],
+                              window_size=12)
+      if pretrained is True:
+          import os
+          model.load_state_dict(torch.load(pretrained_SwinB_model_path,
+                                           map_location='cpu')['model'],
+                                strict=False)
+      return model
+  def SwinL(pretrained=True):
+      model = SwinTransformer(embed_dim=192,
+                              depths=[2, 2, 18, 2],
+                              num_heads=[6, 12, 24, 48],
+                              window_size=12)
+      if pretrained is True:
+          model.load_state_dict(torch.load(
+              'data/backbone_ckpt/swin_large_patch4_window12_384_22kto1k.pth',
+              map_location='cpu')['model'],
+                                strict=False)
+      return model
+  def get_activation_fn(activation):
+      """Return an activation function given a string"""
+      if activation == "relu":
+          return F.relu
+      if activation == "gelu":
+          return F.gelu
+      if activation == "glu":
+          return F.glu
+      raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+  def make_cbr(in_dim, out_dim):
+      return nn.Sequential(nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1),
+                           nn.BatchNorm2d(out_dim), nn.PReLU())
+  def make_cbg(in_dim, out_dim):
+      return nn.Sequential(nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1),
+                           nn.BatchNorm2d(out_dim), nn.GELU())
+  def rescale_to(x, scale_factor: float = 2, interpolation='nearest'):
+      return F.interpolate(x, scale_factor=scale_factor, mode=interpolation)
+  def resize_as(x, y, interpolation='bilinear'):
+      return F.interpolate(x, size=y.shape[-2:], mode=interpolation)
+  def image2patches(x):
+      """b c (hg h) (wg w) -> (hg wg b) c h w"""
+      x = rearrange(x, 'b c (hg h) (wg w) -> (hg wg b) c h w', hg=2, wg=2)
+      return x
+  def patches2image(x):
+      """(hg wg b) c h w -> b c (hg h) (wg w)"""
+      x = rearrange(x, '(hg wg b) c h w -> b c (hg h) (wg w)', hg=2, wg=2)
+      return x
+  def window_partition(x, window_size):
+      """
+      Args:
+          x: (B, H, W, C)
+          window_size (int): window size
+      Returns:
+          windows: (num_windows*B, window_size, window_size, C)
+      """
+      B, H, W, C = x.shape
+      x = x.view(B, H // window_size, window_size, W // window_size, window_size,
+                 C)
+      windows = x.permute(0, 1, 3, 2, 4,
+                          5).contiguous().view(-1, window_size, window_size, C)
+      return windows
+  def window_reverse(windows, window_size, H, W):
+      """
+      Args:
+          windows: (num_windows*B, window_size, window_size, C)
+          window_size (int): Window size
+          H (int): Height of image
+          W (int): Width of image
+      Returns:
+          x: (B, H, W, C)
+      """
+      B = int(windows.shape[0] / (H * W / window_size / window_size))
+      x = windows.view(B, H // window_size, W // window_size, window_size,
+                       window_size, -1)
+      x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+      return x
+#+end_src
+** MVANet_inference class
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.class.py
+  class Mlp(nn.Module):
+      """ Multilayer perceptron."""
+      def __init__(self,
+                   in_features,
+                   hidden_features=None,
+                   out_features=None,
+                   act_layer=nn.GELU,
+                   drop=0.):
+          super().__init__()
+          out_features = out_features or in_features
+          hidden_features = hidden_features or in_features
+          self.fc1 = nn.Linear(in_features, hidden_features)
+          self.act = act_layer()
+          self.fc2 = nn.Linear(hidden_features, out_features)
+          self.drop = nn.Dropout(drop)
+      def forward(self, x):
+          x = self.fc1(x)
+          x = self.act(x)
+          x = self.drop(x)
+          x = self.fc2(x)
+          x = self.drop(x)
+          return x
+  class WindowAttention(nn.Module):
+      """ Window based multi-head self attention (W-MSA) module with relative position bias.
+      It supports both of shifted and non-shifted window.
+      Args:
+          dim (int): Number of input channels.
+          window_size (tuple[int]): The height and width of the window.
+          num_heads (int): Number of attention heads.
+          qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+          attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+          proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+      """
+      def __init__(self,
+                   dim,
+                   window_size,
+                   num_heads,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   attn_drop=0.,
+                   proj_drop=0.):
+          super().__init__()
+          self.dim = dim
+          self.window_size = window_size  # Wh, Ww
+          self.num_heads = num_heads
+          head_dim = dim // num_heads
+          self.scale = qk_scale or head_dim**-0.5
+          # define a parameter table of relative position bias
+          self.relative_position_bias_table = nn.Parameter(
+              torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                          num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+          # get pair-wise relative position index for each token inside the window
+          coords_h = torch.arange(self.window_size[0])
+          coords_w = torch.arange(self.window_size[1])
+          coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+          coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+          relative_coords = coords_flatten[:, :,
+                                           None] - coords_flatten[:,
+                                                                  None, :]  # 2, Wh*Ww, Wh*Ww
+          relative_coords = relative_coords.permute(
+              1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+          relative_coords[:, :,
+                          0] += self.window_size[0] - 1  # shift to start from 0
+          relative_coords[:, :, 1] += self.window_size[1] - 1
+          relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+          relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+          self.register_buffer("relative_position_index",
+                               relative_position_index)
+          self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+          self.attn_drop = nn.Dropout(attn_drop)
+          self.proj = nn.Linear(dim, dim)
+          self.proj_drop = nn.Dropout(proj_drop)
+          trunc_normal_(self.relative_position_bias_table, std=.02)
+          self.softmax = nn.Softmax(dim=-1)
+      def forward(self, x, mask=None):
+          """ Forward function.
+          Args:
+              x: input features with shape of (num_windows*B, N, C)
+              mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+          """
+          x = x.to(dtype=torch_dtype, device=torch_device)
+          B_, N, C = x.shape
+          qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                    C // self.num_heads).permute(2, 0, 3, 1, 4)
+          q, k, v = qkv[0], qkv[1], qkv[
+              2]  # make torchscript happy (cannot use tensor as tuple)
+          q = q * self.scale
+          attn = (q @ k.transpose(-2, -1))
+          relative_position_bias = self.relative_position_bias_table[
+              self.relative_position_index.view(-1)].view(
+                  self.window_size[0] * self.window_size[1],
+                  self.window_size[0] * self.window_size[1],
+                  -1)  # Wh*Ww,Wh*Ww,nH
+          relative_position_bias = relative_position_bias.permute(
+              2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+          attn = attn + relative_position_bias.unsqueeze(0)
+          if mask is not None:
+              nW = mask.shape[0]
+              attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                               N) + mask.unsqueeze(1).unsqueeze(0)
+              attn = attn.view(-1, self.num_heads, N, N)
+              attn = self.softmax(attn)
+          else:
+              attn = self.softmax(attn)
+          attn = self.attn_drop(attn)
+          attn = attn.to(dtype=torch_dtype, device=torch_device)
+          v = v.to(dtype=torch_dtype, device=torch_device)
+          x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+          x = self.proj(x)
+          x = self.proj_drop(x)
+          return x
+  class SwinTransformerBlock(nn.Module):
+      """ Swin Transformer Block.
+      Args:
+          dim (int): Number of input channels.
+          num_heads (int): Number of attention heads.
+          window_size (int): Window size.
+          shift_size (int): Shift size for SW-MSA.
+          mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+          qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+          drop (float, optional): Dropout rate. Default: 0.0
+          attn_drop (float, optional): Attention dropout rate. Default: 0.0
+          drop_path (float, optional): Stochastic depth rate. Default: 0.0
+          act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+          norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+      """
+      def __init__(self,
+                   dim,
+                   num_heads,
+                   window_size=7,
+                   shift_size=0,
+                   mlp_ratio=4.,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   drop=0.,
+                   attn_drop=0.,
+                   drop_path=0.,
+                   act_layer=nn.GELU,
+                   norm_layer=nn.LayerNorm):
+          super().__init__()
+          self.dim = dim
+          self.num_heads = num_heads
+          self.window_size = window_size
+          self.shift_size = shift_size
+          self.mlp_ratio = mlp_ratio
+          assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+          self.norm1 = norm_layer(dim)
+          self.attn = WindowAttention(dim,
+                                      window_size=to_2tuple(self.window_size),
+                                      num_heads=num_heads,
+                                      qkv_bias=qkv_bias,
+                                      qk_scale=qk_scale,
+                                      attn_drop=attn_drop,
+                                      proj_drop=drop)
+          self.drop_path = DropPath(
+              drop_path) if drop_path > 0. else nn.Identity()
+          self.norm2 = norm_layer(dim)
+          mlp_hidden_dim = int(dim * mlp_ratio)
+          self.mlp = Mlp(in_features=dim,
+                         hidden_features=mlp_hidden_dim,
+                         act_layer=act_layer,
+                         drop=drop)
+          self.H = None
+          self.W = None
+      def forward(self, x, mask_matrix):
+          """ Forward function.
+          Args:
+              x: Input feature, tensor size (B, H*W, C).
+              H, W: Spatial resolution of the input feature.
+              mask_matrix: Attention mask for cyclic shift.
+          """
+          B, L, C = x.shape
+          H, W = self.H, self.W
+          assert L == H * W, "input feature has wrong size"
+          shortcut = x
+          x = self.norm1(x)
+          x = x.view(B, H, W, C)
+          # pad feature maps to multiples of window size
+          pad_l = pad_t = 0
+          pad_r = (self.window_size - W % self.window_size) % self.window_size
+          pad_b = (self.window_size - H % self.window_size) % self.window_size
+          x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+          _, Hp, Wp, _ = x.shape
+          # cyclic shift
+          if self.shift_size > 0:
+              shifted_x = torch.roll(x,
+                                     shifts=(-self.shift_size, -self.shift_size),
+                                     dims=(1, 2))
+              attn_mask = mask_matrix
+          else:
+              shifted_x = x
+              attn_mask = None
+          # partition windows
+          x_windows = window_partition(
+              shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+          x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                     C)  # nW*B, window_size*window_size, C
+          # W-MSA/SW-MSA
+          attn_windows = self.attn(
+              x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+          # merge windows
+          attn_windows = attn_windows.view(-1, self.window_size,
+                                           self.window_size, C)
+          shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                     Wp)  # B H' W' C
+          # reverse cyclic shift
+          if self.shift_size > 0:
+              x = torch.roll(shifted_x,
+                             shifts=(self.shift_size, self.shift_size),
+                             dims=(1, 2))
+          else:
+              x = shifted_x
+          if pad_r > 0 or pad_b > 0:
+              x = x[:, :H, :W, :].contiguous()
+          x = x.view(B, H * W, C)
+          # FFN
+          x = shortcut + self.drop_path(x)
+          x = x + self.drop_path(self.mlp(self.norm2(x)))
+          return x
+  class PatchMerging(nn.Module):
+      """ Patch Merging Layer
+      Args:
+          dim (int): Number of input channels.
+          norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+      """
+      def __init__(self, dim, norm_layer=nn.LayerNorm):
+          super().__init__()
+          self.dim = dim
+          self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+          self.norm = norm_layer(4 * dim)
+      def forward(self, x, H, W):
+          """ Forward function.
+          Args:
+              x: Input feature, tensor size (B, H*W, C).
+              H, W: Spatial resolution of the input feature.
+          """
+          B, L, C = x.shape
+          assert L == H * W, "input feature has wrong size"
+          x = x.view(B, H, W, C)
+          # padding
+          pad_input = (H % 2 == 1) or (W % 2 == 1)
+          if pad_input:
+              x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+          x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+          x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+          x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+          x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+          x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+          x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+          x = self.norm(x)
+          x = self.reduction(x)
+          return x
+  class BasicLayer(nn.Module):
+      """ A basic Swin Transformer layer for one stage.
+      Args:
+          dim (int): Number of feature channels
+          depth (int): Depths of this stage.
+          num_heads (int): Number of attention head.
+          window_size (int): Local window size. Default: 7.
+          mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+          qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+          drop (float, optional): Dropout rate. Default: 0.0
+          attn_drop (float, optional): Attention dropout rate. Default: 0.0
+          drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+          norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+          downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+          use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+      """
+      def __init__(self,
+                   dim,
+                   depth,
+                   num_heads,
+                   window_size=7,
+                   mlp_ratio=4.,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   drop=0.,
+                   attn_drop=0.,
+                   drop_path=0.,
+                   norm_layer=nn.LayerNorm,
+                   downsample=None,
+                   use_checkpoint=False):
+          super().__init__()
+          self.window_size = window_size
+          self.shift_size = window_size // 2
+          self.depth = depth
+          self.use_checkpoint = use_checkpoint
+          # build blocks
+          self.blocks = nn.ModuleList([
+              SwinTransformerBlock(dim=dim,
+                                   num_heads=num_heads,
+                                   window_size=window_size,
+                                   shift_size=0 if
+                                   (i % 2 == 0) else window_size // 2,
+                                   mlp_ratio=mlp_ratio,
+                                   qkv_bias=qkv_bias,
+                                   qk_scale=qk_scale,
+                                   drop=drop,
+                                   attn_drop=attn_drop,
+                                   drop_path=drop_path[i] if isinstance(
+                                       drop_path, list) else drop_path,
+                                   norm_layer=norm_layer) for i in range(depth)
+          ])
+          # patch merging layer
+          if downsample is not None:
+              self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+          else:
+              self.downsample = None
+      def forward(self, x, H, W):
+          """ Forward function.
+          Args:
+              x: Input feature, tensor size (B, H*W, C).
+              H, W: Spatial resolution of the input feature.
+          """
+          # calculate attention mask for SW-MSA
+          Hp = int(np.ceil(H / self.window_size)) * self.window_size
+          Wp = int(np.ceil(W / self.window_size)) * self.window_size
+          img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+          h_slices = (slice(0, -self.window_size),
+                      slice(-self.window_size,
+                            -self.shift_size), slice(-self.shift_size, None))
+          w_slices = (slice(0, -self.window_size),
+                      slice(-self.window_size,
+                            -self.shift_size), slice(-self.shift_size, None))
+          cnt = 0
+          for h in h_slices:
+              for w in w_slices:
+                  img_mask[:, h, w, :] = cnt
+                  cnt += 1
+          mask_windows = window_partition(
+              img_mask, self.window_size)  # nW, window_size, window_size, 1
+          mask_windows = mask_windows.view(-1,
+                                           self.window_size * self.window_size)
+          attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+          attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                            float(-100.0)).masked_fill(
+                                                attn_mask == 0, float(0.0))
+          for blk in self.blocks:
+              blk.H, blk.W = H, W
+              if self.use_checkpoint:
+                  x = checkpoint.checkpoint(blk, x, attn_mask)
+              else:
+                  x = blk(x, attn_mask)
+          if self.downsample is not None:
+              x_down = self.downsample(x, H, W)
+              Wh, Ww = (H + 1) // 2, (W + 1) // 2
+              return x, H, W, x_down, Wh, Ww
+          else:
+              return x, H, W, x, H, W
+  class PatchEmbed(nn.Module):
+      """ Image to Patch Embedding
+      Args:
+          patch_size (int): Patch token size. Default: 4.
+          in_chans (int): Number of input image channels. Default: 3.
+          embed_dim (int): Number of linear projection output channels. Default: 96.
+          norm_layer (nn.Module, optional): Normalization layer. Default: None
+      """
+      def __init__(self,
+                   patch_size=4,
+                   in_chans=3,
+                   embed_dim=96,
+                   norm_layer=None):
+          super().__init__()
+          patch_size = to_2tuple(patch_size)
+          self.patch_size = patch_size
+          self.in_chans = in_chans
+          self.embed_dim = embed_dim
+          self.proj = nn.Conv2d(in_chans,
+                                embed_dim,
+                                kernel_size=patch_size,
+                                stride=patch_size)
+          if norm_layer is not None:
+              self.norm = norm_layer(embed_dim)
+          else:
+              self.norm = None
+      def forward(self, x):
+          """Forward function."""
+          # padding
+          _, _, H, W = x.size()
+          if W % self.patch_size[1] != 0:
+              x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+          if H % self.patch_size[0] != 0:
+              x = F.pad(x,
+                        (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+          x = self.proj(x)  # B C Wh Ww
+          if self.norm is not None:
+              Wh, Ww = x.size(2), x.size(3)
+              x = x.flatten(2).transpose(1, 2)
+              x = self.norm(x)
+              x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+          return x
+  class SwinTransformer(nn.Module):
+      """ Swin Transformer backbone.
+          A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+            https://arxiv.org/pdf/2103.14030
+      Args:
+          pretrain_img_size (int): Input image size for training the pretrained model,
+              used in absolute postion embedding. Default 224.
+          patch_size (int | tuple(int)): Patch size. Default: 4.
+          in_chans (int): Number of input image channels. Default: 3.
+          embed_dim (int): Number of linear projection output channels. Default: 96.
+          depths (tuple[int]): Depths of each Swin Transformer stage.
+          num_heads (tuple[int]): Number of attention head of each stage.
+          window_size (int): Window size. Default: 7.
+          mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+          qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+          drop_rate (float): Dropout rate.
+          attn_drop_rate (float): Attention dropout rate. Default: 0.
+          drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+          norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+          ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+          patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+          out_indices (Sequence[int]): Output from which stages.
+          frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+              -1 means not freezing any parameters.
+          use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+      """
+      def __init__(self,
+                   pretrain_img_size=224,
+                   patch_size=4,
+                   in_chans=3,
+                   embed_dim=96,
+                   depths=[2, 2, 6, 2],
+                   num_heads=[3, 6, 12, 24],
+                   window_size=7,
+                   mlp_ratio=4.,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   drop_rate=0.,
+                   attn_drop_rate=0.,
+                   drop_path_rate=0.2,
+                   norm_layer=nn.LayerNorm,
+                   ape=False,
+                   patch_norm=True,
+                   out_indices=(0, 1, 2, 3),
+                   frozen_stages=-1,
+                   use_checkpoint=False):
+          super().__init__()
+          self.pretrain_img_size = pretrain_img_size
+          self.num_layers = len(depths)
+          self.embed_dim = embed_dim
+          self.ape = ape
+          self.patch_norm = patch_norm
+          self.out_indices = out_indices
+          self.frozen_stages = frozen_stages
+          # split image into non-overlapping patches
+          self.patch_embed = PatchEmbed(
+              patch_size=patch_size,
+              in_chans=in_chans,
+              embed_dim=embed_dim,
+              norm_layer=norm_layer if self.patch_norm else None)
+          # absolute position embedding
+          if self.ape:
+              pretrain_img_size = to_2tuple(pretrain_img_size)
+              patch_size = to_2tuple(patch_size)
+              patches_resolution = [
+                  pretrain_img_size[0] // patch_size[0],
+                  pretrain_img_size[1] // patch_size[1]
+              ]
+              self.absolute_pos_embed = nn.Parameter(
+                  torch.zeros(1, embed_dim, patches_resolution[0],
+                              patches_resolution[1]))
+              trunc_normal_(self.absolute_pos_embed, std=.02)
+          self.pos_drop = nn.Dropout(p=drop_rate)
+          # stochastic depth
+          dpr = [
+              x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+          ]  # stochastic depth decay rule
+          # build layers
+          self.layers = nn.ModuleList()
+          for i_layer in range(self.num_layers):
+              layer = BasicLayer(
+                  dim=int(embed_dim * 2**i_layer),
+                  depth=depths[i_layer],
+                  num_heads=num_heads[i_layer],
+                  window_size=window_size,
+                  mlp_ratio=mlp_ratio,
+                  qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop=drop_rate,
+                  attn_drop=attn_drop_rate,
+                  drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                  norm_layer=norm_layer,
+                  downsample=PatchMerging if
+                  (i_layer < self.num_layers - 1) else None,
+                  use_checkpoint=use_checkpoint)
+              self.layers.append(layer)
+          num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+          self.num_features = num_features
+          # add a norm layer for each output
+          for i_layer in out_indices:
+              layer = norm_layer(num_features[i_layer])
+              layer_name = f'norm{i_layer}'
+              self.add_module(layer_name, layer)
+          self._freeze_stages()
+      def _freeze_stages(self):
+          if self.frozen_stages >= 0:
+              self.patch_embed.eval()
+              for param in self.patch_embed.parameters():
+                  param.requires_grad = False
+          if self.frozen_stages >= 1 and self.ape:
+              self.absolute_pos_embed.requires_grad = False
+          if self.frozen_stages >= 2:
+              self.pos_drop.eval()
+              for i in range(0, self.frozen_stages - 1):
+                  m = self.layers[i]
+                  m.eval()
+                  for param in m.parameters():
+                      param.requires_grad = False
+      def init_weights(self, pretrained=None):
+          """Initialize the weights in backbone.
+          Args:
+              pretrained (str, optional): Path to pre-trained weights.
+                  Defaults to None.
+          """
+          def _init_weights(m):
+              if isinstance(m, nn.Linear):
+                  trunc_normal_(m.weight, std=.02)
+                  if isinstance(m, nn.Linear) and m.bias is not None:
+                      nn.init.constant_(m.bias, 0)
+              elif isinstance(m, nn.LayerNorm):
+                  nn.init.constant_(m.bias, 0)
+                  nn.init.constant_(m.weight, 1.0)
+          if isinstance(pretrained, str):
+              self.apply(_init_weights)
+              load_checkpoint(self, pretrained, strict=False, logger=None)
+          elif pretrained is None:
+              self.apply(_init_weights)
+          else:
+              raise TypeError('pretrained must be a str or None')
+      def forward(self, x):
+          x = self.patch_embed(x)
+          Wh, Ww = x.size(2), x.size(3)
+          if self.ape:
+              # interpolate the position embedding to the corresponding size
+              absolute_pos_embed = F.interpolate(self.absolute_pos_embed,
+                                                 size=(Wh, Ww),
+                                                 mode='bicubic')
+              x = (x + absolute_pos_embed)  # B Wh*Ww C
+          outs = [x.contiguous()]
+          x = x.flatten(2).transpose(1, 2)
+          x = self.pos_drop(x)
+          for i in range(self.num_layers):
+              layer = self.layers[i]
+              x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+              if i in self.out_indices:
+                  norm_layer = getattr(self, f'norm{i}')
+                  x_out = norm_layer(x_out)
+                  out = x_out.view(-1, H, W,
+                                   self.num_features[i]).permute(0, 3, 1,
+                                                                 2).contiguous()
+                  outs.append(out)
+          return tuple(outs)
+      def train(self, mode=True):
+          """Convert the model into training mode while keep layers freezed."""
+          super(SwinTransformer, self).train(mode)
+          self._freeze_stages()
+  class PositionEmbeddingSine:
+      def __init__(self,
+                   num_pos_feats=64,
+                   temperature=10000,
+                   normalize=False,
+                   scale=None):
+          super().__init__()
+          self.num_pos_feats = num_pos_feats
+          self.temperature = temperature
+          self.normalize = normalize
+          if scale is not None and normalize is False:
+              raise ValueError("normalize should be True if scale is passed")
+          if scale is None:
+              scale = 2 * math.pi
+          self.scale = scale
+          self.dim_t = torch.arange(0,
+                                    self.num_pos_feats,
+                                    dtype=torch_dtype,
+                                    device=torch_device)
+      def __call__(self, b, h, w):
+          mask = torch.zeros([b, h, w], dtype=torch.bool, device=torch_device)
+          assert mask is not None
+          not_mask = ~mask
+          y_embed = not_mask.cumsum(dim=1, dtype=torch_dtype)
+          x_embed = not_mask.cumsum(dim=2, dtype=torch_dtype)
+          if self.normalize:
+              eps = 1e-6
+              y_embed = ((y_embed - 0.5) / (y_embed[:, -1:, :] + eps) *
+                         self.scale).to(device=torch_device, dtype=torch_dtype)
+              x_embed = ((x_embed - 0.5) / (x_embed[:, :, -1:] + eps) *
+                         self.scale).to(device=torch_device, dtype=torch_dtype)
+          dim_t = self.temperature**(2 * (self.dim_t // 2) / self.num_pos_feats)
+          pos_x = x_embed[:, :, :, None] / dim_t
+          pos_y = y_embed[:, :, :, None] / dim_t
+          pos_x = torch.stack(
+              (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+              dim=4).flatten(3)
+          pos_y = torch.stack(
+              (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+              dim=4).flatten(3)
+          return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+  class MCLM(nn.Module):
+      def __init__(self, d_model, num_heads, pool_ratios=[1, 4, 8]):
+          super(MCLM, self).__init__()
+          self.attention = nn.ModuleList([
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+          ])
+          self.linear1 = nn.Linear(d_model, d_model * 2)
+          self.linear2 = nn.Linear(d_model * 2, d_model)
+          self.linear3 = nn.Linear(d_model, d_model * 2)
+          self.linear4 = nn.Linear(d_model * 2, d_model)
+          self.norm1 = nn.LayerNorm(d_model)
+          self.norm2 = nn.LayerNorm(d_model)
+          self.dropout = nn.Dropout(0.1)
+          self.dropout1 = nn.Dropout(0.1)
+          self.dropout2 = nn.Dropout(0.1)
+          self.activation = get_activation_fn('relu')
+          self.pool_ratios = pool_ratios
+          self.p_poses = []
+          self.g_pos = None
+          self.positional_encoding = PositionEmbeddingSine(
+              num_pos_feats=d_model // 2, normalize=True)
+      def forward(self, l, g):
+          """
+          l: 4,c,h,w
+          g: 1,c,h,w
+          """
+          b, c, h, w = l.size()
+          # 4,c,h,w -> 1,c,2h,2w
+          concated_locs = rearrange(l,
+                                    '(hg wg b) c h w -> b c (hg h) (wg w)',
+                                    hg=2,
+                                    wg=2)
+          pools = []
+          for pool_ratio in self.pool_ratios:
+              # b,c,h,w
+              tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+              pool = F.adaptive_avg_pool2d(concated_locs, tgt_hw)
+              pools.append(rearrange(pool, 'b c h w -> (h w) b c'))
+              if self.g_pos is None:
+                  pos_emb = self.positional_encoding(pool.shape[0],
+                                                     pool.shape[2],
+                                                     pool.shape[3])
+                  pos_emb = rearrange(pos_emb, 'b c h w -> (h w) b c')
+                  self.p_poses.append(pos_emb)
+          pools = torch.cat(pools, 0)
+          if self.g_pos is None:
+              self.p_poses = torch.cat(self.p_poses, dim=0)
+              pos_emb = self.positional_encoding(g.shape[0], g.shape[2],
+                                                 g.shape[3])
+              self.g_pos = rearrange(pos_emb, 'b c h w -> (h w) b c')
+          # attention between glb (q) & multisensory concated-locs (k,v)
+          g_hw_b_c = rearrange(g, 'b c h w -> (h w) b c')
+          g_hw_b_c = g_hw_b_c + self.dropout1(self.attention[0](
+              g_hw_b_c + self.g_pos, pools + self.p_poses, pools)[0])
+          g_hw_b_c = self.norm1(g_hw_b_c)
+          g_hw_b_c = g_hw_b_c + self.dropout2(
+              self.linear2(
+                  self.dropout(self.activation(self.linear1(g_hw_b_c)).clone())))
+          g_hw_b_c = self.norm2(g_hw_b_c)
+          # attention between origin locs (q) & freashed glb (k,v)
+          l_hw_b_c = rearrange(l, "b c h w -> (h w) b c")
+          _g_hw_b_c = rearrange(g_hw_b_c, '(h w) b c -> h w b c', h=h, w=w)
+          _g_hw_b_c = rearrange(_g_hw_b_c,
+                                "(ng h) (nw w) b c -> (h w) (ng nw b) c",
+                                ng=2,
+                                nw=2)
+          outputs_re = []
+          for i, (_l, _g) in enumerate(
+                  zip(l_hw_b_c.chunk(4, dim=1), _g_hw_b_c.chunk(4, dim=1))):
+              outputs_re.append(self.attention[i + 1](_l, _g,
+                                                      _g)[0])  # (h w) 1 c
+          outputs_re = torch.cat(outputs_re, 1)  # (h w) 4 c
+          l_hw_b_c = l_hw_b_c + self.dropout1(outputs_re)
+          l_hw_b_c = self.norm1(l_hw_b_c)
+          l_hw_b_c = l_hw_b_c + self.dropout2(
+              self.linear4(
+                  self.dropout(self.activation(self.linear3(l_hw_b_c)).clone())))
+          l_hw_b_c = self.norm2(l_hw_b_c)
+          l = torch.cat((l_hw_b_c, g_hw_b_c), 1)  # hw,b(5),c
+          return rearrange(l, "(h w) b c -> b c h w", h=h, w=w)  ## (5,c,h*w)
+  class inf_MCLM(nn.Module):
+      def __init__(self, d_model, num_heads, pool_ratios=[1, 4, 8]):
+          super(inf_MCLM, self).__init__()
+          self.attention = nn.ModuleList([
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+          ])
+          self.linear1 = nn.Linear(d_model, d_model * 2)
+          self.linear2 = nn.Linear(d_model * 2, d_model)
+          self.linear3 = nn.Linear(d_model, d_model * 2)
+          self.linear4 = nn.Linear(d_model * 2, d_model)
+          self.norm1 = nn.LayerNorm(d_model)
+          self.norm2 = nn.LayerNorm(d_model)
+          self.dropout = nn.Dropout(0.1)
+          self.dropout1 = nn.Dropout(0.1)
+          self.dropout2 = nn.Dropout(0.1)
+          self.activation = get_activation_fn('relu')
+          self.pool_ratios = pool_ratios
+          self.p_poses = []
+          self.g_pos = None
+          self.positional_encoding = PositionEmbeddingSine(
+              num_pos_feats=d_model // 2, normalize=True)
+      def forward(self, l, g):
+          """
+          l: 4,c,h,w
+          g: 1,c,h,w
+          """
+          b, c, h, w = l.size()
+          # 4,c,h,w -> 1,c,2h,2w
+          concated_locs = rearrange(l,
+                                    '(hg wg b) c h w -> b c (hg h) (wg w)',
+                                    hg=2,
+                                    wg=2)
+          self.p_poses = []
+          pools = []
+          for pool_ratio in self.pool_ratios:
+              # b,c,h,w
+              tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+              pool = F.adaptive_avg_pool2d(concated_locs, tgt_hw)
+              pools.append(rearrange(pool, 'b c h w -> (h w) b c'))
+              # if self.g_pos is None:
+              pos_emb = self.positional_encoding(pool.shape[0], pool.shape[2],
+                                                 pool.shape[3])
+              pos_emb = rearrange(pos_emb, 'b c h w -> (h w) b c')
+              self.p_poses.append(pos_emb)
+          pools = torch.cat(pools, 0)
+          # if self.g_pos is None:
+          self.p_poses = torch.cat(self.p_poses, dim=0)
+          pos_emb = self.positional_encoding(g.shape[0], g.shape[2], g.shape[3])
+          self.g_pos = rearrange(pos_emb, 'b c h w -> (h w) b c')
+          # attention between glb (q) & multisensory concated-locs (k,v)
+          g_hw_b_c = rearrange(g, 'b c h w -> (h w) b c')
+          g_hw_b_c = g_hw_b_c + self.dropout1(self.attention[0](
+              g_hw_b_c + self.g_pos, pools + self.p_poses, pools)[0])
+          g_hw_b_c = self.norm1(g_hw_b_c)
+          g_hw_b_c = g_hw_b_c + self.dropout2(
+              self.linear2(
+                  self.dropout(self.activation(self.linear1(g_hw_b_c)).clone())))
+          g_hw_b_c = self.norm2(g_hw_b_c)
+          # attention between origin locs (q) & freashed glb (k,v)
+          l_hw_b_c = rearrange(l, "b c h w -> (h w) b c")
+          _g_hw_b_c = rearrange(g_hw_b_c, '(h w) b c -> h w b c', h=h, w=w)
+          _g_hw_b_c = rearrange(_g_hw_b_c,
+                                "(ng h) (nw w) b c -> (h w) (ng nw b) c",
+                                ng=2,
+                                nw=2)
+          outputs_re = []
+          for i, (_l, _g) in enumerate(
+                  zip(l_hw_b_c.chunk(4, dim=1), _g_hw_b_c.chunk(4, dim=1))):
+              outputs_re.append(self.attention[i + 1](_l, _g,
+                                                      _g)[0])  # (h w) 1 c
+          outputs_re = torch.cat(outputs_re, 1)  # (h w) 4 c
+          l_hw_b_c = l_hw_b_c + self.dropout1(outputs_re)
+          l_hw_b_c = self.norm1(l_hw_b_c)
+          l_hw_b_c = l_hw_b_c + self.dropout2(
+              self.linear4(
+                  self.dropout(self.activation(self.linear3(l_hw_b_c)).clone())))
+          l_hw_b_c = self.norm2(l_hw_b_c)
+          l = torch.cat((l_hw_b_c, g_hw_b_c), 1)  # hw,b(5),c
+          return rearrange(l, "(h w) b c -> b c h w", h=h, w=w)  ## (5,c,h*w)
+  class MCRM(nn.Module):
+      def __init__(self, d_model, num_heads, pool_ratios=[4, 8, 16], h=None):
+          super(MCRM, self).__init__()
+          self.attention = nn.ModuleList([
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+          ])
+          self.linear3 = nn.Linear(d_model, d_model * 2)
+          self.linear4 = nn.Linear(d_model * 2, d_model)
+          self.norm1 = nn.LayerNorm(d_model)
+          self.norm2 = nn.LayerNorm(d_model)
+          self.dropout = nn.Dropout(0.1)
+          self.dropout1 = nn.Dropout(0.1)
+          self.dropout2 = nn.Dropout(0.1)
+          self.sigmoid = nn.Sigmoid()
+          self.activation = get_activation_fn('relu')
+          self.sal_conv = nn.Conv2d(d_model, 1, 1)
+          self.pool_ratios = pool_ratios
+          self.positional_encoding = PositionEmbeddingSine(
+              num_pos_feats=d_model // 2, normalize=True)
+      def forward(self, x):
+          b, c, h, w = x.size()
+          loc, glb = x.split([4, 1], dim=0)  # 4,c,h,w; 1,c,h,w
+          # b(4),c,h,w
+          patched_glb = rearrange(glb,
+                                  'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                  hg=2,
+                                  wg=2)
+          # generate token attention map
+          token_attention_map = self.sigmoid(self.sal_conv(glb))
+          token_attention_map = F.interpolate(token_attention_map,
+                                              size=patches2image(loc).shape[-2:],
+                                              mode='nearest')
+          loc = loc * rearrange(token_attention_map,
+                                'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                hg=2,
+                                wg=2)
+          pools = []
+          for pool_ratio in self.pool_ratios:
+              tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+              pool = F.adaptive_avg_pool2d(patched_glb, tgt_hw)
+              pools.append(rearrange(pool,
+                                     'nl c h w -> nl c (h w)'))  # nl(4),c,hw
+          # nl(4),c,nphw -> nl(4),nphw,1,c
+          pools = rearrange(torch.cat(pools, 2), "nl c nphw -> nl nphw 1 c")
+          loc_ = rearrange(loc, 'nl c h w -> nl (h w) 1 c')
+          outputs = []
+          for i, q in enumerate(
+                  loc_.unbind(dim=0)):  # traverse all local patches
+              # np*hw,1,c
+              v = pools[i]
+              k = v
+              outputs.append(self.attention[i](q, k, v)[0])
+          outputs = torch.cat(outputs, 1)
+          src = loc.view(4, c, -1).permute(2, 0, 1) + self.dropout1(outputs)
+          src = self.norm1(src)
+          src = src + self.dropout2(
+              self.linear4(
+                  self.dropout(self.activation(self.linear3(src)).clone())))
+          src = self.norm2(src)
+          src = src.permute(1, 2, 0).reshape(4, c, h, w)  # freshed loc
+          glb = glb + F.interpolate(patches2image(src),
+                                    size=glb.shape[-2:],
+                                    mode='nearest')  # freshed glb
+          return torch.cat((src, glb), 0), token_attention_map
+  class inf_MCRM(nn.Module):
+      def __init__(self, d_model, num_heads, pool_ratios=[4, 8, 16], h=None):
+          super(inf_MCRM, self).__init__()
+          self.attention = nn.ModuleList([
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+          ])
+          self.linear3 = nn.Linear(d_model, d_model * 2)
+          self.linear4 = nn.Linear(d_model * 2, d_model)
+          self.norm1 = nn.LayerNorm(d_model)
+          self.norm2 = nn.LayerNorm(d_model)
+          self.dropout = nn.Dropout(0.1)
+          self.dropout1 = nn.Dropout(0.1)
+          self.dropout2 = nn.Dropout(0.1)
+          self.sigmoid = nn.Sigmoid()
+          self.activation = get_activation_fn('relu')
+          self.sal_conv = nn.Conv2d(d_model, 1, 1)
+          self.pool_ratios = pool_ratios
+          self.positional_encoding = PositionEmbeddingSine(
+              num_pos_feats=d_model // 2, normalize=True)
+      def forward(self, x):
+          b, c, h, w = x.size()
+          loc, glb = x.split([4, 1], dim=0)  # 4,c,h,w; 1,c,h,w
+          # b(4),c,h,w
+          patched_glb = rearrange(glb,
+                                  'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                  hg=2,
+                                  wg=2)
+          # generate token attention map
+          token_attention_map = self.sigmoid(self.sal_conv(glb))
+          token_attention_map = F.interpolate(token_attention_map,
+                                              size=patches2image(loc).shape[-2:],
+                                              mode='nearest')
+          loc = loc * rearrange(token_attention_map,
+                                'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                hg=2,
+                                wg=2)
+          pools = []
+          for pool_ratio in self.pool_ratios:
+              tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+              pool = F.adaptive_avg_pool2d(patched_glb, tgt_hw)
+              pools.append(rearrange(pool,
+                                     'nl c h w -> nl c (h w)'))  # nl(4),c,hw
+          # nl(4),c,nphw -> nl(4),nphw,1,c
+          pools = rearrange(torch.cat(pools, 2), "nl c nphw -> nl nphw 1 c")
+          loc_ = rearrange(loc, 'nl c h w -> nl (h w) 1 c')
+          outputs = []
+          for i, q in enumerate(
+                  loc_.unbind(dim=0)):  # traverse all local patches
+              # np*hw,1,c
+              v = pools[i]
+              k = v
+              outputs.append(self.attention[i](q, k, v)[0])
+          outputs = torch.cat(outputs, 1)
+          src = loc.view(4, c, -1).permute(2, 0, 1) + self.dropout1(outputs)
+          src = self.norm1(src)
+          src = src + self.dropout2(
+              self.linear4(
+                  self.dropout(self.activation(self.linear3(src)).clone())))
+          src = self.norm2(src)
+          src = src.permute(1, 2, 0).reshape(4, c, h, w)  # freshed loc
+          glb = glb + F.interpolate(patches2image(src),
+                                    size=glb.shape[-2:],
+                                    mode='nearest')  # freshed glb
+          return torch.cat((src, glb), 0)
+  # model for single-scale training
+  class MVANet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.backbone = SwinB(pretrained=True)
+          emb_dim = 128
+          self.sideout5 = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          self.sideout4 = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          self.sideout3 = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          self.sideout2 = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          self.sideout1 = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          self.output5 = make_cbr(1024, emb_dim)
+          self.output4 = make_cbr(512, emb_dim)
+          self.output3 = make_cbr(256, emb_dim)
+          self.output2 = make_cbr(128, emb_dim)
+          self.output1 = make_cbr(128, emb_dim)
+          self.multifieldcrossatt = MCLM(emb_dim, 1, [1, 4, 8])
+          self.conv1 = make_cbr(emb_dim, emb_dim)
+          self.conv2 = make_cbr(emb_dim, emb_dim)
+          self.conv3 = make_cbr(emb_dim, emb_dim)
+          self.conv4 = make_cbr(emb_dim, emb_dim)
+          self.dec_blk1 = MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk2 = MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk3 = MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk4 = MCRM(emb_dim, 1, [2, 4, 8])
+          self.insmask_head = nn.Sequential(
+              nn.Conv2d(emb_dim, 384, kernel_size=3, padding=1),
+              nn.BatchNorm2d(384), nn.PReLU(),
+              nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.BatchNorm2d(384),
+              nn.PReLU(), nn.Conv2d(384, emb_dim, kernel_size=3, padding=1))
+          self.shallow = nn.Sequential(
+              nn.Conv2d(3, emb_dim, kernel_size=3, padding=1))
+          self.upsample1 = make_cbg(emb_dim, emb_dim)
+          self.upsample2 = make_cbg(emb_dim, emb_dim)
+          self.output = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          for m in self.modules():
+              if isinstance(m, nn.ReLU) or isinstance(m, nn.Dropout):
+                  m.inplace = True
+      def forward(self, x):
+          x = x.to(dtype=torch_dtype, device=torch_device)
+          shallow = self.shallow(x)
+          glb = rescale_to(x, scale_factor=0.5, interpolation='bilinear')
+          loc = image2patches(x)
+          input = torch.cat((loc, glb), dim=0)
+          feature = self.backbone(input)
+          e5 = self.output5(feature[4])  # (5,128,16,16)
+          e4 = self.output4(feature[3])  # (5,128,32,32)
+          e3 = self.output3(feature[2])  # (5,128,64,64)
+          e2 = self.output2(feature[1])  # (5,128,128,128)
+          e1 = self.output1(feature[0])  # (5,128,128,128)
+          loc_e5, glb_e5 = e5.split([4, 1], dim=0)
+          e5 = self.multifieldcrossatt(loc_e5, glb_e5)  # (4,128,16,16)
+          e4, tokenattmap4 = self.dec_blk4(e4 + resize_as(e5, e4))
+          e4 = self.conv4(e4)
+          e3, tokenattmap3 = self.dec_blk3(e3 + resize_as(e4, e3))
+          e3 = self.conv3(e3)
+          e2, tokenattmap2 = self.dec_blk2(e2 + resize_as(e3, e2))
+          e2 = self.conv2(e2)
+          e1, tokenattmap1 = self.dec_blk1(e1 + resize_as(e2, e1))
+          e1 = self.conv1(e1)
+          loc_e1, glb_e1 = e1.split([4, 1], dim=0)
+          output1_cat = patches2image(loc_e1)  # (1,128,256,256)
+          # add glb feat in
+          output1_cat = output1_cat + resize_as(glb_e1, output1_cat)
+          # merge
+          final_output = self.insmask_head(output1_cat)  # (1,128,256,256)
+          # shallow feature merge
+          final_output = final_output + resize_as(shallow, final_output)
+          final_output = self.upsample1(rescale_to(final_output))
+          final_output = rescale_to(final_output +
+                                    resize_as(shallow, final_output))
+          final_output = self.upsample2(final_output)
+          final_output = self.output(final_output)
+          ####
+          sideout5 = self.sideout5(e5).to(dtype=torch_dtype, device=torch_device)
+          sideout4 = self.sideout4(e4)
+          sideout3 = self.sideout3(e3)
+          sideout2 = self.sideout2(e2)
+          sideout1 = self.sideout1(e1)
+          #######glb_sideouts ######
+          glb5 = self.sideout5(glb_e5)
+          glb4 = sideout4[-1, :, :, :].unsqueeze(0)
+          glb3 = sideout3[-1, :, :, :].unsqueeze(0)
+          glb2 = sideout2[-1, :, :, :].unsqueeze(0)
+          glb1 = sideout1[-1, :, :, :].unsqueeze(0)
+          ####### concat 4 to 1 #######
+          sideout1 = patches2image(sideout1[:-1]).to(dtype=torch_dtype,
+                                                     device=torch_device)
+          sideout2 = patches2image(sideout2[:-1]).to(
+              dtype=torch_dtype,
+              device=torch_device)  ####(5,c,h,w) -> (1 c 2h,2w)
+          sideout3 = patches2image(sideout3[:-1]).to(dtype=torch_dtype,
+                                                     device=torch_device)
+          sideout4 = patches2image(sideout4[:-1]).to(dtype=torch_dtype,
+                                                     device=torch_device)
+          sideout5 = patches2image(sideout5[:-1]).to(dtype=torch_dtype,
+                                                     device=torch_device)
+          if self.training:
+              return sideout5, sideout4, sideout3, sideout2, sideout1, final_output, glb5, glb4, glb3, glb2, glb1, tokenattmap4, tokenattmap3, tokenattmap2, tokenattmap1
+          else:
+              return final_output
+  # model for multi-scale testing
+  class inf_MVANet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          # self.backbone = SwinB(pretrained=True)
+          self.backbone = SwinB(pretrained=False)
+          emb_dim = 128
+          self.output5 = make_cbr(1024, emb_dim)
+          self.output4 = make_cbr(512, emb_dim)
+          self.output3 = make_cbr(256, emb_dim)
+          self.output2 = make_cbr(128, emb_dim)
+          self.output1 = make_cbr(128, emb_dim)
+          self.multifieldcrossatt = inf_MCLM(emb_dim, 1, [1, 4, 8])
+          self.conv1 = make_cbr(emb_dim, emb_dim)
+          self.conv2 = make_cbr(emb_dim, emb_dim)
+          self.conv3 = make_cbr(emb_dim, emb_dim)
+          self.conv4 = make_cbr(emb_dim, emb_dim)
+          self.dec_blk1 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk2 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk3 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk4 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+          self.insmask_head = nn.Sequential(
+              nn.Conv2d(emb_dim, 384, kernel_size=3, padding=1),
+              nn.BatchNorm2d(384), nn.PReLU(),
+              nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.BatchNorm2d(384),
+              nn.PReLU(), nn.Conv2d(384, emb_dim, kernel_size=3, padding=1))
+          self.shallow = nn.Sequential(
+              nn.Conv2d(3, emb_dim, kernel_size=3, padding=1))
+          self.upsample1 = make_cbg(emb_dim, emb_dim)
+          self.upsample2 = make_cbg(emb_dim, emb_dim)
+          self.output = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          for m in self.modules():
+              if isinstance(m, nn.ReLU) or isinstance(m, nn.Dropout):
+                  m.inplace = True
+      def forward(self, x):
+          shallow = self.shallow(x)
+          glb = rescale_to(x, scale_factor=0.5, interpolation='bilinear')
+          loc = image2patches(x)
+          input = torch.cat((loc, glb), dim=0)
+          feature = self.backbone(input)
+          e5 = self.output5(feature[4])
+          e4 = self.output4(feature[3])
+          e3 = self.output3(feature[2])
+          e2 = self.output2(feature[1])
+          e1 = self.output1(feature[0])
+          loc_e5, glb_e5 = e5.split([4, 1], dim=0)
+          e5_cat = self.multifieldcrossatt(loc_e5, glb_e5)
+          e4 = self.conv4(self.dec_blk4(e4 + resize_as(e5_cat, e4)))
+          e3 = self.conv3(self.dec_blk3(e3 + resize_as(e4, e3)))
+          e2 = self.conv2(self.dec_blk2(e2 + resize_as(e3, e2)))
+          e1 = self.conv1(self.dec_blk1(e1 + resize_as(e2, e1)))
+          loc_e1, glb_e1 = e1.split([4, 1], dim=0)
+          # after decoder, concat loc features to a whole one, and merge
+          output1_cat = patches2image(loc_e1)
+          # add glb feat in
+          output1_cat = output1_cat + resize_as(glb_e1, output1_cat)
+          # merge
+          final_output = self.insmask_head(output1_cat)
+          # shallow feature merge
+          final_output = final_output + resize_as(shallow, final_output)
+          final_output = self.upsample1(rescale_to(final_output))
+          final_output = rescale_to(final_output +
+                                    resize_as(shallow, final_output))
+          final_output = self.upsample2(final_output)
+          final_output = self.output(final_output)
+          return final_output
+#+end_src
+** Function to load model
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+  def mkdir_safe(out_path):
+      if type(out_path) == str:
+          if len(out_path) > 0:
+              if not os.path.exists(out_path):
+                  os.mkdir(out_path)
+  def get_model_path():
+      import folder_paths
+      from folder_paths import models_dir
+      path_file_model = models_dir
+      mkdir_safe(out_path=path_file_model)
+      path_file_model = os.path.join(path_file_model, 'MVANet')
+      mkdir_safe(out_path=path_file_model)
+      path_file_model = os.path.join(path_file_model, 'Model_80.pth')
+      return path_file_model
+  def download_model(path):
+      if not os.path.exists(path):
+          wget.download(
+              'https://huggingface.co/aravindhv10/Self-Correction-Human-Parsing/resolve/main/checkpoints/Model_80.pth',
+              out=path)
+  def load_model(model_checkpoint_path):
+      download_model(path=model_checkpoint_path)
+      torch.cuda.set_device(0)
+      net = inf_MVANet().to(dtype=torch_dtype, device=torch_device)
+      pretrained_dict = torch.load(finetuned_MVANet_model_path,
+                                   map_location=torch_device)
+      model_dict = net.state_dict()
+      pretrained_dict = {
+          k: v
+          for k, v in pretrained_dict.items() if k in model_dict
+      }
+      model_dict.update(pretrained_dict)
+      net.load_state_dict(model_dict)
+      net = net.to(dtype=torch_dtype, device=torch_device)
+      net.eval()
+      return net
+#+end_src
+** Function for modular inference CV
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+  def do_infer_tensor2tensor(img, net):
+      img_transform = transforms.Compose(
+          [transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
+      h_, w_ = img.shape[1], img.shape[2]
+      with torch.no_grad():
+          img = rearrange(img, 'B H W C -> B C H W')
+          img_resize = torch.nn.functional.interpolate(input=img,
+                                                       size=(1024, 1024),
+                                                       mode='bicubic',
+                                                       antialias=True)
+          img_var = img_transform(img_resize)
+          img_var = Variable(img_var)
+          img_var = img_var.to(dtype=torch_dtype, device=torch_device)
+          mask = []
+          mask.append(net(img_var))
+          prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+          prediction = prediction.sigmoid()
+          prediction = torch.nn.functional.interpolate(input=prediction,
+                                                       size=(h_, w_),
+                                                       mode='bicubic',
+                                                       antialias=True)
+          prediction = prediction.squeeze(0)
+          prediction = prediction.clamp(0, 1)
+          prediction = prediction.detach()
+          prediction = prediction.to(dtype=torch.float32, device='cpu')
+          return prediction
+#+end_src
+** Comfyui wrapper classes
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.class.py
+  class load_MVANet_Model:
+      def __init__(self):
+          pass
+      @classmethod
+      def INPUT_TYPES(s):
+          return {
+              "required": {},
+          }
+      RETURN_TYPES = ("MVANet_Model", )
+      FUNCTION = "test"
+      CATEGORY = "MVANet"
+      def test(self):
+          return (load_model(get_model_path()), )
+  class run_MVANet_inference:
+      def __init__(self):
+          pass
+      @classmethod
+      def INPUT_TYPES(s):
+          return {
+              "required": {
+                  "image": ("IMAGE", ),
+                  "MVANet_Model": ("MVANet_Model", ),
+              },
+          }
+      RETURN_TYPES = ("MASK", )
+      FUNCTION = "test"
+      CATEGORY = "MVANet"
+      def test(
+          self,
+          image,
+          MVANet_Model,
+      ):
+          ret = do_infer_tensor2tensor(img=image, net=MVANet_Model)
+          return (ret, )
+#+end_src
+** MVANet_inference execute
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.execute.py
+  NODE_CLASS_MAPPINGS = {
+      "load_MVANet_Model": load_MVANet_Model,
+      "run_MVANet_inference": run_MVANet_inference
+  }
+  NODE_DISPLAY_NAME_MAPPINGS = {
+      "load_MVANet_Model": "load MVANet Model",
+      "run_MVANet_inference": "run_MVANet_inference"
+  }
+#+end_src
+** MVANet_inference unify
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./MVANet_inference.unify.sh
+  . "${HOME}/dbnew.sh"
+  (
+      echo '#!/usr/bin/python3'
+      cat \
+          './MVANet_inference.import.py' \
+          './MVANet_inference.function.py' \
+          './MVANet_inference.class.py' \
+          './MVANet_inference.execute.py' \
+      | expand | yapf3 \
+      | grep -v '#!/usr/bin/python3' \
+      ;
+  ) > './MVANet_inference.py' \
+  ;
+  cp './MVANet_inference.py' '__init__.py'
+#+end_src
+* WORK SPACE
+** elisp
+#+begin_src elisp
+  (save-buffer)
+  (org-babel-tangle)
+  (shell-command "./MVANet_inference.unify.sh")
+#+end_src
+#+RESULTS:
+: 0
+** sh
+#+begin_src sh :shebang #!/bin/sh :results output
+  realpath .
+  cd /home/asd/GITHUB/aravind-h-v/dreambooth_experiments/MVANet
+#+end_src

ComfyUI_MVANet/__init__.py ADDED Viewed

	@@ -0,0 +1,1548 @@

+#!/usr/bin/python3
+import os
+import sys
+HOME_DIR = os.environ.get('HOME', '/root')
+MVANET_SOURCE_DIR = HOME_DIR + '/GITHUB/qianyu-dlut/MVANet'
+finetuned_MVANet_model_path = MVANET_SOURCE_DIR + '/model/Model_80.pth'
+pretrained_SwinB_model_path = MVANET_SOURCE_DIR + '/model/swin_base_patch4_window12_384_22kto1k.pth'
+import math
+import numpy as np
+import cv2
+import wget
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch.autograd import Variable
+from torch import nn
+from torchvision import transforms
+from einops import rearrange
+from timm.models import load_checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+torch_device = 'cuda'
+torch_dtype = torch.float16
+def check_mkdir(dir_name):
+    if not os.path.isdir(dir_name):
+        os.makedirs(dir_name)
+def SwinT(pretrained=True):
+    model = SwinTransformer(embed_dim=96,
+                            depths=[2, 2, 6, 2],
+                            num_heads=[3, 6, 12, 24],
+                            window_size=7)
+    if pretrained is True:
+        model.load_state_dict(torch.load(
+            'data/backbone_ckpt/swin_tiny_patch4_window7_224.pth',
+            map_location='cpu')['model'],
+                              strict=False)
+    return model
+def SwinS(pretrained=True):
+    model = SwinTransformer(embed_dim=96,
+                            depths=[2, 2, 18, 2],
+                            num_heads=[3, 6, 12, 24],
+                            window_size=7)
+    if pretrained is True:
+        model.load_state_dict(torch.load(
+            'data/backbone_ckpt/swin_small_patch4_window7_224.pth',
+            map_location='cpu')['model'],
+                              strict=False)
+    return model
+def SwinB(pretrained=True):
+    model = SwinTransformer(embed_dim=128,
+                            depths=[2, 2, 18, 2],
+                            num_heads=[4, 8, 16, 32],
+                            window_size=12)
+    if pretrained is True:
+        import os
+        model.load_state_dict(torch.load(pretrained_SwinB_model_path,
+                                         map_location='cpu')['model'],
+                              strict=False)
+    return model
+def SwinL(pretrained=True):
+    model = SwinTransformer(embed_dim=192,
+                            depths=[2, 2, 18, 2],
+                            num_heads=[6, 12, 24, 48],
+                            window_size=12)
+    if pretrained is True:
+        model.load_state_dict(torch.load(
+            'data/backbone_ckpt/swin_large_patch4_window12_384_22kto1k.pth',
+            map_location='cpu')['model'],
+                              strict=False)
+    return model
+def get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+def make_cbr(in_dim, out_dim):
+    return nn.Sequential(nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1),
+                         nn.BatchNorm2d(out_dim), nn.PReLU())
+def make_cbg(in_dim, out_dim):
+    return nn.Sequential(nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1),
+                         nn.BatchNorm2d(out_dim), nn.GELU())
+def rescale_to(x, scale_factor: float = 2, interpolation='nearest'):
+    return F.interpolate(x, scale_factor=scale_factor, mode=interpolation)
+def resize_as(x, y, interpolation='bilinear'):
+    return F.interpolate(x, size=y.shape[-2:], mode=interpolation)
+def image2patches(x):
+    """b c (hg h) (wg w) -> (hg wg b) c h w"""
+    x = rearrange(x, 'b c (hg h) (wg w) -> (hg wg b) c h w', hg=2, wg=2)
+    return x
+def patches2image(x):
+    """(hg wg b) c h w -> b c (hg h) (wg w)"""
+    x = rearrange(x, '(hg wg b) c h w -> b c (hg h) (wg w)', hg=2, wg=2)
+    return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
+               C)
+    windows = x.permute(0, 1, 3, 2, 4,
+                        5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size,
+                     window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+def mkdir_safe(out_path):
+    if type(out_path) == str:
+        if len(out_path) > 0:
+            if not os.path.exists(out_path):
+                os.mkdir(out_path)
+def get_model_path():
+    import folder_paths
+    from folder_paths import models_dir
+    path_file_model = models_dir
+    mkdir_safe(out_path=path_file_model)
+    path_file_model = os.path.join(path_file_model, 'MVANet')
+    mkdir_safe(out_path=path_file_model)
+    path_file_model = os.path.join(path_file_model, 'Model_80.pth')
+    return path_file_model
+def download_model(path):
+    if not os.path.exists(path):
+        wget.download(
+            'https://huggingface.co/aravindhv10/Self-Correction-Human-Parsing/resolve/main/checkpoints/Model_80.pth',
+            out=path)
+def load_model(model_checkpoint_path):
+    download_model(path=model_checkpoint_path)
+    torch.cuda.set_device(0)
+    net = inf_MVANet().to(dtype=torch_dtype, device=torch_device)
+    pretrained_dict = torch.load(finetuned_MVANet_model_path,
+                                 map_location=torch_device)
+    model_dict = net.state_dict()
+    pretrained_dict = {
+        k: v
+        for k, v in pretrained_dict.items() if k in model_dict
+    }
+    model_dict.update(pretrained_dict)
+    net.load_state_dict(model_dict)
+    net = net.to(dtype=torch_dtype, device=torch_device)
+    net.eval()
+    return net
+def do_infer_tensor2tensor(img, net):
+    img_transform = transforms.Compose(
+        [transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
+    h_, w_ = img.shape[1], img.shape[2]
+    with torch.no_grad():
+        img = rearrange(img, 'B H W C -> B C H W')
+        img_resize = torch.nn.functional.interpolate(input=img,
+                                                     size=(1024, 1024),
+                                                     mode='bicubic',
+                                                     antialias=True)
+        img_var = img_transform(img_resize)
+        img_var = Variable(img_var)
+        img_var = img_var.to(dtype=torch_dtype, device=torch_device)
+        mask = []
+        mask.append(net(img_var))
+        prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+        prediction = prediction.sigmoid()
+        prediction = torch.nn.functional.interpolate(input=prediction,
+                                                     size=(h_, w_),
+                                                     mode='bicubic',
+                                                     antialias=True)
+        prediction = prediction.squeeze(0)
+        prediction = prediction.clamp(0, 1)
+        prediction = prediction.detach()
+        prediction = prediction.to(dtype=torch.float32, device='cpu')
+        return prediction
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        x = x.to(dtype=torch_dtype, device=torch_device)
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        attn = attn.to(dtype=torch_dtype, device=torch_device)
+        v = v.to(dtype=torch_dtype, device=torch_device)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(dim,
+                                    window_size=to_2tuple(self.window_size),
+                                    num_heads=num_heads,
+                                    qkv_bias=qkv_bias,
+                                    qk_scale=qk_scale,
+                                    attn_drop=attn_drop,
+                                    proj_drop=drop)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x,
+                                   shifts=(-self.shift_size, -self.shift_size),
+                                   dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                   C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x,
+                           shifts=(self.shift_size, self.shift_size),
+                           dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim,
+                                 num_heads=num_heads,
+                                 window_size=window_size,
+                                 shift_size=0 if
+                                 (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias,
+                                 qk_scale=qk_scale,
+                                 drop=drop,
+                                 attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(
+                                     drop_path, list) else drop_path,
+                                 norm_layer=norm_layer) for i in range(depth)
+        ])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1,
+                                         self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                          float(-100.0)).masked_fill(
+                                              attn_mask == 0, float(0.0))
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x,
+                      (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0],
+                            patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if
+                (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        if isinstance(pretrained, str):
+            self.apply(_init_weights)
+            load_checkpoint(self, pretrained, strict=False, logger=None)
+        elif pretrained is None:
+            self.apply(_init_weights)
+        else:
+            raise TypeError('pretrained must be a str or None')
+    def forward(self, x):
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed,
+                                               size=(Wh, Ww),
+                                               mode='bicubic')
+            x = (x + absolute_pos_embed)  # B Wh*Ww C
+        outs = [x.contiguous()]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W,
+                                 self.num_features[i]).permute(0, 3, 1,
+                                                               2).contiguous()
+                outs.append(out)
+        return tuple(outs)
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+class PositionEmbeddingSine:
+    def __init__(self,
+                 num_pos_feats=64,
+                 temperature=10000,
+                 normalize=False,
+                 scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+        self.dim_t = torch.arange(0,
+                                  self.num_pos_feats,
+                                  dtype=torch_dtype,
+                                  device=torch_device)
+    def __call__(self, b, h, w):
+        mask = torch.zeros([b, h, w], dtype=torch.bool, device=torch_device)
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(dim=1, dtype=torch_dtype)
+        x_embed = not_mask.cumsum(dim=2, dtype=torch_dtype)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = ((y_embed - 0.5) / (y_embed[:, -1:, :] + eps) *
+                       self.scale).to(device=torch_device, dtype=torch_dtype)
+            x_embed = ((x_embed - 0.5) / (x_embed[:, :, -1:] + eps) *
+                       self.scale).to(device=torch_device, dtype=torch_dtype)
+        dim_t = self.temperature**(2 * (self.dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+class MCLM(nn.Module):
+    def __init__(self, d_model, num_heads, pool_ratios=[1, 4, 8]):
+        super(MCLM, self).__init__()
+        self.attention = nn.ModuleList([
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+        ])
+        self.linear1 = nn.Linear(d_model, d_model * 2)
+        self.linear2 = nn.Linear(d_model * 2, d_model)
+        self.linear3 = nn.Linear(d_model, d_model * 2)
+        self.linear4 = nn.Linear(d_model * 2, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(0.1)
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.activation = get_activation_fn('relu')
+        self.pool_ratios = pool_ratios
+        self.p_poses = []
+        self.g_pos = None
+        self.positional_encoding = PositionEmbeddingSine(
+            num_pos_feats=d_model // 2, normalize=True)
+    def forward(self, l, g):
+        """
+        l: 4,c,h,w
+        g: 1,c,h,w
+        """
+        b, c, h, w = l.size()
+        # 4,c,h,w -> 1,c,2h,2w
+        concated_locs = rearrange(l,
+                                  '(hg wg b) c h w -> b c (hg h) (wg w)',
+                                  hg=2,
+                                  wg=2)
+        pools = []
+        for pool_ratio in self.pool_ratios:
+            # b,c,h,w
+            tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+            pool = F.adaptive_avg_pool2d(concated_locs, tgt_hw)
+            pools.append(rearrange(pool, 'b c h w -> (h w) b c'))
+            if self.g_pos is None:
+                pos_emb = self.positional_encoding(pool.shape[0],
+                                                   pool.shape[2],
+                                                   pool.shape[3])
+                pos_emb = rearrange(pos_emb, 'b c h w -> (h w) b c')
+                self.p_poses.append(pos_emb)
+        pools = torch.cat(pools, 0)
+        if self.g_pos is None:
+            self.p_poses = torch.cat(self.p_poses, dim=0)
+            pos_emb = self.positional_encoding(g.shape[0], g.shape[2],
+                                               g.shape[3])
+            self.g_pos = rearrange(pos_emb, 'b c h w -> (h w) b c')
+        # attention between glb (q) & multisensory concated-locs (k,v)
+        g_hw_b_c = rearrange(g, 'b c h w -> (h w) b c')
+        g_hw_b_c = g_hw_b_c + self.dropout1(self.attention[0](
+            g_hw_b_c + self.g_pos, pools + self.p_poses, pools)[0])
+        g_hw_b_c = self.norm1(g_hw_b_c)
+        g_hw_b_c = g_hw_b_c + self.dropout2(
+            self.linear2(
+                self.dropout(self.activation(self.linear1(g_hw_b_c)).clone())))
+        g_hw_b_c = self.norm2(g_hw_b_c)
+        # attention between origin locs (q) & freashed glb (k,v)
+        l_hw_b_c = rearrange(l, "b c h w -> (h w) b c")
+        _g_hw_b_c = rearrange(g_hw_b_c, '(h w) b c -> h w b c', h=h, w=w)
+        _g_hw_b_c = rearrange(_g_hw_b_c,
+                              "(ng h) (nw w) b c -> (h w) (ng nw b) c",
+                              ng=2,
+                              nw=2)
+        outputs_re = []
+        for i, (_l, _g) in enumerate(
+                zip(l_hw_b_c.chunk(4, dim=1), _g_hw_b_c.chunk(4, dim=1))):
+            outputs_re.append(self.attention[i + 1](_l, _g,
+                                                    _g)[0])  # (h w) 1 c
+        outputs_re = torch.cat(outputs_re, 1)  # (h w) 4 c
+        l_hw_b_c = l_hw_b_c + self.dropout1(outputs_re)
+        l_hw_b_c = self.norm1(l_hw_b_c)
+        l_hw_b_c = l_hw_b_c + self.dropout2(
+            self.linear4(
+                self.dropout(self.activation(self.linear3(l_hw_b_c)).clone())))
+        l_hw_b_c = self.norm2(l_hw_b_c)
+        l = torch.cat((l_hw_b_c, g_hw_b_c), 1)  # hw,b(5),c
+        return rearrange(l, "(h w) b c -> b c h w", h=h, w=w)  ## (5,c,h*w)
+class inf_MCLM(nn.Module):
+    def __init__(self, d_model, num_heads, pool_ratios=[1, 4, 8]):
+        super(inf_MCLM, self).__init__()
+        self.attention = nn.ModuleList([
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+        ])
+        self.linear1 = nn.Linear(d_model, d_model * 2)
+        self.linear2 = nn.Linear(d_model * 2, d_model)
+        self.linear3 = nn.Linear(d_model, d_model * 2)
+        self.linear4 = nn.Linear(d_model * 2, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(0.1)
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.activation = get_activation_fn('relu')
+        self.pool_ratios = pool_ratios
+        self.p_poses = []
+        self.g_pos = None
+        self.positional_encoding = PositionEmbeddingSine(
+            num_pos_feats=d_model // 2, normalize=True)
+    def forward(self, l, g):
+        """
+        l: 4,c,h,w
+        g: 1,c,h,w
+        """
+        b, c, h, w = l.size()
+        # 4,c,h,w -> 1,c,2h,2w
+        concated_locs = rearrange(l,
+                                  '(hg wg b) c h w -> b c (hg h) (wg w)',
+                                  hg=2,
+                                  wg=2)
+        self.p_poses = []
+        pools = []
+        for pool_ratio in self.pool_ratios:
+            # b,c,h,w
+            tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+            pool = F.adaptive_avg_pool2d(concated_locs, tgt_hw)
+            pools.append(rearrange(pool, 'b c h w -> (h w) b c'))
+            # if self.g_pos is None:
+            pos_emb = self.positional_encoding(pool.shape[0], pool.shape[2],
+                                               pool.shape[3])
+            pos_emb = rearrange(pos_emb, 'b c h w -> (h w) b c')
+            self.p_poses.append(pos_emb)
+        pools = torch.cat(pools, 0)
+        # if self.g_pos is None:
+        self.p_poses = torch.cat(self.p_poses, dim=0)
+        pos_emb = self.positional_encoding(g.shape[0], g.shape[2], g.shape[3])
+        self.g_pos = rearrange(pos_emb, 'b c h w -> (h w) b c')
+        # attention between glb (q) & multisensory concated-locs (k,v)
+        g_hw_b_c = rearrange(g, 'b c h w -> (h w) b c')
+        g_hw_b_c = g_hw_b_c + self.dropout1(self.attention[0](
+            g_hw_b_c + self.g_pos, pools + self.p_poses, pools)[0])
+        g_hw_b_c = self.norm1(g_hw_b_c)
+        g_hw_b_c = g_hw_b_c + self.dropout2(
+            self.linear2(
+                self.dropout(self.activation(self.linear1(g_hw_b_c)).clone())))
+        g_hw_b_c = self.norm2(g_hw_b_c)
+        # attention between origin locs (q) & freashed glb (k,v)
+        l_hw_b_c = rearrange(l, "b c h w -> (h w) b c")
+        _g_hw_b_c = rearrange(g_hw_b_c, '(h w) b c -> h w b c', h=h, w=w)
+        _g_hw_b_c = rearrange(_g_hw_b_c,
+                              "(ng h) (nw w) b c -> (h w) (ng nw b) c",
+                              ng=2,
+                              nw=2)
+        outputs_re = []
+        for i, (_l, _g) in enumerate(
+                zip(l_hw_b_c.chunk(4, dim=1), _g_hw_b_c.chunk(4, dim=1))):
+            outputs_re.append(self.attention[i + 1](_l, _g,
+                                                    _g)[0])  # (h w) 1 c
+        outputs_re = torch.cat(outputs_re, 1)  # (h w) 4 c
+        l_hw_b_c = l_hw_b_c + self.dropout1(outputs_re)
+        l_hw_b_c = self.norm1(l_hw_b_c)
+        l_hw_b_c = l_hw_b_c + self.dropout2(
+            self.linear4(
+                self.dropout(self.activation(self.linear3(l_hw_b_c)).clone())))
+        l_hw_b_c = self.norm2(l_hw_b_c)
+        l = torch.cat((l_hw_b_c, g_hw_b_c), 1)  # hw,b(5),c
+        return rearrange(l, "(h w) b c -> b c h w", h=h, w=w)  ## (5,c,h*w)
+class MCRM(nn.Module):
+    def __init__(self, d_model, num_heads, pool_ratios=[4, 8, 16], h=None):
+        super(MCRM, self).__init__()
+        self.attention = nn.ModuleList([
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+        ])
+        self.linear3 = nn.Linear(d_model, d_model * 2)
+        self.linear4 = nn.Linear(d_model * 2, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(0.1)
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.sigmoid = nn.Sigmoid()
+        self.activation = get_activation_fn('relu')
+        self.sal_conv = nn.Conv2d(d_model, 1, 1)
+        self.pool_ratios = pool_ratios
+        self.positional_encoding = PositionEmbeddingSine(
+            num_pos_feats=d_model // 2, normalize=True)
+    def forward(self, x):
+        b, c, h, w = x.size()
+        loc, glb = x.split([4, 1], dim=0)  # 4,c,h,w; 1,c,h,w
+        # b(4),c,h,w
+        patched_glb = rearrange(glb,
+                                'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                hg=2,
+                                wg=2)
+        # generate token attention map
+        token_attention_map = self.sigmoid(self.sal_conv(glb))
+        token_attention_map = F.interpolate(token_attention_map,
+                                            size=patches2image(loc).shape[-2:],
+                                            mode='nearest')
+        loc = loc * rearrange(token_attention_map,
+                              'b c (hg h) (wg w) -> (hg wg b) c h w',
+                              hg=2,
+                              wg=2)
+        pools = []
+        for pool_ratio in self.pool_ratios:
+            tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+            pool = F.adaptive_avg_pool2d(patched_glb, tgt_hw)
+            pools.append(rearrange(pool,
+                                   'nl c h w -> nl c (h w)'))  # nl(4),c,hw
+        # nl(4),c,nphw -> nl(4),nphw,1,c
+        pools = rearrange(torch.cat(pools, 2), "nl c nphw -> nl nphw 1 c")
+        loc_ = rearrange(loc, 'nl c h w -> nl (h w) 1 c')
+        outputs = []
+        for i, q in enumerate(
+                loc_.unbind(dim=0)):  # traverse all local patches
+            # np*hw,1,c
+            v = pools[i]
+            k = v
+            outputs.append(self.attention[i](q, k, v)[0])
+        outputs = torch.cat(outputs, 1)
+        src = loc.view(4, c, -1).permute(2, 0, 1) + self.dropout1(outputs)
+        src = self.norm1(src)
+        src = src + self.dropout2(
+            self.linear4(
+                self.dropout(self.activation(self.linear3(src)).clone())))
+        src = self.norm2(src)
+        src = src.permute(1, 2, 0).reshape(4, c, h, w)  # freshed loc
+        glb = glb + F.interpolate(patches2image(src),
+                                  size=glb.shape[-2:],
+                                  mode='nearest')  # freshed glb
+        return torch.cat((src, glb), 0), token_attention_map
+class inf_MCRM(nn.Module):
+    def __init__(self, d_model, num_heads, pool_ratios=[4, 8, 16], h=None):
+        super(inf_MCRM, self).__init__()
+        self.attention = nn.ModuleList([
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+            nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+        ])
+        self.linear3 = nn.Linear(d_model, d_model * 2)
+        self.linear4 = nn.Linear(d_model * 2, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(0.1)
+        self.dropout1 = nn.Dropout(0.1)
+        self.dropout2 = nn.Dropout(0.1)
+        self.sigmoid = nn.Sigmoid()
+        self.activation = get_activation_fn('relu')
+        self.sal_conv = nn.Conv2d(d_model, 1, 1)
+        self.pool_ratios = pool_ratios
+        self.positional_encoding = PositionEmbeddingSine(
+            num_pos_feats=d_model // 2, normalize=True)
+    def forward(self, x):
+        b, c, h, w = x.size()
+        loc, glb = x.split([4, 1], dim=0)  # 4,c,h,w; 1,c,h,w
+        # b(4),c,h,w
+        patched_glb = rearrange(glb,
+                                'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                hg=2,
+                                wg=2)
+        # generate token attention map
+        token_attention_map = self.sigmoid(self.sal_conv(glb))
+        token_attention_map = F.interpolate(token_attention_map,
+                                            size=patches2image(loc).shape[-2:],
+                                            mode='nearest')
+        loc = loc * rearrange(token_attention_map,
+                              'b c (hg h) (wg w) -> (hg wg b) c h w',
+                              hg=2,
+                              wg=2)
+        pools = []
+        for pool_ratio in self.pool_ratios:
+            tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+            pool = F.adaptive_avg_pool2d(patched_glb, tgt_hw)
+            pools.append(rearrange(pool,
+                                   'nl c h w -> nl c (h w)'))  # nl(4),c,hw
+        # nl(4),c,nphw -> nl(4),nphw,1,c
+        pools = rearrange(torch.cat(pools, 2), "nl c nphw -> nl nphw 1 c")
+        loc_ = rearrange(loc, 'nl c h w -> nl (h w) 1 c')
+        outputs = []
+        for i, q in enumerate(
+                loc_.unbind(dim=0)):  # traverse all local patches
+            # np*hw,1,c
+            v = pools[i]
+            k = v
+            outputs.append(self.attention[i](q, k, v)[0])
+        outputs = torch.cat(outputs, 1)
+        src = loc.view(4, c, -1).permute(2, 0, 1) + self.dropout1(outputs)
+        src = self.norm1(src)
+        src = src + self.dropout2(
+            self.linear4(
+                self.dropout(self.activation(self.linear3(src)).clone())))
+        src = self.norm2(src)
+        src = src.permute(1, 2, 0).reshape(4, c, h, w)  # freshed loc
+        glb = glb + F.interpolate(patches2image(src),
+                                  size=glb.shape[-2:],
+                                  mode='nearest')  # freshed glb
+        return torch.cat((src, glb), 0)
+# model for single-scale training
+class MVANet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.backbone = SwinB(pretrained=True)
+        emb_dim = 128
+        self.sideout5 = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout4 = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout3 = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout2 = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.sideout1 = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        self.output5 = make_cbr(1024, emb_dim)
+        self.output4 = make_cbr(512, emb_dim)
+        self.output3 = make_cbr(256, emb_dim)
+        self.output2 = make_cbr(128, emb_dim)
+        self.output1 = make_cbr(128, emb_dim)
+        self.multifieldcrossatt = MCLM(emb_dim, 1, [1, 4, 8])
+        self.conv1 = make_cbr(emb_dim, emb_dim)
+        self.conv2 = make_cbr(emb_dim, emb_dim)
+        self.conv3 = make_cbr(emb_dim, emb_dim)
+        self.conv4 = make_cbr(emb_dim, emb_dim)
+        self.dec_blk1 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk2 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk3 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk4 = MCRM(emb_dim, 1, [2, 4, 8])
+        self.insmask_head = nn.Sequential(
+            nn.Conv2d(emb_dim, 384, kernel_size=3, padding=1),
+            nn.BatchNorm2d(384), nn.PReLU(),
+            nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.BatchNorm2d(384),
+            nn.PReLU(), nn.Conv2d(384, emb_dim, kernel_size=3, padding=1))
+        self.shallow = nn.Sequential(
+            nn.Conv2d(3, emb_dim, kernel_size=3, padding=1))
+        self.upsample1 = make_cbg(emb_dim, emb_dim)
+        self.upsample2 = make_cbg(emb_dim, emb_dim)
+        self.output = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        for m in self.modules():
+            if isinstance(m, nn.ReLU) or isinstance(m, nn.Dropout):
+                m.inplace = True
+    def forward(self, x):
+        x = x.to(dtype=torch_dtype, device=torch_device)
+        shallow = self.shallow(x)
+        glb = rescale_to(x, scale_factor=0.5, interpolation='bilinear')
+        loc = image2patches(x)
+        input = torch.cat((loc, glb), dim=0)
+        feature = self.backbone(input)
+        e5 = self.output5(feature[4])  # (5,128,16,16)
+        e4 = self.output4(feature[3])  # (5,128,32,32)
+        e3 = self.output3(feature[2])  # (5,128,64,64)
+        e2 = self.output2(feature[1])  # (5,128,128,128)
+        e1 = self.output1(feature[0])  # (5,128,128,128)
+        loc_e5, glb_e5 = e5.split([4, 1], dim=0)
+        e5 = self.multifieldcrossatt(loc_e5, glb_e5)  # (4,128,16,16)
+        e4, tokenattmap4 = self.dec_blk4(e4 + resize_as(e5, e4))
+        e4 = self.conv4(e4)
+        e3, tokenattmap3 = self.dec_blk3(e3 + resize_as(e4, e3))
+        e3 = self.conv3(e3)
+        e2, tokenattmap2 = self.dec_blk2(e2 + resize_as(e3, e2))
+        e2 = self.conv2(e2)
+        e1, tokenattmap1 = self.dec_blk1(e1 + resize_as(e2, e1))
+        e1 = self.conv1(e1)
+        loc_e1, glb_e1 = e1.split([4, 1], dim=0)
+        output1_cat = patches2image(loc_e1)  # (1,128,256,256)
+        # add glb feat in
+        output1_cat = output1_cat + resize_as(glb_e1, output1_cat)
+        # merge
+        final_output = self.insmask_head(output1_cat)  # (1,128,256,256)
+        # shallow feature merge
+        final_output = final_output + resize_as(shallow, final_output)
+        final_output = self.upsample1(rescale_to(final_output))
+        final_output = rescale_to(final_output +
+                                  resize_as(shallow, final_output))
+        final_output = self.upsample2(final_output)
+        final_output = self.output(final_output)
+        ####
+        sideout5 = self.sideout5(e5).to(dtype=torch_dtype, device=torch_device)
+        sideout4 = self.sideout4(e4)
+        sideout3 = self.sideout3(e3)
+        sideout2 = self.sideout2(e2)
+        sideout1 = self.sideout1(e1)
+        #######glb_sideouts ######
+        glb5 = self.sideout5(glb_e5)
+        glb4 = sideout4[-1, :, :, :].unsqueeze(0)
+        glb3 = sideout3[-1, :, :, :].unsqueeze(0)
+        glb2 = sideout2[-1, :, :, :].unsqueeze(0)
+        glb1 = sideout1[-1, :, :, :].unsqueeze(0)
+        ####### concat 4 to 1 #######
+        sideout1 = patches2image(sideout1[:-1]).to(dtype=torch_dtype,
+                                                   device=torch_device)
+        sideout2 = patches2image(sideout2[:-1]).to(
+            dtype=torch_dtype,
+            device=torch_device)  ####(5,c,h,w) -> (1 c 2h,2w)
+        sideout3 = patches2image(sideout3[:-1]).to(dtype=torch_dtype,
+                                                   device=torch_device)
+        sideout4 = patches2image(sideout4[:-1]).to(dtype=torch_dtype,
+                                                   device=torch_device)
+        sideout5 = patches2image(sideout5[:-1]).to(dtype=torch_dtype,
+                                                   device=torch_device)
+        if self.training:
+            return sideout5, sideout4, sideout3, sideout2, sideout1, final_output, glb5, glb4, glb3, glb2, glb1, tokenattmap4, tokenattmap3, tokenattmap2, tokenattmap1
+        else:
+            return final_output
+# model for multi-scale testing
+class inf_MVANet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # self.backbone = SwinB(pretrained=True)
+        self.backbone = SwinB(pretrained=False)
+        emb_dim = 128
+        self.output5 = make_cbr(1024, emb_dim)
+        self.output4 = make_cbr(512, emb_dim)
+        self.output3 = make_cbr(256, emb_dim)
+        self.output2 = make_cbr(128, emb_dim)
+        self.output1 = make_cbr(128, emb_dim)
+        self.multifieldcrossatt = inf_MCLM(emb_dim, 1, [1, 4, 8])
+        self.conv1 = make_cbr(emb_dim, emb_dim)
+        self.conv2 = make_cbr(emb_dim, emb_dim)
+        self.conv3 = make_cbr(emb_dim, emb_dim)
+        self.conv4 = make_cbr(emb_dim, emb_dim)
+        self.dec_blk1 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk2 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk3 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+        self.dec_blk4 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+        self.insmask_head = nn.Sequential(
+            nn.Conv2d(emb_dim, 384, kernel_size=3, padding=1),
+            nn.BatchNorm2d(384), nn.PReLU(),
+            nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.BatchNorm2d(384),
+            nn.PReLU(), nn.Conv2d(384, emb_dim, kernel_size=3, padding=1))
+        self.shallow = nn.Sequential(
+            nn.Conv2d(3, emb_dim, kernel_size=3, padding=1))
+        self.upsample1 = make_cbg(emb_dim, emb_dim)
+        self.upsample2 = make_cbg(emb_dim, emb_dim)
+        self.output = nn.Sequential(
+            nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+        for m in self.modules():
+            if isinstance(m, nn.ReLU) or isinstance(m, nn.Dropout):
+                m.inplace = True
+    def forward(self, x):
+        shallow = self.shallow(x)
+        glb = rescale_to(x, scale_factor=0.5, interpolation='bilinear')
+        loc = image2patches(x)
+        input = torch.cat((loc, glb), dim=0)
+        feature = self.backbone(input)
+        e5 = self.output5(feature[4])
+        e4 = self.output4(feature[3])
+        e3 = self.output3(feature[2])
+        e2 = self.output2(feature[1])
+        e1 = self.output1(feature[0])
+        loc_e5, glb_e5 = e5.split([4, 1], dim=0)
+        e5_cat = self.multifieldcrossatt(loc_e5, glb_e5)
+        e4 = self.conv4(self.dec_blk4(e4 + resize_as(e5_cat, e4)))
+        e3 = self.conv3(self.dec_blk3(e3 + resize_as(e4, e3)))
+        e2 = self.conv2(self.dec_blk2(e2 + resize_as(e3, e2)))
+        e1 = self.conv1(self.dec_blk1(e1 + resize_as(e2, e1)))
+        loc_e1, glb_e1 = e1.split([4, 1], dim=0)
+        # after decoder, concat loc features to a whole one, and merge
+        output1_cat = patches2image(loc_e1)
+        # add glb feat in
+        output1_cat = output1_cat + resize_as(glb_e1, output1_cat)
+        # merge
+        final_output = self.insmask_head(output1_cat)
+        # shallow feature merge
+        final_output = final_output + resize_as(shallow, final_output)
+        final_output = self.upsample1(rescale_to(final_output))
+        final_output = rescale_to(final_output +
+                                  resize_as(shallow, final_output))
+        final_output = self.upsample2(final_output)
+        final_output = self.output(final_output)
+        return final_output
+class load_MVANet_Model:
+    def __init__(self):
+        pass
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {},
+        }
+    RETURN_TYPES = ("MVANet_Model", )
+    FUNCTION = "test"
+    CATEGORY = "MVANet"
+    def test(self):
+        return (load_model(get_model_path()), )
+class run_MVANet_inference:
+    def __init__(self):
+        pass
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "image": ("IMAGE", ),
+                "MVANet_Model": ("MVANet_Model", ),
+            },
+        }
+    RETURN_TYPES = ("MASK", )
+    FUNCTION = "test"
+    CATEGORY = "MVANet"
+    def test(
+        self,
+        image,
+        MVANet_Model,
+    ):
+        ret = do_infer_tensor2tensor(img=image, net=MVANet_Model)
+        return (ret, )
+NODE_CLASS_MAPPINGS = {
+    "load_MVANet_Model": load_MVANet_Model,
+    "run_MVANet_inference": run_MVANet_inference
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "load_MVANet_Model": "load MVANet Model",
+    "run_MVANet_inference": "run_MVANet_inference"
+}

ComfyUI_MVANet/download.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/bin/sh
+get_repo(){
+    DIR_REPO="${HOME}/GITHUB/$('echo' "${1}" | 'sed' 's/^git@github.com://g ; s@^https://github.com/@@g ; s@.git$@@g' )"
+    DIR_BASE="$('dirname' '--' "${DIR_REPO}")"
+    mkdir -pv -- "${DIR_BASE}"
+    cd "${DIR_BASE}"
+    git clone "${1}"
+    cd "${DIR_REPO}"
+    git pull
+    git submodule update --recursive --init
+}
+get_repo 'https://github.com/qianyu-dlut/MVANet.git'

ComfyUI_MVANet/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+timm
+einops
+wget

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2020 Peike Li
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MVANet_Inference/README.org ADDED Viewed

	@@ -0,0 +1,2179 @@

+* COMMENT Sample
+** Shell script to download
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./download.sh
+#+end_src
+** MVANet_inference import
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.import.py
+#+end_src
+** MVANet_inference function
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+#+end_src
+** MVANet_inference class
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.class.py
+#+end_src
+** MVANet_inference execute
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.execute.py
+#+end_src
+** MVANet_inference unify
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./MVANet_inference.unify.sh
+#+end_src
+** MVANet_inference run
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./MVANet_inference.run.sh
+#+end_src
+* Download the code:
+** Function to download
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./download.sh
+  get_repo(){
+      DIR_REPO="${HOME}/GITHUB/$('echo' "${1}" | 'sed' 's/^git@github.com://g ; s@^https://github.com/@@g ; s@.git$@@g' )"
+      DIR_BASE="$('dirname' '--' "${DIR_REPO}")"
+      mkdir -pv -- "${DIR_BASE}"
+      cd "${DIR_BASE}"
+      git clone "${1}"
+      cd "${DIR_REPO}"
+      git pull
+      git submodule update --recursive --init
+  }
+#+end_src
+** Download
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./download.sh
+  get_repo 'https://github.com/qianyu-dlut/MVANet.git'
+#+end_src
+* Dependencies
+pip3 install mmdet==2.23.0
+pip3 install mmcv==1.4.8
+pip3 install ttach
+* Python inference
+** Important configs
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.import.py
+  import os
+  import sys
+  HOME_DIR = os.environ.get('HOME', '/root')
+  MVANET_SOURCE_DIR = HOME_DIR + '/GITHUB/qianyu-dlut/MVANet'
+  finetuned_MVANet_model_path = MVANET_SOURCE_DIR + '/model/Model_80.pth'
+  pretrained_SwinB_model_path = MVANET_SOURCE_DIR + '/model/swin_base_patch4_window12_384_22kto1k.pth'
+#+end_src
+** MVANet_inference import
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.import.py
+  import math
+  import numpy as np
+  from PIL import Image
+  import time
+  # import ttach as tta
+  import cv2
+  import torch
+  import torch.nn as nn
+  import torch.nn.functional as F
+  import torch.utils.checkpoint as checkpoint
+  from torch.autograd import Variable
+  from torch import nn
+  from torchvision import transforms
+  from einops import rearrange
+  from timm.models import load_checkpoint
+  from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+#+end_src
+** Load image using CV
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+  def load_image(input_image_path):
+      img = cv2.imread(input_image_path, cv2.IMREAD_COLOR)
+      img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+      return img
+  def load_image_torch(input_image_path):
+      img = cv2.imread(input_image_path, cv2.IMREAD_COLOR)
+      img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+      img = torch.from_numpy(img)
+      img = img.to(dtype=torch.float32)
+      img /= 255.0
+      img = img.unsqueeze(0)
+      return img
+  def save_mask(output_image_path, mask):
+      cv2.imwrite(output_image_path, mask)
+  def save_mask_torch(output_image_path, mask):
+      mask = mask.detach().cpu()
+      mask *= 255.0
+      mask = mask.clamp(0, 255)
+      print(mask.shape)
+      mask = mask.squeeze(0)
+      mask = mask.to(dtype=torch.uint8)
+      print(mask.shape)
+      mask = mask.numpy()
+      print(mask.shape)
+      cv2.imwrite(output_image_path, mask)
+#+end_src
+** Device configs
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.execute.py
+  torch_device = 'cuda'
+  torch_dtype = torch.float16
+#+end_src
+to(dtype=torch_dtype, device=torch_device)
+** MVANet_inference function
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+  def check_mkdir(dir_name):
+      if not os.path.isdir(dir_name):
+          os.makedirs(dir_name)
+  def SwinT(pretrained=True):
+      model = SwinTransformer(embed_dim=96,
+                              depths=[2, 2, 6, 2],
+                              num_heads=[3, 6, 12, 24],
+                              window_size=7)
+      if pretrained is True:
+          model.load_state_dict(torch.load(
+              'data/backbone_ckpt/swin_tiny_patch4_window7_224.pth',
+              map_location='cpu')['model'],
+                                strict=False)
+      return model
+  def SwinS(pretrained=True):
+      model = SwinTransformer(embed_dim=96,
+                              depths=[2, 2, 18, 2],
+                              num_heads=[3, 6, 12, 24],
+                              window_size=7)
+      if pretrained is True:
+          model.load_state_dict(torch.load(
+              'data/backbone_ckpt/swin_small_patch4_window7_224.pth',
+              map_location='cpu')['model'],
+                                strict=False)
+      return model
+  def SwinB(pretrained=True):
+      model = SwinTransformer(embed_dim=128,
+                              depths=[2, 2, 18, 2],
+                              num_heads=[4, 8, 16, 32],
+                              window_size=12)
+      if pretrained is True:
+          import os
+          model.load_state_dict(torch.load(pretrained_SwinB_model_path,
+                                           map_location='cpu')['model'],
+                                strict=False)
+      return model
+  def SwinL(pretrained=True):
+      model = SwinTransformer(embed_dim=192,
+                              depths=[2, 2, 18, 2],
+                              num_heads=[6, 12, 24, 48],
+                              window_size=12)
+      if pretrained is True:
+          model.load_state_dict(torch.load(
+              'data/backbone_ckpt/swin_large_patch4_window12_384_22kto1k.pth',
+              map_location='cpu')['model'],
+                                strict=False)
+      return model
+  def get_activation_fn(activation):
+      """Return an activation function given a string"""
+      if activation == "relu":
+          return F.relu
+      if activation == "gelu":
+          return F.gelu
+      if activation == "glu":
+          return F.glu
+      raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+  def make_cbr(in_dim, out_dim):
+      return nn.Sequential(nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1),
+                           nn.BatchNorm2d(out_dim), nn.PReLU())
+  def make_cbg(in_dim, out_dim):
+      return nn.Sequential(nn.Conv2d(in_dim, out_dim, kernel_size=3, padding=1),
+                           nn.BatchNorm2d(out_dim), nn.GELU())
+  def rescale_to(x, scale_factor: float = 2, interpolation='nearest'):
+      return F.interpolate(x, scale_factor=scale_factor, mode=interpolation)
+  def resize_as(x, y, interpolation='bilinear'):
+      return F.interpolate(x, size=y.shape[-2:], mode=interpolation)
+  def image2patches(x):
+      """b c (hg h) (wg w) -> (hg wg b) c h w"""
+      x = rearrange(x, 'b c (hg h) (wg w) -> (hg wg b) c h w', hg=2, wg=2)
+      return x
+  def patches2image(x):
+      """(hg wg b) c h w -> b c (hg h) (wg w)"""
+      x = rearrange(x, '(hg wg b) c h w -> b c (hg h) (wg w)', hg=2, wg=2)
+      return x
+  def window_partition(x, window_size):
+      """
+      Args:
+          x: (B, H, W, C)
+          window_size (int): window size
+      Returns:
+          windows: (num_windows*B, window_size, window_size, C)
+      """
+      B, H, W, C = x.shape
+      x = x.view(B, H // window_size, window_size, W // window_size, window_size,
+                 C)
+      windows = x.permute(0, 1, 3, 2, 4,
+                          5).contiguous().view(-1, window_size, window_size, C)
+      return windows
+  def window_reverse(windows, window_size, H, W):
+      """
+      Args:
+          windows: (num_windows*B, window_size, window_size, C)
+          window_size (int): Window size
+          H (int): Height of image
+          W (int): Width of image
+      Returns:
+          x: (B, H, W, C)
+      """
+      B = int(windows.shape[0] / (H * W / window_size / window_size))
+      x = windows.view(B, H // window_size, W // window_size, window_size,
+                       window_size, -1)
+      x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+      return x
+#+end_src
+** MVANet_inference class
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.class.py
+  class Mlp(nn.Module):
+      """ Multilayer perceptron."""
+      def __init__(self,
+                   in_features,
+                   hidden_features=None,
+                   out_features=None,
+                   act_layer=nn.GELU,
+                   drop=0.):
+          super().__init__()
+          out_features = out_features or in_features
+          hidden_features = hidden_features or in_features
+          self.fc1 = nn.Linear(in_features, hidden_features)
+          self.act = act_layer()
+          self.fc2 = nn.Linear(hidden_features, out_features)
+          self.drop = nn.Dropout(drop)
+      def forward(self, x):
+          x = self.fc1(x)
+          x = self.act(x)
+          x = self.drop(x)
+          x = self.fc2(x)
+          x = self.drop(x)
+          return x
+  class WindowAttention(nn.Module):
+      """ Window based multi-head self attention (W-MSA) module with relative position bias.
+      It supports both of shifted and non-shifted window.
+      Args:
+          dim (int): Number of input channels.
+          window_size (tuple[int]): The height and width of the window.
+          num_heads (int): Number of attention heads.
+          qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+          attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+          proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+      """
+      def __init__(self,
+                   dim,
+                   window_size,
+                   num_heads,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   attn_drop=0.,
+                   proj_drop=0.):
+          super().__init__()
+          self.dim = dim
+          self.window_size = window_size  # Wh, Ww
+          self.num_heads = num_heads
+          head_dim = dim // num_heads
+          self.scale = qk_scale or head_dim**-0.5
+          # define a parameter table of relative position bias
+          self.relative_position_bias_table = nn.Parameter(
+              torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                          num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+          # get pair-wise relative position index for each token inside the window
+          coords_h = torch.arange(self.window_size[0])
+          coords_w = torch.arange(self.window_size[1])
+          coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+          coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+          relative_coords = coords_flatten[:, :,
+                                           None] - coords_flatten[:,
+                                                                  None, :]  # 2, Wh*Ww, Wh*Ww
+          relative_coords = relative_coords.permute(
+              1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+          relative_coords[:, :,
+                          0] += self.window_size[0] - 1  # shift to start from 0
+          relative_coords[:, :, 1] += self.window_size[1] - 1
+          relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+          relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+          self.register_buffer("relative_position_index",
+                               relative_position_index)
+          self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+          self.attn_drop = nn.Dropout(attn_drop)
+          self.proj = nn.Linear(dim, dim)
+          self.proj_drop = nn.Dropout(proj_drop)
+          trunc_normal_(self.relative_position_bias_table, std=.02)
+          self.softmax = nn.Softmax(dim=-1)
+      def forward(self, x, mask=None):
+          """ Forward function.
+          Args:
+              x: input features with shape of (num_windows*B, N, C)
+              mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+          """
+          x = x.to(dtype=torch_dtype, device=torch_device)
+          B_, N, C = x.shape
+          qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                    C // self.num_heads).permute(2, 0, 3, 1, 4)
+          q, k, v = qkv[0], qkv[1], qkv[
+              2]  # make torchscript happy (cannot use tensor as tuple)
+          q = q * self.scale
+          attn = (q @ k.transpose(-2, -1))
+          relative_position_bias = self.relative_position_bias_table[
+              self.relative_position_index.view(-1)].view(
+                  self.window_size[0] * self.window_size[1],
+                  self.window_size[0] * self.window_size[1],
+                  -1)  # Wh*Ww,Wh*Ww,nH
+          relative_position_bias = relative_position_bias.permute(
+              2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+          attn = attn + relative_position_bias.unsqueeze(0)
+          if mask is not None:
+              nW = mask.shape[0]
+              attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                               N) + mask.unsqueeze(1).unsqueeze(0)
+              attn = attn.view(-1, self.num_heads, N, N)
+              attn = self.softmax(attn)
+          else:
+              attn = self.softmax(attn)
+          attn = self.attn_drop(attn)
+          attn = attn.to(dtype=torch_dtype, device=torch_device)
+          v = v.to(dtype=torch_dtype, device=torch_device)
+          x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+          x = self.proj(x)
+          x = self.proj_drop(x)
+          return x
+  class SwinTransformerBlock(nn.Module):
+      """ Swin Transformer Block.
+      Args:
+          dim (int): Number of input channels.
+          num_heads (int): Number of attention heads.
+          window_size (int): Window size.
+          shift_size (int): Shift size for SW-MSA.
+          mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+          qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+          drop (float, optional): Dropout rate. Default: 0.0
+          attn_drop (float, optional): Attention dropout rate. Default: 0.0
+          drop_path (float, optional): Stochastic depth rate. Default: 0.0
+          act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+          norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+      """
+      def __init__(self,
+                   dim,
+                   num_heads,
+                   window_size=7,
+                   shift_size=0,
+                   mlp_ratio=4.,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   drop=0.,
+                   attn_drop=0.,
+                   drop_path=0.,
+                   act_layer=nn.GELU,
+                   norm_layer=nn.LayerNorm):
+          super().__init__()
+          self.dim = dim
+          self.num_heads = num_heads
+          self.window_size = window_size
+          self.shift_size = shift_size
+          self.mlp_ratio = mlp_ratio
+          assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+          self.norm1 = norm_layer(dim)
+          self.attn = WindowAttention(dim,
+                                      window_size=to_2tuple(self.window_size),
+                                      num_heads=num_heads,
+                                      qkv_bias=qkv_bias,
+                                      qk_scale=qk_scale,
+                                      attn_drop=attn_drop,
+                                      proj_drop=drop)
+          self.drop_path = DropPath(
+              drop_path) if drop_path > 0. else nn.Identity()
+          self.norm2 = norm_layer(dim)
+          mlp_hidden_dim = int(dim * mlp_ratio)
+          self.mlp = Mlp(in_features=dim,
+                         hidden_features=mlp_hidden_dim,
+                         act_layer=act_layer,
+                         drop=drop)
+          self.H = None
+          self.W = None
+      def forward(self, x, mask_matrix):
+          """ Forward function.
+          Args:
+              x: Input feature, tensor size (B, H*W, C).
+              H, W: Spatial resolution of the input feature.
+              mask_matrix: Attention mask for cyclic shift.
+          """
+          B, L, C = x.shape
+          H, W = self.H, self.W
+          assert L == H * W, "input feature has wrong size"
+          shortcut = x
+          x = self.norm1(x)
+          x = x.view(B, H, W, C)
+          # pad feature maps to multiples of window size
+          pad_l = pad_t = 0
+          pad_r = (self.window_size - W % self.window_size) % self.window_size
+          pad_b = (self.window_size - H % self.window_size) % self.window_size
+          x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+          _, Hp, Wp, _ = x.shape
+          # cyclic shift
+          if self.shift_size > 0:
+              shifted_x = torch.roll(x,
+                                     shifts=(-self.shift_size, -self.shift_size),
+                                     dims=(1, 2))
+              attn_mask = mask_matrix
+          else:
+              shifted_x = x
+              attn_mask = None
+          # partition windows
+          x_windows = window_partition(
+              shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+          x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                     C)  # nW*B, window_size*window_size, C
+          # W-MSA/SW-MSA
+          attn_windows = self.attn(
+              x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+          # merge windows
+          attn_windows = attn_windows.view(-1, self.window_size,
+                                           self.window_size, C)
+          shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                     Wp)  # B H' W' C
+          # reverse cyclic shift
+          if self.shift_size > 0:
+              x = torch.roll(shifted_x,
+                             shifts=(self.shift_size, self.shift_size),
+                             dims=(1, 2))
+          else:
+              x = shifted_x
+          if pad_r > 0 or pad_b > 0:
+              x = x[:, :H, :W, :].contiguous()
+          x = x.view(B, H * W, C)
+          # FFN
+          x = shortcut + self.drop_path(x)
+          x = x + self.drop_path(self.mlp(self.norm2(x)))
+          return x
+  class PatchMerging(nn.Module):
+      """ Patch Merging Layer
+      Args:
+          dim (int): Number of input channels.
+          norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+      """
+      def __init__(self, dim, norm_layer=nn.LayerNorm):
+          super().__init__()
+          self.dim = dim
+          self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+          self.norm = norm_layer(4 * dim)
+      def forward(self, x, H, W):
+          """ Forward function.
+          Args:
+              x: Input feature, tensor size (B, H*W, C).
+              H, W: Spatial resolution of the input feature.
+          """
+          B, L, C = x.shape
+          assert L == H * W, "input feature has wrong size"
+          x = x.view(B, H, W, C)
+          # padding
+          pad_input = (H % 2 == 1) or (W % 2 == 1)
+          if pad_input:
+              x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+          x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+          x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+          x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+          x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+          x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+          x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+          x = self.norm(x)
+          x = self.reduction(x)
+          return x
+  class BasicLayer(nn.Module):
+      """ A basic Swin Transformer layer for one stage.
+      Args:
+          dim (int): Number of feature channels
+          depth (int): Depths of this stage.
+          num_heads (int): Number of attention head.
+          window_size (int): Local window size. Default: 7.
+          mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+          qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+          drop (float, optional): Dropout rate. Default: 0.0
+          attn_drop (float, optional): Attention dropout rate. Default: 0.0
+          drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+          norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+          downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+          use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+      """
+      def __init__(self,
+                   dim,
+                   depth,
+                   num_heads,
+                   window_size=7,
+                   mlp_ratio=4.,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   drop=0.,
+                   attn_drop=0.,
+                   drop_path=0.,
+                   norm_layer=nn.LayerNorm,
+                   downsample=None,
+                   use_checkpoint=False):
+          super().__init__()
+          self.window_size = window_size
+          self.shift_size = window_size // 2
+          self.depth = depth
+          self.use_checkpoint = use_checkpoint
+          # build blocks
+          self.blocks = nn.ModuleList([
+              SwinTransformerBlock(dim=dim,
+                                   num_heads=num_heads,
+                                   window_size=window_size,
+                                   shift_size=0 if
+                                   (i % 2 == 0) else window_size // 2,
+                                   mlp_ratio=mlp_ratio,
+                                   qkv_bias=qkv_bias,
+                                   qk_scale=qk_scale,
+                                   drop=drop,
+                                   attn_drop=attn_drop,
+                                   drop_path=drop_path[i] if isinstance(
+                                       drop_path, list) else drop_path,
+                                   norm_layer=norm_layer) for i in range(depth)
+          ])
+          # patch merging layer
+          if downsample is not None:
+              self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+          else:
+              self.downsample = None
+      def forward(self, x, H, W):
+          """ Forward function.
+          Args:
+              x: Input feature, tensor size (B, H*W, C).
+              H, W: Spatial resolution of the input feature.
+          """
+          # calculate attention mask for SW-MSA
+          Hp = int(np.ceil(H / self.window_size)) * self.window_size
+          Wp = int(np.ceil(W / self.window_size)) * self.window_size
+          img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+          h_slices = (slice(0, -self.window_size),
+                      slice(-self.window_size,
+                            -self.shift_size), slice(-self.shift_size, None))
+          w_slices = (slice(0, -self.window_size),
+                      slice(-self.window_size,
+                            -self.shift_size), slice(-self.shift_size, None))
+          cnt = 0
+          for h in h_slices:
+              for w in w_slices:
+                  img_mask[:, h, w, :] = cnt
+                  cnt += 1
+          mask_windows = window_partition(
+              img_mask, self.window_size)  # nW, window_size, window_size, 1
+          mask_windows = mask_windows.view(-1,
+                                           self.window_size * self.window_size)
+          attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+          attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                            float(-100.0)).masked_fill(
+                                                attn_mask == 0, float(0.0))
+          for blk in self.blocks:
+              blk.H, blk.W = H, W
+              if self.use_checkpoint:
+                  x = checkpoint.checkpoint(blk, x, attn_mask)
+              else:
+                  x = blk(x, attn_mask)
+          if self.downsample is not None:
+              x_down = self.downsample(x, H, W)
+              Wh, Ww = (H + 1) // 2, (W + 1) // 2
+              return x, H, W, x_down, Wh, Ww
+          else:
+              return x, H, W, x, H, W
+  class PatchEmbed(nn.Module):
+      """ Image to Patch Embedding
+      Args:
+          patch_size (int): Patch token size. Default: 4.
+          in_chans (int): Number of input image channels. Default: 3.
+          embed_dim (int): Number of linear projection output channels. Default: 96.
+          norm_layer (nn.Module, optional): Normalization layer. Default: None
+      """
+      def __init__(self,
+                   patch_size=4,
+                   in_chans=3,
+                   embed_dim=96,
+                   norm_layer=None):
+          super().__init__()
+          patch_size = to_2tuple(patch_size)
+          self.patch_size = patch_size
+          self.in_chans = in_chans
+          self.embed_dim = embed_dim
+          self.proj = nn.Conv2d(in_chans,
+                                embed_dim,
+                                kernel_size=patch_size,
+                                stride=patch_size)
+          if norm_layer is not None:
+              self.norm = norm_layer(embed_dim)
+          else:
+              self.norm = None
+      def forward(self, x):
+          """Forward function."""
+          # padding
+          _, _, H, W = x.size()
+          if W % self.patch_size[1] != 0:
+              x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+          if H % self.patch_size[0] != 0:
+              x = F.pad(x,
+                        (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+          x = self.proj(x)  # B C Wh Ww
+          if self.norm is not None:
+              Wh, Ww = x.size(2), x.size(3)
+              x = x.flatten(2).transpose(1, 2)
+              x = self.norm(x)
+              x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+          return x
+  class SwinTransformer(nn.Module):
+      """ Swin Transformer backbone.
+          A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+            https://arxiv.org/pdf/2103.14030
+      Args:
+          pretrain_img_size (int): Input image size for training the pretrained model,
+              used in absolute postion embedding. Default 224.
+          patch_size (int | tuple(int)): Patch size. Default: 4.
+          in_chans (int): Number of input image channels. Default: 3.
+          embed_dim (int): Number of linear projection output channels. Default: 96.
+          depths (tuple[int]): Depths of each Swin Transformer stage.
+          num_heads (tuple[int]): Number of attention head of each stage.
+          window_size (int): Window size. Default: 7.
+          mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+          qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+          qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+          drop_rate (float): Dropout rate.
+          attn_drop_rate (float): Attention dropout rate. Default: 0.
+          drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+          norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+          ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+          patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+          out_indices (Sequence[int]): Output from which stages.
+          frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+              -1 means not freezing any parameters.
+          use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+      """
+      def __init__(self,
+                   pretrain_img_size=224,
+                   patch_size=4,
+                   in_chans=3,
+                   embed_dim=96,
+                   depths=[2, 2, 6, 2],
+                   num_heads=[3, 6, 12, 24],
+                   window_size=7,
+                   mlp_ratio=4.,
+                   qkv_bias=True,
+                   qk_scale=None,
+                   drop_rate=0.,
+                   attn_drop_rate=0.,
+                   drop_path_rate=0.2,
+                   norm_layer=nn.LayerNorm,
+                   ape=False,
+                   patch_norm=True,
+                   out_indices=(0, 1, 2, 3),
+                   frozen_stages=-1,
+                   use_checkpoint=False):
+          super().__init__()
+          self.pretrain_img_size = pretrain_img_size
+          self.num_layers = len(depths)
+          self.embed_dim = embed_dim
+          self.ape = ape
+          self.patch_norm = patch_norm
+          self.out_indices = out_indices
+          self.frozen_stages = frozen_stages
+          # split image into non-overlapping patches
+          self.patch_embed = PatchEmbed(
+              patch_size=patch_size,
+              in_chans=in_chans,
+              embed_dim=embed_dim,
+              norm_layer=norm_layer if self.patch_norm else None)
+          # absolute position embedding
+          if self.ape:
+              pretrain_img_size = to_2tuple(pretrain_img_size)
+              patch_size = to_2tuple(patch_size)
+              patches_resolution = [
+                  pretrain_img_size[0] // patch_size[0],
+                  pretrain_img_size[1] // patch_size[1]
+              ]
+              self.absolute_pos_embed = nn.Parameter(
+                  torch.zeros(1, embed_dim, patches_resolution[0],
+                              patches_resolution[1]))
+              trunc_normal_(self.absolute_pos_embed, std=.02)
+          self.pos_drop = nn.Dropout(p=drop_rate)
+          # stochastic depth
+          dpr = [
+              x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+          ]  # stochastic depth decay rule
+          # build layers
+          self.layers = nn.ModuleList()
+          for i_layer in range(self.num_layers):
+              layer = BasicLayer(
+                  dim=int(embed_dim * 2**i_layer),
+                  depth=depths[i_layer],
+                  num_heads=num_heads[i_layer],
+                  window_size=window_size,
+                  mlp_ratio=mlp_ratio,
+                  qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop=drop_rate,
+                  attn_drop=attn_drop_rate,
+                  drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                  norm_layer=norm_layer,
+                  downsample=PatchMerging if
+                  (i_layer < self.num_layers - 1) else None,
+                  use_checkpoint=use_checkpoint)
+              self.layers.append(layer)
+          num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+          self.num_features = num_features
+          # add a norm layer for each output
+          for i_layer in out_indices:
+              layer = norm_layer(num_features[i_layer])
+              layer_name = f'norm{i_layer}'
+              self.add_module(layer_name, layer)
+          self._freeze_stages()
+      def _freeze_stages(self):
+          if self.frozen_stages >= 0:
+              self.patch_embed.eval()
+              for param in self.patch_embed.parameters():
+                  param.requires_grad = False
+          if self.frozen_stages >= 1 and self.ape:
+              self.absolute_pos_embed.requires_grad = False
+          if self.frozen_stages >= 2:
+              self.pos_drop.eval()
+              for i in range(0, self.frozen_stages - 1):
+                  m = self.layers[i]
+                  m.eval()
+                  for param in m.parameters():
+                      param.requires_grad = False
+      def init_weights(self, pretrained=None):
+          """Initialize the weights in backbone.
+          Args:
+              pretrained (str, optional): Path to pre-trained weights.
+                  Defaults to None.
+          """
+          def _init_weights(m):
+              if isinstance(m, nn.Linear):
+                  trunc_normal_(m.weight, std=.02)
+                  if isinstance(m, nn.Linear) and m.bias is not None:
+                      nn.init.constant_(m.bias, 0)
+              elif isinstance(m, nn.LayerNorm):
+                  nn.init.constant_(m.bias, 0)
+                  nn.init.constant_(m.weight, 1.0)
+          if isinstance(pretrained, str):
+              self.apply(_init_weights)
+              load_checkpoint(self, pretrained, strict=False, logger=None)
+          elif pretrained is None:
+              self.apply(_init_weights)
+          else:
+              raise TypeError('pretrained must be a str or None')
+      def forward(self, x):
+          x = self.patch_embed(x)
+          Wh, Ww = x.size(2), x.size(3)
+          if self.ape:
+              # interpolate the position embedding to the corresponding size
+              absolute_pos_embed = F.interpolate(self.absolute_pos_embed,
+                                                 size=(Wh, Ww),
+                                                 mode='bicubic')
+              x = (x + absolute_pos_embed)  # B Wh*Ww C
+          outs = [x.contiguous()]
+          x = x.flatten(2).transpose(1, 2)
+          x = self.pos_drop(x)
+          for i in range(self.num_layers):
+              layer = self.layers[i]
+              x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+              if i in self.out_indices:
+                  norm_layer = getattr(self, f'norm{i}')
+                  x_out = norm_layer(x_out)
+                  out = x_out.view(-1, H, W,
+                                   self.num_features[i]).permute(0, 3, 1,
+                                                                 2).contiguous()
+                  outs.append(out)
+          return tuple(outs)
+      def train(self, mode=True):
+          """Convert the model into training mode while keep layers freezed."""
+          super(SwinTransformer, self).train(mode)
+          self._freeze_stages()
+  class PositionEmbeddingSine:
+      def __init__(self,
+                   num_pos_feats=64,
+                   temperature=10000,
+                   normalize=False,
+                   scale=None):
+          super().__init__()
+          self.num_pos_feats = num_pos_feats
+          self.temperature = temperature
+          self.normalize = normalize
+          if scale is not None and normalize is False:
+              raise ValueError("normalize should be True if scale is passed")
+          if scale is None:
+              scale = 2 * math.pi
+          self.scale = scale
+          self.dim_t = torch.arange(0,
+                                    self.num_pos_feats,
+                                    dtype=torch_dtype,
+                                    device=torch_device)
+      def __call__(self, b, h, w):
+          mask = torch.zeros([b, h, w], dtype=torch.bool, device=torch_device)
+          assert mask is not None
+          not_mask = ~mask
+          y_embed = not_mask.cumsum(dim=1, dtype=torch_dtype)
+          x_embed = not_mask.cumsum(dim=2, dtype=torch_dtype)
+          if self.normalize:
+              eps = 1e-6
+              y_embed = ((y_embed - 0.5) / (y_embed[:, -1:, :] + eps) *
+                         self.scale).to(device=torch_device, dtype=torch_dtype)
+              x_embed = ((x_embed - 0.5) / (x_embed[:, :, -1:] + eps) *
+                         self.scale).to(device=torch_device, dtype=torch_dtype)
+          dim_t = self.temperature**(2 * (self.dim_t // 2) / self.num_pos_feats)
+          pos_x = x_embed[:, :, :, None] / dim_t
+          pos_y = y_embed[:, :, :, None] / dim_t
+          pos_x = torch.stack(
+              (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+              dim=4).flatten(3)
+          pos_y = torch.stack(
+              (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+              dim=4).flatten(3)
+          return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+  class MCLM(nn.Module):
+      def __init__(self, d_model, num_heads, pool_ratios=[1, 4, 8]):
+          super(MCLM, self).__init__()
+          self.attention = nn.ModuleList([
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+          ])
+          self.linear1 = nn.Linear(d_model, d_model * 2)
+          self.linear2 = nn.Linear(d_model * 2, d_model)
+          self.linear3 = nn.Linear(d_model, d_model * 2)
+          self.linear4 = nn.Linear(d_model * 2, d_model)
+          self.norm1 = nn.LayerNorm(d_model)
+          self.norm2 = nn.LayerNorm(d_model)
+          self.dropout = nn.Dropout(0.1)
+          self.dropout1 = nn.Dropout(0.1)
+          self.dropout2 = nn.Dropout(0.1)
+          self.activation = get_activation_fn('relu')
+          self.pool_ratios = pool_ratios
+          self.p_poses = []
+          self.g_pos = None
+          self.positional_encoding = PositionEmbeddingSine(
+              num_pos_feats=d_model // 2, normalize=True)
+      def forward(self, l, g):
+          """
+          l: 4,c,h,w
+          g: 1,c,h,w
+          """
+          b, c, h, w = l.size()
+          # 4,c,h,w -> 1,c,2h,2w
+          concated_locs = rearrange(l,
+                                    '(hg wg b) c h w -> b c (hg h) (wg w)',
+                                    hg=2,
+                                    wg=2)
+          pools = []
+          for pool_ratio in self.pool_ratios:
+              # b,c,h,w
+              tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+              pool = F.adaptive_avg_pool2d(concated_locs, tgt_hw)
+              pools.append(rearrange(pool, 'b c h w -> (h w) b c'))
+              if self.g_pos is None:
+                  pos_emb = self.positional_encoding(pool.shape[0],
+                                                     pool.shape[2],
+                                                     pool.shape[3])
+                  pos_emb = rearrange(pos_emb, 'b c h w -> (h w) b c')
+                  self.p_poses.append(pos_emb)
+          pools = torch.cat(pools, 0)
+          if self.g_pos is None:
+              self.p_poses = torch.cat(self.p_poses, dim=0)
+              pos_emb = self.positional_encoding(g.shape[0], g.shape[2],
+                                                 g.shape[3])
+              self.g_pos = rearrange(pos_emb, 'b c h w -> (h w) b c')
+          # attention between glb (q) & multisensory concated-locs (k,v)
+          g_hw_b_c = rearrange(g, 'b c h w -> (h w) b c')
+          g_hw_b_c = g_hw_b_c + self.dropout1(self.attention[0](
+              g_hw_b_c + self.g_pos, pools + self.p_poses, pools)[0])
+          g_hw_b_c = self.norm1(g_hw_b_c)
+          g_hw_b_c = g_hw_b_c + self.dropout2(
+              self.linear2(
+                  self.dropout(self.activation(self.linear1(g_hw_b_c)).clone())))
+          g_hw_b_c = self.norm2(g_hw_b_c)
+          # attention between origin locs (q) & freashed glb (k,v)
+          l_hw_b_c = rearrange(l, "b c h w -> (h w) b c")
+          _g_hw_b_c = rearrange(g_hw_b_c, '(h w) b c -> h w b c', h=h, w=w)
+          _g_hw_b_c = rearrange(_g_hw_b_c,
+                                "(ng h) (nw w) b c -> (h w) (ng nw b) c",
+                                ng=2,
+                                nw=2)
+          outputs_re = []
+          for i, (_l, _g) in enumerate(
+                  zip(l_hw_b_c.chunk(4, dim=1), _g_hw_b_c.chunk(4, dim=1))):
+              outputs_re.append(self.attention[i + 1](_l, _g,
+                                                      _g)[0])  # (h w) 1 c
+          outputs_re = torch.cat(outputs_re, 1)  # (h w) 4 c
+          l_hw_b_c = l_hw_b_c + self.dropout1(outputs_re)
+          l_hw_b_c = self.norm1(l_hw_b_c)
+          l_hw_b_c = l_hw_b_c + self.dropout2(
+              self.linear4(
+                  self.dropout(self.activation(self.linear3(l_hw_b_c)).clone())))
+          l_hw_b_c = self.norm2(l_hw_b_c)
+          l = torch.cat((l_hw_b_c, g_hw_b_c), 1)  # hw,b(5),c
+          return rearrange(l, "(h w) b c -> b c h w", h=h, w=w)  ## (5,c,h*w)
+  class inf_MCLM(nn.Module):
+      def __init__(self, d_model, num_heads, pool_ratios=[1, 4, 8]):
+          super(inf_MCLM, self).__init__()
+          self.attention = nn.ModuleList([
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+          ])
+          self.linear1 = nn.Linear(d_model, d_model * 2)
+          self.linear2 = nn.Linear(d_model * 2, d_model)
+          self.linear3 = nn.Linear(d_model, d_model * 2)
+          self.linear4 = nn.Linear(d_model * 2, d_model)
+          self.norm1 = nn.LayerNorm(d_model)
+          self.norm2 = nn.LayerNorm(d_model)
+          self.dropout = nn.Dropout(0.1)
+          self.dropout1 = nn.Dropout(0.1)
+          self.dropout2 = nn.Dropout(0.1)
+          self.activation = get_activation_fn('relu')
+          self.pool_ratios = pool_ratios
+          self.p_poses = []
+          self.g_pos = None
+          self.positional_encoding = PositionEmbeddingSine(
+              num_pos_feats=d_model // 2, normalize=True)
+      def forward(self, l, g):
+          """
+          l: 4,c,h,w
+          g: 1,c,h,w
+          """
+          b, c, h, w = l.size()
+          # 4,c,h,w -> 1,c,2h,2w
+          concated_locs = rearrange(l,
+                                    '(hg wg b) c h w -> b c (hg h) (wg w)',
+                                    hg=2,
+                                    wg=2)
+          self.p_poses = []
+          pools = []
+          for pool_ratio in self.pool_ratios:
+              # b,c,h,w
+              tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+              pool = F.adaptive_avg_pool2d(concated_locs, tgt_hw)
+              pools.append(rearrange(pool, 'b c h w -> (h w) b c'))
+              # if self.g_pos is None:
+              pos_emb = self.positional_encoding(pool.shape[0], pool.shape[2],
+                                                 pool.shape[3])
+              pos_emb = rearrange(pos_emb, 'b c h w -> (h w) b c')
+              self.p_poses.append(pos_emb)
+          pools = torch.cat(pools, 0)
+          # if self.g_pos is None:
+          self.p_poses = torch.cat(self.p_poses, dim=0)
+          pos_emb = self.positional_encoding(g.shape[0], g.shape[2], g.shape[3])
+          self.g_pos = rearrange(pos_emb, 'b c h w -> (h w) b c')
+          # attention between glb (q) & multisensory concated-locs (k,v)
+          g_hw_b_c = rearrange(g, 'b c h w -> (h w) b c')
+          g_hw_b_c = g_hw_b_c + self.dropout1(self.attention[0](
+              g_hw_b_c + self.g_pos, pools + self.p_poses, pools)[0])
+          g_hw_b_c = self.norm1(g_hw_b_c)
+          g_hw_b_c = g_hw_b_c + self.dropout2(
+              self.linear2(
+                  self.dropout(self.activation(self.linear1(g_hw_b_c)).clone())))
+          g_hw_b_c = self.norm2(g_hw_b_c)
+          # attention between origin locs (q) & freashed glb (k,v)
+          l_hw_b_c = rearrange(l, "b c h w -> (h w) b c")
+          _g_hw_b_c = rearrange(g_hw_b_c, '(h w) b c -> h w b c', h=h, w=w)
+          _g_hw_b_c = rearrange(_g_hw_b_c,
+                                "(ng h) (nw w) b c -> (h w) (ng nw b) c",
+                                ng=2,
+                                nw=2)
+          outputs_re = []
+          for i, (_l, _g) in enumerate(
+                  zip(l_hw_b_c.chunk(4, dim=1), _g_hw_b_c.chunk(4, dim=1))):
+              outputs_re.append(self.attention[i + 1](_l, _g,
+                                                      _g)[0])  # (h w) 1 c
+          outputs_re = torch.cat(outputs_re, 1)  # (h w) 4 c
+          l_hw_b_c = l_hw_b_c + self.dropout1(outputs_re)
+          l_hw_b_c = self.norm1(l_hw_b_c)
+          l_hw_b_c = l_hw_b_c + self.dropout2(
+              self.linear4(
+                  self.dropout(self.activation(self.linear3(l_hw_b_c)).clone())))
+          l_hw_b_c = self.norm2(l_hw_b_c)
+          l = torch.cat((l_hw_b_c, g_hw_b_c), 1)  # hw,b(5),c
+          return rearrange(l, "(h w) b c -> b c h w", h=h, w=w)  ## (5,c,h*w)
+  class MCRM(nn.Module):
+      def __init__(self, d_model, num_heads, pool_ratios=[4, 8, 16], h=None):
+          super(MCRM, self).__init__()
+          self.attention = nn.ModuleList([
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+          ])
+          self.linear3 = nn.Linear(d_model, d_model * 2)
+          self.linear4 = nn.Linear(d_model * 2, d_model)
+          self.norm1 = nn.LayerNorm(d_model)
+          self.norm2 = nn.LayerNorm(d_model)
+          self.dropout = nn.Dropout(0.1)
+          self.dropout1 = nn.Dropout(0.1)
+          self.dropout2 = nn.Dropout(0.1)
+          self.sigmoid = nn.Sigmoid()
+          self.activation = get_activation_fn('relu')
+          self.sal_conv = nn.Conv2d(d_model, 1, 1)
+          self.pool_ratios = pool_ratios
+          self.positional_encoding = PositionEmbeddingSine(
+              num_pos_feats=d_model // 2, normalize=True)
+      def forward(self, x):
+          b, c, h, w = x.size()
+          loc, glb = x.split([4, 1], dim=0)  # 4,c,h,w; 1,c,h,w
+          # b(4),c,h,w
+          patched_glb = rearrange(glb,
+                                  'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                  hg=2,
+                                  wg=2)
+          # generate token attention map
+          token_attention_map = self.sigmoid(self.sal_conv(glb))
+          token_attention_map = F.interpolate(token_attention_map,
+                                              size=patches2image(loc).shape[-2:],
+                                              mode='nearest')
+          loc = loc * rearrange(token_attention_map,
+                                'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                hg=2,
+                                wg=2)
+          pools = []
+          for pool_ratio in self.pool_ratios:
+              tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+              pool = F.adaptive_avg_pool2d(patched_glb, tgt_hw)
+              pools.append(rearrange(pool,
+                                     'nl c h w -> nl c (h w)'))  # nl(4),c,hw
+          # nl(4),c,nphw -> nl(4),nphw,1,c
+          pools = rearrange(torch.cat(pools, 2), "nl c nphw -> nl nphw 1 c")
+          loc_ = rearrange(loc, 'nl c h w -> nl (h w) 1 c')
+          outputs = []
+          for i, q in enumerate(
+                  loc_.unbind(dim=0)):  # traverse all local patches
+              # np*hw,1,c
+              v = pools[i]
+              k = v
+              outputs.append(self.attention[i](q, k, v)[0])
+          outputs = torch.cat(outputs, 1)
+          src = loc.view(4, c, -1).permute(2, 0, 1) + self.dropout1(outputs)
+          src = self.norm1(src)
+          src = src + self.dropout2(
+              self.linear4(
+                  self.dropout(self.activation(self.linear3(src)).clone())))
+          src = self.norm2(src)
+          src = src.permute(1, 2, 0).reshape(4, c, h, w)  # freshed loc
+          glb = glb + F.interpolate(patches2image(src),
+                                    size=glb.shape[-2:],
+                                    mode='nearest')  # freshed glb
+          return torch.cat((src, glb), 0), token_attention_map
+  class inf_MCRM(nn.Module):
+      def __init__(self, d_model, num_heads, pool_ratios=[4, 8, 16], h=None):
+          super(inf_MCRM, self).__init__()
+          self.attention = nn.ModuleList([
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1),
+              nn.MultiheadAttention(d_model, num_heads, dropout=0.1)
+          ])
+          self.linear3 = nn.Linear(d_model, d_model * 2)
+          self.linear4 = nn.Linear(d_model * 2, d_model)
+          self.norm1 = nn.LayerNorm(d_model)
+          self.norm2 = nn.LayerNorm(d_model)
+          self.dropout = nn.Dropout(0.1)
+          self.dropout1 = nn.Dropout(0.1)
+          self.dropout2 = nn.Dropout(0.1)
+          self.sigmoid = nn.Sigmoid()
+          self.activation = get_activation_fn('relu')
+          self.sal_conv = nn.Conv2d(d_model, 1, 1)
+          self.pool_ratios = pool_ratios
+          self.positional_encoding = PositionEmbeddingSine(
+              num_pos_feats=d_model // 2, normalize=True)
+      def forward(self, x):
+          b, c, h, w = x.size()
+          loc, glb = x.split([4, 1], dim=0)  # 4,c,h,w; 1,c,h,w
+          # b(4),c,h,w
+          patched_glb = rearrange(glb,
+                                  'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                  hg=2,
+                                  wg=2)
+          # generate token attention map
+          token_attention_map = self.sigmoid(self.sal_conv(glb))
+          token_attention_map = F.interpolate(token_attention_map,
+                                              size=patches2image(loc).shape[-2:],
+                                              mode='nearest')
+          loc = loc * rearrange(token_attention_map,
+                                'b c (hg h) (wg w) -> (hg wg b) c h w',
+                                hg=2,
+                                wg=2)
+          pools = []
+          for pool_ratio in self.pool_ratios:
+              tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
+              pool = F.adaptive_avg_pool2d(patched_glb, tgt_hw)
+              pools.append(rearrange(pool,
+                                     'nl c h w -> nl c (h w)'))  # nl(4),c,hw
+          # nl(4),c,nphw -> nl(4),nphw,1,c
+          pools = rearrange(torch.cat(pools, 2), "nl c nphw -> nl nphw 1 c")
+          loc_ = rearrange(loc, 'nl c h w -> nl (h w) 1 c')
+          outputs = []
+          for i, q in enumerate(
+                  loc_.unbind(dim=0)):  # traverse all local patches
+              # np*hw,1,c
+              v = pools[i]
+              k = v
+              outputs.append(self.attention[i](q, k, v)[0])
+          outputs = torch.cat(outputs, 1)
+          src = loc.view(4, c, -1).permute(2, 0, 1) + self.dropout1(outputs)
+          src = self.norm1(src)
+          src = src + self.dropout2(
+              self.linear4(
+                  self.dropout(self.activation(self.linear3(src)).clone())))
+          src = self.norm2(src)
+          src = src.permute(1, 2, 0).reshape(4, c, h, w)  # freshed loc
+          glb = glb + F.interpolate(patches2image(src),
+                                    size=glb.shape[-2:],
+                                    mode='nearest')  # freshed glb
+          return torch.cat((src, glb), 0)
+  # model for single-scale training
+  class MVANet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.backbone = SwinB(pretrained=True)
+          emb_dim = 128
+          self.sideout5 = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          self.sideout4 = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          self.sideout3 = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          self.sideout2 = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          self.sideout1 = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          self.output5 = make_cbr(1024, emb_dim)
+          self.output4 = make_cbr(512, emb_dim)
+          self.output3 = make_cbr(256, emb_dim)
+          self.output2 = make_cbr(128, emb_dim)
+          self.output1 = make_cbr(128, emb_dim)
+          self.multifieldcrossatt = MCLM(emb_dim, 1, [1, 4, 8])
+          self.conv1 = make_cbr(emb_dim, emb_dim)
+          self.conv2 = make_cbr(emb_dim, emb_dim)
+          self.conv3 = make_cbr(emb_dim, emb_dim)
+          self.conv4 = make_cbr(emb_dim, emb_dim)
+          self.dec_blk1 = MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk2 = MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk3 = MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk4 = MCRM(emb_dim, 1, [2, 4, 8])
+          self.insmask_head = nn.Sequential(
+              nn.Conv2d(emb_dim, 384, kernel_size=3, padding=1),
+              nn.BatchNorm2d(384), nn.PReLU(),
+              nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.BatchNorm2d(384),
+              nn.PReLU(), nn.Conv2d(384, emb_dim, kernel_size=3, padding=1))
+          self.shallow = nn.Sequential(
+              nn.Conv2d(3, emb_dim, kernel_size=3, padding=1))
+          self.upsample1 = make_cbg(emb_dim, emb_dim)
+          self.upsample2 = make_cbg(emb_dim, emb_dim)
+          self.output = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          for m in self.modules():
+              if isinstance(m, nn.ReLU) or isinstance(m, nn.Dropout):
+                  m.inplace = True
+      def forward(self, x):
+          x = x.to(dtype=torch_dtype, device=torch_device)
+          shallow = self.shallow(x)
+          glb = rescale_to(x, scale_factor=0.5, interpolation='bilinear')
+          loc = image2patches(x)
+          input = torch.cat((loc, glb), dim=0)
+          feature = self.backbone(input)
+          e5 = self.output5(feature[4])  # (5,128,16,16)
+          e4 = self.output4(feature[3])  # (5,128,32,32)
+          e3 = self.output3(feature[2])  # (5,128,64,64)
+          e2 = self.output2(feature[1])  # (5,128,128,128)
+          e1 = self.output1(feature[0])  # (5,128,128,128)
+          loc_e5, glb_e5 = e5.split([4, 1], dim=0)
+          e5 = self.multifieldcrossatt(loc_e5, glb_e5)  # (4,128,16,16)
+          e4, tokenattmap4 = self.dec_blk4(e4 + resize_as(e5, e4))
+          e4 = self.conv4(e4)
+          e3, tokenattmap3 = self.dec_blk3(e3 + resize_as(e4, e3))
+          e3 = self.conv3(e3)
+          e2, tokenattmap2 = self.dec_blk2(e2 + resize_as(e3, e2))
+          e2 = self.conv2(e2)
+          e1, tokenattmap1 = self.dec_blk1(e1 + resize_as(e2, e1))
+          e1 = self.conv1(e1)
+          loc_e1, glb_e1 = e1.split([4, 1], dim=0)
+          output1_cat = patches2image(loc_e1)  # (1,128,256,256)
+          # add glb feat in
+          output1_cat = output1_cat + resize_as(glb_e1, output1_cat)
+          # merge
+          final_output = self.insmask_head(output1_cat)  # (1,128,256,256)
+          # shallow feature merge
+          final_output = final_output + resize_as(shallow, final_output)
+          final_output = self.upsample1(rescale_to(final_output))
+          final_output = rescale_to(final_output +
+                                    resize_as(shallow, final_output))
+          final_output = self.upsample2(final_output)
+          final_output = self.output(final_output)
+          ####
+          sideout5 = self.sideout5(e5).to(dtype=torch_dtype, device=torch_device)
+          sideout4 = self.sideout4(e4)
+          sideout3 = self.sideout3(e3)
+          sideout2 = self.sideout2(e2)
+          sideout1 = self.sideout1(e1)
+          #######glb_sideouts ######
+          glb5 = self.sideout5(glb_e5)
+          glb4 = sideout4[-1, :, :, :].unsqueeze(0)
+          glb3 = sideout3[-1, :, :, :].unsqueeze(0)
+          glb2 = sideout2[-1, :, :, :].unsqueeze(0)
+          glb1 = sideout1[-1, :, :, :].unsqueeze(0)
+          ####### concat 4 to 1 #######
+          sideout1 = patches2image(sideout1[:-1]).to(dtype=torch_dtype,
+                                                     device=torch_device)
+          sideout2 = patches2image(sideout2[:-1]).to(
+              dtype=torch_dtype,
+              device=torch_device)  ####(5,c,h,w) -> (1 c 2h,2w)
+          sideout3 = patches2image(sideout3[:-1]).to(dtype=torch_dtype,
+                                                     device=torch_device)
+          sideout4 = patches2image(sideout4[:-1]).to(dtype=torch_dtype,
+                                                     device=torch_device)
+          sideout5 = patches2image(sideout5[:-1]).to(dtype=torch_dtype,
+                                                     device=torch_device)
+          if self.training:
+              return sideout5, sideout4, sideout3, sideout2, sideout1, final_output, glb5, glb4, glb3, glb2, glb1, tokenattmap4, tokenattmap3, tokenattmap2, tokenattmap1
+          else:
+              return final_output
+  # model for multi-scale testing
+  class inf_MVANet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          # self.backbone = SwinB(pretrained=True)
+          self.backbone = SwinB(pretrained=False)
+          emb_dim = 128
+          self.output5 = make_cbr(1024, emb_dim)
+          self.output4 = make_cbr(512, emb_dim)
+          self.output3 = make_cbr(256, emb_dim)
+          self.output2 = make_cbr(128, emb_dim)
+          self.output1 = make_cbr(128, emb_dim)
+          self.multifieldcrossatt = inf_MCLM(emb_dim, 1, [1, 4, 8])
+          self.conv1 = make_cbr(emb_dim, emb_dim)
+          self.conv2 = make_cbr(emb_dim, emb_dim)
+          self.conv3 = make_cbr(emb_dim, emb_dim)
+          self.conv4 = make_cbr(emb_dim, emb_dim)
+          self.dec_blk1 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk2 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk3 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+          self.dec_blk4 = inf_MCRM(emb_dim, 1, [2, 4, 8])
+          self.insmask_head = nn.Sequential(
+              nn.Conv2d(emb_dim, 384, kernel_size=3, padding=1),
+              nn.BatchNorm2d(384), nn.PReLU(),
+              nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.BatchNorm2d(384),
+              nn.PReLU(), nn.Conv2d(384, emb_dim, kernel_size=3, padding=1))
+          self.shallow = nn.Sequential(
+              nn.Conv2d(3, emb_dim, kernel_size=3, padding=1))
+          self.upsample1 = make_cbg(emb_dim, emb_dim)
+          self.upsample2 = make_cbg(emb_dim, emb_dim)
+          self.output = nn.Sequential(
+              nn.Conv2d(emb_dim, 1, kernel_size=3, padding=1))
+          for m in self.modules():
+              if isinstance(m, nn.ReLU) or isinstance(m, nn.Dropout):
+                  m.inplace = True
+      def forward(self, x):
+          shallow = self.shallow(x)
+          glb = rescale_to(x, scale_factor=0.5, interpolation='bilinear')
+          loc = image2patches(x)
+          input = torch.cat((loc, glb), dim=0)
+          feature = self.backbone(input)
+          e5 = self.output5(feature[4])
+          e4 = self.output4(feature[3])
+          e3 = self.output3(feature[2])
+          e2 = self.output2(feature[1])
+          e1 = self.output1(feature[0])
+          loc_e5, glb_e5 = e5.split([4, 1], dim=0)
+          e5_cat = self.multifieldcrossatt(loc_e5, glb_e5)
+          e4 = self.conv4(self.dec_blk4(e4 + resize_as(e5_cat, e4)))
+          e3 = self.conv3(self.dec_blk3(e3 + resize_as(e4, e3)))
+          e2 = self.conv2(self.dec_blk2(e2 + resize_as(e3, e2)))
+          e1 = self.conv1(self.dec_blk1(e1 + resize_as(e2, e1)))
+          loc_e1, glb_e1 = e1.split([4, 1], dim=0)
+          # after decoder, concat loc features to a whole one, and merge
+          output1_cat = patches2image(loc_e1)
+          # add glb feat in
+          output1_cat = output1_cat + resize_as(glb_e1, output1_cat)
+          # merge
+          final_output = self.insmask_head(output1_cat)
+          # shallow feature merge
+          final_output = final_output + resize_as(shallow, final_output)
+          final_output = self.upsample1(rescale_to(final_output))
+          final_output = rescale_to(final_output +
+                                    resize_as(shallow, final_output))
+          final_output = self.upsample2(final_output)
+          final_output = self.output(final_output)
+          return final_output
+#+end_src
+** Function to load model
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+  def load_model(model_checkpoint_path):
+      torch.cuda.set_device(0)
+      net = inf_MVANet().to(dtype=torch_dtype, device=torch_device)
+      pretrained_dict = torch.load(model_checkpoint_path,
+                                   map_location=torch_device)
+      model_dict = net.state_dict()
+      pretrained_dict = {
+          k: v
+          for k, v in pretrained_dict.items() if k in model_dict
+      }
+      model_dict.update(pretrained_dict)
+      net.load_state_dict(model_dict)
+      net = net.to(dtype=torch_dtype, device=torch_device)
+      net.eval()
+      return net
+  def load_transforms_stripped():
+      img_transform = transforms.Compose([
+          # transforms.ToTensor(),
+          transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+      ])
+      return img_transform
+  def load_transforms():
+      img_transform = transforms.Compose([
+          # transforms.ToTensor(),
+          transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+      ])
+      depth_transform = transforms.ToTensor()
+      target_transform = transforms.ToTensor()
+      to_pil = transforms.ToPILImage()
+      transforms_var = tta.Compose([
+          tta.HorizontalFlip(),
+          tta.Scale(scales=[0.75, 1, 1.25],
+                    interpolation='bilinear',
+                    align_corners=False),
+      ])
+      return (img_transform, depth_transform, target_transform, to_pil,
+              transforms_var)
+#+end_src
+** Function for modular inference CV
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+  def do_infer_tensor2tensor(img, net):
+      img_transform = transforms.Compose(
+          [transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
+      h_, w_ = img.shape[1], img.shape[2]
+      with torch.no_grad():
+          img = rearrange(img, 'B H W C -> B C H W')
+          img_resize = torch.nn.functional.interpolate(input=img,
+                                                       size=(1024, 1024),
+                                                       mode='bicubic',
+                                                       antialias=True)
+          img_var = img_transform(img_resize)
+          img_var = Variable(img_var)
+          img_var = img_var.to(dtype=torch_dtype, device=torch_device)
+          mask = []
+          mask.append(net(img_var))
+          prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+          prediction = prediction.sigmoid()
+          prediction = torch.nn.functional.interpolate(input=prediction,
+                                                       size=(h_, w_),
+                                                       mode='bicubic',
+                                                       antialias=True)
+          prediction = prediction.squeeze(0)
+          prediction = prediction.clamp(0, 1)
+          return prediction
+  def do_infer_modular_cv(input_image_path, output_mask_path, net,
+                          all_transforms):
+      (img_transform, depth_transform, target_transform, to_pil,
+       transforms_var) = all_transforms
+      img = load_image_torch(input_image_path)
+      h_, w_ = img.shape[1], img.shape[2]
+      with torch.no_grad():
+          img = rearrange(img, 'B H W C -> B C H W')
+          img_resize = torch.nn.functional.interpolate(input=img,
+                                                       size=(1024, 1024),
+                                                       mode='bicubic',
+                                                       antialias=True)
+          img_var = img_transform(img_resize)
+          img_var = Variable(img_var)
+          img_var = img_var.to(dtype=torch_dtype, device=torch_device)
+          mask = []
+          for transformer in transforms_var:
+              rgb_trans = img_var.to(dtype=torch_dtype, device=torch_device)
+              mask.append(net(rgb_trans))
+          prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+          prediction = prediction.sigmoid()
+          prediction = torch.nn.functional.interpolate(input=prediction,
+                                                       size=(h_, w_),
+                                                       mode='bicubic',
+                                                       antialias=True)
+          prediction = prediction.squeeze(0)
+          prediction = prediction.clamp(0, 1)
+          save_mask_torch(output_image_path=output_mask_path, mask=prediction)
+  def do_infer_modular_cv_2(input_image_path, output_mask_path, net,
+                            all_transforms):
+      (img_transform, depth_transform, target_transform, to_pil,
+       transforms_var) = all_transforms
+      img = load_image(input_image_path)
+      w_, h_ = img.shape[0], img.shape[1]
+      img_resize = cv2.resize(img, (1024, 1024), cv2.INTER_CUBIC)
+      with torch.no_grad():
+          # rgb_png_path = input_image_path
+          # img = Image.open(rgb_png_path).convert('RGB')
+          # w_, h_ = img.size
+          # img_resize = img.resize([256 * 4, 256 * 4], Image.BILINEAR)
+          # img_var = Variable(img_transform(img_resize).unsqueeze(0)).to(
+          #     dtype=torch_dtype, device=torch_device)
+          img_resize = torch.from_numpy(img_resize)
+          img_resize = img_resize.to(dtype=torch.float32)
+          img_resize /= 255.0
+          img_resize = rearrange(img_resize, 'H W C -> C H W')
+          img_var = img_transform(img_resize)
+          img_var = img_var.unsqueeze(0)
+          img_var = Variable(img_var)
+          img_var = img_var.to(dtype=torch_dtype, device=torch_device)
+          mask = []
+          for transformer in transforms_var:
+              rgb_trans = transformer.augment_image(img_var)
+              rgb_trans = rgb_trans.to(dtype=torch_dtype, device=torch_device)
+              model_output = net(rgb_trans)
+              deaug_mask = transformer.deaugment_mask(model_output)
+              mask.append(deaug_mask)
+          prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+          prediction = prediction.sigmoid()
+          prediction = to_pil(prediction.data.squeeze(0).cpu())
+          prediction = prediction.resize((w_, h_), Image.BILINEAR)
+          prediction.save(output_mask_path)
+  def do_infer_modular_cv_3(input_image_path, output_mask_path, net,
+                            all_transforms):
+      (img_transform, depth_transform, target_transform, to_pil,
+       transforms_var) = all_transforms
+      img = load_image(input_image_path)
+      w_, h_ = img.shape[0], img.shape[1]
+      with torch.no_grad():
+          # rgb_png_path = input_image_path
+          # img = Image.open(rgb_png_path).convert('RGB')
+          # w_, h_ = img.size
+          # img_resize = img.resize([256 * 4, 256 * 4], Image.BILINEAR)
+          # img_var = Variable(img_transform(img_resize).unsqueeze(0)).to(
+          #     dtype=torch_dtype, device=torch_device)
+          img_resize = torch.from_numpy(img)
+          img_resize = img_resize.to(dtype=torch.float32)
+          img_resize = rearrange(img_resize, 'H W C -> C H W')
+          img_resize = img_resize.unsqueeze(0)
+          img_resize = torch.nn.functional.interpolate(input=img_resize,
+                                                       size=(1024, 1024),
+                                                       mode='bicubic',
+                                                       antialias=True)
+          img_resize = img_resize.squeeze(0)
+          img_resize = rearrange(img_resize, 'C H W -> H W C')
+          img_resize = img_resize.to(dtype=torch.float32)
+          img_resize /= 255.0
+          img_resize = rearrange(img_resize, 'H W C -> C H W')
+          img_var = img_transform(img_resize)
+          img_var = img_var.unsqueeze(0)
+          img_var = Variable(img_var)
+          img_var = img_var.to(dtype=torch_dtype, device=torch_device)
+          mask = []
+          for transformer in transforms_var:
+              rgb_trans = transformer.augment_image(img_var)
+              rgb_trans = rgb_trans.to(dtype=torch_dtype, device=torch_device)
+              model_output = net(rgb_trans)
+              deaug_mask = transformer.deaugment_mask(model_output)
+              mask.append(deaug_mask)
+          prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+          prediction = prediction.sigmoid()
+          prediction = to_pil(prediction.data.squeeze(0).cpu())
+          prediction = prediction.resize((w_, h_), Image.BILINEAR)
+          prediction.save(output_mask_path)
+  def do_infer_modular_cv_4(input_image_path, output_mask_path, net,
+                            all_transforms):
+      (img_transform, depth_transform, target_transform, to_pil,
+       transforms_var) = all_transforms
+      img = load_image(input_image_path)
+      w_, h_ = img.shape[0], img.shape[1]
+      with torch.no_grad():
+          img_resize = torch.from_numpy(img)
+          img_resize = img_resize.to(dtype=torch.float32)
+          img_resize /= 255.0
+          img_resize = img_resize.unsqueeze(0)
+          img_resize = rearrange(img_resize, 'B H W C -> B C H W')
+          img_resize = torch.nn.functional.interpolate(input=img_resize,
+                                                       size=(1024, 1024),
+                                                       mode='bicubic',
+                                                       antialias=True)
+          img_resize = img_resize.squeeze(0)
+          img_var = img_transform(img_resize)
+          img_var = img_var.unsqueeze(0)
+          img_var = Variable(img_var)
+          img_var = img_var.to(dtype=torch_dtype, device=torch_device)
+          mask = []
+          for transformer in transforms_var:
+              rgb_trans = transformer.augment_image(img_var)
+              rgb_trans = rgb_trans.to(dtype=torch_dtype, device=torch_device)
+              model_output = net(rgb_trans)
+              deaug_mask = transformer.deaugment_mask(model_output)
+              mask.append(deaug_mask)
+          prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+          prediction = prediction.sigmoid()
+          prediction = to_pil(prediction.data.squeeze(0).cpu())
+          prediction = prediction.resize((w_, h_), Image.BILINEAR)
+          prediction.save(output_mask_path)
+  def do_infer_modular_cv_5(input_image_path, output_mask_path, net,
+                            all_transforms):
+      (img_transform, depth_transform, target_transform, to_pil,
+       transforms_var) = all_transforms
+      img = load_image(input_image_path)
+      w_, h_ = img.shape[0], img.shape[1]
+      with torch.no_grad():
+          img_resize = torch.from_numpy(img)
+          img_resize = img_resize.to(dtype=torch.float32)
+          img_resize /= 255.0
+          img_resize = img_resize.unsqueeze(0)
+          img_resize = rearrange(img_resize, 'B H W C -> B C H W')
+          img_resize = torch.nn.functional.interpolate(input=img_resize,
+                                                       size=(1024, 1024),
+                                                       mode='bicubic',
+                                                       antialias=True)
+          img_var = img_transform(img_resize)
+          img_var = Variable(img_var)
+          img_var = img_var.to(dtype=torch_dtype, device=torch_device)
+          mask = []
+          for transformer in transforms_var:
+              rgb_trans = transformer.augment_image(img_var)
+              rgb_trans = rgb_trans.to(dtype=torch_dtype, device=torch_device)
+              model_output = net(rgb_trans)
+              deaug_mask = transformer.deaugment_mask(model_output)
+              mask.append(deaug_mask)
+          prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+          prediction = prediction.sigmoid()
+          prediction = to_pil(prediction.data.squeeze(0).cpu())
+          prediction = prediction.resize((w_, h_), Image.BILINEAR)
+          prediction.save(output_mask_path)
+  def do_infer_modular_cv_6(input_image_path, output_mask_path, net,
+                            all_transforms):
+      (img_transform, depth_transform, target_transform, to_pil,
+       transforms_var) = all_transforms
+      img = load_image(input_image_path)
+      w_, h_ = img.shape[0], img.shape[1]
+      with torch.no_grad():
+          img_resize = torch.from_numpy(img)
+          img_resize = img_resize.to(dtype=torch.float32)
+          img_resize /= 255.0
+          img_resize = img_resize.unsqueeze(0)
+          img_resize = rearrange(img_resize, 'B H W C -> B C H W')
+          img_resize = torch.nn.functional.interpolate(input=img_resize,
+                                                       size=(1024, 1024),
+                                                       mode='bicubic',
+                                                       antialias=True)
+          img_var = img_transform(img_resize)
+          img_var = Variable(img_var)
+          img_var = img_var.to(dtype=torch_dtype, device=torch_device)
+          mask = []
+          for transformer in transforms_var:
+              rgb_trans = img_var.to(dtype=torch_dtype, device=torch_device)
+              mask.append(net(rgb_trans))
+          prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+          prediction = prediction.sigmoid()
+          prediction = to_pil(prediction.data.squeeze(0).cpu())
+          prediction = prediction.resize((w_, h_), Image.BILINEAR)
+          prediction.save(output_mask_path)
+  def do_infer_modular_cv_7(input_image_path, output_mask_path, net,
+                            all_transforms):
+      (img_transform, depth_transform, target_transform, to_pil,
+       transforms_var) = all_transforms
+      img = load_image_torch(input_image_path)
+      h_, w_ = img.shape[1], img.shape[2]
+      with torch.no_grad():
+          img = rearrange(img, 'B H W C -> B C H W')
+          img_resize = torch.nn.functional.interpolate(input=img,
+                                                       size=(1024, 1024),
+                                                       mode='bicubic',
+                                                       antialias=True)
+          img_var = img_transform(img_resize)
+          img_var = Variable(img_var)
+          img_var = img_var.to(dtype=torch_dtype, device=torch_device)
+          mask = []
+          for transformer in transforms_var:
+              rgb_trans = img_var.to(dtype=torch_dtype, device=torch_device)
+              mask.append(net(rgb_trans))
+          prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+          prediction = prediction.sigmoid()
+          prediction = to_pil(prediction.data.squeeze(0).cpu())
+          prediction = prediction.resize((w_, h_), Image.BILINEAR)
+          prediction.save(output_mask_path)
+#+end_src
+** Function for modular inference
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+  def do_infer_modular(input_image_path, output_mask_path, net, all_transforms):
+      # net = load_model(finetuned_MVANet_model_path)
+      (img_transform, depth_transform, target_transform, to_pil,
+       transforms_var) = all_transforms
+      with torch.no_grad():
+          rgb_png_path = input_image_path
+          img = Image.open(rgb_png_path).convert('RGB')
+          w_, h_ = img.size
+          # img_resize = img.resize([(w_ // 2) * 2, (h_ // 2) * 2], Image.BILINEAR)
+          img_resize = img.resize([256 * 4, 256 * 4], Image.BILINEAR)
+          # img_resize = img
+          img_var = Variable(img_transform(img_resize).unsqueeze(0)).to(
+              dtype=torch_dtype, device=torch_device)
+          mask = []
+          for transformer in transforms_var:
+              rgb_trans = transformer.augment_image(img_var)
+              rgb_trans = rgb_trans.to(dtype=torch_dtype, device=torch_device)
+              model_output = net(rgb_trans)
+              deaug_mask = transformer.deaugment_mask(model_output)
+              mask.append(deaug_mask)
+          prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+          prediction = prediction.sigmoid()
+          prediction = to_pil(prediction.data.squeeze(0).cpu())
+          prediction = prediction.resize((w_, h_), Image.BILINEAR)
+          prediction.save(output_mask_path)
+#+end_src
+** Function for inference
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+  def do_infer():
+      torch.cuda.set_device(0)
+      args = {'crf_refine': True, 'save_results': True}
+      img_transform = transforms.Compose([
+          transforms.ToTensor(),
+          transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+      ])
+      depth_transform = transforms.ToTensor()
+      target_transform = transforms.ToTensor()
+      to_pil = transforms.ToPILImage()
+      transforms_var = tta.Compose([
+          tta.HorizontalFlip(),
+          tta.Scale(scales=[0.75, 1, 1.25],
+                    interpolation='bilinear',
+                    align_corners=False),
+      ])
+      net = inf_MVANet().to(dtype=torch_dtype, device=torch_device)
+      pretrained_dict = torch.load(finetuned_MVANet_model_path,
+                                   map_location=torch_device)
+      model_dict = net.state_dict()
+      pretrained_dict = {
+          k: v
+          for k, v in pretrained_dict.items() if k in model_dict
+      }
+      model_dict.update(pretrained_dict)
+      net.load_state_dict(model_dict)
+      net = net.to(dtype=torch_dtype, device=torch_device)
+      net.eval()
+      with torch.no_grad():
+          rgb_png_path = '/home/asd/DATASETS/SD_BG_SWAP_TEST/comfyui_outputs/4/output_fooocus/bgswap-output.png'
+          img = Image.open(rgb_png_path).convert('RGB')
+          w_, h_ = img.size
+          # img_resize = img.resize([(w_ // 2) * 2, (h_ // 2) * 2], Image.BILINEAR)
+          img_resize = img.resize([256 * 4 , 256 * 4 ], Image.BILINEAR)
+          # img_resize = img
+          img_var = Variable(img_transform(img_resize).unsqueeze(0),
+                             volatile=True).cuda()
+          mask = []
+          for transformer in transforms_var:
+              rgb_trans = transformer.augment_image(img_var)
+              rgb_trans = rgb_trans.to(dtype=torch_dtype, device=torch_device)
+              model_output = net(rgb_trans)
+              deaug_mask = transformer.deaugment_mask(model_output)
+              mask.append(deaug_mask)
+          prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+          prediction = prediction.sigmoid()
+          prediction = to_pil(prediction.data.squeeze(0).cpu())
+          prediction = prediction.resize((w_, h_), Image.BILINEAR)
+          prediction.save('./tmp.png')
+#+end_src
+** MVANet_inference function
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.function.py
+  def main(item):
+      net = inf_MVANet().cuda()
+      pretrained_dict = torch.load(os.path.join(ckpt_path, item + '.pth'),
+                                   map_location='cuda')
+      model_dict = net.state_dict()
+      pretrained_dict = {
+          k: v
+          for k, v in pretrained_dict.items() if k in model_dict
+      }
+      model_dict.update(pretrained_dict)
+      net.load_state_dict(model_dict)
+      net.eval()
+      with torch.no_grad():
+          for name, root in to_test.items():
+              root1 = os.path.join(root, 'images')
+              img_list = [os.path.splitext(f) for f in os.listdir(root1)]
+              for idx, img_name in enumerate(img_list):
+                  print('predicting for %s: %d / %d' %
+                        (name, idx + 1, len(img_list)))
+                  rgb_png_path = os.path.join(root, 'images',
+                                              img_name[0] + '.png')
+                  rgb_jpg_path = os.path.join(root, 'images',
+                                              img_name[0] + '.jpg')
+                  if os.path.exists(rgb_png_path):
+                      img = Image.open(rgb_png_path).convert('RGB')
+                  else:
+                      img = Image.open(rgb_jpg_path).convert('RGB')
+                  w_, h_ = img.size
+                  img_resize = img.resize([1024, 1024], Image.BILINEAR)
+                  img_var = Variable(img_transform(img_resize).unsqueeze(0),
+                                     volatile=True).cuda()
+                  mask = []
+                  for transformer in transforms_var:
+                      rgb_trans = transformer.augment_image(img_var)
+                      model_output = net(rgb_trans)
+                      deaug_mask = transformer.deaugment_mask(model_output)
+                      mask.append(deaug_mask)
+                  prediction = torch.mean(torch.stack(mask, dim=0), dim=0)
+                  prediction = prediction.sigmoid()
+                  prediction = to_pil(prediction.data.squeeze(0))
+                  prediction = prediction.resize((w_, h_), Image.BILINEAR)
+                  if args['save_results']:
+                      check_mkdir(os.path.join(ckpt_path, item, name))
+                      prediction.save(
+                          os.path.join(ckpt_path, item, name,
+                                       img_name[0] + '.png'))
+#+end_src
+** MVANet_inference execute
+#+begin_src python :shebang #!/usr/bin/python3 :results output :tangle ./MVANet_inference.execute.py
+  def do_merge(path_image, path_mask, path_out):
+      image = cv2.imread(path_image, cv2.IMREAD_COLOR)
+      mask = cv2.imread(path_mask, cv2.IMREAD_GRAYSCALE)
+      mask = (mask > 127).astype(dtype=np.uint8) * 255
+      out = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
+      out[:, :, 0:3] = image
+      out[:, :, 3] = mask
+      cv2.imwrite(path_out, out)
+  if __name__ == '__main__':
+      # do_infer_modular_cv(
+      #     input_image_path=
+      #     '/home/asd/DATASETS/SD_BG_SWAP_TEST/comfyui_outputs/4/output_fooocus/bgswap-output.png',
+      #     output_mask_path='./tmp.png',
+      #     net=load_model(finetuned_MVANet_model_path),
+      #     all_transforms=load_transforms(),
+      # )
+      # net = load_model(
+      #     HOME_DIR + '/dreambooth_experiments/MVANet/MVANet_cloth_segment_14.pth')
+      # net = load_model(
+      #     HOME_DIR +
+      #     '/dreambooth_experiments/MVANet/new_type_crop_with_midshot.pth')
+      # net = load_model('/home/asd/MODEL_CHECKPOINTS/MVANet/SKIN_SEGMENTATION/1/Model_4.pth')
+      net = load_model('/home/asd/MODEL_CHECKPOINTS/MVANet/SKIN_SEGMENTATION/3/Model_14.pth')
+      # net = load_model(HOME_DIR +
+      #                  '/dreambooth_experiments/MVANet/mvanet_normal_crop_2.pth')
+      DATA_DIR_BASE = HOME_DIR + '/DATASETS/cloth_segmentation_test_images.dir/cloth_segmentation_test_images/'
+      images = (
+          '1370', '1371', '1372', '1373', '1374', '1375', '1376', '1377', '1378',
+          '1379', '1380', '1381', '1382', '1383', '1384', '1385', '1386', '1387',
+          '1388', '1389', '1390', '1391', '1392', '1393', '1394', '1395', '1396',
+          '1397', '1398', '1399', '1400', '1401', '1402', '1403', '1404', '1405',
+          '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1413', '1414',
+          '1415', '1539', '1541', '1542', '1543', '17320', '4129', '4190',
+          '4191', '4192', '4193', '4202', '4203', '4204', '4207', '4208', '4209',
+          '4210', '4213', '4214', '4221', '4222', '4223', '4224', '4225', '4226',
+          '4227', '4228', '4229', '4230', '4231', '4232', '4233', '4234', '4235',
+          '4236', '4237', '4238', '4239', '4240', '4241', '4242', '4251', '4252',
+          '4253', '4254', '4255', '4256', '4257', '4258', '4259', '4260', '4261',
+          '4262', '4263', '4264', '6581', '6642', '6647', '6656', '6660', '6690',
+          '6696', '6724', '6767', '6771', '6788', '6791', '6807', '6821', '6824',
+          '6833', '6847', '6850', '6879', '6941', '7001', '7070', '7083', '7092',
+          '7093', '7119', '7191', '7220', '7252', '7264', '7276', '7278', '7281',
+          '7290', '7301', '7312', '7340', '7398', '7404', '7412', '7429', '7439',
+          '7478', '7491', '7631', '7687', '7699', '7719', '7770', '7784', '7793',
+          '7811', '7829', '7861', '7864', '7868', '7980', '7987', '7990', '8069',
+          '8083', '8100', '8108', '8227', '8323', '8329', '8358', '8383', '8401',
+          '8415', '8488', '8515', '8518', '8560', '8565', '8595', '8639', '8676',
+          '8690', '8691', '8701', '8703', '8723', '8726', '8756', '8783', '8801',
+          '8820', '8826', '8842', '8865', '8874', '8875', '8882', '8911', '8946',
+          '8947', '8969', '8979', '8983')
+      masks = [DATA_DIR_BASE + i + '/garment_mask.png' for i in images]
+      out = [DATA_DIR_BASE + i + '/garment_transparent.png' for i in images]
+      images = [DATA_DIR_BASE + i + '/original.jpg' for i in images]
+      for i in range(len(images)):
+          image = images[i]
+          image = load_image_torch(image)
+          mask = do_infer_tensor2tensor(image, net)
+          save_mask_torch(output_image_path=masks[i], mask=mask)
+          do_merge(path_image=images[i], path_mask=masks[i], path_out=out[i])
+      # img = load_image_torch(
+      #     '/home/asd/DATASETS/SD_BG_SWAP_TEST/comfyui_outputs/4/output_fooocus/bgswap-output.png'
+      # )
+      # # all_transforms = load_transforms()
+      # masks = do_infer_tensor2tensor(img, net)
+      # save_mask_torch(output_image_path='./tmp.png', mask=masks)
+#+end_src
+** MVANet_inference unify
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./MVANet_inference.unify.sh
+  . "${HOME}/dbnew.sh"
+  (
+      echo '#!/usr/bin/python3'
+      cat \
+          './MVANet_inference.import.py' \
+          './MVANet_inference.function.py' \
+          './MVANet_inference.class.py' \
+          './MVANet_inference.execute.py' \
+      | expand | yapf3 \
+      | grep -v '#!/usr/bin/python3' \
+      ;
+  ) > './MVANet_inference.py' \
+  ;
+#+end_src
+** MVANet_inference run
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./MVANet_inference.run.sh
+  . "${HOME}/dbnew.sh"
+  python3 './MVANet_inference.py'
+#+end_src
+* WORK SPACE
+** elisp
+#+begin_src elisp
+  (save-buffer)
+  (org-babel-tangle)
+  (shell-command "./MVANet_inference.unify.sh")
+#+end_src
+#+RESULTS:
+: 0
+** sh
+#+begin_src sh :shebang #!/bin/sh :results output
+  realpath .
+  cd /home/asd/GITHUB/aravind-h-v/dreambooth_experiments/MVANet
+#+end_src

README.md ADDED Viewed

	@@ -0,0 +1,131 @@

+# Self Correction for Human Parsing
+This is a copy of https://github.com/GoGoDuck912/Self-Correction-Human-Parsing
+![Python 3.6](https://img.shields.io/badge/python-3.6-green.svg)
+[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
+An out-of-box human parsing representation extractor.
+Our solution ranks 1st for all human parsing tracks (including single, multiple and video) in the third LIP challenge!
+![lip-visualization](./demo/lip-visualization.jpg)
+Features:
+- [x] Out-of-box human parsing extractor for other downstream applications.
+- [x] Pretrained model on three popular single person human parsing datasets.
+- [x] Training and inferecne code.
+- [x] Simple yet effective extension on multi-person and video human parsing tasks.
+## Requirements
+```
+conda env create -f environment.yaml
+conda activate schp
+pip install -r requirements.txt
+```
+## Simple Out-of-Box Extractor
+The easiest way to get started is to use our trained SCHP models on your own images to extract human parsing representations. Here we provided state-of-the-art [trained models](https://drive.google.com/drive/folders/1uOaQCpNtosIjEL2phQKEdiYd0Td18jNo?usp=sharing) on three popular datasets. Theses three datasets have different label system, you can choose the best one to fit on your own task.
+**LIP** ([exp-schp-201908261155-lip.pth](https://drive.google.com/file/d/1k4dllHpu0bdx38J7H28rVVLpU-kOHmnH/view?usp=sharing))
+* mIoU on LIP validation: **59.36 %**.
+* LIP is the largest single person human parsing dataset with 50000+ images. This dataset focus more on the complicated real scenarios. LIP has 20 labels, including 'Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes', 'Dress', 'Coat', 'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm', 'Right-arm', 'Left-leg', 'Right-leg', 'Left-shoe', 'Right-shoe'.
+**ATR** ([exp-schp-201908301523-atr.pth](https://drive.google.com/file/d/1ruJg4lqR_jgQPj-9K0PP-L2vJERYOxLP/view?usp=sharing))
+* mIoU on ATR test: **82.29%**.
+* ATR is a large single person human parsing dataset with 17000+ images. This dataset focus more on fashion AI. ATR has 18 labels, including 'Background', 'Hat', 'Hair', 'Sunglasses', 'Upper-clothes', 'Skirt', 'Pants', 'Dress', 'Belt', 'Left-shoe', 'Right-shoe', 'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm', 'Bag', 'Scarf'.
+**Pascal-Person-Part** ([exp-schp-201908270938-pascal-person-part.pth](https://drive.google.com/file/d/1E5YwNKW2VOEayK9mWCS3Kpsxf-3z04ZE/view?usp=sharing))
+* mIoU on Pascal-Person-Part validation: **71.46** %.
+* Pascal Person Part is a tiny single person human parsing dataset with 3000+ images. This dataset focus more on body parts segmentation. Pascal Person Part has 7 labels, including 'Background', 'Head', 'Torso', 'Upper Arms', 'Lower Arms', 'Upper Legs', 'Lower Legs'.
+Choose one and have fun on your own task!
+To extract the human parsing representation, simply put your own image in the `INPUT_PATH` folder, then download a pretrained model and run the following command. The output images with the same file name will be saved in `OUTPUT_PATH`
+```
+python simple_extractor.py --dataset [DATASET] --model-restore [CHECKPOINT_PATH] --input-dir [INPUT_PATH] --output-dir [OUTPUT_PATH]
+```
+**[Updated]** Here is also a [colab demo example](https://colab.research.google.com/drive/1JOwOPaChoc9GzyBi5FUEYTSaP2qxJl10?usp=sharing) for quick inference provided by [@levindabhi](https://github.com/levindabhi).
+The `DATASET` command has three options, including 'lip', 'atr' and 'pascal'. Note each pixel in the output images denotes the predicted label number. The output images have the same size as the input ones. To better visualization, we put a palette with the output images. We suggest you to read the image with `PIL`.
+If you need not only the final parsing images, but also the feature map representations. Add `--logits` command to save the output feature maps. These feature maps are the logits before softmax layer.
+## Dataset Preparation
+Please download the [LIP](http://sysu-hcp.net/lip/) dataset following the below structure.
+```commandline
+data/LIP
+|--- train_imgaes # 30462 training single person images
+|--- val_images # 10000 validation single person images
+|--- train_segmentations # 30462 training annotations
+|--- val_segmentations # 10000 training annotations
+|--- train_id.txt # training image list
+|--- val_id.txt # validation image list
+```
+## Training
+```
+python train.py
+```
+By default, the trained model will be saved in `./log` directory. Please read the arguments for more details.
+## Evaluation
+```
+python evaluate.py --model-restore [CHECKPOINT_PATH]
+```
+CHECKPOINT_PATH should be the path of trained model.
+## Extension on Multiple Human Parsing
+Please read [MultipleHumanParsing.md](./mhp_extension/README.md) for more details.
+## Citation
+Please cite our work if you find this repo useful in your research.
+```latex
+@article{li2020self,
+  title={Self-Correction for Human Parsing},
+  author={Li, Peike and Xu, Yunqiu and Wei, Yunchao and Yang, Yi},
+  journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  year={2020},
+  doi={10.1109/TPAMI.2020.3048039}}
+```
+## Visualization
+* Source Image.
+![demo](./demo/demo.jpg)
+* LIP Parsing Result.
+![demo-lip](./demo/demo_lip.png)
+* ATR Parsing Result.
+![demo-atr](./demo/demo_atr.png)
+* Pascal-Person-Part Parsing Result.
+![demo-pascal](./demo/demo_pascal.png)
+* Source Image.
+![demo](./mhp_extension/demo/demo.jpg)
+* Instance Human Mask.
+![demo-lip](./mhp_extension/demo/demo_instance_human_mask.png)
+* Global Human Parsing Result.
+![demo-lip](./mhp_extension/demo/demo_global_human_parsing.png)
+* Multiple Human Parsing Result.
+![demo-lip](./mhp_extension/demo/demo_multiple_human_parsing.png)
+## Related
+Our code adopts the [InplaceSyncBN](https://github.com/mapillary/inplace_abn) to save gpu memory cost.
+There is also a [PaddlePaddle](https://github.com/PaddlePaddle/PaddleSeg/tree/develop/contrib/ACE2P) Implementation of this project.

checkpoints/AEMatter/AEM_RWA.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a475193549365ff3c892a85d2f4ca90ece2ac8dc4de4a39df250c76ca870d280
+size 205399637

checkpoints/MVANet/garment.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7604ed46e06fbcff3b8f38c8934d253617171d02aecdd028f0f01086d9344893
+size 380785263

checkpoints/MVANet/skin.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c71afcdd9cb1be73e43d84f5ffc2ae12b4964cc13c8460fc0adb6d52a0603cd4
+size 380782803

checkpoints/Model_80.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffec20a382b0a1832786438475e8b912a03be727a0e3197e7ab039153fb3bc46
+size 386621643

checkpoints/StableDiffusion/90c7c97574f8db765509b6a5d2e7b2551b430a10cac03e37d368654eac5e8169cd149644d188be4b5b2f1b9f29e66b64a02535f622f2bf284c319b076224cb2b ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:010be7341cd98a136da775330ba3eb4e87025c6cfd2f5455dc64daee2200ae98
+size 7105348616

checkpoints/StableDiffusion/b970812225cfb95427c13e73b75eef66430e2a525876dddac494d70fe4ed0524cb197043e0ac3dc3026b32a45cd1d6d126ec2fe74a5bc3ef5df21836ca022b30 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1689257e6e1b2e61544b1a41fc114e7d798f68854b3f875cd52070bfe1fbc00
+size 6938072258

checkpoints/StableDiffusion/hash ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 90c7c97574f8db765509b6a5d2e7b2551b430a10cac03e37d368654eac5e8169cd149644d188be4b5b2f1b9f29e66b64a02535f622f2bf284c319b076224cb2b Juggernaut_X_RunDiffusion_Hyper.safetensors
2	+ b970812225cfb95427c13e73b75eef66430e2a525876dddac494d70fe4ed0524cb197043e0ac3dc3026b32a45cd1d6d126ec2fe74a5bc3ef5df21836ca022b30 juggernautXL_versionXInpaint.safetensors

checkpoints/atr.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9d7c91ce3b4e7133df56b599fc817b533e3439c5e8d282a59126d2fda339a2a
+size 267445237

checkpoints/lip.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24fa3254ceeb74c8435458994a64b522fb439a3635b7b86ff470457e0413da00
+size 267449349

checkpoints/pascal.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b03d343c39fb0696f75d45c44c67b2fc23f5d0bf0925a82c0465e415799fa85
+size 267422621

datasets/__init__.py ADDED Viewed

File without changes

datasets/datasets.py ADDED Viewed

	@@ -0,0 +1,201 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   datasets.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import os
+import numpy as np
+import random
+import torch
+import cv2
+from torch.utils import data
+from utils.transforms import get_affine_transform
+class LIPDataSet(data.Dataset):
+    def __init__(self, root, dataset, crop_size=[473, 473], scale_factor=0.25,
+                 rotation_factor=30, ignore_label=255, transform=None):
+        self.root = root
+        self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
+        self.crop_size = np.asarray(crop_size)
+        self.ignore_label = ignore_label
+        self.scale_factor = scale_factor
+        self.rotation_factor = rotation_factor
+        self.flip_prob = 0.5
+        self.transform = transform
+        self.dataset = dataset
+        list_path = os.path.join(self.root, self.dataset + '_id.txt')
+        train_list = [i_id.strip() for i_id in open(list_path)]
+        self.train_list = train_list
+        self.number_samples = len(self.train_list)
+    def __len__(self):
+        return self.number_samples
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+        return center, scale
+    def __getitem__(self, index):
+        train_item = self.train_list[index]
+        im_path = os.path.join(self.root, self.dataset + '_images', train_item + '.jpg')
+        parsing_anno_path = os.path.join(self.root, self.dataset + '_segmentations', train_item + '.png')
+        im = cv2.imread(im_path, cv2.IMREAD_COLOR)
+        h, w, _ = im.shape
+        parsing_anno = np.zeros((h, w), dtype=np.long)
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        if self.dataset != 'test':
+            # Get pose annotation
+            parsing_anno = cv2.imread(parsing_anno_path, cv2.IMREAD_GRAYSCALE)
+            if self.dataset == 'train' or self.dataset == 'trainval':
+                sf = self.scale_factor
+                rf = self.rotation_factor
+                s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+                r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if random.random() <= 0.6 else 0
+                if random.random() <= self.flip_prob:
+                    im = im[:, ::-1, :]
+                    parsing_anno = parsing_anno[:, ::-1]
+                    person_center[0] = im.shape[1] - person_center[0] - 1
+                    right_idx = [15, 17, 19]
+                    left_idx = [14, 16, 18]
+                    for i in range(0, 3):
+                        right_pos = np.where(parsing_anno == right_idx[i])
+                        left_pos = np.where(parsing_anno == left_idx[i])
+                        parsing_anno[right_pos[0], right_pos[1]] = left_idx[i]
+                        parsing_anno[left_pos[0], left_pos[1]] = right_idx[i]
+        trans = get_affine_transform(person_center, s, r, self.crop_size)
+        input = cv2.warpAffine(
+            im,
+            trans,
+            (int(self.crop_size[1]), int(self.crop_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        if self.transform:
+            input = self.transform(input)
+        meta = {
+            'name': train_item,
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+        if self.dataset == 'val' or self.dataset == 'test':
+            return input, meta
+        else:
+            label_parsing = cv2.warpAffine(
+                parsing_anno,
+                trans,
+                (int(self.crop_size[1]), int(self.crop_size[0])),
+                flags=cv2.INTER_NEAREST,
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=(255))
+            label_parsing = torch.from_numpy(label_parsing)
+            return input, label_parsing, meta
+class LIPDataValSet(data.Dataset):
+    def __init__(self, root, dataset='val', crop_size=[473, 473], transform=None, flip=False):
+        self.root = root
+        self.crop_size = crop_size
+        self.transform = transform
+        self.flip = flip
+        self.dataset = dataset
+        self.root = root
+        self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
+        self.crop_size = np.asarray(crop_size)
+        list_path = os.path.join(self.root, self.dataset + '_id.txt')
+        val_list = [i_id.strip() for i_id in open(list_path)]
+        self.val_list = val_list
+        self.number_samples = len(self.val_list)
+    def __len__(self):
+        return len(self.val_list)
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+        return center, scale
+    def __getitem__(self, index):
+        val_item = self.val_list[index]
+        # Load training image
+        im_path = os.path.join(self.root, self.dataset + '_images', val_item + '.jpg')
+        im = cv2.imread(im_path, cv2.IMREAD_COLOR)
+        h, w, _ = im.shape
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        trans = get_affine_transform(person_center, s, r, self.crop_size)
+        input = cv2.warpAffine(
+            im,
+            trans,
+            (int(self.crop_size[1]), int(self.crop_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        input = self.transform(input)
+        flip_input = input.flip(dims=[-1])
+        if self.flip:
+            batch_input_im = torch.stack([input, flip_input])
+        else:
+            batch_input_im = input
+        meta = {
+            'name': val_item,
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+        return batch_input_im, meta

datasets/simple_extractor_dataset.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   dataset.py
+@Time    :   8/30/19 9:12 PM
+@Desc    :   Dataset Definition
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import os
+import cv2
+import numpy as np
+from torch.utils import data
+from utils.transforms import get_affine_transform
+class SimpleFolderDataset(data.Dataset):
+    def __init__(self, root, input_size=[512, 512], transform=None):
+        self.root = root
+        self.input_size = input_size
+        self.transform = transform
+        self.aspect_ratio = input_size[1] * 1.0 / input_size[0]
+        self.input_size = np.asarray(input_size)
+        self.file_list = os.listdir(self.root)
+    def __len__(self):
+        return len(self.file_list)
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w, h], dtype=np.float32)
+        return center, scale
+    def __getitem__(self, index):
+        img_name = self.file_list[index]
+        img_path = os.path.join(self.root, img_name)
+        img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+        h, w, _ = img.shape
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        trans = get_affine_transform(person_center, s, r, self.input_size)
+        input = cv2.warpAffine(
+            img,
+            trans,
+            (int(self.input_size[1]), int(self.input_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        input = self.transform(input)
+        meta = {
+            'name': img_name,
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+        return input, meta

datasets/target_generation.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from torch.nn import functional as F
+def generate_edge_tensor(label, edge_width=3):
+    label = label.type(torch.cuda.FloatTensor)
+    if len(label.shape) == 2:
+        label = label.unsqueeze(0)
+    n, h, w = label.shape
+    edge = torch.zeros(label.shape, dtype=torch.float).cuda()
+    # right
+    edge_right = edge[:, 1:h, :]
+    edge_right[(label[:, 1:h, :] != label[:, :h - 1, :]) & (label[:, 1:h, :] != 255)
+               & (label[:, :h - 1, :] != 255)] = 1
+    # up
+    edge_up = edge[:, :, :w - 1]
+    edge_up[(label[:, :, :w - 1] != label[:, :, 1:w])
+            & (label[:, :, :w - 1] != 255)
+            & (label[:, :, 1:w] != 255)] = 1
+    # upright
+    edge_upright = edge[:, :h - 1, :w - 1]
+    edge_upright[(label[:, :h - 1, :w - 1] != label[:, 1:h, 1:w])
+                 & (label[:, :h - 1, :w - 1] != 255)
+                 & (label[:, 1:h, 1:w] != 255)] = 1
+    # bottomright
+    edge_bottomright = edge[:, :h - 1, 1:w]
+    edge_bottomright[(label[:, :h - 1, 1:w] != label[:, 1:h, :w - 1])
+                     & (label[:, :h - 1, 1:w] != 255)
+                     & (label[:, 1:h, :w - 1] != 255)] = 1
+    kernel = torch.ones((1, 1, edge_width, edge_width), dtype=torch.float).cuda()
+    with torch.no_grad():
+        edge = edge.unsqueeze(1)
+        edge = F.conv2d(edge, kernel, stride=1, padding=1)
+    edge[edge!=0] = 1
+    edge = edge.squeeze()
+    return edge

demo/demo.jpg ADDED Viewed

Git LFS Details

SHA256: 6871c209cc202232323f309bbdec6ef9c2834aedaa3aef3f50293c4e783f0fec
Pointer size: 131 Bytes
Size of remote file: 310 kB

demo/demo_atr.png ADDED Viewed

demo/demo_lip.png ADDED Viewed

demo/demo_pascal.png ADDED Viewed

demo/lip-visualization.jpg ADDED Viewed

Git LFS Details

SHA256: d311b9ac4871d4e05a6b29953b13d6431afb269514571992267ef7038953bf1d
Pointer size: 132 Bytes
Size of remote file: 1.56 MB

environment.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+name: schp
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - blas=1.0=mkl
+  - ca-certificates=2020.12.8=h06a4308_0
+  - certifi=2020.12.5=py38h06a4308_0
+  - cudatoolkit=10.1.243=h6bb024c_0
+  - freetype=2.10.4=h5ab3b9f_0
+  - intel-openmp=2020.2=254
+  - jpeg=9b=h024ee3a_2
+  - lcms2=2.11=h396b838_0
+  - ld_impl_linux-64=2.33.1=h53a641e_7
+  - libedit=3.1.20191231=h14c3975_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=9.1.0=hdf63c60_0
+  - libpng=1.6.37=hbc83047_0
+  - libstdcxx-ng=9.1.0=hdf63c60_0
+  - libtiff=4.1.0=h2733197_1
+  - lz4-c=1.9.2=heb0550a_3
+  - mkl=2020.2=256
+  - mkl-service=2.3.0=py38he904b0f_0
+  - mkl_fft=1.2.0=py38h23d657b_0
+  - mkl_random=1.1.1=py38h0573a6f_0
+  - ncurses=6.2=he6710b0_1
+  - ninja=1.10.2=py38hff7bd54_0
+  - numpy=1.19.2=py38h54aff64_0
+  - numpy-base=1.19.2=py38hfa32c7d_0
+  - olefile=0.46=py_0
+  - openssl=1.1.1i=h27cfd23_0
+  - pillow=8.0.1=py38he98fc37_0
+  - pip=20.3.3=py38h06a4308_0
+  - python=3.8.5=h7579374_1
+  - readline=8.0=h7b6447c_0
+  - setuptools=51.0.0=py38h06a4308_2
+  - six=1.15.0=py38h06a4308_0
+  - sqlite=3.33.0=h62c20be_0
+  - tk=8.6.10=hbc83047_0
+  - tqdm=4.55.0=pyhd3eb1b0_0
+  - wheel=0.36.2=pyhd3eb1b0_0
+  - xz=5.2.5=h7b6447c_0
+  - zlib=1.2.11=h7b6447c_3
+  - zstd=1.4.5=h9ceee32_0
+  - pytorch=1.5.1=py3.8_cuda10.1.243_cudnn7.6.3_0
+  - torchvision=0.6.1=py38_cu101
+prefix: /home/peike/opt/anaconda3/envs/schp

evaluate.py ADDED Viewed

	@@ -0,0 +1,209 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   evaluate.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import os
+import argparse
+import numpy as np
+import torch
+from torch.utils import data
+from tqdm import tqdm
+from PIL import Image as PILImage
+import torchvision.transforms as transforms
+import torch.backends.cudnn as cudnn
+import networks
+from datasets.datasets import LIPDataValSet
+from utils.miou import compute_mean_ioU
+from utils.transforms import BGR2RGB_transform
+from utils.transforms import transform_parsing
+def get_arguments():
+    """Parse all the arguments provided from the CLI.
+    Returns:
+      A list of parsed arguments.
+    """
+    parser = argparse.ArgumentParser(description="Self Correction for Human Parsing")
+    # Network Structure
+    parser.add_argument("--arch", type=str, default='resnet101')
+    # Data Preference
+    parser.add_argument("--data-dir", type=str, default='./data/LIP')
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--input-size", type=str, default='473,473')
+    parser.add_argument("--num-classes", type=int, default=20)
+    parser.add_argument("--ignore-label", type=int, default=255)
+    parser.add_argument("--random-mirror", action="store_true")
+    parser.add_argument("--random-scale", action="store_true")
+    # Evaluation Preference
+    parser.add_argument("--log-dir", type=str, default='./log')
+    parser.add_argument("--model-restore", type=str, default='./log/checkpoint.pth.tar')
+    parser.add_argument("--gpu", type=str, default='0', help="choose gpu device.")
+    parser.add_argument("--save-results", action="store_true", help="whether to save the results.")
+    parser.add_argument("--flip", action="store_true", help="random flip during the test.")
+    parser.add_argument("--multi-scales", type=str, default='1', help="multiple scales during the test")
+    return parser.parse_args()
+def get_palette(num_cls):
+    """ Returns the color map for visualizing the segmentation mask.
+    Args:
+        num_cls: Number of classes
+    Returns:
+        The color map
+    """
+    n = num_cls
+    palette = [0] * (n * 3)
+    for j in range(0, n):
+        lab = j
+        palette[j * 3 + 0] = 0
+        palette[j * 3 + 1] = 0
+        palette[j * 3 + 2] = 0
+        i = 0
+        while lab:
+            palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
+            palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
+            palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
+            i += 1
+            lab >>= 3
+    return palette
+def multi_scale_testing(model, batch_input_im, crop_size=[473, 473], flip=True, multi_scales=[1]):
+    flipped_idx = (15, 14, 17, 16, 19, 18)
+    if len(batch_input_im.shape) > 4:
+        batch_input_im = batch_input_im.squeeze()
+    if len(batch_input_im.shape) == 3:
+        batch_input_im = batch_input_im.unsqueeze(0)
+    interp = torch.nn.Upsample(size=crop_size, mode='bilinear', align_corners=True)
+    ms_outputs = []
+    for s in multi_scales:
+        interp_im = torch.nn.Upsample(scale_factor=s, mode='bilinear', align_corners=True)
+        scaled_im = interp_im(batch_input_im)
+        parsing_output = model(scaled_im)
+        parsing_output = parsing_output[0][-1]
+        output = parsing_output[0]
+        if flip:
+            flipped_output = parsing_output[1]
+            flipped_output[14:20, :, :] = flipped_output[flipped_idx, :, :]
+            output += flipped_output.flip(dims=[-1])
+            output *= 0.5
+        output = interp(output.unsqueeze(0))
+        ms_outputs.append(output[0])
+    ms_fused_parsing_output = torch.stack(ms_outputs)
+    ms_fused_parsing_output = ms_fused_parsing_output.mean(0)
+    ms_fused_parsing_output = ms_fused_parsing_output.permute(1, 2, 0)  # HWC
+    parsing = torch.argmax(ms_fused_parsing_output, dim=2)
+    parsing = parsing.data.cpu().numpy()
+    ms_fused_parsing_output = ms_fused_parsing_output.data.cpu().numpy()
+    return parsing, ms_fused_parsing_output
+def main():
+    """Create the model and start the evaluation process."""
+    args = get_arguments()
+    multi_scales = [float(i) for i in args.multi_scales.split(',')]
+    gpus = [int(i) for i in args.gpu.split(',')]
+    assert len(gpus) == 1
+    if not args.gpu == 'None':
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
+    cudnn.benchmark = True
+    cudnn.enabled = True
+    h, w = map(int, args.input_size.split(','))
+    input_size = [h, w]
+    model = networks.init_model(args.arch, num_classes=args.num_classes, pretrained=None)
+    IMAGE_MEAN = model.mean
+    IMAGE_STD = model.std
+    INPUT_SPACE = model.input_space
+    print('image mean: {}'.format(IMAGE_MEAN))
+    print('image std: {}'.format(IMAGE_STD))
+    print('input space:{}'.format(INPUT_SPACE))
+    if INPUT_SPACE == 'BGR':
+        print('BGR Transformation')
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=IMAGE_MEAN,
+                                 std=IMAGE_STD),
+        ])
+    if INPUT_SPACE == 'RGB':
+        print('RGB Transformation')
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            BGR2RGB_transform(),
+            transforms.Normalize(mean=IMAGE_MEAN,
+                                 std=IMAGE_STD),
+        ])
+    # Data loader
+    lip_test_dataset = LIPDataValSet(args.data_dir, 'val', crop_size=input_size, transform=transform, flip=args.flip)
+    num_samples = len(lip_test_dataset)
+    print('Totoal testing sample numbers: {}'.format(num_samples))
+    testloader = data.DataLoader(lip_test_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True)
+    # Load model weight
+    state_dict = torch.load(args.model_restore)['state_dict']
+    from collections import OrderedDict
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = k[7:]  # remove `module.`
+        new_state_dict[name] = v
+    model.load_state_dict(new_state_dict)
+    model.cuda()
+    model.eval()
+    sp_results_dir = os.path.join(args.log_dir, 'sp_results')
+    if not os.path.exists(sp_results_dir):
+        os.makedirs(sp_results_dir)
+    palette = get_palette(20)
+    parsing_preds = []
+    scales = np.zeros((num_samples, 2), dtype=np.float32)
+    centers = np.zeros((num_samples, 2), dtype=np.int32)
+    with torch.no_grad():
+        for idx, batch in enumerate(tqdm(testloader)):
+            image, meta = batch
+            if (len(image.shape) > 4):
+                image = image.squeeze()
+            im_name = meta['name'][0]
+            c = meta['center'].numpy()[0]
+            s = meta['scale'].numpy()[0]
+            w = meta['width'].numpy()[0]
+            h = meta['height'].numpy()[0]
+            scales[idx, :] = s
+            centers[idx, :] = c
+            parsing, logits = multi_scale_testing(model, image.cuda(), crop_size=input_size, flip=args.flip,
+                                                  multi_scales=multi_scales)
+            if args.save_results:
+                parsing_result = transform_parsing(parsing, c, s, w, h, input_size)
+                parsing_result_path = os.path.join(sp_results_dir, im_name + '.png')
+                output_im = PILImage.fromarray(np.asarray(parsing_result, dtype=np.uint8))
+                output_im.putpalette(palette)
+                output_im.save(parsing_result_path)
+            parsing_preds.append(parsing)
+    assert len(parsing_preds) == num_samples
+    mIoU = compute_mean_ioU(parsing_preds, scales, centers, args.num_classes, args.data_dir, input_size)
+    print(mIoU)
+    return
+if __name__ == '__main__':
+    main()

main.org ADDED Viewed

	@@ -0,0 +1,663 @@

+* COMMENT WORK SPACE
+cd $HOME/HUGGINGFACE/aravindhv10/Self-Correction-Human-Parsing
+** ELISP
+#+begin_src elisp
+  (save-buffer)
+  (save-some-buffers)
+  (org-babel-tangle)
+  (shell-command "./work.sh" "output_log_work")
+#+end_src
+#+RESULTS:
+: 0
+** ELISP
+#+begin_src elisp
+  (shell-command "git status" "output_log_git_status")
+#+end_src
+#+RESULTS:
+: 0
+** ELISP
+#+begin_src elisp
+  (shell-command "./commit_and_push.sh" "output_log_commit_and_push")
+#+end_src
+#+RESULTS:
+: 0
+* Commit and push
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./commit_and_push.sh
+  git commit -m 'Routine updates'
+  git push
+#+end_src
+* Main script to do everything
+#+begin_src sh :shebang #!/bin/sh :results output :tangle ./work.sh
+  do_ignore(){
+      'sed' 's@^@/@g' './rm.txt';
+      'cat' './gitignore.txt';
+  }
+  do_add(){
+      'sed' 's@^@("git" "lfs" "track" "./@g;s@$@");@g' './git_lfs_track.txt' ;
+      'cat' './git_add.txt' './git_lfs_track.txt' | \
+          'sed' 's@^@("git" "add" "./@g;s@$@");@g' ;
+  }
+  do_rm(){
+      'sed' 's@^@("rm" "-vf" "--" "./@g ; s@$@");@g' './rm.txt' ;
+  }
+  all_commands(){
+      do_add
+      do_rm
+  }
+  do_all(){
+      do_ignore > './.gitignore'
+      all_commands | sh
+  }
+  do_all
+#+end_src
+* List of large files
+#+begin_src conf :tangle ./git_lfs_track.txt
+  checkpoints/AEMatter/AEM_RWA.ckpt
+  checkpoints/atr.pth
+  checkpoints/lip.pth
+  checkpoints/Model_80.pth
+  checkpoints/MVANet/garment.pth
+  checkpoints/MVANet/skin.pth
+  checkpoints/pascal.pth
+  checkpoints/StableDiffusion/90c7c97574f8db765509b6a5d2e7b2551b430a10cac03e37d368654eac5e8169cd149644d188be4b5b2f1b9f29e66b64a02535f622f2bf284c319b076224cb2b
+  checkpoints/StableDiffusion/b970812225cfb95427c13e73b75eef66430e2a525876dddac494d70fe4ed0524cb197043e0ac3dc3026b32a45cd1d6d126ec2fe74a5bc3ef5df21836ca022b30
+  demo/demo_atr.png
+  demo/demo.jpg
+  demo/demo_lip.png
+  demo/demo_pascal.png
+  demo/lip-visualization.jpg
+#+end_src
+* List of source files to add
+#+begin_src conf :tangle ./git_add.txt
+  checkpoints/StableDiffusion/hash
+  ComfyUI_AEMatter/AEMatter.py
+  ComfyUI_AEMatter/AEMatter.run.sh
+  ComfyUI_AEMatter/__init__.py
+  ComfyUI_AEMatter/README.org
+  ComfyUI_MVANet/download.sh
+  ComfyUI_MVANet/__init__.py
+  ComfyUI_MVANet/MVANet_inference.py
+  ComfyUI_MVANet/MVANet_inference.run.sh
+  ComfyUI_MVANet/README.org
+  ComfyUI_MVANet/requirements.txt
+  datasets/datasets.py
+  datasets/__init__.py
+  datasets/simple_extractor_dataset.py
+  datasets/target_generation.py
+  environment.yaml
+  evaluate.py
+  .gitattributes
+  .gitignore
+  LICENSE
+  main.org
+  mhp_extension/coco_style_annotation_creator/human_to_coco.py
+  mhp_extension/coco_style_annotation_creator/pycococreatortools.py
+  mhp_extension/coco_style_annotation_creator/test_human2coco_format.py
+  mhp_extension/demo.ipynb
+  mhp_extension/detectron2/.circleci/config.yml
+  mhp_extension/detectron2/.clang-format
+  mhp_extension/detectron2/configs/Base-RCNN-C4.yaml
+  mhp_extension/detectron2/configs/Base-RCNN-DilatedC5.yaml
+  mhp_extension/detectron2/configs/Base-RCNN-FPN.yaml
+  mhp_extension/detectron2/configs/Base-RetinaNet.yaml
+  mhp_extension/detectron2/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/rpn_R_50_C4_1x.yaml
+  mhp_extension/detectron2/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml
+  mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml
+  mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml
+  mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml
+  mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml
+  mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml
+  mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml
+  mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml
+  mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
+  mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
+  mhp_extension/detectron2/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml
+  mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml
+  mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml
+  mhp_extension/detectron2/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml
+  mhp_extension/detectron2/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml
+  mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml
+  mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml
+  mhp_extension/detectron2/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml
+  mhp_extension/detectron2/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml
+  mhp_extension/detectron2/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml
+  mhp_extension/detectron2/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml
+  mhp_extension/detectron2/configs/Detectron1-Comparisons/README.md
+  mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml
+  mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
+  mhp_extension/detectron2/configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml
+  mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml
+  mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml
+  mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv_parsing.yaml
+  mhp_extension/detectron2/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml
+  mhp_extension/detectron2/configs/Misc/demo.yaml
+  mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml
+  mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml
+  mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml
+  mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml
+  mhp_extension/detectron2/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml
+  mhp_extension/detectron2/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml
+  mhp_extension/detectron2/configs/Misc/parsing_finetune_cihp.yaml
+  mhp_extension/detectron2/configs/Misc/parsing_inference.yaml
+  mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml
+  mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml
+  mhp_extension/detectron2/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml
+  mhp_extension/detectron2/configs/Misc/semantic_R_50_FPN_1x.yaml
+  mhp_extension/detectron2/configs/my_Base-RCNN-FPN.yaml
+  mhp_extension/detectron2/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml
+  mhp_extension/detectron2/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml
+  mhp_extension/detectron2/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_inference_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_instant_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/fast_rcnn_R_50_FPN_instant_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_instant_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/keypoint_rcnn_R_50_FPN_training_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_GCV_instant_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_instant_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_C4_training_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_instant_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/mask_rcnn_R_50_FPN_training_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_instant_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/panoptic_fpn_R_50_training_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/README.md
+  mhp_extension/detectron2/configs/quick_schedules/retinanet_R_50_FPN_inference_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/retinanet_R_50_FPN_instant_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/rpn_R_50_FPN_instant_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_inference_acc_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_instant_test.yaml
+  mhp_extension/detectron2/configs/quick_schedules/semantic_R_50_FPN_training_acc_test.yaml
+  mhp_extension/detectron2/demo/demo.py
+  mhp_extension/detectron2/demo/predictor.py
+  mhp_extension/detectron2/demo/README.md
+  mhp_extension/detectron2/detectron2/checkpoint/c2_model_loading.py
+  mhp_extension/detectron2/detectron2/checkpoint/catalog.py
+  mhp_extension/detectron2/detectron2/checkpoint/detection_checkpoint.py
+  mhp_extension/detectron2/detectron2/checkpoint/__init__.py
+  mhp_extension/detectron2/detectron2/config/compat.py
+  mhp_extension/detectron2/detectron2/config/config.py
+  mhp_extension/detectron2/detectron2/config/defaults.py
+  mhp_extension/detectron2/detectron2/config/__init__.py
+  mhp_extension/detectron2/detectron2/data/build.py
+  mhp_extension/detectron2/detectron2/data/catalog.py
+  mhp_extension/detectron2/detectron2/data/common.py
+  mhp_extension/detectron2/detectron2/data/dataset_mapper.py
+  mhp_extension/detectron2/detectron2/data/datasets/builtin_meta.py
+  mhp_extension/detectron2/detectron2/data/datasets/builtin.py
+  mhp_extension/detectron2/detectron2/data/datasets/cityscapes.py
+  mhp_extension/detectron2/detectron2/data/datasets/coco.py
+  mhp_extension/detectron2/detectron2/data/datasets/__init__.py
+  mhp_extension/detectron2/detectron2/data/datasets/lvis.py
+  mhp_extension/detectron2/detectron2/data/datasets/lvis_v0_5_categories.py
+  mhp_extension/detectron2/detectron2/data/datasets/pascal_voc.py
+  mhp_extension/detectron2/detectron2/data/datasets/README.md
+  mhp_extension/detectron2/detectron2/data/datasets/register_coco.py
+  mhp_extension/detectron2/detectron2/data/detection_utils.py
+  mhp_extension/detectron2/detectron2/data/__init__.py
+  mhp_extension/detectron2/detectron2/data/samplers/distributed_sampler.py
+  mhp_extension/detectron2/detectron2/data/samplers/grouped_batch_sampler.py
+  mhp_extension/detectron2/detectron2/data/samplers/__init__.py
+  mhp_extension/detectron2/detectron2/data/transforms/__init__.py
+  mhp_extension/detectron2/detectron2/data/transforms/transform_gen.py
+  mhp_extension/detectron2/detectron2/data/transforms/transform.py
+  mhp_extension/detectron2/detectron2/engine/defaults.py
+  mhp_extension/detectron2/detectron2/engine/hooks.py
+  mhp_extension/detectron2/detectron2/engine/__init__.py
+  mhp_extension/detectron2/detectron2/engine/launch.py
+  mhp_extension/detectron2/detectron2/engine/train_loop.py
+  mhp_extension/detectron2/detectron2/evaluation/cityscapes_evaluation.py
+  mhp_extension/detectron2/detectron2/evaluation/coco_evaluation.py
+  mhp_extension/detectron2/detectron2/evaluation/evaluator.py
+  mhp_extension/detectron2/detectron2/evaluation/__init__.py
+  mhp_extension/detectron2/detectron2/evaluation/lvis_evaluation.py
+  mhp_extension/detectron2/detectron2/evaluation/panoptic_evaluation.py
+  mhp_extension/detectron2/detectron2/evaluation/pascal_voc_evaluation.py
+  mhp_extension/detectron2/detectron2/evaluation/rotated_coco_evaluation.py
+  mhp_extension/detectron2/detectron2/evaluation/sem_seg_evaluation.py
+  mhp_extension/detectron2/detectron2/evaluation/testing.py
+  mhp_extension/detectron2/detectron2/export/api.py
+  mhp_extension/detectron2/detectron2/export/c10.py
+  mhp_extension/detectron2/detectron2/export/caffe2_export.py
+  mhp_extension/detectron2/detectron2/export/caffe2_inference.py
+  mhp_extension/detectron2/detectron2/export/caffe2_modeling.py
+  mhp_extension/detectron2/detectron2/export/__init__.py
+  mhp_extension/detectron2/detectron2/export/patcher.py
+  mhp_extension/detectron2/detectron2/export/README.md
+  mhp_extension/detectron2/detectron2/export/shared.py
+  mhp_extension/detectron2/detectron2/__init__.py
+  mhp_extension/detectron2/detectron2/layers/batch_norm.py
+  mhp_extension/detectron2/detectron2/layers/blocks.py
+  mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
+  mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+  mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+  mhp_extension/detectron2/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+  mhp_extension/detectron2/detectron2/layers/csrc/cuda_version.cu
+  mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv_cuda.cu
+  mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
+  mhp_extension/detectron2/detectron2/layers/csrc/deformable/deform_conv.h
+  mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
+  mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+  mhp_extension/detectron2/detectron2/layers/csrc/nms_rotated/nms_rotated.h
+  mhp_extension/detectron2/detectron2/layers/csrc/README.md
+  mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp
+  mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu
+  mhp_extension/detectron2/detectron2/layers/csrc/ROIAlign/ROIAlign.h
+  mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
+  mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
+  mhp_extension/detectron2/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
+  mhp_extension/detectron2/detectron2/layers/csrc/vision.cpp
+  mhp_extension/detectron2/detectron2/layers/deform_conv.py
+  mhp_extension/detectron2/detectron2/layers/__init__.py
+  mhp_extension/detectron2/detectron2/layers/mask_ops.py
+  mhp_extension/detectron2/detectron2/layers/nms.py
+  mhp_extension/detectron2/detectron2/layers/roi_align.py
+  mhp_extension/detectron2/detectron2/layers/roi_align_rotated.py
+  mhp_extension/detectron2/detectron2/layers/rotated_boxes.py
+  mhp_extension/detectron2/detectron2/layers/shape_spec.py
+  mhp_extension/detectron2/detectron2/layers/wrappers.py
+  mhp_extension/detectron2/detectron2/modeling/anchor_generator.py
+  mhp_extension/detectron2/detectron2/modeling/backbone/backbone.py
+  mhp_extension/detectron2/detectron2/modeling/backbone/build.py
+  mhp_extension/detectron2/detectron2/modeling/backbone/fpn.py
+  mhp_extension/detectron2/detectron2/modeling/backbone/__init__.py
+  mhp_extension/detectron2/detectron2/modeling/backbone/resnet.py
+  mhp_extension/detectron2/detectron2/modeling/box_regression.py
+  mhp_extension/detectron2/detectron2/modeling/__init__.py
+  mhp_extension/detectron2/detectron2/modeling/matcher.py
+  mhp_extension/detectron2/detectron2/modeling/meta_arch/build.py
+  mhp_extension/detectron2/detectron2/modeling/meta_arch/__init__.py
+  mhp_extension/detectron2/detectron2/modeling/meta_arch/panoptic_fpn.py
+  mhp_extension/detectron2/detectron2/modeling/meta_arch/rcnn.py
+  mhp_extension/detectron2/detectron2/modeling/meta_arch/retinanet.py
+  mhp_extension/detectron2/detectron2/modeling/meta_arch/semantic_seg.py
+  mhp_extension/detectron2/detectron2/modeling/poolers.py
+  mhp_extension/detectron2/detectron2/modeling/postprocessing.py
+  mhp_extension/detectron2/detectron2/modeling/proposal_generator/build.py
+  mhp_extension/detectron2/detectron2/modeling/proposal_generator/__init__.py
+  mhp_extension/detectron2/detectron2/modeling/proposal_generator/proposal_utils.py
+  mhp_extension/detectron2/detectron2/modeling/proposal_generator/rpn_outputs.py
+  mhp_extension/detectron2/detectron2/modeling/proposal_generator/rpn.py
+  mhp_extension/detectron2/detectron2/modeling/proposal_generator/rrpn.py
+  mhp_extension/detectron2/detectron2/modeling/roi_heads/box_head.py
+  mhp_extension/detectron2/detectron2/modeling/roi_heads/cascade_rcnn.py
+  mhp_extension/detectron2/detectron2/modeling/roi_heads/fast_rcnn.py
+  mhp_extension/detectron2/detectron2/modeling/roi_heads/__init__.py
+  mhp_extension/detectron2/detectron2/modeling/roi_heads/keypoint_head.py
+  mhp_extension/detectron2/detectron2/modeling/roi_heads/mask_head.py
+  mhp_extension/detectron2/detectron2/modeling/roi_heads/roi_heads.py
+  mhp_extension/detectron2/detectron2/modeling/roi_heads/rotated_fast_rcnn.py
+  mhp_extension/detectron2/detectron2/modeling/sampling.py
+  mhp_extension/detectron2/detectron2/modeling/test_time_augmentation.py
+  mhp_extension/detectron2/detectron2/model_zoo/__init__.py
+  mhp_extension/detectron2/detectron2/model_zoo/model_zoo.py
+  mhp_extension/detectron2/detectron2/solver/build.py
+  mhp_extension/detectron2/detectron2/solver/__init__.py
+  mhp_extension/detectron2/detectron2/solver/lr_scheduler.py
+  mhp_extension/detectron2/detectron2/structures/boxes.py
+  mhp_extension/detectron2/detectron2/structures/image_list.py
+  mhp_extension/detectron2/detectron2/structures/__init__.py
+  mhp_extension/detectron2/detectron2/structures/instances.py
+  mhp_extension/detectron2/detectron2/structures/keypoints.py
+  mhp_extension/detectron2/detectron2/structures/masks.py
+  mhp_extension/detectron2/detectron2/structures/rotated_boxes.py
+  mhp_extension/detectron2/detectron2/utils/analysis.py
+  mhp_extension/detectron2/detectron2/utils/collect_env.py
+  mhp_extension/detectron2/detectron2/utils/colormap.py
+  mhp_extension/detectron2/detectron2/utils/comm.py
+  mhp_extension/detectron2/detectron2/utils/env.py
+  mhp_extension/detectron2/detectron2/utils/events.py
+  mhp_extension/detectron2/detectron2/utils/__init__.py
+  mhp_extension/detectron2/detectron2/utils/logger.py
+  mhp_extension/detectron2/detectron2/utils/memory.py
+  mhp_extension/detectron2/detectron2/utils/README.md
+  mhp_extension/detectron2/detectron2/utils/registry.py
+  mhp_extension/detectron2/detectron2/utils/serialize.py
+  mhp_extension/detectron2/detectron2/utils/video_visualizer.py
+  mhp_extension/detectron2/detectron2/utils/visualizer.py
+  mhp_extension/detectron2/dev/linter.sh
+  mhp_extension/detectron2/dev/packaging/build_all_wheels.sh
+  mhp_extension/detectron2/dev/packaging/build_wheel.sh
+  mhp_extension/detectron2/dev/packaging/gen_wheel_index.sh
+  mhp_extension/detectron2/dev/packaging/pkg_helpers.bash
+  mhp_extension/detectron2/dev/packaging/README.md
+  mhp_extension/detectron2/dev/parse_results.sh
+  mhp_extension/detectron2/dev/README.md
+  mhp_extension/detectron2/dev/run_inference_tests.sh
+  mhp_extension/detectron2/dev/run_instant_tests.sh
+  mhp_extension/detectron2/docker/docker-compose.yml
+  mhp_extension/detectron2/docker/Dockerfile
+  mhp_extension/detectron2/docker/Dockerfile-circleci
+  mhp_extension/detectron2/docker/README.md
+  mhp_extension/detectron2/docs/conf.py
+  mhp_extension/detectron2/docs/.gitignore
+  mhp_extension/detectron2/docs/index.rst
+  mhp_extension/detectron2/docs/Makefile
+  mhp_extension/detectron2/docs/modules/checkpoint.rst
+  mhp_extension/detectron2/docs/modules/config.rst
+  mhp_extension/detectron2/docs/modules/data.rst
+  mhp_extension/detectron2/docs/modules/engine.rst
+  mhp_extension/detectron2/docs/modules/evaluation.rst
+  mhp_extension/detectron2/docs/modules/export.rst
+  mhp_extension/detectron2/docs/modules/index.rst
+  mhp_extension/detectron2/docs/modules/layers.rst
+  mhp_extension/detectron2/docs/modules/modeling.rst
+  mhp_extension/detectron2/docs/modules/model_zoo.rst
+  mhp_extension/detectron2/docs/modules/solver.rst
+  mhp_extension/detectron2/docs/modules/structures.rst
+  mhp_extension/detectron2/docs/modules/utils.rst
+  mhp_extension/detectron2/docs/notes/benchmarks.md
+  mhp_extension/detectron2/docs/notes/changelog.md
+  mhp_extension/detectron2/docs/notes/compatibility.md
+  mhp_extension/detectron2/docs/notes/contributing.md
+  mhp_extension/detectron2/docs/notes/index.rst
+  mhp_extension/detectron2/docs/README.md
+  mhp_extension/detectron2/docs/tutorials/builtin_datasets.md
+  mhp_extension/detectron2/docs/tutorials/configs.md
+  mhp_extension/detectron2/docs/tutorials/data_loading.md
+  mhp_extension/detectron2/docs/tutorials/datasets.md
+  mhp_extension/detectron2/docs/tutorials/deployment.md
+  mhp_extension/detectron2/docs/tutorials/evaluation.md
+  mhp_extension/detectron2/docs/tutorials/extend.md
+  mhp_extension/detectron2/docs/tutorials/getting_started.md
+  mhp_extension/detectron2/docs/tutorials/index.rst
+  mhp_extension/detectron2/docs/tutorials/install.md
+  mhp_extension/detectron2/docs/tutorials/models.md
+  mhp_extension/detectron2/docs/tutorials/README.md
+  mhp_extension/detectron2/docs/tutorials/training.md
+  mhp_extension/detectron2/docs/tutorials/write-models.md
+  mhp_extension/detectron2/.flake8
+  mhp_extension/detectron2/GETTING_STARTED.md
+  mhp_extension/detectron2/.gitignore
+  mhp_extension/detectron2/INSTALL.md
+  mhp_extension/detectron2/LICENSE
+  mhp_extension/detectron2/MODEL_ZOO.md
+  mhp_extension/detectron2/projects/DensePose/apply_net.py
+  mhp_extension/detectron2/projects/DensePose/configs/Base-DensePose-RCNN-FPN.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/evolution/Base-RCNN-FPN-MC.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/evolution/faster_rcnn_R_50_FPN_1x_MC.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_DL_instant_test.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_TTA_inference_acc_test.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC1_instant_test.yaml
+  mhp_extension/detectron2/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC2_instant_test.yaml
+  mhp_extension/detectron2/projects/DensePose/densepose/config.py
+  mhp_extension/detectron2/projects/DensePose/densepose/data/build.py
+  mhp_extension/detectron2/projects/DensePose/densepose/data/dataset_mapper.py
+  mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/builtin.py
+  mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/coco.py
+  mhp_extension/detectron2/projects/DensePose/densepose/data/datasets/__init__.py
+  mhp_extension/detectron2/projects/DensePose/densepose/data/__init__.py
+  mhp_extension/detectron2/projects/DensePose/densepose/data/structures.py
+  mhp_extension/detectron2/projects/DensePose/densepose/densepose_coco_evaluation.py
+  mhp_extension/detectron2/projects/DensePose/densepose/densepose_head.py
+  mhp_extension/detectron2/projects/DensePose/densepose/evaluator.py
+  mhp_extension/detectron2/projects/DensePose/densepose/__init__.py
+  mhp_extension/detectron2/projects/DensePose/densepose/modeling/test_time_augmentation.py
+  mhp_extension/detectron2/projects/DensePose/densepose/roi_head.py
+  mhp_extension/detectron2/projects/DensePose/densepose/utils/dbhelper.py
+  mhp_extension/detectron2/projects/DensePose/densepose/utils/logger.py
+  mhp_extension/detectron2/projects/DensePose/densepose/utils/transform.py
+  mhp_extension/detectron2/projects/DensePose/densepose/vis/base.py
+  mhp_extension/detectron2/projects/DensePose/densepose/vis/bounding_box.py
+  mhp_extension/detectron2/projects/DensePose/densepose/vis/densepose.py
+  mhp_extension/detectron2/projects/DensePose/densepose/vis/extractor.py
+  mhp_extension/detectron2/projects/DensePose/dev/README.md
+  mhp_extension/detectron2/projects/DensePose/dev/run_inference_tests.sh
+  mhp_extension/detectron2/projects/DensePose/dev/run_instant_tests.sh
+  mhp_extension/detectron2/projects/DensePose/doc/GETTING_STARTED.md
+  mhp_extension/detectron2/projects/DensePose/doc/MODEL_ZOO.md
+  mhp_extension/detectron2/projects/DensePose/doc/TOOL_APPLY_NET.md
+  mhp_extension/detectron2/projects/DensePose/doc/TOOL_QUERY_DB.md
+  mhp_extension/detectron2/projects/DensePose/query_db.py
+  mhp_extension/detectron2/projects/DensePose/README.md
+  mhp_extension/detectron2/projects/DensePose/tests/common.py
+  mhp_extension/detectron2/projects/DensePose/tests/test_model_e2e.py
+  mhp_extension/detectron2/projects/DensePose/tests/test_setup.py
+  mhp_extension/detectron2/projects/DensePose/tests/test_structures.py
+  mhp_extension/detectron2/projects/DensePose/train_net.py
+  mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/Base-PointRend-RCNN-FPN.yaml
+  mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes.yaml
+  mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml
+  mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco.yaml
+  mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_parsing.yaml
+  mhp_extension/detectron2/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_X_101_32x8d_FPN_3x_parsing.yaml
+  mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/Base-PointRend-Semantic-FPN.yaml
+  mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes.yaml
+  mhp_extension/detectron2/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_50_FPN_1x_coco.yaml
+  mhp_extension/detectron2/projects/PointRend/finetune_net.py
+  mhp_extension/detectron2/projects/PointRend/logs/hadoop.kylin.libdfs.log
+  mhp_extension/detectron2/projects/PointRend/point_rend/coarse_mask_head.py
+  mhp_extension/detectron2/projects/PointRend/point_rend/color_augmentation.py
+  mhp_extension/detectron2/projects/PointRend/point_rend/config.py
+  mhp_extension/detectron2/projects/PointRend/point_rend/dataset_mapper.py
+  mhp_extension/detectron2/projects/PointRend/point_rend/__init__.py
+  mhp_extension/detectron2/projects/PointRend/point_rend/point_features.py
+  mhp_extension/detectron2/projects/PointRend/point_rend/point_head.py
+  mhp_extension/detectron2/projects/PointRend/point_rend/roi_heads.py
+  mhp_extension/detectron2/projects/PointRend/point_rend/semantic_seg.py
+  mhp_extension/detectron2/projects/PointRend/README.md
+  mhp_extension/detectron2/projects/PointRend/run.sh
+  mhp_extension/detectron2/projects/PointRend/train_net.py
+  mhp_extension/detectron2/projects/README.md
+  mhp_extension/detectron2/projects/TensorMask/configs/Base-TensorMask.yaml
+  mhp_extension/detectron2/projects/TensorMask/configs/tensormask_R_50_FPN_1x.yaml
+  mhp_extension/detectron2/projects/TensorMask/configs/tensormask_R_50_FPN_6x.yaml
+  mhp_extension/detectron2/projects/TensorMask/README.md
+  mhp_extension/detectron2/projects/TensorMask/setup.py
+  mhp_extension/detectron2/projects/TensorMask/tensormask/arch.py
+  mhp_extension/detectron2/projects/TensorMask/tensormask/config.py
+  mhp_extension/detectron2/projects/TensorMask/tensormask/__init__.py
+  mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat_cuda.cu
+  mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat.h
+  mhp_extension/detectron2/projects/TensorMask/tensormask/layers/csrc/vision.cpp
+  mhp_extension/detectron2/projects/TensorMask/tensormask/layers/__init__.py
+  mhp_extension/detectron2/projects/TensorMask/tensormask/layers/swap_align2nat.py
+  mhp_extension/detectron2/projects/TensorMask/tests/__init__.py
+  mhp_extension/detectron2/projects/TensorMask/tests/test_swap_align2nat.py
+  mhp_extension/detectron2/projects/TensorMask/train_net.py
+  mhp_extension/detectron2/projects/TridentNet/configs/Base-TridentNet-Fast-C4.yaml
+  mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_101_C4_3x.yaml
+  mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_50_C4_1x.yaml
+  mhp_extension/detectron2/projects/TridentNet/configs/tridentnet_fast_R_50_C4_3x.yaml
+  mhp_extension/detectron2/projects/TridentNet/README.md
+  mhp_extension/detectron2/projects/TridentNet/train_net.py
+  mhp_extension/detectron2/projects/TridentNet/tridentnet/config.py
+  mhp_extension/detectron2/projects/TridentNet/tridentnet/__init__.py
+  mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_backbone.py
+  mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_conv.py
+  mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_rcnn.py
+  mhp_extension/detectron2/projects/TridentNet/tridentnet/trident_rpn.py
+  mhp_extension/detectron2/README.md
+  mhp_extension/detectron2/setup.cfg
+  mhp_extension/detectron2/setup.py
+  mhp_extension/detectron2/tests/data/__init__.py
+  mhp_extension/detectron2/tests/data/test_coco.py
+  mhp_extension/detectron2/tests/data/test_detection_utils.py
+  mhp_extension/detectron2/tests/data/test_rotation_transform.py
+  mhp_extension/detectron2/tests/data/test_sampler.py
+  mhp_extension/detectron2/tests/data/test_transforms.py
+  mhp_extension/detectron2/tests/__init__.py
+  mhp_extension/detectron2/tests/layers/__init__.py
+  mhp_extension/detectron2/tests/layers/test_mask_ops.py
+  mhp_extension/detectron2/tests/layers/test_nms_rotated.py
+  mhp_extension/detectron2/tests/layers/test_roi_align.py
+  mhp_extension/detectron2/tests/layers/test_roi_align_rotated.py
+  mhp_extension/detectron2/tests/modeling/__init__.py
+  mhp_extension/detectron2/tests/modeling/test_anchor_generator.py
+  mhp_extension/detectron2/tests/modeling/test_box2box_transform.py
+  mhp_extension/detectron2/tests/modeling/test_fast_rcnn.py
+  mhp_extension/detectron2/tests/modeling/test_model_e2e.py
+  mhp_extension/detectron2/tests/modeling/test_roi_heads.py
+  mhp_extension/detectron2/tests/modeling/test_roi_pooler.py
+  mhp_extension/detectron2/tests/modeling/test_rpn.py
+  mhp_extension/detectron2/tests/README.md
+  mhp_extension/detectron2/tests/structures/__init__.py
+  mhp_extension/detectron2/tests/structures/test_boxes.py
+  mhp_extension/detectron2/tests/structures/test_imagelist.py
+  mhp_extension/detectron2/tests/structures/test_instances.py
+  mhp_extension/detectron2/tests/structures/test_rotated_boxes.py
+  mhp_extension/detectron2/tests/test_checkpoint.py
+  mhp_extension/detectron2/tests/test_config.py
+  mhp_extension/detectron2/tests/test_export_caffe2.py
+  mhp_extension/detectron2/tests/test_model_analysis.py
+  mhp_extension/detectron2/tests/test_model_zoo.py
+  mhp_extension/detectron2/tests/test_visualizer.py
+  mhp_extension/detectron2/tools/analyze_model.py
+  mhp_extension/detectron2/tools/benchmark.py
+  mhp_extension/detectron2/tools/convert-torchvision-to-d2.py
+  mhp_extension/detectron2/tools/deploy/caffe2_converter.py
+  mhp_extension/detectron2/tools/deploy/caffe2_mask_rcnn.cpp
+  mhp_extension/detectron2/tools/deploy/README.md
+  mhp_extension/detectron2/tools/deploy/torchscript_traced_mask_rcnn.cpp
+  mhp_extension/detectron2/tools/finetune_net.py
+  mhp_extension/detectron2/tools/inference.sh
+  mhp_extension/detectron2/tools/plain_train_net.py
+  mhp_extension/detectron2/tools/README.md
+  mhp_extension/detectron2/tools/run.sh
+  mhp_extension/detectron2/tools/train_net.py
+  mhp_extension/detectron2/tools/visualize_data.py
+  mhp_extension/detectron2/tools/visualize_json_results.py
+  mhp_extension/global_local_parsing/global_local_datasets.py
+  mhp_extension/global_local_parsing/global_local_evaluate.py
+  mhp_extension/global_local_parsing/global_local_train.py
+  mhp_extension/global_local_parsing/make_id_list.py
+  mhp_extension/logits_fusion.py
+  mhp_extension/make_crop_and_mask_w_mask_nms.py
+  mhp_extension/README.md
+  mhp_extension/scripts/make_coco_style_annotation.sh
+  mhp_extension/scripts/make_crop.sh
+  mhp_extension/scripts/parsing_fusion.sh
+  modules/bn.py
+  modules/deeplab.py
+  modules/dense.py
+  modules/functions.py
+  modules/__init__.py
+  modules/misc.py
+  modules/residual.py
+  modules/src/checks.h
+  modules/src/inplace_abn.cpp
+  modules/src/inplace_abn_cpu.cpp
+  modules/src/inplace_abn_cuda.cu
+  modules/src/inplace_abn_cuda_half.cu
+  modules/src/inplace_abn.h
+  modules/src/utils/checks.h
+  modules/src/utils/common.h
+  modules/src/utils/cuda.cuh
+  networks/AugmentCE2P.py
+  networks/backbone/mobilenetv2.py
+  networks/backbone/resnet.py
+  networks/backbone/resnext.py
+  networks/context_encoding/aspp.py
+  networks/context_encoding/ocnet.py
+  networks/context_encoding/psp.py
+  networks/__init__.py
+  README.md
+  requirements.txt
+  simple_extractor.py
+  training_code/MVANet/README.org
+  train.py
+  utils/consistency_loss.py
+  utils/criterion.py
+  utils/encoding.py
+  utils/__init__.py
+  utils/kl_loss.py
+  utils/lovasz_softmax.py
+  utils/miou.py
+  utils/schp.py
+  utils/soft_dice_loss.py
+  utils/transforms.py
+  utils/warmup_scheduler.py
+  MVANet_Inference/README.org
+#+end_src
+* List of files to remove
+#+begin_src conf :tangle ./rm.txt
+  ComfyUI_MVANet/__pycache__/__init__.cpython-310.pyc
+  ComfyUI_MVANet/#README.org#
+  ComfyUI_MVANet/.#README.org
+  ComfyUI_MVANet/README.org~
+  ComfyUI_MVANet/.README.org.~undo-tree~
+  #main.org#
+  .#main.org
+  main.org~
+  .main.org.~undo-tree~
+  .README.md.~undo-tree~
+  ComfyUI_MVANet/.#README.org
+  ComfyUI_AEMatter/__pycache__/__init__.cpython-310.pyc
+  ComfyUI_AEMatter/AEMatter.class.py
+  ComfyUI_AEMatter/AEMatter.execute.py
+  ComfyUI_AEMatter/AEMatter.function.py
+  ComfyUI_AEMatter/AEMatter.import.py
+  ComfyUI_MVANet/MVANet_inference.class.py
+  ComfyUI_MVANet/MVANet_inference.execute.py
+  ComfyUI_MVANet/MVANet_inference.function.py
+  ComfyUI_MVANet/MVANet_inference.import.py
+  ComfyUI_MVANet/MVANet_inference.unify.sh
+  ComfyUI_AEMatter/AEMatter.unify.sh
+  git_add.txt
+  git_lfs_track.txt
+  gitignore.txt
+  rm.txt
+  work.sh
+#+end_src
+* List of patterns to ignore
+#+begin_src conf :tangle ./gitignore.txt
+  log/
+  pretrain_model/
+  commit_and_push.sh
+#+end_src

mhp_extension/README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# Self Correction for Human Parsing
+We propose a simple yet effective multiple human parsing framework by extending our self-correction network.
+Here we show an example usage jupyter notebook in [demo.ipynb](./demo.ipynb).
+## Requirements
+Please see [INSTALL.md](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md) for further requirements.
+## Citation
+Please cite our work if you find this repo useful in your research.
+```latex
+@article{li2019self,
+  title={Self-Correction for Human Parsing},
+  author={Li, Peike and Xu, Yunqiu and Wei, Yunchao and Yang, Yi},
+  journal={arXiv preprint arXiv:1910.09777},
+  year={2019}
+}
+```
+## Visualization
+* Source Image.
+![demo](./demo/demo.jpg)
+* Instance Human Mask.
+![demo-lip](./demo/demo_instance_human_mask.png)
+* Global Human Parsing Result.
+![demo-lip](./demo/demo_global_human_parsing.png)
+* Multiple Human Parsing Result.
+![demo-lip](./demo/demo_multiple_human_parsing.png)
+## Related
+Our implementation is based on the [Detectron2](https://github.com/facebookresearch/detectron2).

mhp_extension/coco_style_annotation_creator/__pycache__/pycococreatortools.cpython-37.pyc ADDED Viewed

Binary file (3.6 kB). View file

mhp_extension/coco_style_annotation_creator/human_to_coco.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import argparse
+import datetime
+import json
+import os
+from PIL import Image
+import numpy as np
+import pycococreatortools
+def get_arguments():
+    parser = argparse.ArgumentParser(description="transform mask annotation to coco annotation")
+    parser.add_argument("--dataset", type=str, default='CIHP', help="name of dataset (CIHP, MHPv2 or VIP)")
+    parser.add_argument("--json_save_dir", type=str, default='../data/msrcnn_finetune_annotations',
+                        help="path to save coco-style annotation json file")
+    parser.add_argument("--use_val", type=bool, default=False,
+                        help="use train+val set for finetuning or not")
+    parser.add_argument("--train_img_dir", type=str, default='../data/instance-level_human_parsing/Training/Images',
+                        help="train image path")
+    parser.add_argument("--train_anno_dir", type=str,
+                        default='../data/instance-level_human_parsing/Training/Human_ids',
+                        help="train human mask path")
+    parser.add_argument("--val_img_dir", type=str, default='../data/instance-level_human_parsing/Validation/Images',
+                        help="val image path")
+    parser.add_argument("--val_anno_dir", type=str,
+                        default='../data/instance-level_human_parsing/Validation/Human_ids',
+                        help="val human mask path")
+    return parser.parse_args()
+def main(args):
+    INFO = {
+        "description": args.split_name + " Dataset",
+        "url": "",
+        "version": "",
+        "year": 2019,
+        "contributor": "xyq",
+        "date_created": datetime.datetime.utcnow().isoformat(' ')
+    }
+    LICENSES = [
+        {
+            "id": 1,
+            "name": "",
+            "url": ""
+        }
+    ]
+    CATEGORIES = [
+        {
+            'id': 1,
+            'name': 'person',
+            'supercategory': 'person',
+        },
+    ]
+    coco_output = {
+        "info": INFO,
+        "licenses": LICENSES,
+        "categories": CATEGORIES,
+        "images": [],
+        "annotations": []
+    }
+    image_id = 1
+    segmentation_id = 1
+    for image_name in os.listdir(args.train_img_dir):
+        image = Image.open(os.path.join(args.train_img_dir, image_name))
+        image_info = pycococreatortools.create_image_info(
+            image_id, image_name, image.size
+        )
+        coco_output["images"].append(image_info)
+        human_mask_name = os.path.splitext(image_name)[0] + '.png'
+        human_mask = np.asarray(Image.open(os.path.join(args.train_anno_dir, human_mask_name)))
+        human_gt_labels = np.unique(human_mask)
+        for i in range(1, len(human_gt_labels)):
+            category_info = {'id': 1, 'is_crowd': 0}
+            binary_mask = np.uint8(human_mask == i)
+            annotation_info = pycococreatortools.create_annotation_info(
+                segmentation_id, image_id, category_info, binary_mask,
+                image.size, tolerance=10
+            )
+            if annotation_info is not None:
+                coco_output["annotations"].append(annotation_info)
+            segmentation_id += 1
+        image_id += 1
+    if not os.path.exists(args.json_save_dir):
+        os.makedirs(args.json_save_dir)
+    if not args.use_val:
+        with open('{}/{}_train.json'.format(args.json_save_dir, args.split_name), 'w') as output_json_file:
+            json.dump(coco_output, output_json_file)
+    else:
+        for image_name in os.listdir(args.val_img_dir):
+            image = Image.open(os.path.join(args.val_img_dir, image_name))
+            image_info = pycococreatortools.create_image_info(
+                image_id, image_name, image.size
+            )
+            coco_output["images"].append(image_info)
+            human_mask_name = os.path.splitext(image_name)[0] + '.png'
+            human_mask = np.asarray(Image.open(os.path.join(args.val_anno_dir, human_mask_name)))
+            human_gt_labels = np.unique(human_mask)
+            for i in range(1, len(human_gt_labels)):
+                category_info = {'id': 1, 'is_crowd': 0}
+                binary_mask = np.uint8(human_mask == i)
+                annotation_info = pycococreatortools.create_annotation_info(
+                    segmentation_id, image_id, category_info, binary_mask,
+                    image.size, tolerance=10
+                )
+                if annotation_info is not None:
+                    coco_output["annotations"].append(annotation_info)
+                segmentation_id += 1
+            image_id += 1
+        with open('{}/{}_trainval.json'.format(args.json_save_dir, args.split_name), 'w') as output_json_file:
+            json.dump(coco_output, output_json_file)
+    coco_output_val = {
+        "info": INFO,
+        "licenses": LICENSES,
+        "categories": CATEGORIES,
+        "images": [],
+        "annotations": []
+    }
+    image_id_val = 1
+    segmentation_id_val = 1
+    for image_name in os.listdir(args.val_img_dir):
+        image = Image.open(os.path.join(args.val_img_dir, image_name))
+        image_info = pycococreatortools.create_image_info(
+            image_id_val, image_name, image.size
+        )
+        coco_output_val["images"].append(image_info)
+        human_mask_name = os.path.splitext(image_name)[0] + '.png'
+        human_mask = np.asarray(Image.open(os.path.join(args.val_anno_dir, human_mask_name)))
+        human_gt_labels = np.unique(human_mask)
+        for i in range(1, len(human_gt_labels)):
+            category_info = {'id': 1, 'is_crowd': 0}
+            binary_mask = np.uint8(human_mask == i)
+            annotation_info = pycococreatortools.create_annotation_info(
+                segmentation_id_val, image_id_val, category_info, binary_mask,
+                image.size, tolerance=10
+            )
+            if annotation_info is not None:
+                coco_output_val["annotations"].append(annotation_info)
+            segmentation_id_val += 1
+        image_id_val += 1
+    with open('{}/{}_val.json'.format(args.json_save_dir, args.split_name), 'w') as output_json_file_val:
+        json.dump(coco_output_val, output_json_file_val)
+if __name__ == "__main__":
+    args = get_arguments()
+    main(args)

mhp_extension/coco_style_annotation_creator/pycococreatortools.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import re
+import datetime
+import numpy as np
+from itertools import groupby
+from skimage import measure
+from PIL import Image
+from pycocotools import mask
+convert = lambda text: int(text) if text.isdigit() else text.lower()
+natrual_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
+def resize_binary_mask(array, new_size):
+    image = Image.fromarray(array.astype(np.uint8) * 255)
+    image = image.resize(new_size)
+    return np.asarray(image).astype(np.bool_)
+def close_contour(contour):
+    if not np.array_equal(contour[0], contour[-1]):
+        contour = np.vstack((contour, contour[0]))
+    return contour
+def binary_mask_to_rle(binary_mask):
+    rle = {'counts': [], 'size': list(binary_mask.shape)}
+    counts = rle.get('counts')
+    for i, (value, elements) in enumerate(groupby(binary_mask.ravel(order='F'))):
+        if i == 0 and value == 1:
+            counts.append(0)
+        counts.append(len(list(elements)))
+    return rle
+def binary_mask_to_polygon(binary_mask, tolerance=0):
+    """Converts a binary mask to COCO polygon representation
+    Args:
+        binary_mask: a 2D binary numpy array where '1's represent the object
+        tolerance: Maximum distance from original points of polygon to approximated
+            polygonal chain. If tolerance is 0, the original coordinate array is returned.
+    """
+    polygons = []
+    # pad mask to close contours of shapes which start and end at an edge
+    padded_binary_mask = np.pad(binary_mask, pad_width=1, mode='constant', constant_values=0)
+    contours = measure.find_contours(padded_binary_mask, 0.5)
+    contours = np.subtract(contours, 1)
+    for contour in contours:
+        contour = close_contour(contour)
+        contour = measure.approximate_polygon(contour, tolerance)
+        if len(contour) < 3:
+            continue
+        contour = np.flip(contour, axis=1)
+        segmentation = contour.ravel().tolist()
+        # after padding and subtracting 1 we may get -0.5 points in our segmentation
+        segmentation = [0 if i < 0 else i for i in segmentation]
+        polygons.append(segmentation)
+    return polygons
+def create_image_info(image_id, file_name, image_size,
+                      date_captured=datetime.datetime.utcnow().isoformat(' '),
+                      license_id=1, coco_url="", flickr_url=""):
+    image_info = {
+        "id": image_id,
+        "file_name": file_name,
+        "width": image_size[0],
+        "height": image_size[1],
+        "date_captured": date_captured,
+        "license": license_id,
+        "coco_url": coco_url,
+        "flickr_url": flickr_url
+    }
+    return image_info
+def create_annotation_info(annotation_id, image_id, category_info, binary_mask,
+                           image_size=None, tolerance=2, bounding_box=None):
+    if image_size is not None:
+        binary_mask = resize_binary_mask(binary_mask, image_size)
+    binary_mask_encoded = mask.encode(np.asfortranarray(binary_mask.astype(np.uint8)))
+    area = mask.area(binary_mask_encoded)
+    if area < 1:
+        return None
+    if bounding_box is None:
+        bounding_box = mask.toBbox(binary_mask_encoded)
+    if category_info["is_crowd"]:
+        is_crowd = 1
+        segmentation = binary_mask_to_rle(binary_mask)
+    else:
+        is_crowd = 0
+        segmentation = binary_mask_to_polygon(binary_mask, tolerance)
+        if not segmentation:
+            return None
+    annotation_info = {
+        "id": annotation_id,
+        "image_id": image_id,
+        "category_id": category_info["id"],
+        "iscrowd": is_crowd,
+        "area": area.tolist(),
+        "bbox": bounding_box.tolist(),
+        "segmentation": segmentation,
+        "width": binary_mask.shape[1],
+        "height": binary_mask.shape[0],
+    }
+    return annotation_info

mhp_extension/coco_style_annotation_creator/test_human2coco_format.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import argparse
+import datetime
+import json
+import os
+from PIL import Image
+import pycococreatortools
+def get_arguments():
+    parser = argparse.ArgumentParser(description="transform mask annotation to coco annotation")
+    parser.add_argument("--dataset", type=str, default='CIHP', help="name of dataset (CIHP, MHPv2 or VIP)")
+    parser.add_argument("--json_save_dir", type=str, default='../data/CIHP/annotations',
+                        help="path to save coco-style annotation json file")
+    parser.add_argument("--test_img_dir", type=str, default='../data/CIHP/Testing/Images',
+                        help="test image path")
+    return parser.parse_args()
+args = get_arguments()
+INFO = {
+    "description": args.dataset + "Dataset",
+    "url": "",
+    "version": "",
+    "year": 2020,
+    "contributor": "yunqiuxu",
+    "date_created": datetime.datetime.utcnow().isoformat(' ')
+}
+LICENSES = [
+    {
+        "id": 1,
+        "name": "",
+        "url": ""
+    }
+]
+CATEGORIES = [
+    {
+        'id': 1,
+        'name': 'person',
+        'supercategory': 'person',
+    },
+]
+def main(args):
+    coco_output = {
+        "info": INFO,
+        "licenses": LICENSES,
+        "categories": CATEGORIES,
+        "images": [],
+        "annotations": []
+    }
+    image_id = 1
+    for image_name in os.listdir(args.test_img_dir):
+        image = Image.open(os.path.join(args.test_img_dir, image_name))
+        image_info = pycococreatortools.create_image_info(
+            image_id, image_name, image.size
+        )
+        coco_output["images"].append(image_info)
+        image_id += 1
+    if not os.path.exists(os.path.join(args.json_save_dir)):
+        os.mkdir(os.path.join(args.json_save_dir))
+    with open('{}/{}.json'.format(args.json_save_dir, args.dataset), 'w') as output_json_file:
+        json.dump(coco_output, output_json_file)
+if __name__ == "__main__":
+    main(args)

mhp_extension/demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

mhp_extension/detectron2/.circleci/config.yml ADDED Viewed

	@@ -0,0 +1,179 @@

+# Python CircleCI 2.0 configuration file
+#
+# Check https://circleci.com/docs/2.0/language-python/ for more details
+#
+version: 2
+# -------------------------------------------------------------------------------------
+# Environments to run the jobs in
+# -------------------------------------------------------------------------------------
+cpu: &cpu
+  docker:
+    - image: circleci/python:3.6.8-stretch
+  resource_class: medium
+gpu: &gpu
+  machine:
+    image: ubuntu-1604:201903-01
+    docker_layer_caching: true
+  resource_class: gpu.small
+# -------------------------------------------------------------------------------------
+# Re-usable commands
+# -------------------------------------------------------------------------------------
+install_python: &install_python
+  - run:
+      name: Install Python
+      working_directory: ~/
+      command: |
+        pyenv install 3.6.1
+        pyenv global 3.6.1
+setup_venv: &setup_venv
+  - run:
+      name: Setup Virtual Env
+      working_directory: ~/
+      command: |
+        python -m venv ~/venv
+        echo ". ~/venv/bin/activate" >> $BASH_ENV
+        . ~/venv/bin/activate
+        python --version
+        which python
+        which pip
+        pip install --upgrade pip
+install_dep: &install_dep
+  - run:
+      name: Install Dependencies
+      command: |
+        pip install --progress-bar off -U 'git+https://github.com/facebookresearch/fvcore'
+        pip install --progress-bar off cython opencv-python
+        pip install --progress-bar off 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
+        pip install --progress-bar off torch torchvision
+install_detectron2: &install_detectron2
+  - run:
+      name: Install Detectron2
+      command: |
+        gcc --version
+        pip install -U --progress-bar off -e .[dev]
+        python -m detectron2.utils.collect_env
+install_nvidia_driver: &install_nvidia_driver
+  - run:
+      name: Install nvidia driver
+      working_directory: ~/
+      command: |
+        wget -q 'https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-430.40.run'
+        sudo /bin/bash ./NVIDIA-Linux-x86_64-430.40.run -s --no-drm
+        nvidia-smi
+run_unittests: &run_unittests
+  - run:
+      name: Run Unit Tests
+      command: |
+        python -m unittest discover -v -s tests
+# -------------------------------------------------------------------------------------
+# Jobs to run
+# -------------------------------------------------------------------------------------
+jobs:
+  cpu_tests:
+    <<: *cpu
+    working_directory: ~/detectron2
+    steps:
+      - checkout
+      - <<: *setup_venv
+      # Cache the venv directory that contains dependencies
+      - restore_cache:
+          keys:
+            - cache-key-{{ .Branch }}-ID-20200425
+      - <<: *install_dep
+      - save_cache:
+          paths:
+            - ~/venv
+          key: cache-key-{{ .Branch }}-ID-20200425
+      - <<: *install_detectron2
+      - run:
+          name: isort
+          command: |
+            isort -c -sp .
+      - run:
+          name: black
+          command: |
+            black --check -l 100 .
+      - run:
+          name: flake8
+          command: |
+            flake8 .
+      - <<: *run_unittests
+  gpu_tests:
+    <<: *gpu
+    working_directory: ~/detectron2
+    steps:
+      - checkout
+      - <<: *install_nvidia_driver
+      - run:
+          name: Install nvidia-docker
+          working_directory: ~/
+          command: |
+            curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+            distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+            curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
+            sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+            sudo apt-get update && sudo apt-get install -y nvidia-docker2
+            # reload the docker daemon configuration
+            sudo pkill -SIGHUP dockerd
+      - run:
+          name: Launch docker
+          working_directory: ~/detectron2/docker
+          command: |
+            nvidia-docker build -t detectron2:v0 -f Dockerfile-circleci .
+            nvidia-docker run -itd --name d2 detectron2:v0
+            docker exec -it d2 nvidia-smi
+      - run:
+          name: Build Detectron2
+          command: |
+            docker exec -it d2 pip install 'git+https://github.com/facebookresearch/fvcore'
+            docker cp ~/detectron2 d2:/detectron2
+            # This will build d2 for the target GPU arch only
+            docker exec -it d2 pip install -e /detectron2
+            docker exec -it d2 python3 -m detectron2.utils.collect_env
+            docker exec -it d2 python3 -c 'import torch; assert(torch.cuda.is_available())'
+      - run:
+          name: Run Unit Tests
+          command: |
+            docker exec -e CIRCLECI=true -it d2 python3 -m unittest discover -v -s /detectron2/tests
+workflows:
+  version: 2
+  regular_test:
+    jobs:
+      - cpu_tests
+      - gpu_tests
+  #nightly_test:
+    #jobs:
+      #- gpu_tests
+    #triggers:
+      #- schedule:
+          #cron: "0 0 * * *"
+          #filters:
+            #branches:
+              #only:
+                #- master

mhp_extension/detectron2/.clang-format ADDED Viewed

	@@ -0,0 +1,85 @@

+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH, FOR_EACH_ENUMERATE, FOR_EACH_KV, FOR_EACH_R, FOR_EACH_RANGE, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never

mhp_extension/detectron2/.flake8 ADDED Viewed

	@@ -0,0 +1,9 @@

+# This is an example .flake8 config, used when developing *Black* itself.
+# Keep in sync with setup.cfg which is used for source packages.
+[flake8]
+ignore = W503, E203, E221, C901, C408, E741
+max-line-length = 100
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
+exclude = build,__init__.py

mhp_extension/detectron2/.gitignore ADDED Viewed

	@@ -0,0 +1,46 @@

+# output dir
+output
+instant_test_output
+inference_test_output
+*.jpg
+*.png
+*.txt
+*.json
+*.diff
+# compilation and distribution
+__pycache__
+_ext
+*.pyc
+*.so
+detectron2.egg-info/
+build/
+dist/
+wheels/
+# pytorch/python/numpy formats
+*.pth
+*.pkl
+*.npy
+# ipython/jupyter notebooks
+*.ipynb
+**/.ipynb_checkpoints/
+# Editor temporaries
+*.swn
+*.swo
+*.swp
+*~
+# editor settings
+.idea
+.vscode
+# project dirs
+/detectron2/model_zoo/configs
+/datasets
+/projects/*/datasets
+/models

mhp_extension/detectron2/GETTING_STARTED.md ADDED Viewed

	@@ -0,0 +1,79 @@

+## Getting Started with Detectron2
+This document provides a brief intro of the usage of builtin command-line tools in detectron2.
+For a tutorial that involves actual coding with the API,
+see our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+which covers how to run inference with an
+existing model, and how to train a builtin model on a custom dataset.
+For more advanced tutorials, refer to our [documentation](https://detectron2.readthedocs.io/tutorials/extend.html).
+### Inference Demo with Pre-trained Models
+1. Pick a model and its config file from
+	[model zoo](MODEL_ZOO.md),
+	for example, `mask_rcnn_R_50_FPN_3x.yaml`.
+2. We provide `demo.py` that is able to run builtin standard models. Run it with:
+```
+cd demo/
+python demo.py --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
+  --input input1.jpg input2.jpg \
+  [--other-options]
+  --opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
+```
+The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation.
+This command will run the inference and show visualizations in an OpenCV window.
+For details of the command line arguments, see `demo.py -h` or look at its source code
+to understand its behavior. Some common arguments are:
+* To run __on your webcam__, replace `--input files` with `--webcam`.
+* To run __on a video__, replace `--input files` with `--video-input video.mp4`.
+* To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`.
+* To save outputs to a directory (for images) or a file (for webcam or video), use `--output`.
+### Training & Evaluation in Command Line
+We provide a script in "tools/{,plain_}train_net.py", that is made to train
+all the configs provided in detectron2.
+You may want to use it as a reference to write your own training script.
+To train a model with "train_net.py", first
+setup the corresponding datasets following
+[datasets/README.md](./datasets/README.md),
+then run:
+```
+cd tools/
+./train_net.py --num-gpus 8 \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
+```
+The configs are made for 8-GPU training.
+To train on 1 GPU, you may need to [change some parameters](https://arxiv.org/abs/1706.02677), e.g.:
+```
+./train_net.py \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
+	--num-gpus 1 SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025
+```
+For most models, CPU training is not supported.
+To evaluate a model's performance, use
+```
+./train_net.py \
+	--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
+	--eval-only MODEL.WEIGHTS /path/to/checkpoint_file
+```
+For more options, see `./train_net.py -h`.
+### Use Detectron2 APIs in Your Code
+See our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+to learn how to use detectron2 APIs to:
+1. run inference with an existing model
+2. train a builtin model on a custom dataset
+See [detectron2/projects](https://github.com/facebookresearch/detectron2/tree/master/projects)
+for more ways to build your project on detectron2.

mhp_extension/detectron2/INSTALL.md ADDED Viewed

	@@ -0,0 +1,184 @@

+## Installation
+Our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+has step-by-step instructions that install detectron2.
+The [Dockerfile](docker)
+also installs detectron2 with a few simple commands.
+### Requirements
+- Linux or macOS with Python ≥ 3.6
+- PyTorch ≥ 1.4
+- [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
+	You can install them together at [pytorch.org](https://pytorch.org) to make sure of this.
+- OpenCV, optional, needed by demo and visualization
+- pycocotools: `pip install cython; pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'`
+### Build Detectron2 from Source
+gcc & g++ ≥ 5 are required. [ninja](https://ninja-build.org/) is recommended for faster build.
+After having them, run:
+```
+python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
+# (add --user if you don't have permission)
+# Or, to install it from a local clone:
+git clone https://github.com/facebookresearch/detectron2.git
+python -m pip install -e detectron2
+# Or if you are on macOS
+# CC=clang CXX=clang++ python -m pip install -e .
+```
+To __rebuild__ detectron2 that's built from a local clone, use `rm -rf build/ **/*.so` to clean the
+old build first. You often need to rebuild detectron2 after reinstalling PyTorch.
+### Install Pre-Built Detectron2 (Linux only)
+```
+# for CUDA 10.1:
+python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/index.html
+```
+You can replace cu101 with "cu{100,92}" or "cpu".
+Note that:
+1. Such installation has to be used with certain version of official PyTorch release.
+   See [releases](https://github.com/facebookresearch/detectron2/releases) for requirements.
+   It will not work with a different version of PyTorch or a non-official build of PyTorch.
+2. Such installation is out-of-date w.r.t. master branch of detectron2. It may not be
+	 compatible with the master branch of a research project that uses detectron2 (e.g. those in
+	 [projects](projects) or [meshrcnn](https://github.com/facebookresearch/meshrcnn/)).
+### Common Installation Issues
+If you met issues using the pre-built detectron2, please uninstall it and try building it from source.
+Click each issue for its solutions:
+<details>
+<summary>
+Undefined torch/aten/caffe2 symbols, or segmentation fault immediately when running the library.
+</summary>
+<br/>
+This usually happens when detectron2 or torchvision is not
+compiled with the version of PyTorch you're running.
+Pre-built torchvision or detectron2 has to work with the corresponding official release of pytorch.
+If the error comes from a pre-built torchvision, uninstall torchvision and pytorch and reinstall them
+following [pytorch.org](http://pytorch.org). So the versions will match.
+If the error comes from a pre-built detectron2, check [release notes](https://github.com/facebookresearch/detectron2/releases)
+to see the corresponding pytorch version required for each pre-built detectron2.
+If the error comes from detectron2 or torchvision that you built manually from source,
+remove files you built (`build/`, `**/*.so`) and rebuild it so it can pick up the version of pytorch currently in your environment.
+If you cannot resolve this problem, please include the output of `gdb -ex "r" -ex "bt" -ex "quit" --args python -m detectron2.utils.collect_env`
+in your issue.
+</details>
+<details>
+<summary>
+Undefined C++ symbols (e.g. `GLIBCXX`) or C++ symbols not found.
+</summary>
+<br/>
+Usually it's because the library is compiled with a newer C++ compiler but run with an old C++ runtime.
+This often happens with old anaconda.
+Try `conda update libgcc`. Then rebuild detectron2.
+The fundamental solution is to run the code with proper C++ runtime.
+One way is to use `LD_PRELOAD=/path/to/libstdc++.so`.
+</details>
+<details>
+<summary>
+"Not compiled with GPU support" or "Detectron2 CUDA Compiler: not available".
+</summary>
+<br/>
+CUDA is not found when building detectron2.
+You should make sure
+```
+python -c 'import torch; from torch.utils.cpp_extension import CUDA_HOME; print(torch.cuda.is_available(), CUDA_HOME)'
+```
+print valid outputs at the time you build detectron2.
+Most models can run inference (but not training) without GPU support. To use CPUs, set `MODEL.DEVICE='cpu'` in the config.
+</details>
+<details>
+<summary>
+"invalid device function" or "no kernel image is available for execution".
+</summary>
+<br/>
+Two possibilities:
+* You build detectron2 with one version of CUDA but run it with a different version.
+  To check whether it is the case,
+  use `python -m detectron2.utils.collect_env` to find out inconsistent CUDA versions.
+	In the output of this command, you should expect "Detectron2 CUDA Compiler", "CUDA_HOME", "PyTorch built with - CUDA"
+	to contain cuda libraries of the same version.
+	When they are inconsistent,
+	you need to either install a different build of PyTorch (or build by yourself)
+	to match your local CUDA installation, or install a different version of CUDA to match PyTorch.
+* Detectron2 or PyTorch/torchvision is not built for the correct GPU architecture (compute compatibility).
+	The GPU architecture for PyTorch/detectron2/torchvision is available in the "architecture flags" in
+	`python -m detectron2.utils.collect_env`.
+	The GPU architecture flags of detectron2/torchvision by default matches the GPU model detected
+	during compilation. This means the compiled code may not work on a different GPU model.
+	To overwrite the GPU architecture for detectron2/torchvision, use `TORCH_CUDA_ARCH_LIST` environment variable during compilation.
+	For example, `export TORCH_CUDA_ARCH_LIST=6.0,7.0` makes it compile for both P100s and V100s.
+	Visit [developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus) to find out
+	the correct compute compatibility number for your device.
+</details>
+<details>
+<summary>
+Undefined CUDA symbols; cannot open libcudart.so; other nvcc failures.
+</summary>
+<br/>
+The version of NVCC you use to build detectron2 or torchvision does
+not match the version of CUDA you are running with.
+This often happens when using anaconda's CUDA runtime.
+Use `python -m detectron2.utils.collect_env` to find out inconsistent CUDA versions.
+In the output of this command, you should expect "Detectron2 CUDA Compiler", "CUDA_HOME", "PyTorch built with - CUDA"
+to contain cuda libraries of the same version.
+When they are inconsistent,
+you need to either install a different build of PyTorch (or build by yourself)
+to match your local CUDA installation, or install a different version of CUDA to match PyTorch.
+</details>
+<details>
+<summary>
+"ImportError: cannot import name '_C'".
+</summary>
+<br/>
+Please build and install detectron2 following the instructions above.
+If you are running code from detectron2's root directory, `cd` to a different one.
+Otherwise you may not import the code that you installed.
+</details>
+<details>
+<summary>
+ONNX conversion segfault after some "TraceWarning".
+</summary>
+<br/>
+The ONNX package is compiled with too old compiler.
+Please build and install ONNX from its source code using a compiler
+whose version is closer to what's used by PyTorch (available in `torch.__config__.show()`).
+</details>

mhp_extension/detectron2/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction,
+and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by
+the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all
+other entities that control, are controlled by, or are under common
+control with that entity. For the purposes of this definition,
+"control" means (i) the power, direct or indirect, to cause the
+direction or management of such entity, whether by contract or
+otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity
+exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications,
+including but not limited to software source code, documentation
+source, and configuration files.
+"Object" form shall mean any form resulting from mechanical
+transformation or translation of a Source form, including but
+not limited to compiled object code, generated documentation,
+and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or
+Object form, made available under the License, as indicated by a
+copyright notice that is included in or attached to the work
+(an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object
+form, that is based on (or derived from) the Work and for which the
+editorial revisions, annotations, elaborations, or other modifications
+represent, as a whole, an original work of authorship. For the purposes
+of this License, Derivative Works shall not include works that remain
+separable from, or merely link (or bind by name) to the interfaces of,
+the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including
+the original version of the Work and any modifications or additions
+to that Work or Derivative Works thereof, that is intentionally
+submitted to Licensor for inclusion in the Work by the copyright owner
+or by an individual or Legal Entity authorized to submit on behalf of
+the copyright owner. For the purposes of this definition, "submitted"
+means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems,
+and issue tracking systems that are managed by, or on behalf of, the
+Licensor for the purpose of discussing and improving the Work, but
+excluding communication that is conspicuously marked or otherwise
+designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity
+on behalf of whom a Contribution has been received by Licensor and
+subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the
+Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+(except as stated in this section) patent license to make, have made,
+use, offer to sell, sell, import, and otherwise transfer the Work,
+where such license applies only to those patent claims licensable
+by such Contributor that are necessarily infringed by their
+Contribution(s) alone or by combination of their Contribution(s)
+with the Work to which such Contribution(s) was submitted. If You
+institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work
+or a Contribution incorporated within the Work constitutes direct
+or contributory patent infringement, then any patent licenses
+granted to You under this License for that Work shall terminate
+as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the
+Work or Derivative Works thereof in any medium, with or without
+modifications, and in Source or Object form, provided that You
+meet the following conditions:
+(a) You must give any other recipients of the Work or
+Derivative Works a copy of this License; and
+(b) You must cause any modified files to carry prominent notices
+stating that You changed the files; and
+(c) You must retain, in the Source form of any Derivative Works
+that You distribute, all copyright, patent, trademark, and
+attribution notices from the Source form of the Work,
+excluding those notices that do not pertain to any part of
+the Derivative Works; and
+(d) If the Work includes a "NOTICE" text file as part of its
+distribution, then any Derivative Works that You distribute must
+include a readable copy of the attribution notices contained
+within such NOTICE file, excluding those notices that do not
+pertain to any part of the Derivative Works, in at least one
+of the following places: within a NOTICE text file distributed
+as part of the Derivative Works; within the Source form or
+documentation, if provided along with the Derivative Works; or,
+within a display generated by the Derivative Works, if and
+wherever such third-party notices normally appear. The contents
+of the NOTICE file are for informational purposes only and
+do not modify the License. You may add Your own attribution
+notices within Derivative Works that You distribute, alongside
+or as an addendum to the NOTICE text from the Work, provided
+that such additional attribution notices cannot be construed
+as modifying the License.
+You may add Your own copyright statement to Your modifications and
+may provide additional or different license terms and conditions
+for use, reproduction, or distribution of Your modifications, or
+for any such Derivative Works as a whole, provided Your use,
+reproduction, and distribution of the Work otherwise complies with
+the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise,
+any Contribution intentionally submitted for inclusion in the Work
+by You to the Licensor shall be under the terms and conditions of
+this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify
+the terms of any separate license agreement you may have executed
+with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade
+names, trademarks, service marks, or product names of the Licensor,
+except as required for reasonable and customary use in describing the
+origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or
+agreed to in writing, Licensor provides the Work (and each
+Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied, including, without limitation, any warranties or conditions
+of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE. You are solely responsible for determining the
+appropriateness of using or redistributing the Work and assume any
+risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory,
+whether in tort (including negligence), contract, or otherwise,
+unless required by applicable law (such as deliberate and grossly
+negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special,
+incidental, or consequential damages of any character arising as a
+result of this License or out of the use or inability to use the
+Work (including but not limited to damages for loss of goodwill,
+work stoppage, computer failure or malfunction, or any and all
+other commercial damages or losses), even if such Contributor
+has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing
+the Work or Derivative Works thereof, You may choose to offer,
+and charge a fee for, acceptance of support, warranty, indemnity,
+or other liability obligations and/or rights consistent with this
+License. However, in accepting such obligations, You may act only
+on Your own behalf and on Your sole responsibility, not on behalf
+of any other Contributor, and only if You agree to indemnify,
+defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason
+of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+APPENDIX: How to apply the Apache License to your work.
+To apply the Apache License to your work, attach the following
+boilerplate notice, with the fields enclosed by brackets "[]"
+replaced with your own identifying information. (Don't include
+the brackets!)  The text should be enclosed in the appropriate
+comment syntax for the file format. We also recommend that a
+file or class name and description of purpose be included on the
+same "printed page" as the copyright notice for easier
+identification within third-party archives.
+Copyright 2019 - present, Facebook, Inc
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.