isno0907 commited on Nov 7, 2025

Commit

6c49103

verified ·

1 Parent(s): fdd9e4e

Upload 115 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
LDMAE/.DS_Store +0 -0
LDMAE/configs/accelerator/4gpu.yaml +17 -0
LDMAE/configs/accelerator/8gpu.yaml +17 -0
LDMAE/configs/celeba_hq/lightningdit_b_vmae_f8d16_cfg.yaml +82 -0
LDMAE/configs/imagenet/lightningdit_b_vmae_f8d16_cfg.yaml +80 -0
LDMAE/datasets/__init__.py +0 -0
LDMAE/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
LDMAE/datasets/__pycache__/__init__.cpython-38.pyc +0 -0
LDMAE/datasets/__pycache__/img_latent_dataset.cpython-310.pyc +0 -0
LDMAE/datasets/__pycache__/img_latent_dataset.cpython-38.pyc +0 -0
LDMAE/datasets/img_latent_dataset.py +94 -0
LDMAE/evaluate_tokenizer.py +262 -0
LDMAE/extract_features.py +235 -0
LDMAE/inference.py +368 -0
LDMAE/models/__init__.py +0 -0
LDMAE/models/__pycache__/__init__.cpython-310.pyc +0 -0
LDMAE/models/__pycache__/__init__.cpython-38.pyc +0 -0
LDMAE/models/__pycache__/lightningdit.cpython-310.pyc +0 -0
LDMAE/models/__pycache__/lightningdit.cpython-38.pyc +0 -0
LDMAE/models/__pycache__/pos_embed.cpython-310.pyc +0 -0
LDMAE/models/__pycache__/pos_embed.cpython-38.pyc +0 -0
LDMAE/models/__pycache__/rmsnorm.cpython-310.pyc +0 -0
LDMAE/models/__pycache__/rmsnorm.cpython-38.pyc +0 -0
LDMAE/models/__pycache__/swiglu_ffn.cpython-310.pyc +0 -0
LDMAE/models/__pycache__/swiglu_ffn.cpython-38.pyc +0 -0
LDMAE/models/lightningdit.py +531 -0
LDMAE/models/lpips.py +184 -0
LDMAE/models/pos_embed.py +135 -0
LDMAE/models/rmsnorm.py +495 -0
LDMAE/models/swiglu_ffn.py +74 -0
LDMAE/pretrain_weight/aef8d16.pth +3 -0
LDMAE/pretrain_weight/daef8d16.pth +3 -0
LDMAE/pretrain_weight/sdv3f8d16.pth +3 -0
LDMAE/pretrain_weight/vaef8d16.pth +3 -0
LDMAE/pretrain_weight/vmaef8d16.pth +3 -0
LDMAE/requirements.txt +16 -0
LDMAE/run_extract_feature.sh +22 -0
LDMAE/run_fast_inference.sh +20 -0
LDMAE/run_inference.sh +20 -0
LDMAE/run_robustness_test.sh +81 -0
LDMAE/run_train.sh +22 -0
LDMAE/tokenizer/__init__.py +0 -0
LDMAE/tokenizer/__pycache__/__init__.cpython-310.pyc +0 -0
LDMAE/tokenizer/__pycache__/__init__.cpython-38.pyc +0 -0
LDMAE/tokenizer/__pycache__/autoencoder.cpython-310.pyc +0 -0
LDMAE/tokenizer/__pycache__/models_mae.cpython-310.pyc +0 -0
LDMAE/tokenizer/__pycache__/sdvae.cpython-310.pyc +0 -0
LDMAE/tokenizer/__pycache__/vavae.cpython-310.pyc +0 -0
LDMAE/tokenizer/__pycache__/vavae.cpython-38.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+figure/thumbnail.png filter=lfs diff=lfs merge=lfs -text

LDMAE/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

LDMAE/configs/accelerator/4gpu.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

LDMAE/configs/accelerator/8gpu.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

LDMAE/configs/celeba_hq/lightningdit_b_vmae_f8d16_cfg.yaml ADDED Viewed

	@@ -0,0 +1,82 @@

+# we recommend to read config_details.yaml first.
+ckpt_path: 'output/celeba_hq/lightningdit_b_vmae_f8d16/checkpoints/0060000.pt' # <---- download our pre-trained lightningdit or your own checkpoint
+data:
+  name: celebahq
+  origin_path: "/data/dataset/celeba_hq/celeba_hq_256"
+  data_path: '/data/dataset/celeba_hq/vmae_feature_celebahq_train_256' # <---- path to your data. it is generated by extract_features.py.
+                                 #       if you just want to inference, download our latent_stats.pt and give its path here is ok.
+  fid_reference_file: 'tools/fid_statistics/ALL_celebahq256.npz' # <---- path to your fid_reference_file.npz. download it from ADM
+  # recommend to use default settings
+  image_size: 256
+  num_classes: 1
+  num_workers: 8
+  latent_norm: true
+  latent_multiplier: 1.0
+  sample: true  # <------------------------------ check this. you should comment out this when you don't want to use it.
+# recommend to use default settings (we wil release codes with SD-VAE later)
+vae:
+  model_name: 'vmae'
+  downsample_ratio: 8
+  weight_path: 'pretrain_weight/vmae_f8d16.pth'
+# recommend to use default settings
+model:
+  model_type: LightningDiT-B/1
+  use_qknorm: false # no qk normalizing in celeba
+  use_swiglu: true
+  use_rope: true
+  use_rmsnorm: true
+  wo_shift: false
+  in_chans: 16
+# recommend to use default settings
+train:
+  max_steps: 60000
+  global_batch_size: 1024 # 256 ok
+  global_seed: 1
+  output_dir: 'output'
+  exp_name: 'celeba_hq/lightningdit_b_vmae_f8d16' # <---- experiment name, set as you like
+  ckpt: null
+  log_every: 100
+  ckpt_every: 20000
+  use_checkpoint: false
+  gradient_accumulation_steps: 1
+optimizer:
+  lr: 0.0002
+  beta2: 0.95
+  # max_grad_norm: 1.0
+# recommend to use default settings
+transport:
+  path_type: Linear
+  prediction: velocity
+  loss_weight: null
+  sample_eps: null
+  train_eps: null
+  use_cosine_loss: false
+  use_lognorm: true
+# recommend to use default settings
+sample:
+  mode: ODE
+  sampling_method: euler
+  atol: 0.000001
+  rtol: 0.001
+  reverse: false
+  likelihood: false
+  num_sampling_steps: 250
+  cfg_scale: 0 # <---- cfg scale, for 800 epoch performance with FID=1.35 cfg_scale=6.7
+                  #       for 64 epoch performance with FID=2.11 cfg_scale=10.0
+                  #       you may find we use a large cfg_scale, this is because of 2 reasons:
+                  #       we find a high-dimensional latent space requires a large cfg_scale to get good performance than f8d4 SD-VAE
+                  #       we enable cfg interval, which reduces the negative effects of large cfg on high-noise parts. This means larger cfg can be utilized
+  # recommend to use default settings
+  per_proc_batch_size: 128
+  fid_num: 50000
+  cfg_interval_start: 0.10
+  timestep_shift: 0.3

LDMAE/configs/imagenet/lightningdit_b_vmae_f8d16_cfg.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+# we recommend to read config_details.yaml first.
+ckpt_path: 'output/imagenet/lightningdit_b_vmae_f8d16/checkpoints/0100000.pt' # <---- download our pre-trained lightningdit or your own checkpoint
+data:
+  origin_path: '/data/dataset/imagenet/1K_dataset'
+  data_path: '/data/dataset/imagenet/vmae_feature_imagenet_train_256' # <---- path to your data. it is generated by extract_features.py.
+                                 #       if you just want to inference, download our latent_stats.pt and give its path here is ok.
+  fid_reference_file: 'tools/fid_statistics/VIRTUAL_imagenet256_labeled.npz' # <---- path to your fid_reference_file.npz. download it from ADM
+  # recommend to use default settings
+  image_size: 256
+  num_classes: 1000
+  num_workers: 8
+  latent_norm: true
+  latent_multiplier: 1.0
+  sample: true  # <------------------------------ check this. you should comment out this when you don't want to use it.
+# recommend to use default settings (we wil release codes with SD-VAE later)
+vae:
+  model_name: 'vmae_f8d16'
+  downsample_ratio: 8
+  weight_path: 'pretrain_weight/vmaef8d16.pth'
+# recommend to use default settings
+model:
+  model_type: LightningDiT-B/1
+  use_qknorm: true
+  use_swiglu: true
+  use_rope: true
+  use_rmsnorm: true
+  wo_shift: false
+  in_chans: 16
+# recommend to use default settings
+train:
+  max_steps: 100000
+  global_batch_size: 256 # 256 ok
+  global_seed: 0
+  output_dir: 'output'
+  exp_name: 'imagenet/lightningdit_b_vmae_f8d16' # <---- experiment name, set as you like
+  ckpt: null
+  log_every: 100
+  ckpt_every: 20000
+  use_checkpoint: false
+  gradient_accumulation_steps: 1
+optimizer:
+  lr: 0.0002
+  beta2: 0.95
+  # max_grad_norm: 1.0
+# recommend to use default settings
+transport:
+  path_type: Linear
+  prediction: velocity
+  loss_weight: null
+  sample_eps: null
+  train_eps: null
+  use_cosine_loss: false
+  use_lognorm: true
+# recommend to use default settings
+sample:
+  mode: ODE
+  sampling_method: euler
+  atol: 0.000001
+  rtol: 0.001
+  reverse: false
+  likelihood: false
+  num_sampling_steps: 250
+  cfg_scale: 10.0 # <---- cfg scale, for 800 epoch performance with FID=1.35 cfg_scale=6.7
+                  #       for 64 epoch performance with FID=2.11 cfg_scale=10.0
+                  #       you may find we use a large cfg_scale, this is because of 2 reasons:
+                  #       we find a high-dimensional latent space requires a large cfg_scale to get good performance than f8d4 SD-VAE
+                  #       we enable cfg interval, which reduces the negative effects of large cfg on high-noise parts. This means larger cfg can be utilized
+  # recommend to use default settings
+  per_proc_batch_size: 256
+  fid_num: 50000
+  cfg_interval_start: 0.10
+  timestep_shift: 0.3

LDMAE/datasets/__init__.py ADDED Viewed

File without changes

LDMAE/datasets/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (173 Bytes). View file

LDMAE/datasets/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (171 Bytes). View file

LDMAE/datasets/__pycache__/img_latent_dataset.cpython-310.pyc ADDED Viewed

Binary file (3.41 kB). View file

LDMAE/datasets/__pycache__/img_latent_dataset.cpython-38.pyc ADDED Viewed

Binary file (3.33 kB). View file

LDMAE/datasets/img_latent_dataset.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+ImageNet Latent Dataset with safetensors.
+"""
+import os
+import numpy as np
+from glob import glob
+from tqdm import tqdm
+import torch
+from torch.utils.data import Dataset
+from safetensors import safe_open
+from tokenizer.util.misc import DiagonalGaussianDistribution
+class ImgLatentDataset(Dataset):
+    def __init__(self, data_dir, latent_norm=True, latent_multiplier=1.0, sample=False):
+        self.data_dir = data_dir
+        self.latent_norm = latent_norm
+        self.latent_multiplier = latent_multiplier
+        self.sample = sample
+        self.files = sorted(glob(os.path.join(data_dir, "*.safetensors")))
+        self.img_to_file_map = self.get_img_to_safefile_map()
+        if latent_norm:
+            self._latent_mean, self._latent_std = self.get_latent_stats()
+    def get_img_to_safefile_map(self):
+        img_to_file = {}
+        for safe_file in self.files:
+            with safe_open(safe_file, framework="pt", device="cpu") as f:
+                labels = f.get_slice('labels')
+                labels_shape = labels.get_shape()
+                num_imgs = labels_shape[0]
+                cur_len = len(img_to_file)
+                for i in range(num_imgs):
+                    img_to_file[cur_len+i] = {
+                        'safe_file': safe_file,
+                        'idx_in_file': i
+                    }
+        return img_to_file
+    def get_latent_stats(self):
+        latent_stats_cache_file = os.path.join(self.data_dir, "latents_stats.pt")
+        if not os.path.exists(latent_stats_cache_file):
+            latent_stats = self.compute_latent_stats()
+            torch.save(latent_stats, latent_stats_cache_file)
+        else:
+            latent_stats = torch.load(latent_stats_cache_file)
+        return latent_stats['mean'], latent_stats['std']
+    def compute_latent_stats(self):
+        num_samples = min(10000, len(self.img_to_file_map))
+        random_indices = np.random.choice(len(self.img_to_file_map), num_samples, replace=False)
+        latents = []
+        for idx in tqdm(random_indices):
+            img_info = self.img_to_file_map[idx]
+            safe_file, img_idx = img_info['safe_file'], img_info['idx_in_file']
+            with safe_open(safe_file, framework="pt", device="cpu") as f:
+                features = f.get_slice('latents')
+                feature = features[img_idx:img_idx+1]
+                if self.sample:
+                    feature = DiagonalGaussianDistribution(feature).sample()
+                latents.append(feature)
+        latents = torch.cat(latents, dim=0)
+        mean = latents.mean(dim=[0, 2, 3], keepdim=True)
+        std = latents.std(dim=[0, 2, 3], keepdim=True)
+        latent_stats = {'mean': mean, 'std': std}
+        print(latent_stats)
+        return latent_stats
+    def __len__(self):
+        return len(self.img_to_file_map.keys())
+    def __getitem__(self, idx):
+        img_info = self.img_to_file_map[idx]
+        safe_file, img_idx = img_info['safe_file'], img_info['idx_in_file']
+        with safe_open(safe_file, framework="pt", device="cpu") as f:
+            tensor_key = "latents" if np.random.uniform(0, 1) > 0.5 else "latents_flip"
+            features = f.get_slice(tensor_key)
+            labels = f.get_slice('labels')
+            feature = features[img_idx:img_idx+1]
+            label = labels[img_idx:img_idx+1]
+        if self.sample:
+            feature = DiagonalGaussianDistribution(feature).sample()
+        if self.latent_norm:
+            feature = (feature - self._latent_mean) / self._latent_std
+        feature = feature * self.latent_multiplier
+        # remove the first batch dimension (=1) kept by get_slice()
+        feature = feature.squeeze(0)
+        label = label.squeeze(0)
+        return feature, label

LDMAE/evaluate_tokenizer.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+Evaluate tokenizer performance by computing reconstruction metrics.
+Metrics include:
+- rFID (Reconstruction FID)
+- PSNR (Peak Signal-to-Noise Ratio)
+- LPIPS (Learned Perceptual Image Patch Similarity)
+- SSIM (Structural Similarity Index)
+by Jingfeng Yao
+from HUST-VL
+"""
+import os
+import torch, yaml
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from omegaconf import OmegaConf
+from torch.utils.data import DataLoader, DistributedSampler
+from tools.calculate_fid import calculate_fid_given_paths
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from torchmetrics import StructuralSimilarityIndexMeasure
+from models.lpips import LPIPS
+from torchvision.datasets import ImageFolder
+from torchvision import transforms
+from diffusers.models import AutoencoderKL
+from tokenizer.sdvae import Diffusers_AutoencoderKL
+from tokenizer import models_mae
+def load_config(config_path):
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+def print_with_prefix(content, prefix='Tokenizer Evaluation', rank=0):
+    if rank == 0:
+        print(f"\033[34m[{prefix}]\033[0m {content}")
+def save_image(image, filename):
+    Image.fromarray(image).save(filename)
+def evaluate_tokenizer(args, config_path, model_type, data_path, output_path):
+    # Initialize distributed training
+    dist.init_process_group(backend='nccl')
+    local_rank = torch.distributed.get_rank()
+    torch.cuda.set_device(local_rank)
+    device = torch.device(f'cuda:{local_rank}')
+    train_config = load_config(config_path)
+    model_type = train_config['vae']['model_name'].split("_")[0]
+    if local_rank == 0:
+        print_with_prefix(f"Loading model... {model_type.upper()} {args.epsilon}")
+    if train_config['vae']['model_name'].split("_")[0] == 'vmae':
+        chkpt = train_config['vae']['weight_path']
+        arch = 'mae_for_ldmae_f8d16_prev'
+        model = getattr(models_mae, arch)(ldmae_mode=True, no_cls=True, kl_loss_weight=True, smooth_output=True, img_size=train_config['data']['image_size'])
+        checkpoint = torch.load(chkpt, map_location='cpu')
+        model = model.to(device).eval()
+        msg = model.load_state_dict(checkpoint['model'], strict=False)
+    elif train_config['vae']['model_name'].split("_")[0] in ['ae','dae','vae','sdv3']:
+        model = Diffusers_AutoencoderKL(
+            img_size=train_config['data']['image_size'],
+            sample_size=128,
+            in_channels=3,
+            out_channels=3,
+            layers_per_block=2,
+            latent_channels=16,
+            norm_num_groups=32,
+            act_fn="silu",
+            block_out_channels=(128, 256, 512, 512),
+            force_upcast=False,
+            use_quant_conv=False,
+            use_post_quant_conv=False,
+            down_block_types=(
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+                "DownEncoderBlock2D",
+            ),
+            up_block_types=(
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D",
+                "UpDecoderBlock2D",
+            ),
+        ).to(device).eval()
+        chkpt_dir = train_config['vae']['weight_path']
+        checkpoint = torch.load(chkpt_dir, map_location='cpu')
+        msg = model.load_state_dict(checkpoint['model'], strict=False)
+    else:
+        raise
+    print(msg)
+    # Image preprocessing
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Resize(256),
+        transforms.CenterCrop(256),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+    ])
+    # Create dataset and dataloader
+    dataset = ImageFolder(root=data_path, transform=transform)
+    distributed_sampler = DistributedSampler(dataset, num_replicas=dist.get_world_size(), rank=local_rank)
+    val_dataloader = DataLoader(
+        dataset,
+        batch_size=8,
+        shuffle=False,
+        num_workers=4,
+        sampler=distributed_sampler
+    )
+    if 'sample' in train_config['data']:
+        train_config['data']['data_path'] += '_sample'
+    latent_stats_cache_file = os.path.join(train_config['data']['data_path'], 'latents_stats.pt')
+    latent_stats = torch.load(latent_stats_cache_file)
+    latent_mean, latent_std = latent_stats['mean'], latent_stats['std']
+    latent_mean = latent_mean.clone().detach().to(device)
+    latent_std = latent_std.clone().detach().to(device)
+    # Setup output directories
+    folder_name = f"{model_type}_{args.epsilon}"
+    save_dir = os.path.join(output_path, folder_name, 'decoded_images')
+    ref_path = os.path.join(output_path, 'ref_images')
+    os.makedirs(save_dir, exist_ok=True)
+    os.makedirs(ref_path, exist_ok=True)
+    if local_rank == 0:
+        print_with_prefix(f"Output dir: {save_dir}")
+        print_with_prefix(f"Reference dir: {ref_path}")
+    # Save reference images if needed
+    ref_png_files = [f for f in os.listdir(ref_path) if f.endswith('.png')]
+    if len(ref_png_files) < 50000:
+        total_samples = 0
+        for batch in val_dataloader:
+            images = batch[0].to(device)
+            for j in range(images.size(0)):
+                img = torch.clamp(127.5 * images[j] + 128.0, 0, 255).cpu().permute(1, 2, 0).numpy().astype(np.uint8)
+                Image.fromarray(img).save(os.path.join(ref_path, f"ref_image_rank_{local_rank}_{total_samples}.png"))
+                total_samples += 1
+                if total_samples % 100 == 0 and local_rank == 0:
+                    print_with_prefix(f"Rank {local_rank}, Saved {total_samples} reference images")
+    dist.barrier()
+    # Initialize metrics
+    lpips_values = []
+    ssim_values = []
+    lpips = LPIPS().to(device).eval()
+    ssim_metric = StructuralSimilarityIndexMeasure(data_range=(-1.0, 1.0)).to(device)
+    # Generate reconstructions and compute metrics
+    if local_rank == 0:
+        print_with_prefix("Generating reconstructions...")
+    all_indices = 0
+    if len(os.listdir(save_dir)) < 50000:
+        for batch in val_dataloader:
+            images = batch[0].to(device)
+            latents = encode_images(model, images)
+            epsilon = args.epsilon * torch.randn_like(latents)
+            latents = latents + epsilon * latent_std
+            with torch.no_grad():
+                decoded_images_tensor = model.decode(latents).sample
+                decoded_images = torch.clamp(127.5 * decoded_images_tensor + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+            # Compute metrics
+            lpips_values.append(lpips(decoded_images_tensor, images).mean())
+            ssim_values.append(ssim_metric(decoded_images_tensor, images))
+            # Save reconstructions
+            for i, img in enumerate(decoded_images):
+                save_image(img, os.path.join(save_dir, f"decoded_image_rank_{local_rank}_{all_indices + i}.png"))
+                if (all_indices + i) % 100 == 0 and local_rank == 0:
+                    print_with_prefix(f"Rank {local_rank}, Processed {all_indices + i} images")
+            all_indices += len(decoded_images)
+    dist.barrier()
+    # Aggregate metrics across GPUs
+    lpips_values = torch.tensor(lpips_values).to(device)
+    ssim_values = torch.tensor(ssim_values).to(device)
+    dist.all_reduce(lpips_values, op=dist.ReduceOp.AVG)
+    dist.all_reduce(ssim_values, op=dist.ReduceOp.AVG)
+    avg_lpips = lpips_values.mean().item()
+    avg_ssim = ssim_values.mean().item()
+    if local_rank == 0:
+        # Calculate FID
+        print_with_prefix("Computing rFID...")
+        fid = calculate_fid_given_paths([ref_path, save_dir], batch_size=50, dims=2048, device=device, num_workers=16)
+        # Calculate PSNR
+        print_with_prefix("Computing PSNR...")
+        psnr_values = calculate_psnr_between_folders(ref_path, save_dir)
+        avg_psnr = sum(psnr_values) / len(psnr_values)
+        # Print final results
+        print_with_prefix(f"Final Metrics:")
+        print_with_prefix(f"rFID: {fid:.3f}")
+        print_with_prefix(f"PSNR: {avg_psnr:.3f}")
+        print_with_prefix(f"LPIPS: {avg_lpips:.3f}")
+        print_with_prefix(f"SSIM: {avg_ssim:.3f}")
+    dist.barrier()
+    dist.destroy_process_group()
+def encode_images(model, images):
+    with torch.no_grad():
+        posterior = model.encode(images).latent_dist
+        return posterior.mode().to(torch.float32)
+def decode_to_images(model, z):
+    with torch.no_grad():
+        images = model.decode(z)
+        images = torch.clamp(127.5 * images + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+    return images
+def calculate_psnr(original, processed):
+    mse = torch.mean((original - processed) ** 2)
+    return 20 * torch.log10(255.0 / torch.sqrt(mse)).item()
+def load_image(image_path):
+    image = Image.open(image_path).convert('RGB')
+    return torch.tensor(np.array(image).transpose(2, 0, 1), dtype=torch.float32)
+def calculate_psnr_for_pair(original_path, processed_path):
+    return calculate_psnr(load_image(original_path), load_image(processed_path))
+def calculate_psnr_between_folders(original_folder, processed_folder):
+    original_files = sorted(os.listdir(original_folder))
+    processed_files = sorted(os.listdir(processed_folder))
+    if len(original_files) != len(processed_files):
+        print("Warning: Mismatched number of images in folders")
+        return []
+    with ThreadPoolExecutor() as executor:
+        futures = [
+            executor.submit(calculate_psnr_for_pair,
+                          os.path.join(original_folder, orig),
+                          os.path.join(processed_folder, proc))
+            for orig, proc in zip(original_files, processed_files)
+        ]
+        return [future.result() for future in as_completed(futures)]
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config_path', type=str, default='tokenizer/configs/vavae_f16d32.yaml')
+    parser.add_argument('--model_type', type=str, default='vavae')
+    parser.add_argument('--data_path', type=str, default='/data/dataset/imagenet/1K_dataset/val')
+    parser.add_argument('--output_path', type=str, default='./rfid')
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--epsilon', type=float, default=0, help="Noise pertubation ratio for latent robustness experiment.")
+    args = parser.parse_args()
+    evaluate_tokenizer(args, config_path=args.config_path, model_type=args.model_type, data_path=args.data_path, output_path=args.output_path)

LDMAE/extract_features.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from torchvision.datasets import ImageFolder
+import argparse
+import os, yaml
+from safetensors.torch import save_file
+from datetime import datetime
+from datasets.img_latent_dataset import ImgLatentDataset
+from tokenizer import models_mae
+from tokenizer.sdvae import Diffusers_AutoencoderKL
+def load_config(config_path):
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+def main(args, train_config):
+    """
+    Run a tokenizer on full dataset and save the features.
+    """
+    assert torch.cuda.is_available(), "Extract features currently requires at least one GPU."
+    # Setup DDP:
+    try:
+        dist.init_process_group("nccl")
+        rank = dist.get_rank()
+        device = rank % torch.cuda.device_count()
+        world_size = dist.get_world_size()
+        seed = args.seed + rank
+        if rank == 0:
+            print(f"Starting rank={rank}, seed={seed}, world_size={world_size}.")
+    except:
+        print("Failed to initialize DDP. Running in local mode.")
+        rank = 0
+        device = 0
+        world_size = 1
+        seed = args.seed
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    model_name = train_config['vae']['model_name'].split("_")[0]
+    output_path = os.path.dirname(train_config['data']['origin_path'])
+    dataset_name = train_config['data']['name']
+    # Setup feature folders:
+    output_dir = os.path.join(output_path, f'{model_name}_feature_{dataset_name}_{args.data_split}_{args.image_size}')
+    if 'sample' in train_config['data']:
+        output_dir += '_sample'
+    if rank == 0:
+        os.makedirs(output_dir, exist_ok=True)
+    print(model_name)
+    # Create model:
+    if model_name == 'vmae':
+        arch = 'mae_for_ldmae_f8d16_prev'
+        # chkpt = 'pretrain_weight/mae60_kl_f8d16_200ep.pth'
+        chkpt = train_config['vae']['weight_path']
+        tokenizer = getattr(models_mae, arch)(ldmae_mode=True, no_cls=True, kl_loss_weight=True, smooth_output=True, img_size=args.image_size)
+        checkpoint = torch.load(chkpt, map_location='cpu')
+        tokenizer = tokenizer.to(device).eval()
+        msg = tokenizer.load_state_dict(checkpoint['model'], strict=False)
+        if rank == 0:
+            print(model_name, msg)
+    elif model_name in ['ae','dae', 'vae','sdv3']:
+        tokenizer = Diffusers_AutoencoderKL(
+                img_size=args.image_size,
+                sample_size=128,
+                in_channels=3,
+                out_channels=3,
+                layers_per_block=2,
+                latent_channels=16,
+                norm_num_groups=32,
+                act_fn="silu",
+                block_out_channels=(128, 256, 512, 512),
+                force_upcast=False,
+                use_quant_conv=False,
+                use_post_quant_conv=False,
+                down_block_types=(
+                    "DownEncoderBlock2D",
+                    "DownEncoderBlock2D",
+                    "DownEncoderBlock2D",
+                    "DownEncoderBlock2D",
+                ),
+                up_block_types=(
+                    "UpDecoderBlock2D",
+                    "UpDecoderBlock2D",
+                    "UpDecoderBlock2D",
+                    "UpDecoderBlock2D",
+                ),
+            ).to(device).eval()
+        # chkpt_dir = "./pretrain_weight/sdv3f8d16.pth"
+        chkpt = train_config['vae']['weight_path']
+        checkpoint = torch.load(chkpt, map_location='cpu')
+        msg = tokenizer.load_state_dict(checkpoint['model'], strict=False)
+        if rank == 0:
+            print(model_name, msg)
+    else:
+        raise("")
+    print(f"{device} GPU - Model loaded")
+    # Setup data:
+    data_path = train_config['data']['origin_path']
+    datasets = [
+        ImageFolder(os.path.join(data_path, args.data_split), transform=tokenizer.img_transform(p_hflip=0.0, img_size=args.image_size)),
+        ImageFolder(os.path.join(data_path, args.data_split), transform=tokenizer.img_transform(p_hflip=1.0, img_size=args.image_size))
+    ]
+    samplers = [
+        DistributedSampler(
+            dataset,
+            num_replicas=world_size,
+            rank=rank,
+            shuffle=False,
+            seed=args.seed
+        ) for dataset in datasets
+    ] # Maybe gray scale files are dropped. Need to be fixed.
+    loaders = [
+        DataLoader(
+            dataset,
+            batch_size=args.batch_size,
+            shuffle=False,
+            sampler=sampler,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False
+        ) for dataset, sampler in zip(datasets, samplers)
+    ]
+    total_data_in_loop = len(loaders[0].dataset)
+    if rank == 0:
+        print(f"Total data in one loop: {total_data_in_loop}")
+    run_images = 0
+    saved_files = 0
+    latents = []
+    latents_flip = []
+    labels = []
+    for batch_idx, batch_data in enumerate(zip(*loaders)):
+        run_images += batch_data[0][0].shape[0]
+        if run_images % 100 == 0 and rank == 0:
+            print(f'{datetime.now()} processing {run_images} of {total_data_in_loop} images')
+        for loader_idx, data in enumerate(batch_data):
+            x = data[0].to(device)
+            y = data[1]  # (N,)
+            with torch.no_grad():
+                if 'sample' in train_config['data']:
+                    z = tokenizer._encode(x)
+                else:
+                    z = tokenizer.encode(x).latent_dist.mode().detach().cpu()  # (N, C, H, W)
+            if batch_idx == 0 and rank == 0:
+                print('latent shape', z.shape, 'dtype', z.dtype)
+            if loader_idx == 0:
+                latents.append(z)
+                labels.append(y)
+            else:
+                latents_flip.append(z)
+        if len(latents) == 10000 // args.batch_size:
+            latents = torch.cat(latents, dim=0)
+            latents_flip = torch.cat(latents_flip, dim=0)
+            labels = torch.cat(labels, dim=0)
+            save_dict = {
+                'latents': latents,
+                'latents_flip': latents_flip,
+                'labels': labels
+            }
+            for key in save_dict:
+                if rank == 0:
+                    print(key, save_dict[key].shape)
+            save_dict = {key: tensor.contiguous().cpu() for key, tensor in save_dict.items()}
+            save_filename = os.path.join(output_dir, f'latents_rank{rank:02d}_shard{saved_files:03d}.safetensors')
+            save_file(
+                save_dict,
+                save_filename,
+                metadata={'total_size': f'{latents.shape[0]}', 'dtype': f'{latents.dtype}', 'device': f'{latents.device}'}
+            )
+            if rank == 0:
+                print(f'Saved {save_filename}')
+            latents = []
+            latents_flip = []
+            labels = []
+            saved_files += 1
+    # save remainder latents that are fewer than 10000 images
+    if len(latents) > 0:
+        latents = torch.cat(latents, dim=0)
+        latents_flip = torch.cat(latents_flip, dim=0)
+        labels = torch.cat(labels, dim=0)
+        save_dict = {
+            'latents': latents,
+            'latents_flip': latents_flip,
+            'labels': labels
+        }
+        for key in save_dict:
+            if rank == 0:
+                print(key, save_dict[key].shape)
+        save_dict = {key: tensor.contiguous().cpu() for key, tensor in save_dict.items()}
+        save_filename = os.path.join(output_dir, f'latents_rank{rank:02d}_shard{saved_files:03d}.safetensors')
+        save_file(
+            save_dict,
+            save_filename,
+            metadata={'total_size': f'{latents.shape[0]}', 'dtype': f'{latents.dtype}', 'device': f'{latents.device}'}
+        )
+        if rank == 0:
+            print(f'Saved {save_filename}')
+    # Calculate latents stats
+    dist.barrier()
+    if rank == 0:
+        dataset = ImgLatentDataset(output_dir, latent_norm=True, sample=train_config['data']['sample'] if 'sample' in train_config['data'] else False,)
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--data_path", type=str, default='/path/to/your/data')
+    parser.add_argument("--data_split", type=str, default='train')
+    parser.add_argument("--output_path", type=str, default="/data/dataset/imagenet/")
+    parser.add_argument("--image_size", type=int, default=256)
+    parser.add_argument("--batch_size", type=int, default=64)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num_workers", type=int, default=8)
+    parser.add_argument('--config', type=str, default='configs/debug.yaml')
+    args = parser.parse_args()
+    train_config = load_config(args.config)
+    main(args, train_config)

LDMAE/inference.py ADDED Viewed

	@@ -0,0 +1,368 @@

+"""
+Sampling Scripts of LightningDiT.
+by Maple (Jingfeng Yao) from HUST-VL
+"""
+import os, math, json, pickle, logging, argparse, yaml, torch, numpy as np
+from time import time, strftime
+from glob import glob
+from copy import deepcopy
+from collections import OrderedDict
+from PIL import Image
+from tqdm import tqdm
+import torch.distributed as dist
+from accelerate import Accelerator
+from torch.utils.data import DataLoader
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+import torchvision
+# local imports
+from tokenizer.vavae import VA_VAE
+from tokenizer.sdvae import Diffusers_AutoencoderKL
+from tokenizer import models_mae
+import threading
+from models.lightningdit import LightningDiT_models
+from transport import create_transport, Sampler
+from datasets.img_latent_dataset import ImgLatentDataset
+from torchvision.utils import save_image
+# sample function
+def save_images_async(images, indices, save_dir):
+    """비동기적으로 이미지를 저장하는 함수"""
+    for img, idx in zip(images, indices):
+        # numpy.ndarray를 torch.Tensor로 변환 후 저장
+        if isinstance(img, np.ndarray):
+            img = torch.from_numpy(img).permute(2, 0, 1).float() / 255.0  # [H, W, C] → [C, H, W]
+        save_image(img, f"{save_dir}/{idx:06d}.png")
+def do_sample(train_config, accelerator, ckpt_path=None, cfg_scale=None, model=None, vae=None, demo_sample_mode=False):
+    """
+    Run sampling.
+    """
+    folder_name = f"{train_config['model']['model_type'].replace('/', '-')}-ckpt-{ckpt_path.split('/')[-1].split('.')[0]}-{train_config['sample']['sampling_method']}-{train_config['sample']['num_sampling_steps']}".lower()
+    if cfg_scale is None:
+        cfg_scale = train_config['sample']['cfg_scale']
+    cfg_interval_start = train_config['sample']['cfg_interval_start'] if 'cfg_interval_start' in train_config['sample'] else 0
+    timestep_shift = train_config['sample']['timestep_shift'] if 'timestep_shift' in train_config['sample'] else 0
+    if cfg_scale > 1.0:
+        folder_name += f"-interval{cfg_interval_start:.2f}"+f"-cfg{cfg_scale:.2f}"
+        folder_name += f"-shift{timestep_shift:.2f}"
+    if demo_sample_mode:
+        cfg_interval_start = 0
+        timestep_shift = 0
+        # cfg_scale = 15
+    sample_folder_dir = os.path.join(train_config['train']['output_dir'], train_config['train']['exp_name'], folder_name)
+    if accelerator.process_index == 0:
+        if not demo_sample_mode:
+            print_with_prefix('Sample_folder_dir=', sample_folder_dir)
+        print_with_prefix('ckpt_path=', ckpt_path)
+        print_with_prefix('cfg_scale=', cfg_scale)
+        print_with_prefix('cfg_interval_start=', cfg_interval_start)
+        print_with_prefix('timestep_shift=', timestep_shift)
+    if not demo_sample_mode:
+        if not os.path.exists(sample_folder_dir):
+            if accelerator.process_index == 0:
+                os.makedirs(sample_folder_dir, exist_ok=True)
+        else:
+            png_files = [f for f in os.listdir(sample_folder_dir) if f.endswith('.png')]
+            png_count = len(png_files)
+            if png_count > train_config['sample']['fid_num']:
+                if accelerator.process_index == 0:
+                    print_with_prefix(f"Found {png_count} PNG files in {sample_folder_dir}, skip sampling.")
+                return sample_folder_dir
+    torch.backends.cuda.matmul.allow_tf32 = True  # True: fast but may lead to some small numerical differences
+    assert torch.cuda.is_available(), "Sampling with DDP requires at least one GPU. sample.py supports CPU-only usage"
+    torch.set_grad_enabled(False)
+    # Setup accelerator:
+    device = accelerator.device
+    # Setup DDP:
+    seed = train_config['train']['global_seed'] * accelerator.num_processes + accelerator.process_index
+    torch.manual_seed(seed)
+    # torch.cuda.set_device(device)
+    print_with_prefix(f"Starting rank={accelerator.local_process_index}, seed={seed}, world_size={accelerator.num_processes}.")
+    rank = accelerator.local_process_index
+    # Load model:
+    if 'downsample_ratio' in train_config['vae']:
+        downsample_ratio = train_config['vae']['downsample_ratio']
+    else:
+        downsample_ratio = 16
+    latent_size = train_config['data']['image_size'] // downsample_ratio
+    checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
+    if "ema" in checkpoint:  # supports checkpoints from train.py
+        checkpoint = checkpoint["ema"]
+    model.load_state_dict(checkpoint)
+    model.eval()  # important!
+    model.to(device)
+    transport = create_transport(
+        train_config['transport']['path_type'],
+        train_config['transport']['prediction'],
+        train_config['transport']['loss_weight'],
+        train_config['transport']['train_eps'],
+        train_config['transport']['sample_eps'],
+        use_cosine_loss = train_config['transport']['use_cosine_loss'] if 'use_cosine_loss' in train_config['transport'] else False,
+        use_lognorm = train_config['transport']['use_lognorm'] if 'use_lognorm' in train_config['transport'] else False,
+    )  # default: velocity;
+    sampler = Sampler(transport)
+    mode = train_config['sample']['mode']
+    if mode == "ODE":
+        sample_fn = sampler.sample_ode(
+            sampling_method=train_config['sample']['sampling_method'],
+            num_steps=train_config['sample']['num_sampling_steps'],
+            atol=train_config['sample']['atol'],
+            rtol=train_config['sample']['rtol'],
+            reverse=train_config['sample']['reverse'],
+            timestep_shift=timestep_shift,
+        )
+    else:
+        raise NotImplementedError(f"Sampling mode {mode} is not supported.")
+    if vae is None:
+        if train_config['vae']['model_name'].split("_")[0] == 'vmae':
+            chkpt = train_config['vae']['weight_path']
+            arch = 'mae_for_ldmae_f8d16_prev'
+            vae = getattr(models_mae, arch)(ldmae_mode=True, no_cls=True, kl_loss_weight=True, smooth_output=True, img_size=train_config['data']['image_size'])
+            checkpoint = torch.load(chkpt, map_location='cpu')
+            vae = vae.to(device).eval()
+            msg = vae.load_state_dict(checkpoint['model'], strict=False)
+        elif train_config['vae']['model_name'].split("_")[0] in ['ae','dae', 'vae','sdv3']:
+            vae = Diffusers_AutoencoderKL(
+                img_size=train_config['data']['image_size'],
+                sample_size=128,
+                in_channels=3,
+                out_channels=3,
+                layers_per_block=2,
+                latent_channels=16,
+                norm_num_groups=32,
+                act_fn="silu",
+                block_out_channels=(128, 256, 512, 512),
+                force_upcast=False,
+                use_quant_conv=False,
+                use_post_quant_conv=False,
+                down_block_types=(
+                    "DownEncoderBlock2D",
+                    "DownEncoderBlock2D",
+                    "DownEncoderBlock2D",
+                    "DownEncoderBlock2D",
+                ),
+                up_block_types=(
+                    "UpDecoderBlock2D",
+                    "UpDecoderBlock2D",
+                    "UpDecoderBlock2D",
+                    "UpDecoderBlock2D",
+                ),
+            ).to(device).eval()
+            chkpt_dir = train_config['vae']['weight_path']
+            checkpoint = torch.load(chkpt_dir, map_location='cpu')
+            msg = vae.load_state_dict(checkpoint['model'], strict=False)
+        else:
+            raise
+        if accelerator.process_index == 0:
+            print_with_prefix(f'Model Loaded')
+    using_cfg = cfg_scale > 1.0
+    if using_cfg:
+        if accelerator.process_index == 0:
+            print_with_prefix('Using cfg:', using_cfg)
+    if rank == 0:
+        os.makedirs(sample_folder_dir, exist_ok=True)
+        if accelerator.process_index == 0 and not demo_sample_mode:
+            print_with_prefix(f"Saving .png samples at {sample_folder_dir}")
+    accelerator.wait_for_everyone()
+    # Figure out how many samples we need to generate on each GPU and how many iterations we need to run:
+    n = train_config['sample']['per_proc_batch_size']
+    global_batch_size = n * accelerator.num_processes
+    # To make things evenly-divisible, we'll sample a bit more than we need and then discard the extra samples:
+    num_samples = len([name for name in os.listdir(sample_folder_dir) if (os.path.isfile(os.path.join(sample_folder_dir, name)) and ".png" in name)])
+    total_samples = int(math.ceil(train_config['sample']['fid_num'] / global_batch_size) * global_batch_size)
+    if rank == 0:
+        if accelerator.process_index == 0:
+            print_with_prefix(f"Total number of images that will be sampled: {total_samples}")
+    assert total_samples % accelerator.num_processes == 0, "total_samples must be divisible by world_size"
+    samples_needed_this_gpu = int(total_samples // accelerator.num_processes)
+    assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size"
+    iterations = int(samples_needed_this_gpu // n)
+    done_iterations = int( int(num_samples // accelerator.num_processes) // n)
+    pbar = range(iterations)
+    if not demo_sample_mode:
+        pbar = tqdm(pbar) if rank == 0 else pbar
+    total = 0
+    if accelerator.process_index == 0:
+        print_with_prefix("Using latent normalization")
+    if 'sample' in train_config['data']:
+        train_config['data']['data_path'] += '_sample'
+    dataset = ImgLatentDataset(
+        data_dir=train_config['data']['data_path'],
+        latent_norm=train_config['data']['latent_norm'] if 'latent_norm' in train_config['data'] else False,
+        latent_multiplier=train_config['data']['latent_multiplier'] if 'latent_multiplier' in train_config['data'] else 0.18215,
+        sample=train_config['data']['sample'] if 'sample' in train_config['data'] else False,
+    )
+    latent_mean, latent_std = dataset.get_latent_stats()
+    latent_multiplier = train_config['data']['latent_multiplier'] if 'latent_multiplier' in train_config['data'] else 0.18215
+    # move to device
+    latent_mean = latent_mean.clone().detach().to(device)
+    latent_std = latent_std.clone().detach().to(device)
+    if demo_sample_mode:
+        if accelerator.process_index == 0:
+            images = []
+            if using_cfg:
+                for label in tqdm([975, 3, 207, 387, 388, 88, 979, 279], desc="Generating Demo Samples"):
+                    z = torch.randn(1, model.in_channels, latent_size, latent_size, device=device)
+                    y = torch.tensor([label], device=device)
+                    z = torch.cat([z, z], 0)
+                    y_null = torch.tensor([1000] * 1, device=device)
+                    y = torch.cat([y, y_null], 0)
+                    model_kwargs = dict(y=y, cfg_scale=cfg_scale, cfg_interval=False, cfg_interval_start=cfg_interval_start)
+                    model_fn = model.forward_with_cfg
+                    samples = sample_fn(z, model_fn, **model_kwargs)[-1]
+                    samples = (samples * latent_std) / latent_multiplier + latent_mean
+                    samples = vae.decode_to_images(samples)
+                    images.append(samples)
+            else:
+                for label in tqdm([0]*8, desc="Generating Demo Samples"):
+                    z = torch.randn(1, model.in_channels, latent_size, latent_size, device=device)
+                    y = torch.tensor([label], device=device)
+                    model_kwargs = dict(y=y)
+                    model_fn = model.forward
+                    samples = sample_fn(z, model_fn, **model_kwargs)[-1]
+                    samples = (samples * latent_std) / latent_multiplier + latent_mean
+                    samples = vae.decode_to_images(samples)
+                    images.append(samples)
+            # Combine 8 images into a 2x4 grid
+            os.makedirs('demo_images', exist_ok=True)
+            # Stack all images into a large numpy array
+            all_images = np.stack([img[0] for img in images])  # Take first image from each batch
+            # Rearrange into 2x4 grid
+            h, w = all_images.shape[1:3]
+            grid = np.zeros((2 * h, 4 * w, 3), dtype=np.uint8)
+            for idx, image in enumerate(all_images):
+                i, j = divmod(idx, 4)  # Calculate position in 2x4 grid
+                grid[i*h:(i+1)*h, j*w:(j+1)*w] = image
+            # Save the combined image
+            exp_name = train_config['train']['exp_name']
+            ckpt_iter = train_config['ckpt_path'].split("/")[-1][:-3]
+            Image.fromarray(grid).save(f'demo_images/{exp_name}_cfg{cfg_scale}_{ckpt_iter}_demo_samples.png')
+            return None
+    else:
+        for i in pbar:
+            # Sample inputs:
+            z = torch.randn(n, model.in_channels, latent_size, latent_size, device=device)
+            if 'trunaction' in train_config['sample']:
+                truncation_bound = train_config['sample']['truncation']
+                for _ in range(100):
+                    invalid_mask = torch.abs(z) > truncation_bound
+                    if not invalid_mask.any():
+                        break
+                    z[invalid_mask] = torch.randn_like(z[invalid_mask])
+            y = torch.randint(0, train_config['data']['num_classes'], (n,), device=device)
+            # Setup classifier-free guidance:
+            if using_cfg:
+                z = torch.cat([z, z], 0)
+                y_null = torch.tensor([1000] * n, device=device)
+                y = torch.cat([y, y_null], 0)
+                model_kwargs = dict(y=y, cfg_scale=cfg_scale, cfg_interval=True, cfg_interval_start=cfg_interval_start)
+                model_fn = model.forward_with_cfg
+            else:
+                model_kwargs = dict(y=y)
+                model_fn = model.forward
+            samples = sample_fn(z, model_fn, **model_kwargs)[-1]
+            if using_cfg:
+                samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
+            samples = (samples * latent_std) / latent_multiplier + latent_mean
+            samples = vae.decode_to_images(samples)
+            # Save samples to disk as individual .png files
+            for i, sample in enumerate(samples):
+                index = i * accelerator.num_processes + accelerator.process_index + total
+                Image.fromarray(sample).save(f"{sample_folder_dir}/{index:06d}.png")
+            total += global_batch_size
+            accelerator.wait_for_everyone()
+    return sample_folder_dir
+# some utils
+def print_with_prefix(*messages):
+    prefix = f"\033[34m[LightningDiT-Sampling {strftime('%Y-%m-%d %H:%M:%S')}]\033[0m"
+    combined_message = ' '.join(map(str, messages))
+    print(f"{prefix}: {combined_message}")
+def load_config(config_path):
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+if __name__ == "__main__":
+    # read config
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', type=str, default='configs/lightningdit_b_ldmvae_f16d16.yaml')
+    parser.add_argument('--demo', action='store_true', default=False)
+    args = parser.parse_args()
+    accelerator = Accelerator()
+    train_config = load_config(args.config)
+    # get ckpt_dir
+    assert 'ckpt_path' in train_config, "ckpt_path must be specified in config"
+    if accelerator.process_index == 0:
+        print_with_prefix('Using ckpt:', train_config['ckpt_path'])
+    ckpt_dir = train_config['ckpt_path']
+    if 'downsample_ratio' in train_config['vae']:
+        latent_size = train_config['data']['image_size'] // train_config['vae']['downsample_ratio']
+    else:
+        latent_size = train_config['data']['image_size'] // 16
+    # get model
+    model = LightningDiT_models[train_config['model']['model_type']](
+        input_size=latent_size,
+        num_classes=train_config['data']['num_classes'],
+        use_qknorm=train_config['model']['use_qknorm'],
+        use_swiglu=train_config['model']['use_swiglu'] if 'use_swiglu' in train_config['model'] else False,
+        use_rope=train_config['model']['use_rope'] if 'use_rope' in train_config['model'] else False,
+        use_rmsnorm=train_config['model']['use_rmsnorm'] if 'use_rmsnorm' in train_config['model'] else False,
+        wo_shift=train_config['model']['wo_shift'] if 'wo_shift' in train_config['model'] else False,
+        in_channels=train_config['model']['in_chans'] if 'in_chans' in train_config['model'] else 4,
+        learn_sigma=train_config['model']['learn_sigma'] if 'learn_sigma' in train_config['model'] else False,
+        class_dropout_prob=0 if train_config['data']['num_classes'] == 1 else 0.1,
+    )
+    # naive sample
+    sample_folder_dir = do_sample(train_config, accelerator, ckpt_path=ckpt_dir, model=model, demo_sample_mode=args.demo)
+    if not args.demo:
+        # calculate FID
+        # Important: FID is only for reference, please use ADM evaluation for paper reporting
+        if accelerator.process_index == 0:
+            from tools.calculate_fid import calculate_fid_given_paths
+            print_with_prefix('Calculating FID with {} number of samples'.format(train_config['sample']['fid_num']))
+            assert 'fid_reference_file' in train_config['data'], "fid_reference_file must be specified in config"
+            fid_reference_file = train_config['data']['fid_reference_file']
+            fid = calculate_fid_given_paths(
+                [fid_reference_file, sample_folder_dir],
+                batch_size=50,
+                dims=2048,
+                device='cuda',
+                num_workers=8,
+                sp_len = train_config['sample']['fid_num']
+            )
+            print_with_prefix('fid=',fid)

LDMAE/models/__init__.py ADDED Viewed

File without changes

LDMAE/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (171 Bytes). View file

LDMAE/models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (169 Bytes). View file

LDMAE/models/__pycache__/lightningdit.cpython-310.pyc ADDED Viewed

Binary file (15.9 kB). View file

LDMAE/models/__pycache__/lightningdit.cpython-38.pyc ADDED Viewed

Binary file (16 kB). View file

LDMAE/models/__pycache__/pos_embed.cpython-310.pyc ADDED Viewed

Binary file (4.77 kB). View file

LDMAE/models/__pycache__/pos_embed.cpython-38.pyc ADDED Viewed

Binary file (4.76 kB). View file

LDMAE/models/__pycache__/rmsnorm.cpython-310.pyc ADDED Viewed

Binary file (16.4 kB). View file

LDMAE/models/__pycache__/rmsnorm.cpython-38.pyc ADDED Viewed

Binary file (16.5 kB). View file

LDMAE/models/__pycache__/swiglu_ffn.cpython-310.pyc ADDED Viewed

Binary file (2.16 kB). View file

LDMAE/models/__pycache__/swiglu_ffn.cpython-38.pyc ADDED Viewed

Binary file (2.07 kB). View file

LDMAE/models/lightningdit.py ADDED Viewed

	@@ -0,0 +1,531 @@

+"""
+Lightning DiT's codes are built from original DiT & SiT.
+(https://github.com/facebookresearch/DiT; https://github.com/willisma/SiT)
+It demonstrates that a advanced DiT together with advanced diffusion skills
+could also achieve a very promising result with 1.35 FID on ImageNet 256 generation.
+Enjoy everyone, DiT strikes back!
+by Maple (Jingfeng Yao) from HUST-VL
+"""
+import os
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from timm.models.vision_transformer import PatchEmbed, Mlp
+from models.swiglu_ffn import SwiGLUFFN
+from models.pos_embed import VisionRotaryEmbeddingFast
+from models.rmsnorm import RMSNorm
+@torch.compile
+def modulate(x, shift, scale):
+    if shift is None:
+        return x * (1 + scale.unsqueeze(1))
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+class Attention(nn.Module):
+    """
+    Attention module of LightningDiT.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        attn_drop: float = 0.,
+        proj_drop: float = 0.,
+        norm_layer: nn.Module = nn.LayerNorm,
+        fused_attn: bool = True,
+        use_rmsnorm: bool = False,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = fused_attn
+        if use_rmsnorm:
+            norm_layer = RMSNorm
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor, rope=None) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if rope is not None:
+            q = rope(q)
+            k = rope(k)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    Same as DiT.
+    """
+    def __init__(self, hidden_size: int, frequency_embedding_size: int = 256) -> None:
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+    @staticmethod
+    def timestep_embedding(t: torch.Tensor, dim: int, max_period: int = 10000) -> torch.Tensor:
+        """
+        Create sinusoidal timestep embeddings.
+        Args:
+            t: A 1-D Tensor of N indices, one per batch element. These may be fractional.
+            dim: The dimension of the output.
+            max_period: Controls the minimum frequency of the embeddings.
+        Returns:
+            An (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    @torch.compile
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    Same as DiT.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    @torch.compile
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+class LightningDiTBlock(nn.Module):
+    """
+    Lightning DiT Block. We add features including:
+    - ROPE
+    - QKNorm
+    - RMSNorm
+    - SwiGLU
+    - No shift AdaLN.
+    Not all of them are used in the final model, please refer to the paper for more details.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        mlp_ratio=4.0,
+        use_qknorm=False,
+        use_swiglu=False,
+        use_rmsnorm=False,
+        wo_shift=False,
+        **block_kwargs
+    ):
+        super().__init__()
+        # Initialize normalization layers
+        if not use_rmsnorm:
+            self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+            self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm1 = RMSNorm(hidden_size)
+            self.norm2 = RMSNorm(hidden_size)
+        # Initialize attention layer
+        self.attn = Attention(
+            hidden_size,
+            num_heads=num_heads,
+            qkv_bias=True,
+            qk_norm=use_qknorm,
+            use_rmsnorm=use_rmsnorm,
+            **block_kwargs
+        )
+        # Initialize MLP layer
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        if use_swiglu:
+            # here we did not use SwiGLU from xformers because it is not compatible with torch.compile for now.
+            self.mlp = SwiGLUFFN(hidden_size, int(2/3 * mlp_hidden_dim))
+        else:
+            self.mlp = Mlp(
+                in_features=hidden_size,
+                hidden_features=mlp_hidden_dim,
+                act_layer=approx_gelu,
+                drop=0
+            )
+        # Initialize AdaLN modulation
+        if wo_shift:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(hidden_size, 4 * hidden_size, bias=True)
+            )
+        else:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+            )
+        self.wo_shift = wo_shift
+    @torch.compile
+    def forward(self, x, c, feat_rope=None):
+        if self.wo_shift:
+            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(4, dim=1)
+            shift_msa = None
+            shift_mlp = None
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa), rope=feat_rope)
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of LightningDiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels, use_rmsnorm=False):
+        super().__init__()
+        if not use_rmsnorm:
+            self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm_final = RMSNorm(hidden_size)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    @torch.compile
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class LightningDiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=32,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=1000,
+        learn_sigma=False,
+        use_qknorm=False,
+        use_swiglu=False,
+        use_rope=False,
+        use_rmsnorm=False,
+        wo_shift=False,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels if not learn_sigma else in_channels * 2
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.use_rope = use_rope
+        self.use_rmsnorm = use_rmsnorm
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.use_checkpoint = use_checkpoint
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        # use rotary position encoding, borrow from EVA
+        if self.use_rope:
+            half_head_dim = hidden_size // num_heads // 2
+            hw_seq_len = input_size // patch_size
+            self.feat_rope = VisionRotaryEmbeddingFast(
+                dim=half_head_dim,
+                pt_seq_len=hw_seq_len,
+            )
+        else:
+            self.feat_rope = None
+        self.blocks = nn.ModuleList([
+            LightningDiTBlock(hidden_size,
+                     num_heads,
+                     mlp_ratio=mlp_ratio,
+                     use_qknorm=use_qknorm,
+                     use_swiglu=use_swiglu,
+                     use_rmsnorm=use_rmsnorm,
+                     wo_shift=wo_shift,
+                     ) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels, use_rmsnorm=use_rmsnorm)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize label embedding table:
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in LightningDiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    def forward(self, x, t=None, y=None):
+        """
+        Forward pass of LightningDiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        use_checkpoint: boolean to toggle checkpointing
+        """
+        use_checkpoint = self.use_checkpoint
+        x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(t)                   # (N, D)
+        y = self.y_embedder(y, self.training)    # (N, D)
+        c = t + y                                # (N, D)
+        for block in self.blocks:
+            if use_checkpoint:
+                x = checkpoint(block, x, c, self.feat_rope, use_reentrant=True)
+            else:
+                x = block(x, c, self.feat_rope)
+        x = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)                   # (N, out_channels, H, W)
+        if self.learn_sigma:
+            x, _ = x.chunk(2, dim=1)
+        return x
+    def forward_with_cfg(self, x, t, y, cfg_scale, cfg_interval=None, cfg_interval_start=None):
+        """
+        Forward pass of LightningDiT, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, y)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        if cfg_interval is True:
+            timestep = t[0]
+            if timestep < cfg_interval_start:
+                half_eps = cond_eps
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+#################################################################################
+#                             LightningDiT Configs                              #
+#################################################################################
+def LightningDiT_XL_1(**kwargs):
+    return LightningDiT(depth=28, hidden_size=1152, patch_size=1, num_heads=16, **kwargs)
+def LightningDiT_XL_2(**kwargs):
+    return LightningDiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def LightningDiT_L_2(**kwargs):
+    return LightningDiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def LightningDiT_B_1(**kwargs):
+    return LightningDiT(depth=12, hidden_size=768, patch_size=1, num_heads=12, **kwargs)
+def LightningDiT_B_2(**kwargs):
+    return LightningDiT(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def LightningDiT_1p0B_1(**kwargs):
+    return LightningDiT(depth=24, hidden_size=1536, patch_size=1, num_heads=24, **kwargs)
+def LightningDiT_1p0B_2(**kwargs):
+    return LightningDiT(depth=24, hidden_size=1536, patch_size=2, num_heads=24, **kwargs)
+def LightningDiT_1p6B_1(**kwargs):
+    return LightningDiT(depth=28, hidden_size=1792, patch_size=1, num_heads=28, **kwargs)
+def LightningDiT_1p6B_2(**kwargs):
+    return LightningDiT(depth=28, hidden_size=1792, patch_size=2, num_heads=28, **kwargs)
+LightningDiT_models = {
+    'LightningDiT-B/1': LightningDiT_B_1, 'LightningDiT-B/2': LightningDiT_B_2,
+    'LightningDiT-L/2': LightningDiT_L_2,
+    'LightningDiT-XL/1': LightningDiT_XL_1, 'LightningDiT-XL/2': LightningDiT_XL_2,
+    'LightningDiT-1p0B/1': LightningDiT_1p0B_1, 'LightningDiT-1p0B/2': LightningDiT_1p0B_2,
+    'LightningDiT-1p6B/1': LightningDiT_1p6B_1, 'LightningDiT-1p6B/2': LightningDiT_1p6B_2,
+}

LDMAE/models/lpips.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import torch
+import hashlib
+import requests
+import torch.nn as nn
+from torchvision import models
+from collections import namedtuple
+import os
+from tqdm import tqdm
+URL_MAP = {"vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"}
+CKPT_MAP = {"vgg_lpips": "vgg.pth"}
+MD5_MAP = {"vgg_lpips": "d507d7349b931f0638a25a48a722f98a"}
+def download(url, local_path, chunk_size=1024):
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+def md5_hash(path):
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+def get_ckpt_path(name, root, check=False):
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path
+class LPIPS(nn.Module):
+    # Learned perceptual metric
+    def __init__(self, use_dropout=True):
+        super().__init__()
+        self.scaling_layer = ScalingLayer()
+        self.chns = [64, 128, 256, 512, 512]  # vgg16 features
+        self.net = vgg16(pretrained=True, requires_grad=False)
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.load_from_pretrained()
+        for param in self.parameters():
+            param.requires_grad = False
+    def load_from_pretrained(self, name="vgg_lpips"):
+        ckpt = get_ckpt_path(name, "movqgan/modules/losses/lpips")
+        self.load_state_dict(
+            torch.load(ckpt, map_location=torch.device("cpu")), strict=False
+        )
+        print("loaded pretrained LPIPS loss from {}".format(ckpt))
+    @classmethod
+    def from_pretrained(cls, name="vgg_lpips"):
+        if name != "vgg_lpips":
+            raise NotImplementedError
+        model = cls()
+        ckpt = get_ckpt_path(name)
+        model.load_state_dict(
+            torch.load(ckpt, map_location=torch.device("cpu")), strict=False
+        )
+        return model
+    def forward(self, input, target):
+        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
+        outs0, outs1 = self.net(in0_input), self.net(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        for kk in range(len(self.chns)):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(
+                outs1[kk]
+            )
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+        res = [
+            spatial_average(lins[kk].model(diffs[kk]), keepdim=True)
+            for kk in range(len(self.chns))
+        ]
+        val = res[0]
+        for l in range(1, len(self.chns)):
+            val += res[l]
+        return val
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        self.register_buffer(
+            "shift", torch.Tensor([-0.030, -0.088, -0.188])[None, :, None, None]
+        )
+        self.register_buffer(
+            "scale", torch.Tensor([0.458, 0.448, 0.450])[None, :, None, None]
+        )
+    def forward(self, inp):
+        # convert imagenet normalized data to [-1, 1]
+        return (inp - self.shift) / self.scale
+class NetLinLayer(nn.Module):
+    """A single linear layer which does a 1x1 conv"""
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = (
+            [
+                nn.Dropout(),
+            ]
+            if (use_dropout)
+            else []
+        )
+        layers += [
+            nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
+        ]
+        self.model = nn.Sequential(*layers)
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(pretrained=pretrained)
+        vgg_pretrained_features = vgg_pretrained_features.features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple(
+            "VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"]
+        )
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+def normalize_tensor(x, eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True))
+    return x / (norm_factor + eps)
+def spatial_average(x, keepdim=True):
+    return x.mean([2, 3], keepdim=keepdim)

LDMAE/models/pos_embed.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# --------------------------------------------------------
+# EVA-02: A Visual Representation for Neon Genesis
+# Github source: https://github.com/baaivision/EVA/EVA02
+# Copyright (c) 2023 Beijing Academy of Artificial Intelligence (BAAI)
+# Licensed under The MIT License [see LICENSE for details]
+# By Yuxin Fang
+#
+# Based on https://github.com/lucidrains/rotary-embedding-torch
+# --------------------------------------------------------'
+from math import pi
+import torch
+from torch import nn
+from einops import rearrange, repeat
+def broadcat(tensors, dim = -1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim = dim)
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs = None,
+        freqs_for = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f'unknown modality {freqs_for}')
+        if ft_seq_len is None: ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs_h = torch.einsum('..., f -> ... f', t, freqs)
+        freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2)
+        freqs_w = torch.einsum('..., f -> ... f', t, freqs)
+        freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2)
+        freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim = -1)
+        self.register_buffer("freqs_cos", freqs.cos())
+        self.register_buffer("freqs_sin", freqs.sin())
+        # print('======== shape of rope freq', self.freqs_cos.shape, '========')
+    def forward(self, t, start_index = 0):
+        rot_dim = self.freqs_cos.shape[-1]
+        end_index = start_index + rot_dim
+        assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+        t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+        t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin)
+        return torch.cat((t_left, t, t_right), dim = -1)
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len=16,
+        ft_seq_len=None,
+        custom_freqs = None,
+        freqs_for = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f'unknown modality {freqs_for}')
+        if ft_seq_len is None: ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs = torch.einsum('..., f -> ... f', t, freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
+        freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1)
+        freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
+        freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
+        self.register_buffer("freqs_cos", freqs_cos)
+        self.register_buffer("freqs_sin", freqs_sin)
+        # print('======== shape of rope freq', self.freqs_cos.shape, '========')
+    def forward(self, t): return  t * self.freqs_cos + rotate_half(t) * self.freqs_sin

LDMAE/models/rmsnorm.py ADDED Viewed

	@@ -0,0 +1,495 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import fairscale.nn.model_parallel.initialize as fs_init
+import torch
+import torch.nn.functional as F
+from fairscale.nn.model_parallel.layers import (
+    ColumnParallelLinear,
+    ParallelEmbedding,
+    RowParallelLinear,
+)
+from torch import nn
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        end (int): End index for precomputing frequencies.
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+    Returns:
+        torch.Tensor: Precomputed frequency tensor with complex exponentials.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Args:
+        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings.
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.
+        freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :]
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+class Attention(nn.Module):
+    """Multi-head attention module."""
+    def __init__(self, args: ModelArgs):
+        """
+        Initialize the Attention module.
+        Args:
+            args (ModelArgs): Model configuration parameters.
+        Attributes:
+            n_kv_heads (int): Number of key and value heads.
+            n_local_heads (int): Number of local query heads.
+            n_local_kv_heads (int): Number of local key and value heads.
+            n_rep (int): Number of repetitions for local heads.
+            head_dim (int): Dimension size of each attention head.
+            wq (ColumnParallelLinear): Linear transformation for queries.
+            wk (ColumnParallelLinear): Linear transformation for keys.
+            wv (ColumnParallelLinear): Linear transformation for values.
+            wo (RowParallelLinear): Linear transformation for output.
+            cache_k (torch.Tensor): Cached keys for attention.
+            cache_v (torch.Tensor): Cached values for attention.
+        """
+        super().__init__()
+        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        model_parallel_size = fs_init.get_model_parallel_world_size()
+        self.n_local_heads = args.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.dim // args.n_heads
+        self.wq = ColumnParallelLinear(
+            args.dim,
+            args.n_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wk = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wv = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wo = RowParallelLinear(
+            args.n_heads * self.head_dim,
+            args.dim,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+        self.cache_k = torch.zeros(
+            (
+                args.max_batch_size,
+                args.max_seq_len,
+                self.n_local_kv_heads,
+                self.head_dim,
+            )
+        ).cuda()
+        self.cache_v = torch.zeros(
+            (
+                args.max_batch_size,
+                args.max_seq_len,
+                self.n_local_kv_heads,
+                self.head_dim,
+            )
+        ).cuda()
+    def forward(
+        self,
+        x: torch.Tensor,
+        start_pos: int,
+        freqs_cis: torch.Tensor,
+        mask: Optional[torch.Tensor],
+    ):
+        """
+        Forward pass of the attention module.
+        Args:
+            x (torch.Tensor): Input tensor.
+            start_pos (int): Starting position for caching.
+            freqs_cis (torch.Tensor): Precomputed frequency tensor.
+            mask (torch.Tensor, optional): Attention mask tensor.
+        Returns:
+            torch.Tensor: Output tensor after attention.
+        """
+        bsz, seqlen, _ = x.shape
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+        self.cache_k = self.cache_k.to(xq)
+        self.cache_v = self.cache_v.to(xq)
+        self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
+        self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv
+        keys = self.cache_k[:bsz, : start_pos + seqlen]
+        values = self.cache_v[:bsz, : start_pos + seqlen]
+        # repeat k/v heads if n_kv_heads < n_heads
+        keys = repeat_kv(keys, self.n_rep)  # (bs, cache_len + seqlen, n_local_heads, head_dim)
+        values = repeat_kv(values, self.n_rep)  # (bs, cache_len + seqlen, n_local_heads, head_dim)
+        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        keys = keys.transpose(1, 2) # (bs, n_local_heads, cache_len + seqlen, head_dim)
+        values = values.transpose(1, 2) # (bs, n_local_heads, cache_len + seqlen, head_dim)
+        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if mask is not None:
+            scores = scores + mask  # (bs, n_local_heads, seqlen, cache_len + seqlen)
+        scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+        output = torch.matmul(scores, values)  # (bs, n_local_heads, seqlen, head_dim)
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+        return self.wo(output)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+    ):
+        """
+        Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+            ffn_dim_multiplier (float, optional): Custom multiplier for hidden dimension. Defaults to None.
+        Attributes:
+            w1 (ColumnParallelLinear): Linear transformation for the first layer.
+            w2 (RowParallelLinear): Linear transformation for the second layer.
+            w3 (ColumnParallelLinear): Linear transformation for the third layer.
+        """
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = ColumnParallelLinear(
+            dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
+        )
+        self.w2 = RowParallelLinear(
+            hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x
+        )
+        self.w3 = ColumnParallelLinear(
+            dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
+        )
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, args: ModelArgs):
+        """
+        Initialize a TransformerBlock.
+        Args:
+            layer_id (int): Identifier for the layer.
+            args (ModelArgs): Model configuration parameters.
+        Attributes:
+            n_heads (int): Number of attention heads.
+            dim (int): Dimension size of the model.
+            head_dim (int): Dimension size of each attention head.
+            attention (Attention): Attention module.
+            feed_forward (FeedForward): FeedForward module.
+            layer_id (int): Identifier for the layer.
+            attention_norm (RMSNorm): Layer normalization for attention output.
+            ffn_norm (RMSNorm): Layer normalization for feedforward output.
+        """
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            ffn_dim_multiplier=args.ffn_dim_multiplier,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+    def forward(
+        self,
+        x: torch.Tensor,
+        start_pos: int,
+        freqs_cis: torch.Tensor,
+        mask: Optional[torch.Tensor],
+    ):
+        """
+        Perform a forward pass through the TransformerBlock.
+        Args:
+            x (torch.Tensor): Input tensor.
+            start_pos (int): Starting position for attention caching.
+            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
+            mask (torch.Tensor, optional): Masking tensor for attention. Defaults to None.
+        Returns:
+            torch.Tensor: Output tensor after applying attention and feedforward layers.
+        """
+        h = x + self.attention(
+            self.attention_norm(x), start_pos, freqs_cis, mask
+        )
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+class Transformer(nn.Module):
+    def __init__(self, params: ModelArgs):
+        """
+        Initialize a Transformer model.
+        Args:
+            params (ModelArgs): Model configuration parameters.
+        Attributes:
+            params (ModelArgs): Model configuration parameters.
+            vocab_size (int): Vocabulary size.
+            n_layers (int): Number of layers in the model.
+            tok_embeddings (ParallelEmbedding): Token embeddings.
+            layers (torch.nn.ModuleList): List of Transformer blocks.
+            norm (RMSNorm): Layer normalization for the model output.
+            output (ColumnParallelLinear): Linear layer for final output.
+            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
+        """
+        super().__init__()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+        self.tok_embeddings = ParallelEmbedding(
+            params.vocab_size, params.dim, init_method=lambda x: x
+        )
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(params.n_layers):
+            self.layers.append(TransformerBlock(layer_id, params))
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
+        self.output = ColumnParallelLinear(
+            params.dim, params.vocab_size, bias=False, init_method=lambda x: x
+        )
+        self.freqs_cis = precompute_freqs_cis(
+            # Note that self.params.max_seq_len is multiplied by 2 because the token limit for the Llama 2 generation of models is 4096.
+            # Adding this multiplier instead of using 4096 directly allows for dynamism of token lengths while training or fine-tuning.
+            self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
+        )
+    @torch.inference_mode()
+    def forward(self, tokens: torch.Tensor, start_pos: int):
+        """
+        Perform a forward pass through the Transformer model.
+        Args:
+            tokens (torch.Tensor): Input token indices.
+            start_pos (int): Starting position for attention caching.
+        Returns:
+            torch.Tensor: Output logits after applying the Transformer model.
+        """
+        _bsz, seqlen = tokens.shape
+        h = self.tok_embeddings(tokens)
+        self.freqs_cis = self.freqs_cis.to(h.device)
+        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
+        mask = None
+        if seqlen > 1:
+            mask = torch.full(
+                (seqlen, seqlen), float("-inf"), device=tokens.device
+            )
+            mask = torch.triu(mask, diagonal=1)
+            # When performing key-value caching, we compute the attention scores
+            # only for the new sequence. Thus, the matrix of scores is of size
+            # (seqlen, cache_len + seqlen), and the only masked entries are (i, j) for
+            # j > cache_len + i, since row i corresponds to token cache_len + i.
+            mask = torch.hstack([
+                torch.zeros((seqlen, start_pos), device=tokens.device),
+                mask
+            ]).type_as(h)
+        for layer in self.layers:
+            h = layer(h, start_pos, freqs_cis, mask)
+        h = self.norm(h)
+        output = self.output(h).float()
+        return output

LDMAE/models/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import os
+import torch
+from typing import Callable, Optional
+import warnings
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    @torch.compile
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import SwiGLU
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (SwiGLU)")
+    else:
+        # warnings.warn("xFormers is disabled (SwiGLU)")
+        raise ImportError
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (SwiGLU)")
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

LDMAE/pretrain_weight/aef8d16.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97ad3672641653fcd74106cd050dc8f5042089b8edc06e30cbcde642be239aa6
+size 1006144522

LDMAE/pretrain_weight/daef8d16.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbf39522a2f602df6f271b8b0ea0a73a7c23687fd54f98c5b60ef85289b15168
+size 1006144522

LDMAE/pretrain_weight/sdv3f8d16.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b367d60d708cb371261c005a44bd68f8d17dd211f8c771fb2b3802e51df2f8c
+size 1098238157

LDMAE/pretrain_weight/vaef8d16.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f0e12f87137cdb19bd8f461dc1c8d7c572628d79e03f070ebdcc7a802e610c6
+size 1006144522

LDMAE/pretrain_weight/vmaef8d16.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:441e7360993a03978e729dafc77432a372f066bf46a3c9610c7c33c4a0f09fc1
+size 147225897

LDMAE/requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+torch>=2.0.0
+torchvision
+accelerate
+transformers
+Pillow
+numpy
+scipy
+tqdm
+matplotlib
+tensorboard
+omegaconf
+einops
+timm
+opencv-python
+scikit-learn
+lpips

LDMAE/run_extract_feature.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+CONFIG_PATH=$1
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+NNODES=${WORLD_SIZE:-1}
+NODE_RANK=${RANK:-0}
+MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
+MASTER_PORT=${MASTER_PORT:-1235}
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+PRECISION=${PRECISION:-bf16}
+echo $CONFIG_PATH
+accelerate launch \
+    --config-file configs/accelerator/8gpu.yaml \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank $NODE_RANK \
+    --num_processes  $(($GPUS_PER_NODE*$NNODES)) \
+    --num_machines $NNODES \
+    --mixed_precision $PRECISION \
+    extract_features.py \
+    --config $CONFIG_PATH \

LDMAE/run_fast_inference.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+CONFIG_PATH=$1
+GPUS_PER_NODE=1
+NNODES=1
+NODE_RANK=0
+MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
+MASTER_PORT=${MASTER_PORT:-1236}
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+PRECISION=${PRECISION:-bf16}
+accelerate launch \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank $NODE_RANK \
+    --num_processes  $(($GPUS_PER_NODE*$NNODES)) \
+    --num_machines $NNODES \
+    --mixed_precision $PRECISION \
+    inference.py \
+    --config $CONFIG_PATH \
+    --demo

LDMAE/run_inference.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+CONFIG_PATH=$1
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+NNODES=${WORLD_SIZE:-1}
+NODE_RANK=${RANK:-0}
+MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
+MASTER_PORT=${MASTER_PORT:-1237}
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+PRECISION=${PRECISION:-bf16}
+accelerate launch \
+    --config-file configs/accelerator/8gpu.yaml \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank $NODE_RANK \
+    --num_processes  $(($GPUS_PER_NODE*$NNODES)) \
+    --num_machines $NNODES \
+    --mixed_precision $PRECISION \
+    inference.py \
+    --config $CONFIG_PATH

LDMAE/run_robustness_test.sh ADDED Viewed

	@@ -0,0 +1,81 @@

+# CONFIG=$1
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+NNODES=${WORLD_SIZE:-1}
+NODE_RANK=${RANK:-0}
+MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
+MASTER_PORT=${MASTER_PORT:-1241}
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+PRECISION=${PRECISION:-bf16}
+# VMAE reconstruction
+accelerate launch \
+    --config-file configs/accelerator/8gpu.yaml \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank $NODE_RANK \
+    --num_processes  $(($GPUS_PER_NODE*$NNODES)) \
+    --num_machines $NNODES \
+    --mixed_precision $PRECISION \
+    evaluate_tokenizer.py \
+    --config configs/imagenet/lightningdit_b_vmae_f8d16_cfg.yaml \
+    --robust_exp True \
+accelerate launch \
+    --config-file configs/accelerator/8gpu.yaml \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank $NODE_RANK \
+    --num_processes  $(($GPUS_PER_NODE*$NNODES)) \
+    --num_machines $NNODES \
+    --mixed_precision $PRECISION \
+    evaluate_tokenizer.py \
+    --epsilon 0.01 \
+    --config configs/imagenet/lightningdit_b_vmae_f8d16_cfg.yaml \
+accelerate launch \
+    --config-file configs/accelerator/8gpu.yaml \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank $NODE_RANK \
+    --num_processes  $(($GPUS_PER_NODE*$NNODES)) \
+    --num_machines $NNODES \
+    --mixed_precision $PRECISION \
+    evaluate_tokenizer.py \
+    --epsilon 0.05 \
+    --config configs/imagenet/lightningdit_b_vmae_f8d16_cfg.yaml \
+accelerate launch \
+    --config-file configs/accelerator/8gpu.yaml \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank $NODE_RANK \
+    --num_processes  $(($GPUS_PER_NODE*$NNODES)) \
+    --num_machines $NNODES \
+    --mixed_precision $PRECISION \
+    evaluate_tokenizer.py \
+    --epsilon 0.1 \
+    --config configs/imagenet/lightningdit_b_vmae_f8d16_cfg.yaml \
+accelerate launch \
+    --config-file configs/accelerator/8gpu.yaml \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank $NODE_RANK \
+    --num_processes  $(($GPUS_PER_NODE*$NNODES)) \
+    --num_machines $NNODES \
+    --mixed_precision $PRECISION \
+    evaluate_tokenizer_mae.py \
+    --epsilon 0.2 \
+    --config configs/imagenet/lightningdit_b_vmae_f8d16_cfg.yaml \
+accelerate launch \
+    --config-file configs/accelerator/8gpu.yaml \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank $NODE_RANK \
+    --num_processes  $(($GPUS_PER_NODE*$NNODES)) \
+    --num_machines $NNODES \
+    --mixed_precision $PRECISION \
+    evaluate_tokenizer.py \
+    --epsilon 0.3 \
+    --config configs/imagenet/lightningdit_b_vmae_f8d16_cfg.yaml \

LDMAE/run_train.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+CONFIG_PATH=$1
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+NNODES=${WORLD_SIZE:-1}
+NODE_RANK=${RANK:-0}
+MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
+MASTER_PORT=${MASTER_PORT:-1235}
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+PRECISION=${PRECISION:-bf16}
+echo $CONFIG_PATH
+accelerate launch \
+    --config-file configs/accelerator/8gpu.yaml \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank $NODE_RANK \
+    --num_processes  $(($GPUS_PER_NODE*$NNODES)) \
+    --num_machines $NNODES \
+    --mixed_precision $PRECISION \
+    train_accum.py \
+    --config $CONFIG_PATH

LDMAE/tokenizer/__init__.py ADDED Viewed

File without changes

LDMAE/tokenizer/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (174 Bytes). View file

LDMAE/tokenizer/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (146 Bytes). View file

LDMAE/tokenizer/__pycache__/autoencoder.cpython-310.pyc ADDED Viewed

Binary file (12.5 kB). View file

LDMAE/tokenizer/__pycache__/models_mae.cpython-310.pyc ADDED Viewed

Binary file (28.8 kB). View file

LDMAE/tokenizer/__pycache__/sdvae.cpython-310.pyc ADDED Viewed

Binary file (3.5 kB). View file

LDMAE/tokenizer/__pycache__/vavae.cpython-310.pyc ADDED Viewed

Binary file (4.48 kB). View file

LDMAE/tokenizer/__pycache__/vavae.cpython-38.pyc ADDED Viewed

Binary file (4.35 kB). View file