xiangzai commited on 6 days ago

Commit

3b7386d

verified ·

1 Parent(s): b2fa289

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
GVP/Baseline/DeltaFM/assets/deltafm.png +3 -0
GVP/Baseline/DeltaFM/interference_vectors/imnet256_interference_vector.pt +3 -0
GVP/Baseline/W_No.log +0 -0
GVP/Baseline/classify_image_graph_def.pb +3 -0
GVP/Baseline/compare_samples.sh +21 -0
GVP/Baseline/compare_sampling.log +0 -0
GVP/Baseline/download.py +41 -0
GVP/Baseline/environment.yml +16 -0
GVP/Baseline/evaluate_samples.sh +65 -0
GVP/Baseline/evaluator.py +690 -0
GVP/Baseline/gvp_sampling.log +51 -0
GVP/Baseline/models.py +647 -0
GVP/Baseline/nohup.out +180 -0
GVP/Baseline/pic_npz.py +168 -0
GVP/Baseline/run.sh +15 -0
GVP/Baseline/sample_compare_ddp_rectified.py +274 -0
GVP/Baseline/sample_ddp.py +233 -0
GVP/Baseline/sample_rectified_noise.py +380 -0
GVP/Baseline/samples.sh +16 -0
GVP/Baseline/samples_ddp.sh +14 -0
GVP/Baseline/transport/__pycache__/ot_plan.cpython-311.pyc +0 -0
GVP/Baseline/transport/__pycache__/path.cpython-310.pyc +0 -0
GVP/Baseline/transport/__pycache__/path.cpython-311.pyc +0 -0
GVP/Baseline/transport/__pycache__/path.cpython-312.pyc +0 -0
GVP/Baseline/transport/__pycache__/path.cpython-38.pyc +0 -0
GVP/Baseline/transport/__pycache__/transport.cpython-310.pyc +0 -0
GVP/Baseline/transport/__pycache__/transport.cpython-311.pyc +0 -0
GVP/Baseline/transport/__pycache__/transport.cpython-312.pyc +0 -0
GVP/Baseline/transport/__pycache__/transport.cpython-38.pyc +0 -0
GVP/Baseline/transport/__pycache__/utils.cpython-310.pyc +0 -0
GVP/Baseline/transport/__pycache__/utils.cpython-311.pyc +0 -0
GVP/Baseline/transport/__pycache__/utils.cpython-312.pyc +0 -0
GVP/Baseline/transport/__pycache__/utils.cpython-38.pyc +0 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0020000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0040000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0060000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0080000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0100000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0120000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0140000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0160000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0180000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0200000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0220000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0240000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0260000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0280000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0300000.pt +3 -0
VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0320000.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+GVP/Baseline/DeltaFM/assets/deltafm.png filter=lfs diff=lfs merge=lfs -text

GVP/Baseline/DeltaFM/assets/deltafm.png ADDED Viewed

Git LFS Details

SHA256: ea6d5e6c5d64fecceb307ee87bfdc6c9fc97db6bf2fd37390b0b189018156336
Pointer size: 131 Bytes
Size of remote file: 405 kB

GVP/Baseline/DeltaFM/interference_vectors/imnet256_interference_vector.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2f34c2a05a7abe7c22e32d6cc06e65f8435e9271f7b4aa0cc7044dce1b7727b
+size 17629

GVP/Baseline/W_No.log ADDED Viewed

The diff for this file is too large to render. See raw diff

GVP/Baseline/classify_image_graph_def.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:009d6814d1bc560d4e7b236e170e9b2d5ca6f4b57bd8037f6db05776204415c6
+size 95673916

GVP/Baseline/compare_samples.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 nohup torchrun \
+    --nnodes=1 \
+    --nproc_per_node=4 \
+    --rdzv_endpoint=localhost:29166 \
+    sample_compare_ddp_rectified.py SDE \
+    --model SiT-XL/2 \
+    --sample-dir compare_samples \
+    --num-fid-samples 50000 \
+    --num-classes 1000 \
+    --global-seed 1 \
+    --cfg-scale 1.0 \
+    --num-sampling-steps 250 \
+    --depth 6 \
+    --use-sitf2 True \
+    --sitf2-threshold 0.5 \
+    --ckpt /gemini/space/gzy_new/models/xiangzai_Back/GVP_check/base.pt \
+    --sitf2-ckpt /gemini/space/gzy_new/models/Baseline/results_256_gvp_disp/depth-mu-6-007-SiT-XL-2-GVP-velocity-None-OT-Contrastive0.05/checkpoints/0300000.pt \
+    > compare_sampling.log 2>&1 &

GVP/Baseline/compare_sampling.log ADDED Viewed

The diff for this file is too large to render. See raw diff

GVP/Baseline/download.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Functions for downloading pre-trained SiT models
+"""
+from torchvision.datasets.utils import download_url
+import torch
+import os
+pretrained_models = {'SiT-XL-2-256x256.pt'}
+def find_model(model_name):
+    """
+    Finds a pre-trained SiT model, downloading it if necessary. Alternatively, loads a model from a local path.
+    """
+    if model_name in pretrained_models:
+        return download_model(model_name)
+    else:
+        assert os.path.isfile(model_name), f'Could not find SiT checkpoint at {model_name}'
+        checkpoint = torch.load(model_name, map_location=lambda storage, loc: storage, weights_only=False)
+        if "ema" in checkpoint:  # supports checkpoints from train.py
+            checkpoint = checkpoint["ema"]
+        return checkpoint
+def download_model(model_name):
+    """
+    Downloads a pre-trained SiT model from the web.
+    """
+    assert model_name in pretrained_models
+    local_path = f'pretrained_models/{model_name}'
+    if not os.path.isfile(local_path):
+        os.makedirs('pretrained_models', exist_ok=True)
+        web_path = f'https://www.dl.dropboxusercontent.com/scl/fi/as9oeomcbub47de5g4be0/SiT-XL-2-256.pt?rlkey=uxzxmpicu46coq3msb17b9ofa&dl=0'
+        download_url(web_path, 'pretrained_models', filename=model_name)
+    model = torch.load(local_path, map_location=lambda storage, loc: storage, weights_only=False)
+    return model

GVP/Baseline/environment.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+name: RN
+channels:
+  - pytorch
+  - nvidia
+dependencies:
+  - python >= 3.8
+  - pytorch >= 1.13
+  - torchvision
+  - pytorch-cuda >=11.7
+  - pip
+  - pip:
+    - timm
+    - diffusers
+    - accelerate
+    - torchdiffeq
+    - wandb

GVP/Baseline/evaluate_samples.sh ADDED Viewed

	@@ -0,0 +1,65 @@

+#!/bin/bash
+# Execute all evaluation tasks in parallel
+# Each command runs in the background using &
+echo "Starting all evaluation tasks in parallel..."
+# Reference batch path
+REF_BATCH="/gemini/space/zhaozy/zhy/dataset/VIRTUAL_imagenet256_labeled.npz"
+# Base directory for sample files
+SAMPLE_DIR="/gemini/space/zhaozy/zhy/gzy_new/Noise_Matching/Rectified-Noise/last_samples_depth_2_gvp_0.5"
+# Change to the project root directory
+cd /gemini/space/zhaozy/zhy/gzy_new/Noise_Matching
+# Evaluate threshold 0.0 on GPU 0
+CUDA_VISIBLE_DEVICES=0 nohup python evaluator.py \
+    --ref_batch ${REF_BATCH} \
+    --sample_batch ${SAMPLE_DIR}/depth-mu-2-threshold-0.0-0550000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04.npz \
+    > eval_threshold_0.0.log 2>&1 &
+# Evaluate threshold 0.15 on GPU 1
+CUDA_VISIBLE_DEVICES=1 nohup python evaluator.py \
+    --ref_batch ${REF_BATCH} \
+    --sample_batch ${SAMPLE_DIR}/depth-mu-2-threshold-0.15-0550000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04.npz \
+    > eval_threshold_0.15.log 2>&1 &
+# Evaluate threshold 0.25 on GPU 2
+CUDA_VISIBLE_DEVICES=2 nohup python evaluator.py \
+    --ref_batch ${REF_BATCH} \
+    --sample_batch ${SAMPLE_DIR}/depth-mu-2-threshold-0.25-0550000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04.npz \
+    > eval_threshold_0.25.log 2>&1 &
+# Evaluate threshold 0.5 on GPU 3
+CUDA_VISIBLE_DEVICES=3 nohup python evaluator.py \
+    --ref_batch ${REF_BATCH} \
+    --sample_batch ${SAMPLE_DIR}/depth-mu-2-threshold-0.5-0550000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04.npz \
+    > eval_threshold_0.5.log 2>&1 &
+# Evaluate threshold 0.75 on GPU 4
+CUDA_VISIBLE_DEVICES=0 nohup python evaluator.py \
+    --ref_batch ${REF_BATCH} \
+    --sample_batch ${SAMPLE_DIR}/depth-mu-2-threshold-0.75-0550000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04.npz \
+    > eval_threshold_0.75.log 2>&1 &
+# Evaluate threshold 1.0 on GPU 5
+CUDA_VISIBLE_DEVICES=1 nohup python evaluator.py \
+    --ref_batch ${REF_BATCH} \
+    --sample_batch ${SAMPLE_DIR}/depth-mu-2-threshold-1.0-0550000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04.npz \
+    > eval_threshold_1.0.log 2>&1 &
+# Wait for all background jobs to complete
+echo "All evaluation tasks started. Waiting for completion..."
+wait
+echo "All evaluation tasks completed!"
+echo ""
+echo "Results saved in:"
+echo "  - eval_threshold_0.0.log"
+echo "  - eval_threshold_0.15.log"
+echo "  - eval_threshold_0.25.log"
+echo "  - eval_threshold_0.5.log"
+echo "  - eval_threshold_0.75.log"
+echo "  - eval_threshold_1.0.log"

GVP/Baseline/evaluator.py ADDED Viewed

	@@ -0,0 +1,690 @@

+import argparse
+import io
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+import random
+import warnings
+import zipfile
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from functools import partial
+from multiprocessing import cpu_count
+from multiprocessing.pool import ThreadPool
+from typing import Iterable, Optional, Tuple, Union
+import numpy as np
+import requests
+import tensorflow.compat.v1 as tf
+from scipy import linalg
+from tqdm.auto import tqdm
+from datetime import timedelta
+import torch
+INCEPTION_V3_URL = "https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/classify_image_graph_def.pb"
+INCEPTION_V3_PATH = "classify_image_graph_def.pb"
+FID_POOL_NAME = "pool_3:0"
+FID_SPATIAL_NAME = "mixed_6/conv:0"
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ref_batch", default='/gemini/space/gzy_new/models/reference/VIRTUAL_imagenet256_labeled.npz',help="path to reference batch npz file")
+    parser.add_argument("--sample_batch", default='/gemini/space/gzy_new/models/Baseline/GVP_samples/depth-mu-6-0300000-base-cfg-1.0-12-SDE-250-Euler-sigma-Mean-0.04.npz', help="path to sample batch npz file")
+    args = parser.parse_args()
+    config = tf.ConfigProto(
+        allow_soft_placement=True  # allows DecodeJpeg to run on CPU in Inception graph
+    )
+    config.gpu_options.allow_growth = True
+    evaluator = Evaluator(tf.Session(config=config))
+    print("warming up TensorFlow...")
+    # This will cause TF to print a bunch of verbose stuff now rather
+    # than after the next print(), to help prevent confusion.
+    evaluator.warmup()
+    print("computing reference batch activations...")
+    ref_acts = evaluator.read_activations(args.ref_batch)
+    print("computing/reading reference batch statistics...")
+    ref_stats, ref_stats_spatial = evaluator.read_statistics(args.ref_batch, ref_acts)
+    print("computing sample batch activations...")
+    sample_acts = evaluator.read_activations(args.sample_batch)
+    print("computing/reading sample batch statistics...")
+    sample_stats, sample_stats_spatial = evaluator.read_statistics(args.sample_batch, sample_acts)
+    print("Computing evaluations...")
+    print("Inception Score:", evaluator.compute_inception_score(sample_acts[0]))
+    print("FID:", sample_stats.frechet_distance(ref_stats))
+    print("sFID:", sample_stats_spatial.frechet_distance(ref_stats_spatial))
+    prec, recall = evaluator.compute_prec_recall(ref_acts[0], sample_acts[0])
+    print("Precision:", prec)
+    print("Recall:", recall)
+class InvalidFIDException(Exception):
+    pass
+class FIDStatistics:
+    def __init__(self, mu: np.ndarray, sigma: np.ndarray):
+        self.mu = mu
+        self.sigma = sigma
+    def frechet_distance(self, other, eps=1e-6):
+        """
+        Compute the Frechet distance between two sets of statistics.
+        """
+        # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L132
+        mu1, sigma1 = self.mu, self.sigma
+        mu2, sigma2 = other.mu, other.sigma
+        mu1 = np.atleast_1d(mu1)
+        mu2 = np.atleast_1d(mu2)
+        sigma1 = np.atleast_2d(sigma1)
+        sigma2 = np.atleast_2d(sigma2)
+        assert (
+            mu1.shape == mu2.shape
+        ), f"Training and test mean vectors have different lengths: {mu1.shape}, {mu2.shape}"
+        assert (
+            sigma1.shape == sigma2.shape
+        ), f"Training and test covariances have different dimensions: {sigma1.shape}, {sigma2.shape}"
+        diff = mu1 - mu2
+        # product might be almost singular
+        covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+        if not np.isfinite(covmean).all():
+            msg = (
+                "fid calculation produces singular product; adding %s to diagonal of cov estimates"
+                % eps
+            )
+            warnings.warn(msg)
+            offset = np.eye(sigma1.shape[0]) * eps
+            covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+        # numerical error might give slight imaginary component
+        #虚部报错部分
+        if np.iscomplexobj(covmean):
+            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1):
+                m = np.max(np.abs(covmean.imag))
+                print(f"Real component: {covmean.real}")
+                raise ValueError("Imaginary component {}".format(m))
+            covmean = covmean.real
+        tr_covmean = np.trace(covmean)
+        return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
+class Evaluator:
+    def __init__(
+        self,
+        session,
+        batch_size=64,
+        softmax_batch_size=512,
+    ):
+        self.sess = session
+        self.batch_size = batch_size
+        self.softmax_batch_size = softmax_batch_size
+        self.manifold_estimator = ManifoldEstimator(session)
+        with self.sess.graph.as_default():
+            self.image_input = tf.placeholder(tf.float32, shape=[None, None, None, 3])
+            self.softmax_input = tf.placeholder(tf.float32, shape=[None, 2048])
+            self.pool_features, self.spatial_features = _create_feature_graph(self.image_input)
+            self.softmax = _create_softmax_graph(self.softmax_input)
+    def warmup(self):
+        self.compute_activations(np.zeros([1, 8, 64, 64, 3]))
+    def read_activations(self, npz_path: Union[str, np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
+        if isinstance(npz_path, str):
+            # If npz_path is a string, treat it as a file path and read the .npz file
+            with open_npz_array(npz_path, "arr_0") as reader:
+                return self.compute_activations(reader.read_batches(self.batch_size))
+        elif isinstance(npz_path, np.ndarray):
+            # If npz_path is a numpy array, split it into batches manually
+            print("--------line 140-----------")
+            batches = np.array_split(npz_path, range(self.batch_size, npz_path.shape[0], self.batch_size))
+            print("--------line 143-----------")
+            return self.compute_activations(batches)
+        else:
+            raise ValueError("npz_path must be either a file path (str) or a numpy array (np.ndarray)")
+    def compute_activations(self, batches: Iterable[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Compute image features for downstream evals.
+        :param batches: a iterator over NHWC numpy arrays in [0, 255].
+        :return: a tuple of numpy arrays of shape [N x X], where X is a feature
+                 dimension. The tuple is (pool_3, spatial).
+        """
+        preds = []
+        spatial_preds = []
+        for batch in tqdm(batches):
+            # print("--------line 164-----------")
+            # # 识别当前进程信息
+            # if 'RANK' in os.environ:
+            #     rank = int(os.environ['RANK'])
+            #     local_rank = int(os.environ.get('LOCAL_RANK', rank % torch.cuda.device_count()))
+            #     print(f"Distributed training - Global Rank: {rank}, Local Rank: {local_rank}")
+            #     print(f"Current GPU device: {torch.cuda.current_device()}" if torch.cuda.is_available() else "No CUDA")
+            # else:
+            #     print("Single process mode")
+            # print(f"Process PID: {os.getpid()}")
+            batch = batch.astype(np.float32)
+            pred, spatial_pred = self.sess.run(
+                [self.pool_features, self.spatial_features], {self.image_input: batch}
+            )
+            # print("--------line 169-----------")
+            preds.append(pred.reshape([pred.shape[0], -1]))
+            spatial_preds.append(spatial_pred.reshape([spatial_pred.shape[0], -1]))
+        return (
+            np.concatenate(preds, axis=0),
+            np.concatenate(spatial_preds, axis=0),
+        )
+    def read_statistics(
+        self, npz_path: Union[str, np.ndarray], activations: Tuple[np.ndarray, np.ndarray]
+    ) -> Tuple[FIDStatistics, FIDStatistics]:
+        if isinstance(npz_path, str):
+            obj = np.load(npz_path)
+            if "mu" in list(obj.keys()):
+                return FIDStatistics(obj["mu"], obj["sigma"]), FIDStatistics(
+                    obj["mu_s"], obj["sigma_s"]
+                )
+        elif isinstance(npz_path, np.ndarray):
+            obj = npz_path
+        else:
+            raise ValueError("npz_path must be either a file path (str) or a numpy array (np.ndarray)")
+        return tuple(self.compute_statistics(x) for x in activations)
+    def compute_statistics(self, activations: np.ndarray) -> FIDStatistics:
+        mu = np.mean(activations, axis=0)
+        sigma = np.cov(activations, rowvar=False)
+        return FIDStatistics(mu, sigma)
+    def compute_inception_score(self, activations: np.ndarray, split_size: int = 5000) -> float:
+        softmax_out = []
+        for i in range(0, len(activations), self.softmax_batch_size):
+            acts = activations[i : i + self.softmax_batch_size]
+            softmax_out.append(self.sess.run(self.softmax, feed_dict={self.softmax_input: acts}))
+        preds = np.concatenate(softmax_out, axis=0)
+        # https://github.com/openai/improved-gan/blob/4f5d1ec5c16a7eceb206f42bfc652693601e1d5c/inception_score/model.py#L46
+        scores = []
+        for i in range(0, len(preds), split_size):
+            part = preds[i : i + split_size]
+            kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0)))
+            kl = np.mean(np.sum(kl, 1))
+            scores.append(np.exp(kl))
+        return float(np.mean(scores))
+    def compute_prec_recall(
+        self, activations_ref: np.ndarray, activations_sample: np.ndarray
+    ) -> Tuple[float, float]:
+        radii_1 = self.manifold_estimator.manifold_radii(activations_ref)
+        radii_2 = self.manifold_estimator.manifold_radii(activations_sample)
+        pr = self.manifold_estimator.evaluate_pr(
+            activations_ref, radii_1, activations_sample, radii_2
+        )
+        return (float(pr[0][0]), float(pr[1][0]))
+class ManifoldEstimator:
+    """
+    A helper for comparing manifolds of feature vectors.
+    Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L57
+    """
+    def __init__(
+        self,
+        session,
+        row_batch_size=10000,
+        col_batch_size=10000,
+        nhood_sizes=(3,),
+        clamp_to_percentile=None,
+        eps=1e-5,
+    ):
+        """
+        Estimate the manifold of given feature vectors.
+        :param session: the TensorFlow session.
+        :param row_batch_size: row batch size to compute pairwise distances
+                               (parameter to trade-off between memory usage and performance).
+        :param col_batch_size: column batch size to compute pairwise distances.
+        :param nhood_sizes: number of neighbors used to estimate the manifold.
+        :param clamp_to_percentile: prune hyperspheres that have radius larger than
+                                    the given percentile.
+        :param eps: small number for numerical stability.
+        """
+        self.distance_block = DistanceBlock(session)
+        self.row_batch_size = row_batch_size
+        self.col_batch_size = col_batch_size
+        self.nhood_sizes = nhood_sizes
+        self.num_nhoods = len(nhood_sizes)
+        self.clamp_to_percentile = clamp_to_percentile
+        self.eps = eps
+    def warmup(self):
+        feats, radii = (
+            np.zeros([1, 2048], dtype=np.float32),
+            np.zeros([1, 1], dtype=np.float32),
+        )
+        self.evaluate_pr(feats, radii, feats, radii)
+    def manifold_radii(self, features: np.ndarray) -> np.ndarray:
+        num_images = len(features)
+        # Estimate manifold of features by calculating distances to k-NN of each sample.
+        radii = np.zeros([num_images, self.num_nhoods], dtype=np.float32)
+        distance_batch = np.zeros([self.row_batch_size, num_images], dtype=np.float32)
+        seq = np.arange(max(self.nhood_sizes) + 1, dtype=np.int32)
+        for begin1 in range(0, num_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_images)
+            row_batch = features[begin1:end1]
+            for begin2 in range(0, num_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_images)
+                col_batch = features[begin2:end2]
+                # Compute distances between batches.
+                distance_batch[
+                    0 : end1 - begin1, begin2:end2
+                ] = self.distance_block.pairwise_distances(row_batch, col_batch)
+            # Find the k-nearest neighbor from the current batch.
+            radii[begin1:end1, :] = np.concatenate(
+                [
+                    x[:, self.nhood_sizes]
+                    for x in _numpy_partition(distance_batch[0 : end1 - begin1, :], seq, axis=1)
+                ],
+                axis=0,
+            )
+        if self.clamp_to_percentile is not None:
+            max_distances = np.percentile(radii, self.clamp_to_percentile, axis=0)
+            radii[radii > max_distances] = 0
+        return radii
+    def evaluate(self, features: np.ndarray, radii: np.ndarray, eval_features: np.ndarray):
+        """
+        Evaluate if new feature vectors are at the manifold.
+        """
+        num_eval_images = eval_features.shape[0]
+        num_ref_images = radii.shape[0]
+        distance_batch = np.zeros([self.row_batch_size, num_ref_images], dtype=np.float32)
+        batch_predictions = np.zeros([num_eval_images, self.num_nhoods], dtype=np.int32)
+        max_realism_score = np.zeros([num_eval_images], dtype=np.float32)
+        nearest_indices = np.zeros([num_eval_images], dtype=np.int32)
+        for begin1 in range(0, num_eval_images, self.row_batch_size):
+            end1 = min(begin1 + self.row_batch_size, num_eval_images)
+            feature_batch = eval_features[begin1:end1]
+            for begin2 in range(0, num_ref_images, self.col_batch_size):
+                end2 = min(begin2 + self.col_batch_size, num_ref_images)
+                ref_batch = features[begin2:end2]
+                distance_batch[
+                    0 : end1 - begin1, begin2:end2
+                ] = self.distance_block.pairwise_distances(feature_batch, ref_batch)
+            # From the minibatch of new feature vectors, determine if they are in the estimated manifold.
+            # If a feature vector is inside a hypersphere of some reference sample, then
+            # the new sample lies at the estimated manifold.
+            # The radii of the hyperspheres are determined from distances of neighborhood size k.
+            samples_in_manifold = distance_batch[0 : end1 - begin1, :, None] <= radii
+            batch_predictions[begin1:end1] = np.any(samples_in_manifold, axis=1).astype(np.int32)
+            max_realism_score[begin1:end1] = np.max(
+                radii[:, 0] / (distance_batch[0 : end1 - begin1, :] + self.eps), axis=1
+            )
+            nearest_indices[begin1:end1] = np.argmin(distance_batch[0 : end1 - begin1, :], axis=1)
+        return {
+            "fraction": float(np.mean(batch_predictions)),
+            "batch_predictions": batch_predictions,
+            "max_realisim_score": max_realism_score,
+            "nearest_indices": nearest_indices,
+        }
+    def evaluate_pr(
+        self,
+        features_1: np.ndarray,
+        radii_1: np.ndarray,
+        features_2: np.ndarray,
+        radii_2: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Evaluate precision and recall efficiently.
+        :param features_1: [N1 x D] feature vectors for reference batch.
+        :param radii_1: [N1 x K1] radii for reference vectors.
+        :param features_2: [N2 x D] feature vectors for the other batch.
+        :param radii_2: [N x K2] radii for other vectors.
+        :return: a tuple of arrays for (precision, recall):
+                 - precision: an np.ndarray of length K1
+                 - recall: an np.ndarray of length K2
+        """
+        features_1_status = np.zeros([len(features_1), radii_2.shape[1]], dtype=np.bool)
+        features_2_status = np.zeros([len(features_2), radii_1.shape[1]], dtype=np.bool)
+        for begin_1 in range(0, len(features_1), self.row_batch_size):
+            end_1 = begin_1 + self.row_batch_size
+            batch_1 = features_1[begin_1:end_1]
+            for begin_2 in range(0, len(features_2), self.col_batch_size):
+                end_2 = begin_2 + self.col_batch_size
+                batch_2 = features_2[begin_2:end_2]
+                batch_1_in, batch_2_in = self.distance_block.less_thans(
+                    batch_1, radii_1[begin_1:end_1], batch_2, radii_2[begin_2:end_2]
+                )
+                features_1_status[begin_1:end_1] |= batch_1_in
+                features_2_status[begin_2:end_2] |= batch_2_in
+        return (
+            np.mean(features_2_status.astype(np.float64), axis=0),
+            np.mean(features_1_status.astype(np.float64), axis=0),
+        )
+class DistanceBlock:
+    """
+    Calculate pairwise distances between vectors.
+    Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L34
+    """
+    def __init__(self, session):
+        self.session = session
+        # Initialize TF graph to calculate pairwise distances.
+        with session.graph.as_default():
+            self._features_batch1 = tf.placeholder(tf.float32, shape=[None, None])
+            self._features_batch2 = tf.placeholder(tf.float32, shape=[None, None])
+            distance_block_16 = _batch_pairwise_distances(
+                tf.cast(self._features_batch1, tf.float16),
+                tf.cast(self._features_batch2, tf.float16),
+            )
+            self.distance_block = tf.cond(
+                tf.reduce_all(tf.math.is_finite(distance_block_16)),
+                lambda: tf.cast(distance_block_16, tf.float32),
+                lambda: _batch_pairwise_distances(self._features_batch1, self._features_batch2),
+            )
+            # Extra logic for less thans.
+            self._radii1 = tf.placeholder(tf.float32, shape=[None, None])
+            self._radii2 = tf.placeholder(tf.float32, shape=[None, None])
+            dist32 = tf.cast(self.distance_block, tf.float32)[..., None]
+            self._batch_1_in = tf.math.reduce_any(dist32 <= self._radii2, axis=1)
+            self._batch_2_in = tf.math.reduce_any(dist32 <= self._radii1[:, None], axis=0)
+    def pairwise_distances(self, U, V):
+        """
+        Evaluate pairwise distances between two batches of feature vectors.
+        """
+        return self.session.run(
+            self.distance_block,
+            feed_dict={self._features_batch1: U, self._features_batch2: V},
+        )
+    def less_thans(self, batch_1, radii_1, batch_2, radii_2):
+        return self.session.run(
+            [self._batch_1_in, self._batch_2_in],
+            feed_dict={
+                self._features_batch1: batch_1,
+                self._features_batch2: batch_2,
+                self._radii1: radii_1,
+                self._radii2: radii_2,
+            },
+        )
+def _batch_pairwise_distances(U, V):
+    """
+    Compute pairwise distances between two batches of feature vectors.
+    """
+    with tf.variable_scope("pairwise_dist_block"):
+        # Squared norms of each row in U and V.
+        norm_u = tf.reduce_sum(tf.square(U), 1)
+        norm_v = tf.reduce_sum(tf.square(V), 1)
+        # norm_u as a column and norm_v as a row vectors.
+        norm_u = tf.reshape(norm_u, [-1, 1])
+        norm_v = tf.reshape(norm_v, [1, -1])
+        # Pairwise squared Euclidean distances.
+        D = tf.maximum(norm_u - 2 * tf.matmul(U, V, False, True) + norm_v, 0.0)
+    return D
+class NpzArrayReader(ABC):
+    @abstractmethod
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        pass
+    @abstractmethod
+    def remaining(self) -> int:
+        pass
+    def read_batches(self, batch_size: int) -> Iterable[np.ndarray]:
+        def gen_fn():
+            while True:
+                batch = self.read_batch(batch_size)
+                if batch is None:
+                    break
+                yield batch
+        rem = self.remaining()
+        num_batches = rem // batch_size + int(rem % batch_size != 0)
+        return BatchIterator(gen_fn, num_batches)
+class BatchIterator:
+    def __init__(self, gen_fn, length):
+        self.gen_fn = gen_fn
+        self.length = length
+    def __len__(self):
+        return self.length
+    def __iter__(self):
+        return self.gen_fn()
+class StreamingNpzArrayReader(NpzArrayReader):
+    def __init__(self, arr_f, shape, dtype):
+        self.arr_f = arr_f
+        self.shape = shape
+        self.dtype = dtype
+        self.idx = 0
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        if self.idx >= self.shape[0]:
+            return None
+        bs = min(batch_size, self.shape[0] - self.idx)
+        self.idx += bs
+        if self.dtype.itemsize == 0:
+            return np.ndarray([bs, *self.shape[1:]], dtype=self.dtype)
+        read_count = bs * np.prod(self.shape[1:])
+        read_size = int(read_count * self.dtype.itemsize)
+        data = _read_bytes(self.arr_f, read_size, "array data")
+        return np.frombuffer(data, dtype=self.dtype).reshape([bs, *self.shape[1:]])
+    def remaining(self) -> int:
+        return max(0, self.shape[0] - self.idx)
+class MemoryNpzArrayReader(NpzArrayReader):
+    def __init__(self, arr):
+        self.arr = arr
+        self.idx = 0
+    @classmethod
+    def load(cls, path: str, arr_name: str):
+        with open(path, "rb") as f:
+            arr = np.load(f)[arr_name]
+        return cls(arr)
+    def read_batch(self, batch_size: int) -> Optional[np.ndarray]:
+        if self.idx >= self.arr.shape[0]:
+            return None
+        res = self.arr[self.idx : self.idx + batch_size]
+        self.idx += batch_size
+        return res
+    def remaining(self) -> int:
+        return max(0, self.arr.shape[0] - self.idx)
+@contextmanager
+def open_npz_array(path: str, arr_name: str) -> NpzArrayReader:
+    with _open_npy_file(path, arr_name) as arr_f:
+        version = np.lib.format.read_magic(arr_f)
+        if version == (1, 0):
+            header = np.lib.format.read_array_header_1_0(arr_f)
+        elif version == (2, 0):
+            header = np.lib.format.read_array_header_2_0(arr_f)
+        else:
+            yield MemoryNpzArrayReader.load(path, arr_name)
+            return
+        shape, fortran, dtype = header
+        if fortran or dtype.hasobject:
+            yield MemoryNpzArrayReader.load(path, arr_name)
+        else:
+            yield StreamingNpzArrayReader(arr_f, shape, dtype)
+def _read_bytes(fp, size, error_template="ran out of data"):
+    """
+    Copied from: https://github.com/numpy/numpy/blob/fb215c76967739268de71aa4bda55dd1b062bc2e/numpy/lib/format.py#L788-L886
+    Read from file-like object until size bytes are read.
+    Raises ValueError if not EOF is encountered before size bytes are read.
+    Non-blocking objects only supported if they derive from io objects.
+    Required as e.g. ZipExtFile in python 2.6 can return less data than
+    requested.
+    """
+    data = bytes()
+    while True:
+        # io files (default in python3) return None or raise on
+        # would-block, python2 file will truncate, probably nothing can be
+        # done about that.  note that regular files can't be non-blocking
+        try:
+            r = fp.read(size - len(data))
+            data += r
+            if len(r) == 0 or len(data) == size:
+                break
+        except io.BlockingIOError:
+            pass
+    if len(data) != size:
+        msg = "EOF: reading %s, expected %d bytes got %d"
+        raise ValueError(msg % (error_template, size, len(data)))
+    else:
+        return data
+@contextmanager
+def _open_npy_file(path: str, arr_name: str):
+    with open(path, "rb") as f:
+        with zipfile.ZipFile(f, "r") as zip_f:
+            if f"{arr_name}.npy" not in zip_f.namelist():
+                raise ValueError(f"missing {arr_name} in npz file")
+            with zip_f.open(f"{arr_name}.npy", "r") as arr_f:
+                yield arr_f
+def _download_inception_model():
+    if os.path.exists(INCEPTION_V3_PATH):
+        return
+    print("downloading InceptionV3 model...")
+    with requests.get(INCEPTION_V3_URL, stream=True) as r:
+        r.raise_for_status()
+        tmp_path = INCEPTION_V3_PATH + ".tmp"
+        with open(tmp_path, "wb") as f:
+            for chunk in tqdm(r.iter_content(chunk_size=8192)):
+                f.write(chunk)
+        os.rename(tmp_path, INCEPTION_V3_PATH)
+def _create_feature_graph(input_batch):
+    _download_inception_model()
+    prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
+    with open(INCEPTION_V3_PATH, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    pool3, spatial = tf.import_graph_def(
+        graph_def,
+        input_map={f"ExpandDims:0": input_batch},
+        return_elements=[FID_POOL_NAME, FID_SPATIAL_NAME],
+        name=prefix,
+    )
+    _update_shapes(pool3)
+    spatial = spatial[..., :7]
+    return pool3, spatial
+def _create_softmax_graph(input_batch):
+    _download_inception_model()
+    prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}"
+    with open(INCEPTION_V3_PATH, "rb") as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+    (matmul,) = tf.import_graph_def(
+        graph_def, return_elements=[f"softmax/logits/MatMul"], name=prefix
+    )
+    w = matmul.inputs[1]
+    logits = tf.matmul(input_batch, w)
+    return tf.nn.softmax(logits)
+def _update_shapes(pool3):
+    # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L50-L63
+    ops = pool3.graph.get_operations()
+    for op in ops:
+        for o in op.outputs:
+            shape = o.get_shape()
+            if shape._dims is not None:  # pylint: disable=protected-access
+                # shape = [s.value for s in shape] TF 1.x
+                shape = [s for s in shape]  # TF 2.x
+                new_shape = []
+                for j, s in enumerate(shape):
+                    if s == 1 and j == 0:
+                        new_shape.append(None)
+                    else:
+                        new_shape.append(s)
+                o.__dict__["_shape_val"] = tf.TensorShape(new_shape)
+    return pool3
+def _numpy_partition(arr, kth, **kwargs):
+    num_workers = min(cpu_count(), len(arr))
+    chunk_size = len(arr) // num_workers
+    extra = len(arr) % num_workers
+    start_idx = 0
+    batches = []
+    for i in range(num_workers):
+        size = chunk_size + (1 if i < extra else 0)
+        batches.append(arr[start_idx : start_idx + size])
+        start_idx += size
+    with ThreadPool(num_workers) as pool:
+        return list(pool.map(partial(np.partition, kth=kth, **kwargs), batches))
+if __name__ == "__main__":
+    main()

GVP/Baseline/gvp_sampling.log ADDED Viewed

@@ -0,0 +1,51 @@
  0%|          | 0/1042 [00:00<?, ?it/s]
  0%|          | 1/1042 [00:59<17:12:21, 59.50s/it]
  0%|          | 2/1042 [01:58<17:01:08, 58.91s/it]
  0%|          | 3/1042 [02:57<17:02:36, 59.05s/it]
  0%|          | 4/1042 [03:56<17:00:31, 58.99s/it]
  0%|          | 5/1042 [04:52<16:42:43, 58.02s/it]
  1%|          | 6/1042 [05:51<16:47:28, 58.35s/it]
  1%|          | 7/1042 [06:50<16:53:25, 58.75s/it]
  1%|          | 8/1042 [07:49<16:49:35, 58.58s/it]
  1%|          | 9/1042 [08:48<16:51:57, 58.78s/it]
  1%|          | 10/1042 [09:46<16:49:39, 58.70s/it]
  1%|          | 11/1042 [10:45<16:48:10, 58.67s/it]
  1%|          | 12/1042 [11:45<16:53:10, 59.02s/it]
  1%|          | 13/1042 [12:44<16:52:36, 59.04s/it]
  1%|▏         | 14/1042 [13:43<16:50:11, 58.96s/it]W0407 16:34:53.638000 2760 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 2845 closing signal SIGTERM

+W0407 16:17:44.645000 2760 site-packages/torch/distributed/run.py:793]
+W0407 16:17:44.645000 2760 site-packages/torch/distributed/run.py:793] *****************************************
+W0407 16:17:44.645000 2760 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+W0407 16:17:44.645000 2760 site-packages/torch/distributed/run.py:793] *****************************************
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+Starting rank=0, seed=0, world_size=4.
+Starting rank=2, seed=2, world_size=4.
+Starting rank=1, seed=1, world_size=4.
+Starting rank=3, seed=3, world_size=4.
+[rank1]:[W407 16:20:17.131912166 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[rank3]:[W407 16:20:17.153628536 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+Saving .png samples at baseline_gvp_/SiT-XL-2-base-cfg-1.0-12-SDE-250-Euler-sigma-Mean-0.04
+[rank0]:[W407 16:20:17.306737681 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+[rank2]:[W407 16:20:18.780347929 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
+Total number of images that will be sampled: 50016
  0%|          | 0/1042 [00:00<?, ?it/s]
  0%|          | 1/1042 [00:59<17:12:21, 59.50s/it]
  0%|          | 2/1042 [01:58<17:01:08, 58.91s/it]
  0%|          | 3/1042 [02:57<17:02:36, 59.05s/it]
  0%|          | 4/1042 [03:56<17:00:31, 58.99s/it]
  0%|          | 5/1042 [04:52<16:42:43, 58.02s/it]
  1%|          | 6/1042 [05:51<16:47:28, 58.35s/it]
  1%|          | 7/1042 [06:50<16:53:25, 58.75s/it]
  1%|          | 8/1042 [07:49<16:49:35, 58.58s/it]
  1%|          | 9/1042 [08:48<16:51:57, 58.78s/it]
  1%|          | 10/1042 [09:46<16:49:39, 58.70s/it]
  1%|          | 11/1042 [10:45<16:48:10, 58.67s/it]
  1%|          | 12/1042 [11:45<16:53:10, 59.02s/it]
  1%|          | 13/1042 [12:44<16:52:36, 59.04s/it]
  1%|▏         | 14/1042 [13:43<16:50:11, 58.96s/it]W0407 16:34:53.638000 2760 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 2845 closing signal SIGTERM
+W0407 16:34:53.639000 2760 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 2847 closing signal SIGTERM
+W0407 16:34:53.639000 2760 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 2848 closing signal SIGTERM
+E0407 16:34:53.854000 2760 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -9) local_rank: 1 (pid: 2846) of binary: /root/miniconda3/envs/SiT/bin/python3.10
+Traceback (most recent call last):
+  File "/root/miniconda3/envs/SiT/bin/torchrun", line 6, in <module>
+    sys.exit(main())
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+    return f(*args, **kwargs)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main
+    run(args)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run
+    elastic_launch(
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+==========================================================
+sample_ddp.py FAILED
+----------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+----------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2026-04-07_16:34:53
+  host      : 280c8972fe62c4ab251b3c74bd05a546-taskrole1-0
+  rank      : 1 (local_rank: 1)
+  exitcode  : -9 (pid: 2846)
+  error_file: <N/A>
+  traceback : Signal 9 (SIGKILL) received by PID 2846
+==========================================================

GVP/Baseline/models.py ADDED Viewed

	@@ -0,0 +1,647 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#################################################################################
+#                                 Core SiT Model                                #
+#################################################################################
+class SiTBlock(nn.Module):
+    """
+    A SiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of SiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class SiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=1000,
+        learn_sigma=True,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.learn_sigma = True
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        self.blocks = nn.ModuleList([
+            SiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize label embedding table:
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in SiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    def forward(self, x, t, y, return_act=False):
+        """
+        Forward pass of SiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        return_act: if True, return activations from transformer blocks
+        """
+        act = []
+        x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(t)                   # (N, D)
+        y = self.y_embedder(y, self.training)    # (N, D)
+        c = t + y                                # (N, D)
+        for block in self.blocks:
+            x = block(x, c)                      # (N, T, D)
+            if return_act:
+                act.append(x)
+        x = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)                   # (N, out_channels, H, W)
+        if self.learn_sigma:
+            x, _ = x.chunk(2, dim=1)
+        if return_act:
+            return x, act
+        return x
+    def forward_with_cfg(self, x, t, y, cfg_scale):
+        """
+        Forward pass of SiT, but also batches the unconSiTional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, y)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+#################################################################################
+#                                   SiT Configs                                  #
+#################################################################################
+def SiT_XL_2(**kwargs):
+    return SiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def SiT_XL_4(**kwargs):
+    return SiT(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)
+def SiT_XL_8(**kwargs):
+    return SiT(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)
+def SiT_L_2(**kwargs):
+    return SiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def SiT_L_4(**kwargs):
+    return SiT(depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)
+def SiT_L_8(**kwargs):
+    return SiT(depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)
+def SiT_B_2(**kwargs):
+    return SiT(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def SiT_B_4(**kwargs):
+    return SiT(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)
+def SiT_B_8(**kwargs):
+    return SiT(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def SiT_S_2(**kwargs):
+    return SiT(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+def SiT_S_4(**kwargs):
+    return SiT(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)
+def SiT_S_8(**kwargs):
+    return SiT(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)
+SiT_models = {
+    'SiT-XL/2': SiT_XL_2,  'SiT-XL/4': SiT_XL_4,  'SiT-XL/8': SiT_XL_8,
+    'SiT-L/2':  SiT_L_2,   'SiT-L/4':  SiT_L_4,   'SiT-L/8':  SiT_L_8,
+    'SiT-B/2':  SiT_B_2,   'SiT-B/4':  SiT_B_4,   'SiT-B/8':  SiT_B_8,
+    'SiT-S/2':  SiT_S_2,   'SiT-S/4':  SiT_S_4,   'SiT-S/8':  SiT_S_8,
+}
+#################################################################################
+#                                 SiTF1, SiTF2, CombinedModel                   #
+#################################################################################
+class SiTF1(nn.Module):
+    """
+    SiTF1 Model
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=1000,
+        learn_sigma=True,
+        final_layer=None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.patch_size= patch_size
+        self.hidden_size = hidden_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.learn_sigma = learn_sigma
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.x_embedder.num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        self.blocks = nn.ModuleList([
+            SiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+    def forward(self, x, t, y):
+        x = self.x_embedder(x) + self.pos_embed
+        t = self.t_embedder(t)
+        y = self.y_embedder(y, self.training)
+        c = t + y
+        for block in self.blocks:
+            x = block(x, c)
+        x_now = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
+        x_now = self.unpatchify(x_now)                   # (N, out_channels, H, W)
+        x_now, _ = x_now.chunk(2, dim=1)
+        return x,x_now  # patch token (N, T, D)
+    def forward_with_cfg(self, x, t, y, cfg_scale):
+        """
+        Forward pass with classifier-free guidance for SiTF1.
+        Applies guidance consistently to both patch tokens and image output (x_now).
+        """
+        # Take the first half (conditional inputs) and duplicate it so that
+        # it can be paired with conditional and unconditional labels in `y`.
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        patch_tokens, x_now = self.forward(combined, t, y)
+        # Apply CFG on the image output channels (first 3 channels by default)
+        eps, rest = x_now[:, :3, ...], x_now[:, 3:, ...]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        x_now = torch.cat([eps, rest], dim=1)
+        # Apply same guidance logic to patch tokens so downstream modules see
+        # a consistent guided representation.
+        cond_tok, uncond_tok = torch.split(patch_tokens, len(patch_tokens) // 2, dim=0)
+        half_tok = uncond_tok + cfg_scale * (cond_tok - uncond_tok)
+        patch_tokens = torch.cat([half_tok, half_tok], dim=0)
+        return patch_tokens, x_now
+class SiTF2(nn.Module):
+    """
+    SiTF2:
+    """
+    def __init__(
+        self,
+        input_size=32,
+        hidden_size=1152,
+        out_channels=8,
+        patch_size=2,
+        num_heads=16,
+        mlp_ratio=4.0,
+        depth=4,
+        learn_sigma=True,
+        final_layer=None,
+        num_classes=1000,
+        class_dropout_prob=0.1,
+        learn_mu=False,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.learn_mu = learn_mu
+        self.out_channels = out_channels
+        self.in_channels = 4
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.blocks = nn.ModuleList([
+            SiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.x_embedder = PatchEmbed(input_size, patch_size, self.in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.x_embedder.num_patches
+        self.num_patches = num_patches  # Save original num_patches for unpatchify
+        # pos_embed needs to support 2*num_patches for concatenated input
+        self.pos_embed = nn.Parameter(torch.zeros(1, 2 * num_patches, hidden_size), requires_grad=False)
+        # Initialize pos_embed with sin-cos embedding
+        pos_embed = get_2d_sincos_pos_embed(hidden_size, int(num_patches ** 0.5))
+        # Repeat the pos_embed for both halves (or could use different embeddings)
+        pos_embed_full = np.concatenate([pos_embed, pos_embed], axis=0)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed_full).float().unsqueeze(0))
+        if final_layer is not None:
+            self.final_layer = final_layer
+        else:
+            self.final_layer = FinalLayer(hidden_size, patch_size, out_channels)
+            if depth !=0:
+                for p in self.final_layer.parameters():
+                    if p is not None:
+                        torch.nn.init.constant_(p, 0)
+    def unpatchify(self, x, patch_size, out_channels):
+        c = out_channels
+        p = patch_size
+        # x.shape[1] might be 2*num_patches when using concatenated input
+        # Use original num_patches to calculate h and w
+        h = w = int(self.num_patches ** 0.5)
+        # If input has 2*num_patches, we need to handle it
+        if x.shape[1] == 2 * self.num_patches:
+            # Take only the first half (or average, or other strategy)
+            # For now, we'll take the first half
+            x = x[:, :self.num_patches, :]
+        assert h * w == x.shape[1], f"Expected {h * w} patches, got {x.shape[1]}"
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    def forward(self, x, c, t, return_act=False):
+        act = []
+        for block in self.blocks:
+            x = block(x, c)
+            if return_act:
+                act.append(x)
+        x = self.final_layer(x, c)
+        x = self.unpatchify(x, self.patch_size, self.out_channels)
+        if self.learn_sigma:
+            mean_pred, log_var_pred = x.chunk(2, dim=1)
+            variance_pred = torch.exp(log_var_pred)
+            std_dev_pred = torch.sqrt(variance_pred)
+            noise = torch.randn_like(mean_pred)
+            #uniform_noise = torch.rand_like(mean_pred)
+            #uniform_noise = uniform_noise.clamp(min=1e-5, max=1-1e-5)
+            #gumbel_noise = -torch.log(-torch.log(uniform_noise))
+            if self.learn_mu==True:
+                resampled_x = mean_pred + std_dev_pred * noise
+            else:
+                resampled_x = std_dev_pred * noise
+            x = resampled_x
+        else:
+            x, _ = x.chunk(2, dim=1)
+        if return_act:
+            return x, act
+        return x
+    def forward_noise(self, x, c):
+        for block in self.blocks:
+            x = block(x, c)
+        x = self.final_layer(x, c)
+        x = self.unpatchify(x, self.patch_size, self.out_channels)
+        if self.learn_sigma:
+            mean_pred, log_var_pred = x.chunk(2, dim=1)
+            variance_pred = torch.exp(log_var_pred)
+            std_dev_pred = torch.sqrt(variance_pred)
+            noise = torch.randn_like(mean_pred)
+            if self.learn_mu==True:
+                resampled_x = mean_pred + std_dev_pred * noise
+            else:
+                resampled_x = std_dev_pred * noise
+            x = resampled_x
+        else:
+            x, _ = x.chunk(2, dim=1)
+        return x
+#有两种写法，一种是拿理想的，一种是拿真实的，一种是拼接，一种是加和
+class CombinedModel(nn.Module):
+    """
+    CombinedModel。
+    """
+    def __init__(self, sitf1: SiTF1, sitf2: SiTF2):
+        super().__init__()
+        self.sitf1 = sitf1
+        self.sitf2 = sitf2
+        input_size=self.sitf1.input_size
+        patch_size=self.sitf1.patch_size
+        hidden_size=self.sitf1.hidden_size
+        self.x_embedder = PatchEmbed(input_size, patch_size, 4, hidden_size, bias=True)
+        num_patches = self.x_embedder.num_patches
+        # pos_embed needs to support 2*num_patches for concatenated input
+        self.pos_embed = nn.Parameter(torch.zeros(1, 2 * num_patches, hidden_size), requires_grad=False)
+        # Initialize pos_embed with sin-cos embedding
+        pos_embed = get_2d_sincos_pos_embed(hidden_size, int(num_patches ** 0.5))
+        # Repeat the pos_embed for both halves (or could use different embeddings)
+        pos_embed_full = np.concatenate([pos_embed, pos_embed], axis=0)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed_full).float().unsqueeze(0))
+    def forward(self, x, t, y, return_act=False):
+        patch_tokens,x_now = self.sitf1(x, t, y)
+        # Interpolate between x_now and x using timestep t: (1-t)*x_now + t*x
+        # t shape is (N,), need to broadcast to (N, 1, 1, 1) for broadcasting with image (N, C, H, W)
+        t_broadcast = t.view(-1, 1, 1, 1)  # (N, 1, 1, 1)
+        # Compute interpolated input: (1-t)*x_now + t*x
+        x_interpolated = (1 - t_broadcast) * x_now +  x
+        # Convert interpolated input (image format) back to patch token format (without pos_embed, will add later)
+        x_now_patches = self.x_embedder(x_interpolated)
+        # Concatenate patch_tokens and x_now_patches along the sequence dimension
+        concatenated_input = torch.cat([patch_tokens, x_now_patches], dim=1)  # (N, 2*T, D)
+        # Add position embedding for the concatenated input
+        # Use the same pos_embed for both halves (or could use different embeddings)
+        concatenated_input = concatenated_input + self.pos_embed
+        t_emb = self.sitf1.t_embedder(t)
+        y_emb = self.sitf1.y_embedder(y, self.training)
+        c = t_emb + y_emb
+        return self.sitf2(concatenated_input, c, t, return_act=return_act)

GVP/Baseline/nohup.out ADDED Viewed

@@ -0,0 +1,180 @@
  0%|          | 0/1042 [00:00<?, ?it/s]
  0%|          | 1/1042 [00:13<4:01:42, 13.93s/it]
  0%|          | 2/1042 [00:26<3:44:31, 12.95s/it]
  0%|          | 3/1042 [00:39<3:43:49, 12.93s/it]W0317 10:30:53.664000 11774 site-packages/torch/distributed/elastic/agent/server/api.py:704] Received Signals.SIGINT death signal, shutting down workers
  0%|          | 3/1042 [00:39<3:46:17, 13.07s/it]

+W0317 10:27:10.803000 11774 site-packages/torch/distributed/run.py:793]
+W0317 10:27:10.803000 11774 site-packages/torch/distributed/run.py:793] *****************************************
+W0317 10:27:10.803000 11774 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+W0317 10:27:10.803000 11774 site-packages/torch/distributed/run.py:793] *****************************************
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+Starting rank=0, seed=0, world_size=4.
+Starting rank=1, seed=1, world_size=4.
+Starting rank=3, seed=3, world_size=4.
+Starting rank=2, seed=2, world_size=4.
+Saving .png samples at GVP_samples/depth-mu-6-0300000-base-cfg-1.0-12-SDE-100-Euler-sigma-Mean-0.04
+Total number of images that will be sampled: 50016
  0%|          | 0/1042 [00:00<?, ?it/s]
  0%|          | 1/1042 [00:13<4:01:42, 13.93s/it]
  0%|          | 2/1042 [00:26<3:44:31, 12.95s/it]
  0%|          | 3/1042 [00:39<3:43:49, 12.93s/it]W0317 10:30:53.664000 11774 site-packages/torch/distributed/elastic/agent/server/api.py:704] Received Signals.SIGINT death signal, shutting down workers
+W0317 10:30:53.667000 11774 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 11854 closing signal SIGINT
+W0317 10:30:53.668000 11774 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 11855 closing signal SIGINT
+W0317 10:30:53.668000 11774 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 11856 closing signal SIGINT
  0%|          | 3/1042 [00:39<3:46:17, 13.07s/it]
+W0317 10:30:53.668000 11774 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 11857 closing signal SIGINT
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "/gemini/space/gzy_new/models/Baseline/sample_rectified_noise.py", line 380, in <module>
+[rank3]:     main(mode, args)
+[rank3]:   File "/gemini/space/gzy_new/models/Baseline/sample_rectified_noise.py", line 312, in main
+[rank3]:     samples = sample_fn(z, combined_sampling_model, **model_kwargs)[-1]
+[rank3]:   File "/gemini/space/gzy_new/models/Baseline/transport/transport.py", line 388, in _sample
+[rank3]:     xs = _sde.sample(init, model, **model_kwargs)
+[rank3]:   File "/gemini/space/gzy_new/models/Baseline/transport/integrators.py", line 72, in sample
+[rank3]:     x, mean_x = sampler(x, mean_x, ti, model, **model_kwargs)
+[rank3]:   File "/gemini/space/gzy_new/models/Baseline/transport/integrators.py", line 30, in __Euler_Maruyama_step
+[rank3]:     w_cur = th.randn(x.size()).to(x)
+[rank3]: KeyboardInterrupt
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/gemini/space/gzy_new/models/Baseline/sample_rectified_noise.py", line 380, in <module>
+[rank0]:     main(mode, args)
+[rank0]:   File "/gemini/space/gzy_new/models/Baseline/sample_rectified_noise.py", line 312, in main
+[rank0]:     samples = sample_fn(z, combined_sampling_model, **model_kwargs)[-1]
+[rank0]:   File "/gemini/space/gzy_new/models/Baseline/transport/transport.py", line 388, in _sample
+[rank0]:     xs = _sde.sample(init, model, **model_kwargs)
+[rank0]:   File "/gemini/space/gzy_new/models/Baseline/transport/integrators.py", line 72, in sample
+[rank0]:     x, mean_x = sampler(x, mean_x, ti, model, **model_kwargs)
+[rank0]:   File "/gemini/space/gzy_new/models/Baseline/transport/integrators.py", line 33, in __Euler_Maruyama_step
+[rank0]:     drift = self.drift(x, t, model, **model_kwargs)
+[rank0]:   File "/gemini/space/gzy_new/models/Baseline/transport/transport.py", line 299, in <lambda>
+[rank0]:     self.drift(x, t, model, **kwargs) + diffusion_fn(x, t) * self.score(x, t, model, **kwargs)
+[rank0]:   File "/gemini/space/gzy_new/models/Baseline/transport/transport.py", line 247, in body_fn
+[rank0]:     model_output = drift_fn(x, t, model, **model_kwargs)
+[rank0]:   File "/gemini/space/gzy_new/models/Baseline/transport/transport.py", line 236, in velocity_ode
+[rank0]:     model_output = model(x, t, **model_kwargs)
+[rank0]:   File "/gemini/space/gzy_new/models/Baseline/sample_rectified_noise.py", line 194, in combined_sampling_model
+[rank0]:     sit_out = base_model.forward(x, t, y)
+[rank0]:   File "/gemini/space/gzy_new/models/Baseline/models.py", line 245, in forward
+[rank0]:     x = block(x, c)                      # (N, T, D)
+[rank0]:   File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/gemini/space/gzy_new/models/Baseline/models.py", line 117, in forward
+[rank0]:     x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+[rank0]: KeyboardInterrupt
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/gemini/space/gzy_new/models/Baseline/sample_rectified_noise.py", line 380, in <module>
+[rank1]:     main(mode, args)
+[rank1]:   File "/gemini/space/gzy_new/models/Baseline/sample_rectified_noise.py", line 312, in main
+[rank1]:     samples = sample_fn(z, combined_sampling_model, **model_kwargs)[-1]
+[rank1]:   File "/gemini/space/gzy_new/models/Baseline/transport/transport.py", line 388, in _sample
+[rank1]:     xs = _sde.sample(init, model, **model_kwargs)
+[rank1]:   File "/gemini/space/gzy_new/models/Baseline/transport/integrators.py", line 72, in sample
+[rank1]:     x, mean_x = sampler(x, mean_x, ti, model, **model_kwargs)
+[rank1]:   File "/gemini/space/gzy_new/models/Baseline/transport/integrators.py", line 33, in __Euler_Maruyama_step
+[rank1]:     drift = self.drift(x, t, model, **model_kwargs)
+[rank1]:   File "/gemini/space/gzy_new/models/Baseline/transport/transport.py", line 299, in <lambda>
+[rank1]:     self.drift(x, t, model, **kwargs) + diffusion_fn(x, t) * self.score(x, t, model, **kwargs)
+[rank1]:   File "/gemini/space/gzy_new/models/Baseline/transport/transport.py", line 264, in <lambda>
+[rank1]:     score_fn = lambda x, t, model, **kwargs: self.path_sampler.get_score_from_velocity(model(x, t, **kwargs), x, t)
+[rank1]:   File "/gemini/space/gzy_new/models/Baseline/sample_rectified_noise.py", line 194, in combined_sampling_model
+[rank1]:     sit_out = base_model.forward(x, t, y)
+[rank1]:   File "/gemini/space/gzy_new/models/Baseline/models.py", line 241, in forward
+[rank1]:     t = self.t_embedder(t)                   # (N, D)
+[rank1]:   File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+[rank1]:     return self._call_impl(*args, **kwargs)
+[rank1]:   File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+[rank1]:     return forward_call(*args, **kwargs)
+[rank1]:   File "/gemini/space/gzy_new/models/Baseline/models.py", line 59, in forward
+[rank1]:     t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+[rank1]:   File "/gemini/space/gzy_new/models/Baseline/models.py", line 49, in timestep_embedding
+[rank1]:     freqs = torch.exp(
+[rank1]: KeyboardInterrupt
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "/gemini/space/gzy_new/models/Baseline/sample_rectified_noise.py", line 380, in <module>
+[rank2]:     main(mode, args)
+[rank2]:   File "/gemini/space/gzy_new/models/Baseline/sample_rectified_noise.py", line 312, in main
+[rank2]:     samples = sample_fn(z, combined_sampling_model, **model_kwargs)[-1]
+[rank2]:   File "/gemini/space/gzy_new/models/Baseline/transport/transport.py", line 388, in _sample
+[rank2]:     xs = _sde.sample(init, model, **model_kwargs)
+[rank2]:   File "/gemini/space/gzy_new/models/Baseline/transport/integrators.py", line 72, in sample
+[rank2]:     x, mean_x = sampler(x, mean_x, ti, model, **model_kwargs)
+[rank2]:   File "/gemini/space/gzy_new/models/Baseline/transport/integrators.py", line 33, in __Euler_Maruyama_step
+[rank2]:     drift = self.drift(x, t, model, **model_kwargs)
+[rank2]:   File "/gemini/space/gzy_new/models/Baseline/transport/transport.py", line 299, in <lambda>
+[rank2]:     self.drift(x, t, model, **kwargs) + diffusion_fn(x, t) * self.score(x, t, model, **kwargs)
+[rank2]:   File "/gemini/space/gzy_new/models/Baseline/transport/transport.py", line 264, in <lambda>
+[rank2]:     score_fn = lambda x, t, model, **kwargs: self.path_sampler.get_score_from_velocity(model(x, t, **kwargs), x, t)
+[rank2]:   File "/gemini/space/gzy_new/models/Baseline/sample_rectified_noise.py", line 194, in combined_sampling_model
+[rank2]:     sit_out = base_model.forward(x, t, y)
+[rank2]:   File "/gemini/space/gzy_new/models/Baseline/models.py", line 241, in forward
+[rank2]:     t = self.t_embedder(t)                   # (N, D)
+[rank2]:   File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+[rank2]:     return self._call_impl(*args, **kwargs)
+[rank2]:   File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+[rank2]:     return forward_call(*args, **kwargs)
+[rank2]:   File "/gemini/space/gzy_new/models/Baseline/models.py", line 59, in forward
+[rank2]:     t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+[rank2]:   File "/gemini/space/gzy_new/models/Baseline/models.py", line 49, in timestep_embedding
+[rank2]:     freqs = torch.exp(
+[rank2]: KeyboardInterrupt
+W0317 10:30:53.820000 11774 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 11854 closing signal SIGTERM
+W0317 10:30:53.895000 11774 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 11855 closing signal SIGTERM
+W0317 10:30:53.895000 11774 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 11856 closing signal SIGTERM
+W0317 10:30:53.895000 11774 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 11857 closing signal SIGTERM
+Traceback (most recent call last):
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 696, in run
+    result = self._invoke_run(role)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 855, in _invoke_run
+    time.sleep(monitor_interval)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 11774 got signal: 2
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 705, in run
+    self._shutdown(e.sigval)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 365, in _shutdown
+    self._pcontext.close(death_sig)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 572, in close
+    self._close(death_sig=death_sig, timeout=timeout)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 909, in _close
+    handler.proc.wait(time_to_wait)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/subprocess.py", line 1209, in wait
+    return self._wait(timeout=timeout)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/subprocess.py", line 1953, in _wait
+    time.sleep(delay)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 11774 got signal: 2
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "/root/miniconda3/envs/SiT/bin/torchrun", line 6, in <module>
+    sys.exit(main())
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+    return f(*args, **kwargs)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main
+    run(args)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run
+    elastic_launch(
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 260, in launch_agent
+    result = agent.run()
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
+    result = f(*args, **kwargs)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 710, in run
+    self._shutdown()
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 365, in _shutdown
+    self._pcontext.close(death_sig)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 572, in close
+    self._close(death_sig=death_sig, timeout=timeout)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 909, in _close
+    handler.proc.wait(time_to_wait)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/subprocess.py", line 1209, in wait
+    return self._wait(timeout=timeout)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/subprocess.py", line 1953, in _wait
+    time.sleep(delay)
+  File "/root/miniconda3/envs/SiT/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 11774 got signal: 2

GVP/Baseline/pic_npz.py ADDED Viewed

	@@ -0,0 +1,168 @@

+#!/usr/bin/env python3
+"""
+将文件夹下所有PNG或JPG文件读取并生成对应NPZ文件
+基于 sample_ddp_new.py 中的 create_npz_from_sample_folder 函数改进
+支持自动检测图片数量，支持PNG和JPG格式，输出到父级目录
+支持从 metadata.jsonl 文件读取图片路径
+"""
+import os
+import argparse
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+import glob
+import json
+def create_npz_from_metadata(*args, **kwargs):
+    """
+    占位函数：已废弃 metadata.jsonl 功能，保留空壳避免旧脚本导入时报错。
+    """
+    raise RuntimeError("metadata.jsonl 功能已移除，请仅使用 --image_folder 方式生成 npz。")
+def main():
+    """
+    主函数：解析命令行参数并执行图片到npz的转换
+    """
+    parser = argparse.ArgumentParser(
+        description="将文件夹下所有PNG或JPG文件转换为NPZ格式",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+使用示例:
+  python pic_npz.py /path/to/image/folder
+  python pic_npz.py /path/to/image/folder --output-dir /custom/output/path
+        """
+    )
+    parser.add_argument(
+        "--image_folder",
+        type=str,
+        default="/gemini/space/gzy_new/models/Baseline/GVP_samples/depth-mu-6-0300000-base-cfg-1.0-12-SDE-250-Euler-sigma-Mean-0.04",
+        help="包含PNG或JPG图片文件的文件夹路径"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="自定义输出目录（默认为输入文件夹的父级目录或 metadata.jsonl 所在目录）"
+    )
+    args = parser.parse_args()
+    try:
+        # 仅使用图片文件夹，不再支持 metadata.jsonl
+        image_folder_path = os.path.abspath(args.image_folder)
+        if args.output_dir:
+            # 如果指定了输出目录，修改生成逻辑
+            folder_name = os.path.basename(image_folder_path.rstrip('/'))
+            custom_output_path = os.path.join(args.output_dir, f"{folder_name}.npz")
+            # 创建输出目录（如果不存在）
+            os.makedirs(args.output_dir, exist_ok=True)
+            # 使用自定义输出路径版本
+            npz_path = create_npz_from_image_folder_custom(image_folder_path, custom_output_path)
+        else:
+            npz_path = create_npz_from_image_folder(image_folder_path)
+        print(f"转换完成！NPZ文件已保存至: {npz_path}")
+    except Exception as e:
+        print(f"错误: {e}")
+        return 1
+    return 0
+def create_npz_from_image_folder_custom(image_folder_path, output_path):
+    """
+    从包含图片的文件夹构建单个 .npz 文件（自定义输出路径版本）
+    Args:
+        image_folder_path (str): 包含图片文件的文件夹路径
+        output_path (str): 输出npz文件的完整路径
+    Returns:
+        str: 生成的 npz 文件路径
+    """
+    # 确保路径存在
+    if not os.path.exists(image_folder_path):
+        raise ValueError(f"文件夹路径不存在: {image_folder_path}")
+    # 获取所有支持的图片文件
+    supported_extensions = ['*.png', '*.PNG', '*.jpg', '*.JPG', '*.jpeg', '*.JPEG']
+    image_files = []
+    for extension in supported_extensions:
+        pattern = os.path.join(image_folder_path, extension)
+        image_files.extend(glob.glob(pattern))
+    # 按文件名排序确保一致性
+    image_files.sort()
+    if len(image_files) == 0:
+        raise ValueError(f"在文件夹 {image_folder_path} 中未找到任何PNG或JPG图片文件")
+    print(f"找到 {len(image_files)} 张图片文件")
+    # 读取所有图片
+    samples = []
+    for img_path in tqdm(image_files, desc="读取图片并转换为numpy数组"):
+        try:
+            # 打开图片并转换为RGB格式（确保一致性）
+            with Image.open(img_path) as img:
+                # 转换为RGB，确保所有图片都是3通道
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                # 将图片resize到512x512
+                img = img.resize((512, 512), Image.LANCZOS)
+                sample_np = np.asarray(img).astype(np.uint8)
+                # 确保图片是3通道
+                if len(sample_np.shape) != 3 or sample_np.shape[2] != 3:
+                    print(f"警告: 跳过非3通道图片 {img_path}, 形状: {sample_np.shape}")
+                    continue
+                samples.append(sample_np)
+        except Exception as e:
+            print(f"警告: 无法读取图片 {img_path}: {e}")
+            continue
+    if len(samples) == 0:
+        raise ValueError("没有成功读取任何有效的图片文件")
+    # 转换为numpy数组
+    samples = np.stack(samples)
+    print(f"成功��取 {len(samples)} 张图片，形状: {samples.shape}")
+    # 验证数据形状
+    assert len(samples.shape) == 4, f"期望4维数组，得到形状: {samples.shape}"
+    assert samples.shape[3] == 3, f"期望3通道图片，得到: {samples.shape[3]}通道"
+    # 保存为npz文件
+    np.savez(output_path, arr_0=samples)
+    print(f"已保存 .npz 文件到 {output_path} [形状={samples.shape}]")
+    return output_path
+def create_npz_from_image_folder(image_folder_path):
+    """
+    从图片文件夹构建 .npz，输出到该文件夹的父目录，文件名为 <文件夹名>.npz
+    """
+    parent_dir = os.path.dirname(os.path.abspath(image_folder_path))
+    folder_name = os.path.basename(os.path.abspath(image_folder_path).rstrip("/"))
+    output_path = os.path.join(parent_dir, f"{folder_name}.npz")
+    return create_npz_from_image_folder_custom(image_folder_path, output_path)
+if __name__ == "__main__":
+    exit(main())

GVP/Baseline/run.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+nohup torchrun \
+    --nnodes=1 \
+    --nproc_per_node=4 \
+    --rdzv_endpoint=localhost:29739 \
+    train_rectified_noise.py \
+    --depth 6 \
+    --results-dir results_256_gvp_disp \
+    --data-path /gemini/space/gzy_new/Imagenet256/train \
+    --ckpt /gemini/space/gzy_new/models/xiangzai_Back/GVP_check/base.pt \
+    --num-classes 1000 \
+    --path-type GVP \
+    --prediction velocity \
+    --use-ot \
+    --use-contrastive \
+    > w_training1.log 2>&1 &

GVP/Baseline/sample_compare_ddp_rectified.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import argparse
+import math
+import os
+import sys
+import numpy as np
+import torch
+import torch.distributed as dist
+from diffusers.models import AutoencoderKL
+from PIL import Image
+from torch.nn.parallel import DistributedDataParallel as DDP
+from tqdm import tqdm
+from download import find_model
+from models import SiT_models
+from train_utils import parse_ode_args, parse_sde_args, parse_transport_args
+from transport import Sampler, create_transport
+def fix_state_dict_for_ddp(state_dict):
+    if isinstance(state_dict, dict) and ("model" in state_dict or "ema" in state_dict):
+        if "ema" in state_dict:
+            state_dict = state_dict["ema"]
+        elif "model" in state_dict:
+            state_dict = state_dict["model"]
+    fixed_state_dict = {}
+    for key, value in state_dict.items():
+        fixed_state_dict[key if key.startswith("module.") else f"module.{key}"] = value
+    return fixed_state_dict
+def save_png_batch(samples, out_dir, rank, total_offset):
+    for i, sample in enumerate(samples):
+        index = i * dist.get_world_size() + rank + total_offset
+        Image.fromarray(sample).save(f"{out_dir}/{index:06d}.png")
+def create_npz_from_sample_folder(sample_dir, num=50_000):
+    samples = []
+    for i in tqdm(range(num), desc=f"Building .npz from {os.path.basename(sample_dir)}"):
+        sample_pil = Image.open(f"{sample_dir}/{i:06d}.png")
+        samples.append(np.asarray(sample_pil).astype(np.uint8))
+    samples = np.stack(samples)
+    npz_path = f"{sample_dir}.npz"
+    np.savez(npz_path, arr_0=samples)
+    print(f"Saved .npz to {npz_path} [shape={samples.shape}]")
+def main(mode, args):
+    torch.backends.cuda.matmul.allow_tf32 = args.tf32
+    assert torch.cuda.is_available(), "This script requires at least one GPU."
+    torch.set_grad_enabled(False)
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    latent_size = args.image_size // 8
+    assert args.cfg_scale >= 1.0
+    using_cfg = args.cfg_scale > 1.0
+    # ---------------- Base model (sample_ddp style) ----------------
+    base_model = SiT_models[args.model](
+        input_size=latent_size, num_classes=args.num_classes, learn_sigma=False
+    ).to(device)
+    base_state = find_model(args.ckpt)
+    if isinstance(base_state, dict) and "model" in base_state:
+        base_state = base_state["model"]
+    base_model.load_state_dict(base_state, strict=False)
+    base_model.eval()
+    # ---------------- Rectified model (sample_rectified style) ----------------
+    from models import CombinedModel, SiTF1, SiTF2
+    model_name = args.model
+    if "XL" in model_name:
+        hidden_size, depth, num_heads = 1152, 28, 16
+    elif "L" in model_name:
+        hidden_size, depth, num_heads = 1024, 24, 16
+    elif "B" in model_name:
+        hidden_size, depth, num_heads = 768, 12, 12
+    elif "S" in model_name:
+        hidden_size, depth, num_heads = 384, 12, 6
+    else:
+        hidden_size, depth, num_heads = 768, 12, 12
+    patch_size = int(model_name.split("/")[-1])
+    sitf1 = SiTF1(
+        input_size=latent_size,
+        patch_size=patch_size,
+        in_channels=4,
+        hidden_size=hidden_size,
+        depth=depth,
+        num_heads=num_heads,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=args.num_classes,
+        learn_sigma=False,
+    ).to(device)
+    sitf1.load_state_dict(base_state, strict=False)
+    sitf1.eval()
+    sitf2 = SiTF2(
+        input_size=latent_size,
+        hidden_size=hidden_size,
+        out_channels=8,
+        patch_size=patch_size,
+        num_heads=num_heads,
+        mlp_ratio=4.0,
+        depth=args.depth,
+        learn_sigma=True,
+        num_classes=args.num_classes,
+        learn_mu=args.learn_mu,
+    ).to(device)
+    sitf2 = DDP(sitf2, device_ids=[device])
+    sitf2_state = fix_state_dict_for_ddp(find_model(args.sitf2_ckpt))
+    sitf2.load_state_dict(sitf2_state, strict=False)
+    sitf2.eval()
+    rectified_model = CombinedModel(sitf1, sitf2).to(device)
+    rectified_model.eval()
+    def model_base(x, t, y=None, **kwargs):
+        if using_cfg and "cfg_scale" in kwargs:
+            return base_model.forward_with_cfg(x, t, y, kwargs["cfg_scale"])
+        return base_model.forward(x, t, y)
+    def model_rectified(x, t, y=None, **kwargs):
+        if using_cfg and "cfg_scale" in kwargs:
+            sit_out = base_model.forward_with_cfg(x, t, y, kwargs["cfg_scale"])
+        else:
+            sit_out = base_model.forward(x, t, y)
+        if not args.use_sitf2:
+            return sit_out
+        out = rectified_model.forward(x, t, y)
+        if args.use_sitf2_before_t05:
+            mask = (t < args.sitf2_threshold).float()
+            while len(mask.shape) < len(out.shape):
+                mask = mask.unsqueeze(-1)
+            out = out * mask.expand_as(out)
+        return sit_out + out
+    transport = create_transport(
+        args.path_type, args.prediction, args.loss_weight, args.train_eps, args.sample_eps
+    )
+    sampler = Sampler(transport)
+    if mode == "ODE":
+        sample_fn = sampler.sample_ode(
+            sampling_method=args.sampling_method,
+            num_steps=args.num_sampling_steps,
+            atol=args.atol,
+            rtol=args.rtol,
+            reverse=args.reverse,
+        )
+    else:
+        sample_fn = sampler.sample_sde(
+            sampling_method=args.sampling_method,
+            diffusion_form=args.diffusion_form,
+            diffusion_norm=args.diffusion_norm,
+            last_step=args.last_step,
+            last_step_size=args.last_step_size,
+            num_steps=args.num_sampling_steps,
+        )
+    vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
+    exp_name = f"compare-{args.model.replace('/', '-')}-cfg-{args.cfg_scale}-{mode}-{args.num_sampling_steps}"
+    root_out = os.path.join(args.sample_dir, exp_name)
+    out_base = os.path.join(root_out, "base")
+    out_rect = os.path.join(root_out, "rectified")
+    out_pair = os.path.join(root_out, "pair")
+    if rank == 0:
+        os.makedirs(out_base, exist_ok=True)
+        os.makedirs(out_rect, exist_ok=True)
+        os.makedirs(out_pair, exist_ok=True)
+    dist.barrier()
+    n = args.per_proc_batch_size
+    global_batch = n * dist.get_world_size()
+    total_samples = int(math.ceil(args.num_fid_samples / global_batch) * global_batch)
+    iters = total_samples // global_batch
+    pbar = tqdm(range(iters)) if rank == 0 else range(iters)
+    total = 0
+    for _ in pbar:
+        z = torch.randn(n, base_model.in_channels, latent_size, latent_size, device=device)
+        y = torch.randint(0, args.num_classes, (n,), device=device)
+        if using_cfg:
+            z_in = torch.cat([z, z], 0)
+            y_null = torch.full((n,), args.num_classes, device=device, dtype=y.dtype)
+            y_in = torch.cat([y, y_null], 0)
+            model_kwargs = dict(y=y_in, cfg_scale=args.cfg_scale)
+        else:
+            z_in = z
+            y_in = y
+            model_kwargs = dict(y=y_in)
+        # Ensure SDE process noise is identical between base and rectified:
+        # save RNG state -> run base -> restore RNG state -> run rectified.
+        cpu_rng_state = torch.get_rng_state()
+        cuda_rng_state = torch.cuda.get_rng_state(device)
+        x_base = sample_fn(z_in, model_base, **model_kwargs)[-1]
+        torch.set_rng_state(cpu_rng_state)
+        torch.cuda.set_rng_state(cuda_rng_state, device=device)
+        x_rect = sample_fn(z_in, model_rectified, **model_kwargs)[-1]
+        if using_cfg:
+            x_base, _ = x_base.chunk(2, dim=0)
+            x_rect, _ = x_rect.chunk(2, dim=0)
+        img_base = vae.decode(x_base / 0.18215).sample
+        img_rect = vae.decode(x_rect / 0.18215).sample
+        img_base = torch.clamp(127.5 * img_base + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+        img_rect = torch.clamp(127.5 * img_rect + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+        save_png_batch(img_base, out_base, rank, total)
+        save_png_batch(img_rect, out_rect, rank, total)
+        for i in range(img_base.shape[0]):
+            index = i * dist.get_world_size() + rank + total
+            pair = np.concatenate([img_base[i], img_rect[i]], axis=1)
+            Image.fromarray(pair).save(f"{out_pair}/{index:06d}.png")
+        total += global_batch
+        dist.barrier()
+    dist.barrier()
+    if rank == 0:
+        create_npz_from_sample_folder(out_base, args.num_fid_samples)
+        create_npz_from_sample_folder(out_rect, args.num_fid_samples)
+        print(f"Done. Output root: {root_out}")
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    if len(sys.argv) < 2:
+        print("Usage: program.py <mode> [options]")
+        sys.exit(1)
+    mode = sys.argv[1]
+    assert mode in ["ODE", "SDE"], "Invalid mode. Please choose 'ODE' or 'SDE'"
+    parser.add_argument("--model", type=str, choices=list(SiT_models.keys()), default="SiT-XL/2")
+    parser.add_argument("--vae", type=str, choices=["ema", "mse"], default="ema")
+    parser.add_argument("--sample-dir", type=str, default="compare_samples")
+    parser.add_argument("--per-proc-batch-size", type=int, default=12)
+    parser.add_argument("--num-fid-samples", type=int, default=50_000)
+    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
+    parser.add_argument("--num-classes", type=int, default=1000)
+    parser.add_argument("--cfg-scale", type=float, default=1.0)
+    parser.add_argument("--num-sampling-steps", type=int, default=250)
+    parser.add_argument("--global-seed", type=int, default=1)
+    parser.add_argument("--tf32", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument("--ckpt", type=str, required=True, help="Base SiT checkpoint")
+    parser.add_argument("--sitf2-ckpt", type=str, required=True, help="SiTF2 checkpoint")
+    parser.add_argument("--learn-mu", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument("--depth", type=int, default=1, help="SiTF2 depth")
+    parser.add_argument("--use-sitf2", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument("--use-sitf2-before-t05", action=argparse.BooleanOptionalAction, default=False)
+    parser.add_argument("--sitf2-threshold", type=float, default=0.5)
+    parse_transport_args(parser)
+    if mode == "ODE":
+        parse_ode_args(parser)
+    else:
+        parse_sde_args(parser)
+    args = parser.parse_known_args()[0]
+    main(mode, args)

GVP/Baseline/sample_ddp.py ADDED Viewed

	@@ -0,0 +1,233 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Samples a large number of images from a pre-trained SiT model using DDP.
+Subsequently saves a .npz file that can be used to compute FID and other
+evaluation metrics via the ADM repo: https://github.com/openai/guided-diffusion/tree/main/evaluations
+For a simple single-GPU/CPU sampling script, see sample.py.
+"""
+import torch
+import torch.distributed as dist
+from models import SiT_models
+from download import find_model
+from transport import create_transport, Sampler
+from diffusers.models import AutoencoderKL
+from train_utils import parse_ode_args, parse_sde_args, parse_transport_args
+from tqdm import tqdm
+import os
+from PIL import Image
+import numpy as np
+import math
+import argparse
+import sys
+def create_npz_from_sample_folder(sample_dir, num=50_000):
+    """
+    Builds a single .npz file from a folder of .png samples.
+    """
+    samples = []
+    for i in tqdm(range(num), desc="Building .npz file from samples"):
+        sample_pil = Image.open(f"{sample_dir}/{i:06d}.png")
+        sample_np = np.asarray(sample_pil).astype(np.uint8)
+        samples.append(sample_np)
+    samples = np.stack(samples)
+    assert samples.shape == (num, samples.shape[1], samples.shape[2], 3)
+    npz_path = f"{sample_dir}.npz"
+    np.savez(npz_path, arr_0=samples)
+    print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
+    return npz_path
+def main(mode, args):
+    """
+    Run sampling.
+    """
+    torch.backends.cuda.matmul.allow_tf32 = args.tf32  # True: fast but may lead to some small numerical differences
+    assert torch.cuda.is_available(), "Sampling with DDP requires at least one GPU. sample.py supports CPU-only usage"
+    torch.set_grad_enabled(False)
+    # Setup DDP:
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    if args.ckpt is None:
+        assert args.model == "SiT-XL/2", "Only SiT-XL/2 models are available for auto-download."
+        assert args.image_size in [256, 512]
+        assert args.num_classes == 1000
+        assert args.image_size == 256, "512x512 models are not yet available for auto-download." # remove this line when 512x512 models are available
+        learn_sigma = args.image_size == 256
+    else:
+        learn_sigma = False
+    # Load model:
+    latent_size = args.image_size // 8
+    model = SiT_models[args.model](
+        input_size=latent_size,
+        num_classes=args.num_classes,
+        learn_sigma=learn_sigma,
+    ).to(device)
+    # Auto-download a pre-trained model or load a custom SiT checkpoint from train.py:
+    ckpt_path = args.ckpt or f"SiT-XL-2-{args.image_size}x{args.image_size}.pt"
+    state_dict = find_model(ckpt_path)
+    model.load_state_dict(state_dict)
+    model.eval()  # important!
+    transport = create_transport(
+        args.path_type,
+        args.prediction,
+        args.loss_weight,
+        args.train_eps,
+        args.sample_eps
+    )
+    sampler = Sampler(transport)
+    if mode == "ODE":
+        if args.likelihood:
+            assert args.cfg_scale == 1, "Likelihood is incompatible with guidance"
+            sample_fn = sampler.sample_ode_likelihood(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+            )
+        else:
+            sample_fn = sampler.sample_ode(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+                reverse=args.reverse
+            )
+    elif mode == "SDE":
+        sample_fn = sampler.sample_sde(
+            sampling_method=args.sampling_method,
+            diffusion_form=args.diffusion_form,
+            diffusion_norm=args.diffusion_norm,
+            last_step=args.last_step,
+            last_step_size=args.last_step_size,
+            num_steps=args.num_sampling_steps,
+        )
+    vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
+    assert args.cfg_scale >= 1.0, "In almost all cases, cfg_scale be >= 1.0"
+    using_cfg = args.cfg_scale > 1.0
+    # Create folder to save samples:
+    model_string_name = args.model.replace("/", "-")
+    ckpt_string_name = os.path.basename(args.ckpt).replace(".pt", "") if args.ckpt else "pretrained"
+    if mode == "ODE":
+        folder_name = f"{model_string_name}-{ckpt_string_name}-" \
+                  f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                  f"{mode}-{args.num_sampling_steps}-{args.sampling_method}"
+    elif mode == "SDE":
+        folder_name = f"{model_string_name}-{ckpt_string_name}-" \
+                    f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                    f"{mode}-{args.num_sampling_steps}-{args.sampling_method}-"\
+                    f"{args.diffusion_form}-{args.last_step}-{args.last_step_size}"
+    sample_folder_dir = f"{args.sample_dir}/{folder_name}"
+    if rank == 0:
+        os.makedirs(sample_folder_dir, exist_ok=True)
+        print(f"Saving .png samples at {sample_folder_dir}")
+    dist.barrier()
+    # Figure out how many samples we need to generate on each GPU and how many iterations we need to run:
+    n = args.per_proc_batch_size
+    global_batch_size = n * dist.get_world_size()
+    # To make things evenly-divisible, we'll sample a bit more than we need and then discard the extra samples:
+    num_samples = len([name for name in os.listdir(sample_folder_dir) if (os.path.isfile(os.path.join(sample_folder_dir, name)) and ".png" in name)])
+    total_samples = int(math.ceil(args.num_fid_samples / global_batch_size) * global_batch_size)
+    if rank == 0:
+        print(f"Total number of images that will be sampled: {total_samples}")
+    assert total_samples % dist.get_world_size() == 0, "total_samples must be divisible by world_size"
+    samples_needed_this_gpu = int(total_samples // dist.get_world_size())
+    assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size"
+    iterations = int(samples_needed_this_gpu // n)
+    done_iterations = int( int(num_samples // dist.get_world_size()) // n)
+    pbar = range(iterations)
+    pbar = tqdm(pbar) if rank == 0 else pbar
+    total = 0
+    for i in pbar:
+        # Sample inputs:
+        z = torch.randn(n, model.in_channels, latent_size, latent_size, device=device)
+        y = torch.randint(0, args.num_classes, (n,), device=device)
+        # Setup classifier-free guidance:
+        if using_cfg:
+            z = torch.cat([z, z], 0)
+            y_null = torch.tensor([1000] * n, device=device)
+            y = torch.cat([y, y_null], 0)
+            model_kwargs = dict(y=y, cfg_scale=args.cfg_scale)
+            model_fn = model.forward_with_cfg
+        else:
+            model_kwargs = dict(y=y)
+            model_fn = model.forward
+        samples = sample_fn(z, model_fn, **model_kwargs)[-1]
+        if using_cfg:
+            samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
+        samples = vae.decode(samples / 0.18215).sample
+        samples = torch.clamp(127.5 * samples + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+        # Save samples to disk as individual .png files
+        for i, sample in enumerate(samples):
+            index = i * dist.get_world_size() + rank + total
+            Image.fromarray(sample).save(f"{sample_folder_dir}/{index:06d}.png")
+        total += global_batch_size
+        dist.barrier()
+    # Make sure all processes have finished saving their samples before attempting to convert to .npz
+    dist.barrier()
+    if rank == 0:
+        create_npz_from_sample_folder(sample_folder_dir, args.num_fid_samples)
+        print("Done.")
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    if len(sys.argv) < 2:
+        print("Usage: program.py <mode> [options]")
+        sys.exit(1)
+    mode = sys.argv[1]
+    assert mode[:2] != "--", "Usage: program.py <mode> [options]"
+    assert mode in ["ODE", "SDE"], "Invalid mode. Please choose 'ODE' or 'SDE'"
+    parser.add_argument("--model", type=str, choices=list(SiT_models.keys()), default="SiT-XL/2")
+    parser.add_argument("--vae",  type=str, choices=["ema", "mse"], default="ema")
+    parser.add_argument("--sample-dir", type=str, default="samples")
+    parser.add_argument("--per-proc-batch-size", type=int, default=12)
+    parser.add_argument("--num-fid-samples", type=int, default=50_000)
+    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
+    parser.add_argument("--num-classes", type=int, default=1000)
+    parser.add_argument("--cfg-scale",  type=float, default=1.0)
+    parser.add_argument("--num-sampling-steps", type=int, default=250)
+    parser.add_argument("--global-seed", type=int, default=1)
+    parser.add_argument("--tf32", action=argparse.BooleanOptionalAction, default=True,
+                        help="By default, use TF32 matmuls. This massively accelerates sampling on Ampere GPUs.")
+    parser.add_argument("--ckpt", type=str, default=None,
+                        help="Optional path to a SiT checkpoint (default: auto-download a pre-trained SiT-XL/2 model).")
+    parse_transport_args(parser)
+    if mode == "ODE":
+        parse_ode_args(parser)
+        # Further processing for ODE
+    elif mode == "SDE":
+        parse_sde_args(parser)
+        # Further processing for SDE
+    args = parser.parse_known_args()[0]
+    main(mode, args)

GVP/Baseline/sample_rectified_noise.py ADDED Viewed

	@@ -0,0 +1,380 @@

+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from models import SiT_models
+from download import find_model
+from transport import create_transport, Sampler
+from diffusers.models import AutoencoderKL
+from train_utils import parse_ode_args, parse_sde_args, parse_transport_args
+from tqdm import tqdm
+import os
+from PIL import Image
+import numpy as np
+import math
+import argparse
+import sys
+def create_npz_from_sample_folder(sample_dir, num=50_000):
+    """
+    Builds a single .npz file from a folder of .png samples.
+    """
+    samples = []
+    for i in tqdm(range(num), desc="Building .npz file from samples"):
+        sample_pil = Image.open(f"{sample_dir}/{i:06d}.png")
+        sample_np = np.asarray(sample_pil).astype(np.uint8)
+        samples.append(sample_np)
+    samples = np.stack(samples)
+    assert samples.shape == (num, samples.shape[1], samples.shape[2], 3)
+    npz_path = f"{sample_dir}.npz"
+    np.savez(npz_path, arr_0=samples)
+    print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
+    return npz_path
+def fix_state_dict_for_ddp(state_dict):
+    """
+    Fix state dict keys to match DistributedDataParallel model keys.
+    Add "module." prefix to keys if they don't have it.
+    """
+    # Check if this is a full checkpoint dict with "model", "ema", or "opt" keys
+    if isinstance(state_dict, dict) and ("model" in state_dict or "ema" in state_dict or "opt" in state_dict):
+        # This is a full checkpoint dict, extract the state dict we need
+        # Prefer "ema" then "model" then return as is
+        if "ema" in state_dict:
+            state_dict = state_dict["ema"]
+        elif "model" in state_dict:
+            state_dict = state_dict["model"]
+        else:
+            # If only "opt" or other keys exist, return original
+            state_dict = state_dict
+    # Now fix the keys to match DDP format
+    fixed_state_dict = {}
+    for key, value in state_dict.items():
+        if not key.startswith("module."):
+            new_key = "module." + key
+        else:
+            new_key = key
+        fixed_state_dict[new_key] = value
+    return fixed_state_dict
+def main(mode, args):
+    """
+    Run sampling.
+    """
+    torch.backends.cuda.matmul.allow_tf32 = args.tf32  # True: fast but may lead to some small numerical differences
+    assert torch.cuda.is_available(), "Sampling with DDP requires at least one GPU. sample.py supports CPU-only usage"
+    torch.set_grad_enabled(False)
+    learn_mu = args.learn_mu
+    sitf2_depth = args.depth  # Save SiTF2 depth before it gets overwritten
+    # Setup DDP:
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    if args.ckpt is None:
+        assert args.model == "SiT-XL/2", "Only SiT-XL/2 models are available for auto-download."
+        assert args.image_size in [256, 512]
+        assert args.num_classes == 1000
+        assert args.image_size == 256, "512x512 models are not yet available for auto-download." # remove this line when 512x512 models are available
+        learn_sigma = args.image_size == 256
+    else:
+        learn_sigma = False
+    # Load SiTF1 and SiTF2 models and create CombinedModel
+    from models import SiTF1, SiTF2, CombinedModel
+    latent_size = args.image_size // 8
+    # Get model configuration based on args.model
+    model_name = args.model
+    if 'XL' in model_name:
+        hidden_size, depth, num_heads = 1152, 28, 16
+    elif 'L' in model_name:
+        hidden_size, depth, num_heads = 1024, 24, 16
+    elif 'B' in model_name:
+        hidden_size, depth, num_heads = 768, 12, 12
+    elif 'S' in model_name:
+        hidden_size, depth, num_heads = 384, 12, 6
+    else:
+        # Default fallback
+        hidden_size, depth, num_heads = 768, 12, 12
+    # Extract patch size from model name like 'SiT-XL/2' -> patch_size = 2
+    patch_size = int(model_name.split('/')[-1])
+    # Load SiTF1
+    sitf1 = SiTF1(
+        input_size=latent_size,
+        patch_size=patch_size,
+        in_channels=4,
+        hidden_size=hidden_size,
+        depth=depth,
+        num_heads=num_heads,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=args.num_classes,
+        learn_sigma=False
+    ).to(device)
+    sitf1_state_raw = find_model(args.ckpt)
+    # find_model now returns ema if available, or the full checkpoint
+    # Extract the actual state_dict to use for both sitf1 and base_model
+    if isinstance(sitf1_state_raw, dict) and "model" in sitf1_state_raw:
+        sitf1_state = sitf1_state_raw["model"]
+    else:
+        # sitf1_state_raw is already a state_dict (either ema or direct model state)
+        sitf1_state = sitf1_state_raw
+    sitf1.load_state_dict(sitf1_state)
+    sitf1.eval()
+    # For sampling, we can use sitf1 directly instead of creating a separate sit model
+    # since sitf1 and sit have the same architecture and weights
+    # Load SiTF2 with the same architecture parameters as SiTF1 for compatibility
+    sitf2 = SiTF2(
+        input_size=latent_size,
+        hidden_size=hidden_size,  # Use the same hidden_size as SiTF1
+        out_channels=8,
+        patch_size=patch_size,  # Use the same patch_size as SiTF1
+        num_heads=num_heads,  # Use the same num_heads as SiTF1
+        mlp_ratio=4.0,
+        depth=sitf2_depth,  # Use the depth specified by command line argument (not the model's default depth)
+        learn_sigma=True,
+        num_classes=args.num_classes,
+        learn_mu=learn_mu
+    ).to(device)
+    sitf2 = DDP(sitf2, device_ids=[device])
+    sitf2_state = find_model(args.sitf2_ckpt)
+    # Fix state dict keys to match DDP model
+    sitf2_state_fixed = fix_state_dict_for_ddp(sitf2_state)
+    try:
+        sitf2.load_state_dict(sitf2_state_fixed)
+    except Exception as e:
+        print(f"Error loading state dict: {e}")
+        # Try loading with strict=False as fallback
+        sitf2.load_state_dict(sitf2_state_fixed, strict=False)
+    sitf2.eval()
+    # CombinedModel
+    combined_model = CombinedModel(sitf1, sitf2).to(device)
+    sitf2.eval()
+    combined_model.eval()
+    # Use SiT_models factory function to create the base model, same as in SiT_clean
+    # This ensures correct model configuration
+    # Use learn_sigma=False to match sitf1 configuration
+    base_model = SiT_models[args.model](
+        input_size=latent_size,
+        num_classes=args.num_classes,
+        learn_sigma=False,  # Match sitf1's learn_sigma=False
+    ).to(device)
+    # Load the checkpoint (same as sitf1) - use the exact same state_dict
+    base_model.load_state_dict(sitf1_state)
+    base_model.eval()
+    # Determine if CFG will be used (needed for combined_sampling_model function)
+    assert args.cfg_scale >= 1.0, "In almost all cases, cfg_scale be >= 1.0"
+    using_cfg = args.cfg_scale > 1.0
+    # There are repeated calculations in the middle,
+    # which will cause Flops to double. A simplified version will be released later
+    def combined_sampling_model(x, t, y=None, **kwargs):
+        with torch.no_grad():
+            # Handle CFG same as in SiT_clean/sample_ddp.py
+            if using_cfg and 'cfg_scale' in kwargs:
+                # Use forward_with_cfg when CFG is enabled
+                sit_out = base_model.forward_with_cfg(x, t, y, kwargs['cfg_scale'])
+            else:
+                # Use regular forward when CFG is disabled
+                sit_out = base_model.forward(x, t, y)
+            # If use_sitf2_before_t05 is True, only use sitf2 when t < threshold
+            if args.use_sitf2:
+                if args.use_sitf2_before_t05:
+                    # t is a tensor, check which samples have t < threshold
+                    # Create a mask: 1.0 where t < threshold, 0.0 otherwise
+                    mask = (t < args.sitf2_threshold).float()
+                    # Compute sitf2 output for all samples
+                    combined_out = combined_model.forward(x, t, y)
+                    # Expand mask to match the spatial dimensions of combined_out
+                    # combined_out shape is (batch, channels, height, width)
+                    while len(mask.shape) < len(combined_out.shape):
+                        mask = mask.unsqueeze(-1)
+                    # Broadcast mask to match combined_out shape
+                    mask = mask.expand_as(combined_out)
+                    # Only use sitf2 output where t < threshold
+                    combined_out = combined_out * mask
+                    # Combine sit_out and masked combined_out
+                    return sit_out + combined_out
+                else:
+                # Default behavior: only use base model output
+                    return sit_out
+            else:
+                # Default behavior: only use base model output
+                return sit_out
+    transport = create_transport(
+        args.path_type,
+        args.prediction,
+        args.loss_weight,
+        args.train_eps,
+        args.sample_eps
+    )
+    sampler = Sampler(transport)
+    if mode == "ODE":
+        if args.likelihood:
+            assert args.cfg_scale == 1, "Likelihood is incompatible with guidance"
+            sample_fn = sampler.sample_ode_likelihood(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+            )
+        else:
+            sample_fn = sampler.sample_ode(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+                reverse=args.reverse
+            )
+    elif mode == "SDE":
+        sample_fn = sampler.sample_sde(
+            sampling_method=args.sampling_method,
+            diffusion_form=args.diffusion_form,
+            diffusion_norm=args.diffusion_norm,
+            last_step=args.last_step,
+            last_step_size=args.last_step_size,
+            num_steps=args.num_sampling_steps,
+        )
+    vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
+    # Create folder to save samples:
+    model_string_name = args.model.replace("/", "-")
+    ckpt_string_name = os.path.basename(args.ckpt).replace(".pt", "") if args.ckpt else "pretrained"
+    sitf2_ckpt_string_name = os.path.basename(args.sitf2_ckpt).replace(".pt", "") if args.ckpt else "pretrained"
+    if mode == "ODE":
+        folder_name = f"{sitf2_ckpt_string_name}-{ckpt_string_name}-" \
+                  f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                  f"{mode}-{args.num_sampling_steps}-{args.sampling_method}"
+    elif mode == "SDE":
+        # Add threshold info to folder name if use_sitf2_before_t05 is enabled
+        threshold_suffix = f"-threshold-{args.sitf2_threshold}" if args.use_sitf2_before_t05 else ""
+        if learn_mu:
+            folder_name = f"depth-mu-{sitf2_depth}{threshold_suffix}-{sitf2_ckpt_string_name}-{ckpt_string_name}-" \
+                        f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                        f"{mode}-{args.num_sampling_steps}-{args.sampling_method}-"\
+                        f"{args.diffusion_form}-{args.last_step}-{args.last_step_size}"
+        else:
+            folder_name = f"depth-sigma-{sitf2_depth}{threshold_suffix}-{sitf2_ckpt_string_name}-{ckpt_string_name}-" \
+                        f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                        f"{mode}-{args.num_sampling_steps}-{args.sampling_method}-"\
+                        f"{args.diffusion_form}-{args.last_step}-{args.last_step_size}"
+    sample_folder_dir = f"{args.sample_dir}/{folder_name}"
+    if rank == 0:
+        os.makedirs(sample_folder_dir, exist_ok=True)
+        print(f"Saving .png samples at {sample_folder_dir}")
+    dist.barrier()
+    # Figure out how many samples we need to generate on each GPU and how many iterations we need to run:
+    n = args.per_proc_batch_size
+    global_batch_size = n * dist.get_world_size()
+    # To make things evenly-divisible, we'll sample a bit more than we need and then discard the extra samples:
+    num_samples = len([name for name in os.listdir(sample_folder_dir) if (os.path.isfile(os.path.join(sample_folder_dir, name)) and ".png" in name)])
+    total_samples = int(math.ceil(args.num_fid_samples / global_batch_size) * global_batch_size)
+    if rank == 0:
+        print(f"Total number of images that will be sampled: {total_samples}")
+    assert total_samples % dist.get_world_size() == 0, "total_samples must be divisible by world_size"
+    samples_needed_this_gpu = int(total_samples // dist.get_world_size())
+    assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size"
+    iterations = int(samples_needed_this_gpu // n)
+    done_iterations = int( int(num_samples // dist.get_world_size()) // n)
+    pbar = range(iterations)
+    pbar = tqdm(pbar) if rank == 0 else pbar
+    total = 0
+    for i in pbar:
+        # Sample inputs:
+        z = torch.randn(n, base_model.in_channels, latent_size, latent_size, device=device)
+        y = torch.randint(0, args.num_classes, (n,), device=device)
+        # Setup classifier-free guidance:
+        if using_cfg:
+            z = torch.cat([z, z], 0)
+            y_null = torch.tensor([1000] * n, device=device)
+            y = torch.cat([y, y_null], 0)
+            model_kwargs = dict(y=y, cfg_scale=args.cfg_scale)
+        else:
+            model_kwargs = dict(y=y)
+        samples = sample_fn(z, combined_sampling_model, **model_kwargs)[-1]
+        if using_cfg:
+            samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
+        samples = vae.decode(samples / 0.18215).sample
+        samples = torch.clamp(127.5 * samples + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+        # Save samples to disk as individual .png files
+        for i, sample in enumerate(samples):
+            index = i * dist.get_world_size() + rank + total
+            Image.fromarray(sample).save(f"{sample_folder_dir}/{index:06d}.png")
+        total += global_batch_size
+        dist.barrier()
+    # Make sure all processes have finished saving their samples before attempting to convert to .npz
+    dist.barrier()
+    if rank == 0:
+        create_npz_from_sample_folder(sample_folder_dir, args.num_fid_samples)
+        print("Done.")
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    if len(sys.argv) < 2:
+        print("Usage: program.py <mode> [options]")
+        sys.exit(1)
+    mode = sys.argv[1]
+    assert mode[:2] != "--", "Usage: program.py <mode> [options]"
+    assert mode in ["ODE", "SDE"], "Invalid mode. Please choose 'ODE' or 'SDE'"
+    parser.add_argument("--model", type=str, choices=list(SiT_models.keys()), default="SiT-XL/2")
+    parser.add_argument("--vae",  type=str, choices=["ema", "mse"], default="ema")
+    parser.add_argument("--sample-dir", type=str, default="samples")
+    parser.add_argument("--per-proc-batch-size", type=int, default=12)
+    parser.add_argument("--num-fid-samples", type=int, default=50_000)
+    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
+    parser.add_argument("--num-classes", type=int, default=1000)
+    parser.add_argument("--cfg-scale",  type=float, default=1.0)
+    parser.add_argument("--num-sampling-steps", type=int, default=250)
+    parser.add_argument("--global-seed", type=int, default=0)
+    parser.add_argument("--tf32", action=argparse.BooleanOptionalAction, default=True,
+                        help="By default, use TF32 matmuls. This massively accelerates sampling on Ampere GPUs.")
+    parser.add_argument("--ckpt", type=str, default=None,
+                        help="Optional path to a SiT checkpoint.")
+    parser.add_argument("--sitf2-ckpt", type=str, required=True, help="Path to SiTF2 checkpoint")
+    parser.add_argument("--learn-mu", action=argparse.BooleanOptionalAction, default=True,
+                        help="Whether to learn mu parameter")
+    parser.add_argument("--depth", type=int, default=1,
+                        help="Depth parameter for SiTF2 model")
+    parser.add_argument("--use-sitf2", action=argparse.BooleanOptionalAction, default=True,
+    help="Only use SiTF2 output when t < threshold, otherwise use only SiT")
+    parser.add_argument("--use-sitf2-before-t05", action=argparse.BooleanOptionalAction, default=False,
+                        help="Only use SiTF2 output when t < threshold, otherwise use only SiT")
+    parser.add_argument("--sitf2-threshold", type=float, default=0.5,
+                        help="Time threshold for using SiTF2 output (default: 0.5). Only effective when --use-sitf2-before-t05 is True")
+    parse_transport_args(parser)
+    if mode == "ODE":
+        parse_ode_args(parser)
+        # Further processing for ODE
+    elif mode == "SDE":
+        parse_sde_args(parser)
+        # Further processing for SDE
+    args = parser.parse_known_args()[0]
+    main(mode, args)

GVP/Baseline/samples.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+CUDA_VISIBLE_DEVICES=0,1,2,3 nohup torchrun \
+    --nnodes=1 \
+    --nproc_per_node=4 \
+    --rdzv_endpoint=localhost:29110 \
+    sample_rectified_noise.py SDE \
+    --depth 6 \
+    --sample-dir GVP_samples \
+    --model SiT-XL/2 \
+    --num-fid-samples 50000 \
+    --num-classes 1000 \
+    --global-seed 0 \
+    --use-sitf2 True \
+    --sitf2-threshold 1 \
+    --ckpt /gemini/space/gzy_new/models/xiangzai_Back/GVP_check/base.pt \
+    --sitf2-ckpt /gemini/space/gzy_new/models/Baseline/results_256_gvp_disp/depth-mu-6-007-SiT-XL-2-GVP-velocity-None-OT-Contrastive0.05/checkpoints/0300000.pt \
+    > W_No.log 2>&1 &

GVP/Baseline/samples_ddp.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+CUDA_VISIBLE_DEVICES=0,1,2,3 nohup torchrun \
+    --nnodes=1 \
+    --nproc_per_node=4 \
+    --rdzv_endpoint=localhost:29111 \
+    sample_ddp.py SDE \
+    --sample-dir baseline_gvp_ \
+    --model SiT-XL/2 \
+    --num-fid-samples 50000 \
+    --num-classes 1000 \
+    --global-seed 0 \
+    --path-type GVP \
+    --prediction velocity \
+    --ckpt /gemini/space/gzy_new/models/xiangzai_Back/GVP_check/base.pt \
+    > gvp_sampling.log 2>&1 &

GVP/Baseline/transport/__pycache__/ot_plan.cpython-311.pyc ADDED Viewed

Binary file (5.71 kB). View file

GVP/Baseline/transport/__pycache__/path.cpython-310.pyc ADDED Viewed

Binary file (7.9 kB). View file

GVP/Baseline/transport/__pycache__/path.cpython-311.pyc ADDED Viewed

Binary file (12.1 kB). View file

GVP/Baseline/transport/__pycache__/path.cpython-312.pyc ADDED Viewed

Binary file (11.3 kB). View file

GVP/Baseline/transport/__pycache__/path.cpython-38.pyc ADDED Viewed

Binary file (7.93 kB). View file

GVP/Baseline/transport/__pycache__/transport.cpython-310.pyc ADDED Viewed

Binary file (14.1 kB). View file

GVP/Baseline/transport/__pycache__/transport.cpython-311.pyc ADDED Viewed

Binary file (23.8 kB). View file

GVP/Baseline/transport/__pycache__/transport.cpython-312.pyc ADDED Viewed

Binary file (22.8 kB). View file

GVP/Baseline/transport/__pycache__/transport.cpython-38.pyc ADDED Viewed

Binary file (13.2 kB). View file

GVP/Baseline/transport/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.24 kB). View file

GVP/Baseline/transport/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (2.17 kB). View file

GVP/Baseline/transport/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (1.9 kB). View file

GVP/Baseline/transport/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (1.26 kB). View file

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0020000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de6ee3c5ee036be85b216daddafb56f16df64e9c3f7d3060b31f5cb301a345b1
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0040000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30fe96a8ac31b2744debe3861b4c92e631462eb27401c26f88042b6e65ac287d
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0060000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f82060d1339e53cc05c4863d39dc083795f05ad282cabc830889d8b6f72911b
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0080000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:606a5b7dceacc351bf68867736eed793e3e02c29a6e27aa0e8cf6e2ef3382c06
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0100000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ecb79b4530e3503370cefb16c38706a8b7b823e221f30f621c902e2e174aaec
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0120000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d03dae6043ff1543a412b1c44c02a21aa299014e8b3b3209e21cfd73b2c5d0f
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0140000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c630ada8db890d9fc41eaf8d30b3bb1d95b1e61d380d8f7e97f639ec3b307233
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0160000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9aa31b63d4138dec6b8d4c55a58fadf60096ac8f1d4ec83651ea724e7d842e17
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0180000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4b9c5725fd5492e8e4979b190b04809ba03514ff57da6b5ce92e3e99df9c0db
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0200000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49d27a75f329c0170b0b6bbc0028881a40b6d8cb2a34495fba8ba73e49a395a4
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0220000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c59a8b6bca3ade4b6547951082b5dc67ad805861c611a13bccf1995cf8f0ca7d
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0240000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b47c2b4f5baef28f5f1033b57e5ad7cb2c1b52246fcaa0ff4214781bfa305604
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0260000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf7f7746b191c29efd3ff33ba7cdb83f67d242ed03a14228478943a3cb484907
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0280000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f254fdaea4c2c6875493b4a94846b5a8a54cf90210d610d297a7bbceadd84101
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0300000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f198d183997904bfef6ae3222f59022cbb86c40ae625d2196d659bd7c1c3c121
+size 1193384322

VP/depth-mu-4-001-SiT-XL-2-VP-velocity-None-OT-Contrastive0.05/checkpoints/0320000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e4aca9e6138967eea8540f58f7a7b1d09b749ea4b471e62a0052b8c04e9a8d4
+size 1193384322