## config.yaml ## Neural Sinkhorn Gradient Flow (NSGF++) Configuration ## Based on arXiv:2401.14069 # ============================================================ # 2D Synthetic Experiments (Section 5.1, Appendix E.1) # ============================================================ experiment_2d: # Datasets: 8gaussians, moons, scurve, checkerboard, 8gaussians_moons dataset: "8gaussians" source: "gaussian" # source distribution: standard Gaussian N(0, I) # MLP Architecture (Appendix E.1: 3 hidden layers, 256 hidden units) model: input_dim: 2 hidden_dim: 256 num_hidden_layers: 3 time_emb_dim: 64 activation: "silu" # Sinkhorn gradient flow parameters sinkhorn: epsilon: 0.1 # regularization coefficient ε blur: 0.5 # GeomLoss blur parameter (blur^p ~ ε) scaling: 0.80 # GeomLoss multiscale scaling eta: 1.0 # gradient flow step size η num_steps: 10 # T: number of gradient flow time steps batch_size: 256 # n: minibatch size for Sinkhorn flow # Trajectory pool pool: num_batches: 200 # number of batches to build pool experience_replay: true # Velocity field matching training training: num_iterations: 20000 batch_size: 256 learning_rate: 0.001 optimizer: "adam" beta1: 0.9 beta2: 0.999 weight_decay: 0.0 # Inference / Sampling inference: num_euler_steps: 10 # 10 or 100 Euler steps (uniform schedule) num_samples: 1024 # samples for evaluation # Evaluation evaluation: num_test_samples: 1024 # W2 computed against 1024 test samples metric: "w2" # 2-Wasserstein distance # ============================================================ # Image Benchmark Experiments (Section 5.2, Appendix E.2) # ============================================================ experiment_mnist: dataset: "mnist" image_size: 28 in_channels: 1 # UNet Architecture (Appendix E.2, Dhariwal & Nichol 2021) unet: model_channels: 32 # base channels num_res_blocks: 1 # depth = 1 channel_mult: [1, 2, 2] num_heads: 1 num_head_channels: -1 # use num_heads instead attention_resolutions: [16] dropout: 0.0 use_scale_shift_norm: true # AdaGN # Sinkhorn gradient flow (Phase 1) sinkhorn: blur: 0.5 scaling: 0.80 eta: 1.0 num_steps: 5 # T <= 5 for NSGF phase batch_size: 256 # Trajectory pool (Appendix E.2: 256 batch * 1500 batches * 5 steps < 20GB) pool: num_batches: 1500 storage_limit_gb: 20 # Velocity field matching training (NSGF model) nsgf_training: num_iterations: 100000 batch_size: 128 learning_rate: 0.0001 optimizer: "adam" beta1: 0.9 beta2: 0.999 weight_decay: 0.0 # Neural Straight Flow (Phase 2) nsf_training: num_iterations: 100000 batch_size: 128 learning_rate: 0.0001 optimizer: "adam" beta1: 0.9 beta2: 0.999 weight_decay: 0.0 # Phase-transition time predictor (CNN) time_predictor: conv_channels: [32, 64, 128, 256] kernel_size: 3 stride: 1 padding: 1 pool_size: 2 num_iterations: 40000 learning_rate: 0.0001 batch_size: 128 # Inference inference: nsgf_steps: 5 # 5-step Euler in NSGF phase nsf_steps: 55 # remaining steps for straight flow total_nfe: 60 # total NFE = nsgf_steps + nsf_steps # Evaluation (Appendix E.2: FID between 10K gen and test) evaluation: num_generated: 10000 metrics: ["fid"] experiment_cifar10: dataset: "cifar10" image_size: 32 in_channels: 3 # UNet Architecture (Appendix E.2) unet: model_channels: 128 # base channels num_res_blocks: 2 # depth = 2 channel_mult: [1, 2, 2, 2] num_heads: 4 num_head_channels: 64 attention_resolutions: [16] dropout: 0.0 use_scale_shift_norm: true # Sinkhorn gradient flow (Phase 1) # NOTE: batch_size reduced from paper's 128 to 32 for T4 16GB VRAM. # Sinkhorn on 3072-dim flattened vectors (3x32x32) with tensorized backend # uses O(N^2 * D) memory. 128 samples OOMs on T4; 32 fits comfortably. # Compensate by increasing pool batches (32 * 10000 = 320K ≈ 128 * 2500). sinkhorn: blur: 1.0 scaling: 0.85 eta: 1.0 num_steps: 5 batch_size: 32 # Trajectory pool — adjusted for smaller Sinkhorn batch # 32 batch * 10000 batches * 5 steps = 1.6M entries (same order as paper) pool: num_batches: 10000 storage_limit_gb: 45 # Velocity field matching training (NSGF model) nsgf_training: num_iterations: 200000 batch_size: 128 learning_rate: 0.0001 optimizer: "adam" beta1: 0.9 beta2: 0.999 weight_decay: 0.0 # Neural Straight Flow (Phase 2) nsf_training: num_iterations: 200000 batch_size: 128 learning_rate: 0.0001 optimizer: "adam" beta1: 0.9 beta2: 0.999 weight_decay: 0.0 # Phase-transition time predictor (same CNN architecture) time_predictor: conv_channels: [32, 64, 128, 256] kernel_size: 3 stride: 1 padding: 1 pool_size: 2 num_iterations: 40000 learning_rate: 0.0001 batch_size: 128 # Inference inference: nsgf_steps: 5 nsf_steps: 54 total_nfe: 59 # paper reports NFE=59 for CIFAR-10 # Evaluation evaluation: num_generated: 10000 metrics: ["fid", "is"] # Paper target: FID=5.55, IS=8.86