File size: 5,522 Bytes
370da72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80b1d4b
 
 
 
370da72
 
 
 
 
80b1d4b
370da72
80b1d4b
 
370da72
80b1d4b
370da72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
## config.yaml
## Neural Sinkhorn Gradient Flow (NSGF++) Configuration
## Based on arXiv:2401.14069

# ============================================================
# 2D Synthetic Experiments (Section 5.1, Appendix E.1)
# ============================================================
experiment_2d:
  # Datasets: 8gaussians, moons, scurve, checkerboard, 8gaussians_moons
  dataset: "8gaussians"
  source: "gaussian"  # source distribution: standard Gaussian N(0, I)

  # MLP Architecture (Appendix E.1: 3 hidden layers, 256 hidden units)
  model:
    input_dim: 2
    hidden_dim: 256
    num_hidden_layers: 3
    time_emb_dim: 64
    activation: "silu"

  # Sinkhorn gradient flow parameters
  sinkhorn:
    epsilon: 0.1           # regularization coefficient ε
    blur: 0.5              # GeomLoss blur parameter (blur^p ~ ε)
    scaling: 0.80          # GeomLoss multiscale scaling
    eta: 1.0               # gradient flow step size η
    num_steps: 10          # T: number of gradient flow time steps
    batch_size: 256        # n: minibatch size for Sinkhorn flow

  # Trajectory pool
  pool:
    num_batches: 200       # number of batches to build pool
    experience_replay: true

  # Velocity field matching training
  training:
    num_iterations: 20000
    batch_size: 256
    learning_rate: 0.001
    optimizer: "adam"
    beta1: 0.9
    beta2: 0.999
    weight_decay: 0.0

  # Inference / Sampling
  inference:
    num_euler_steps: 10    # 10 or 100 Euler steps (uniform schedule)
    num_samples: 1024      # samples for evaluation

  # Evaluation
  evaluation:
    num_test_samples: 1024  # W2 computed against 1024 test samples
    metric: "w2"           # 2-Wasserstein distance

# ============================================================
# Image Benchmark Experiments (Section 5.2, Appendix E.2)
# ============================================================
experiment_mnist:
  dataset: "mnist"
  image_size: 28
  in_channels: 1

  # UNet Architecture (Appendix E.2, Dhariwal & Nichol 2021)
  unet:
    model_channels: 32          # base channels
    num_res_blocks: 1           # depth = 1
    channel_mult: [1, 2, 2]
    num_heads: 1
    num_head_channels: -1       # use num_heads instead
    attention_resolutions: [16]
    dropout: 0.0
    use_scale_shift_norm: true  # AdaGN

  # Sinkhorn gradient flow (Phase 1)
  sinkhorn:
    blur: 0.5
    scaling: 0.80
    eta: 1.0
    num_steps: 5               # T <= 5 for NSGF phase
    batch_size: 256

  # Trajectory pool (Appendix E.2: 256 batch * 1500 batches * 5 steps < 20GB)
  pool:
    num_batches: 1500
    storage_limit_gb: 20

  # Velocity field matching training (NSGF model)
  nsgf_training:
    num_iterations: 100000
    batch_size: 128
    learning_rate: 0.0001
    optimizer: "adam"
    beta1: 0.9
    beta2: 0.999
    weight_decay: 0.0

  # Neural Straight Flow (Phase 2)
  nsf_training:
    num_iterations: 100000
    batch_size: 128
    learning_rate: 0.0001
    optimizer: "adam"
    beta1: 0.9
    beta2: 0.999
    weight_decay: 0.0

  # Phase-transition time predictor (CNN)
  time_predictor:
    conv_channels: [32, 64, 128, 256]
    kernel_size: 3
    stride: 1
    padding: 1
    pool_size: 2
    num_iterations: 40000
    learning_rate: 0.0001
    batch_size: 128

  # Inference
  inference:
    nsgf_steps: 5              # 5-step Euler in NSGF phase
    nsf_steps: 55              # remaining steps for straight flow
    total_nfe: 60              # total NFE = nsgf_steps + nsf_steps

  # Evaluation (Appendix E.2: FID between 10K gen and test)
  evaluation:
    num_generated: 10000
    metrics: ["fid"]

experiment_cifar10:
  dataset: "cifar10"
  image_size: 32
  in_channels: 3

  # UNet Architecture (Appendix E.2)
  unet:
    model_channels: 128         # base channels
    num_res_blocks: 2           # depth = 2
    channel_mult: [1, 2, 2, 2]
    num_heads: 4
    num_head_channels: 64
    attention_resolutions: [16]
    dropout: 0.0
    use_scale_shift_norm: true

  # Sinkhorn gradient flow (Phase 1)
  # NOTE: batch_size reduced from paper's 128 to 32 for T4 16GB VRAM.
  # Sinkhorn on 3072-dim flattened vectors (3x32x32) with tensorized backend
  # uses O(N^2 * D) memory. 128 samples OOMs on T4; 32 fits comfortably.
  # Compensate by increasing pool batches (32 * 10000 = 320K ≈ 128 * 2500).
  sinkhorn:
    blur: 1.0
    scaling: 0.85
    eta: 1.0
    num_steps: 5
    batch_size: 32

  # Trajectory pool — adjusted for smaller Sinkhorn batch
  # 32 batch * 10000 batches * 5 steps = 1.6M entries (same order as paper)
  pool:
    num_batches: 10000
    storage_limit_gb: 45

  # Velocity field matching training (NSGF model)
  nsgf_training:
    num_iterations: 200000
    batch_size: 128
    learning_rate: 0.0001
    optimizer: "adam"
    beta1: 0.9
    beta2: 0.999
    weight_decay: 0.0

  # Neural Straight Flow (Phase 2)
  nsf_training:
    num_iterations: 200000
    batch_size: 128
    learning_rate: 0.0001
    optimizer: "adam"
    beta1: 0.9
    beta2: 0.999
    weight_decay: 0.0

  # Phase-transition time predictor (same CNN architecture)
  time_predictor:
    conv_channels: [32, 64, 128, 256]
    kernel_size: 3
    stride: 1
    padding: 1
    pool_size: 2
    num_iterations: 40000
    learning_rate: 0.0001
    batch_size: 128

  # Inference
  inference:
    nsgf_steps: 5
    nsf_steps: 54
    total_nfe: 59              # paper reports NFE=59 for CIFAR-10

  # Evaluation
  evaluation:
    num_generated: 10000
    metrics: ["fid", "is"]
    # Paper target: FID=5.55, IS=8.86