# JiT + CFM training (train_cfm_jit.py --config)
# Smallest practical JiT for 32x32 RGB: 1 patch, 1 block, narrow hidden dim.

sigma: 0.0

# Must match training data; in_channels = dim[0], input_size = dim[1] = dim[2]
dim: [3, 32, 32]

lr: 1.0e-4
weight_decay: 0.0

inference_steps: 50
vis_batch_size: 4

# JiT (jit_model_unconditional.JiT) — minimal footprint
# Single 32×32 patch → 1 token; depth 1; hidden_size divisible by num_heads
input_size: 32
patch_size: 2
hidden_size: 512
depth: 6
num_heads: 2
mlp_ratio: 2.0
attn_drop: 0.0
proj_drop: 0.0
bottleneck_dim: 128
in_context_len: 32
in_context_start: 0