# JiT + CFM training (train_cfm_jit.py --config) # Smallest practical JiT for 32x32 RGB: 1 patch, 1 block, narrow hidden dim. sigma: 0.0 # Must match training data; in_channels = dim[0], input_size = dim[1] = dim[2] dim: [3, 32, 32] lr: 1.0e-4 weight_decay: 0.0 inference_steps: 50 vis_batch_size: 4 # JiT (jit_model_unconditional.JiT) — minimal footprint # Single 32×32 patch → 1 token; depth 1; hidden_size divisible by num_heads input_size: 32 patch_size: 2 hidden_size: 512 depth: 6 num_heads: 2 mlp_ratio: 2.0 attn_drop: 0.0 proj_drop: 0.0 bottleneck_dim: 128 in_context_len: 32 in_context_start: 0