flow-matching / config.yml
sabertoaster's picture
Add files using upload-large-folder tool
bc1c44c verified
# flow_matching/src/config.yml
# Global settings
out_dir: output/two_stage_encoding
seed: 3315
overwrite: false
device: cuda
batch_size: 16
# Stage 1: Mean Anchor Generation (MultiSubjectConvLinearEncoder)
stage1:
epochs: 10
lr: 3e-4
weight_decay: 0.1
model:
embed_dim: 192
encoder_kernel_size: 45
decoder_kernel_size: 0
hidden_model: null
global_pool: avg
encoder_causal: false
encoder_positive: false
encoder_blockwise: false
pool_num_heads: 3
with_shared_decoder: true
with_subject_decoders: true
# Note: num_subjects will be inferred from data
# Transformer/Conv configuration for hidden_model if needed
transformer:
num_heads: 3
depth: 6
mlp_ratio: 4.0
conv1dnext:
depth: 6
kernel_size: 11
causal: false
# Stage 2: Neural Vector Field (Flow Matching)
stage2:
epochs: 15
lr: 3e-4
weight_decay: 0.01
n_timesteps: 25
# CFM and training regularization
cfm:
solver: euler
kld_weight: 3.0
kld_target_std: 1.0
detach_ut: false
time_dist_shift: 1.0
# DiT-style velocity model
velocity_net:
hidden_dim: 256
modality_dims: [1000]
n_blocks: 4
n_heads: 8
dropout: 0.05
modality_dropout: 0.0
max_seq_len: 2048
temporal_attn_layers: 2
# Source variational encoder
source_ve:
depth: 4
num_heads: 8
num_queries: 16
dropout: 0.1
use_variational: true
init_logvar: 1.0
fixed_std: null
# CSFM transport + sampler settings
transport:
path_type: Linear
prediction: velocity
loss_weight: null
time_dist_type: uniform
time_dist_shift: 1.0
# Dataset Configuration (from default_feature_encoding.yaml)
subjects: [1, 2, 3, 5]
features:
internvl3_8b:
model: InternVL3_8B
layers:
layers.20: language_model.model.layers.20.post_attention_layernorm
internvl3_14b:
model: InternVL3_14B
layers:
layers.20: language_model.model.layers.20.post_attention_layernorm
layers.30: language_model.model.layers.30.post_attention_layernorm
qwen-2-5-omni-3b:
model: qwen2-5_3B
layers:
layers.10: model.layers.10.post_attention_layernorm
layers.15: model.layers.15.post_attention_layernorm
layers.20: model.layers.20.post_attention_layernorm
norm: model.norm
qwen-2-5-omni-7b:
model: qwen-2-5-omni-7b
layers:
layers.5: model.layers.5.post_attention_layernorm
layers.10: model.layers.10.post_attention_layernorm
layers.15: model.layers.15.post_attention_layernorm
layers.20: model.layers.20.post_attention_layernorm
norm: model.norm
whisper:
model: whisper
layers:
layers.12: layers.12.fc2
layers.25: layers.25.fc2
layers.31: layers.31.fc2
norm: layer_norm
llama_3.2_1b:
model: Llama-3.2-1B
layers:
layers.7: model.layers.7
layers.11: model.layers.11
layers.15: model.layers.15
llama_3.2_3b:
model: Llama-3.2-3B
layers:
layers.7: model.layers.7
layers.11: model.layers.11
layers.15: model.layers.15
layers.19: model.layers.19
layers.23: model.layers.23
vjepa2:
model: vjepa2_avg_feat
layers:
layers.5: encoder.layer.5.norm1_avg
layers.15: encoder.layer.15.norm1_avg
layers.25: encoder.layer.25.norm1_avg
layers.35: encoder.layer.35.norm1_avg
norm: encoder.layernorm_avg
include_features:
- llama_3.2_3b/layers.11
- whisper/layers.12
- qwen-2-5-omni-3b/layers.20
- internvl3_14b/layers.30
- vjepa2/norm
datasets:
train:
filter:
# seasons: [1]
seasons: [1, 2, 3, 4, 5]
movies: ["bourne", "wolf"]
sample_length: 64
num_samples: 2000
shuffle: True
seed: 42
val_s6:
filter:
seasons: [6]
movies: []
sample_length: null
num_samples: null
shuffle: false
val_figures:
filter:
seasons: []
movies: ["figures"]
sample_length: null
num_samples: null
shuffle: false
# val_life:
# filter:
# seasons: []
# movies: ["life"]
# sample_length: null
# num_samples: null
# shuffle: false
val_set_name: val_figures
datasets_root: null