# flow_matching/src/config.yml # Global settings out_dir: output/two_stage_encoding seed: 3315 overwrite: false device: cuda batch_size: 16 # Stage 1: Mean Anchor Generation (MultiSubjectConvLinearEncoder) stage1: epochs: 10 lr: 3e-4 weight_decay: 0.1 model: embed_dim: 192 encoder_kernel_size: 45 decoder_kernel_size: 0 hidden_model: null global_pool: avg encoder_causal: false encoder_positive: false encoder_blockwise: false pool_num_heads: 3 with_shared_decoder: true with_subject_decoders: true # Note: num_subjects will be inferred from data # Transformer/Conv configuration for hidden_model if needed transformer: num_heads: 3 depth: 6 mlp_ratio: 4.0 conv1dnext: depth: 6 kernel_size: 11 causal: false # Stage 2: Neural Vector Field (Flow Matching) stage2: epochs: 15 lr: 3e-4 weight_decay: 0.01 n_timesteps: 25 # CFM and training regularization cfm: solver: euler kld_weight: 3.0 kld_target_std: 1.0 detach_ut: false time_dist_shift: 1.0 # DiT-style velocity model velocity_net: hidden_dim: 256 modality_dims: [1000] n_blocks: 4 n_heads: 8 dropout: 0.05 modality_dropout: 0.0 max_seq_len: 2048 temporal_attn_layers: 2 # Source variational encoder source_ve: depth: 4 num_heads: 8 num_queries: 16 dropout: 0.1 use_variational: true init_logvar: 1.0 fixed_std: null # CSFM transport + sampler settings transport: path_type: Linear prediction: velocity loss_weight: null time_dist_type: uniform time_dist_shift: 1.0 # Dataset Configuration (from default_feature_encoding.yaml) subjects: [1, 2, 3, 5] features: internvl3_8b: model: InternVL3_8B layers: layers.20: language_model.model.layers.20.post_attention_layernorm internvl3_14b: model: InternVL3_14B layers: layers.20: language_model.model.layers.20.post_attention_layernorm layers.30: language_model.model.layers.30.post_attention_layernorm qwen-2-5-omni-3b: model: qwen2-5_3B layers: layers.10: model.layers.10.post_attention_layernorm layers.15: model.layers.15.post_attention_layernorm layers.20: model.layers.20.post_attention_layernorm norm: model.norm qwen-2-5-omni-7b: model: qwen-2-5-omni-7b layers: layers.5: model.layers.5.post_attention_layernorm layers.10: model.layers.10.post_attention_layernorm layers.15: model.layers.15.post_attention_layernorm layers.20: model.layers.20.post_attention_layernorm norm: model.norm whisper: model: whisper layers: layers.12: layers.12.fc2 layers.25: layers.25.fc2 layers.31: layers.31.fc2 norm: layer_norm llama_3.2_1b: model: Llama-3.2-1B layers: layers.7: model.layers.7 layers.11: model.layers.11 layers.15: model.layers.15 llama_3.2_3b: model: Llama-3.2-3B layers: layers.7: model.layers.7 layers.11: model.layers.11 layers.15: model.layers.15 layers.19: model.layers.19 layers.23: model.layers.23 vjepa2: model: vjepa2_avg_feat layers: layers.5: encoder.layer.5.norm1_avg layers.15: encoder.layer.15.norm1_avg layers.25: encoder.layer.25.norm1_avg layers.35: encoder.layer.35.norm1_avg norm: encoder.layernorm_avg include_features: - llama_3.2_3b/layers.11 - whisper/layers.12 - qwen-2-5-omni-3b/layers.20 - internvl3_14b/layers.30 - vjepa2/norm datasets: train: filter: # seasons: [1] seasons: [1, 2, 3, 4, 5] movies: ["bourne", "wolf"] sample_length: 64 num_samples: 2000 shuffle: True seed: 42 val_s6: filter: seasons: [6] movies: [] sample_length: null num_samples: null shuffle: false val_figures: filter: seasons: [] movies: ["figures"] sample_length: null num_samples: null shuffle: false # val_life: # filter: # seasons: [] # movies: ["life"] # sample_length: null # num_samples: null # shuffle: false val_set_name: val_figures datasets_root: null