| |
|
|
| |
| out_dir: output/two_stage_encoding |
| seed: 3315 |
| overwrite: false |
| device: cuda |
| batch_size: 16 |
|
|
| |
| stage1: |
| epochs: 10 |
| lr: 3e-4 |
| weight_decay: 0.1 |
| |
| model: |
| embed_dim: 192 |
| encoder_kernel_size: 45 |
| decoder_kernel_size: 0 |
| hidden_model: null |
| global_pool: avg |
| encoder_causal: false |
| encoder_positive: false |
| encoder_blockwise: false |
| pool_num_heads: 3 |
| with_shared_decoder: true |
| with_subject_decoders: true |
| |
|
|
| |
| transformer: |
| num_heads: 3 |
| depth: 6 |
| mlp_ratio: 4.0 |
| conv1dnext: |
| depth: 6 |
| kernel_size: 11 |
| causal: false |
|
|
| |
| stage2: |
| epochs: 15 |
| lr: 3e-4 |
| weight_decay: 0.01 |
| n_timesteps: 25 |
|
|
| |
| cfm: |
| solver: euler |
| kld_weight: 3.0 |
| kld_target_std: 1.0 |
| detach_ut: false |
| time_dist_shift: 1.0 |
|
|
| |
| velocity_net: |
| hidden_dim: 256 |
| modality_dims: [1000] |
| n_blocks: 4 |
| n_heads: 8 |
| dropout: 0.05 |
| modality_dropout: 0.0 |
| max_seq_len: 2048 |
| temporal_attn_layers: 2 |
|
|
| |
| source_ve: |
| depth: 4 |
| num_heads: 8 |
| num_queries: 16 |
| dropout: 0.1 |
| use_variational: true |
| init_logvar: 1.0 |
| fixed_std: null |
|
|
| |
| transport: |
| path_type: Linear |
| prediction: velocity |
| loss_weight: null |
| time_dist_type: uniform |
| time_dist_shift: 1.0 |
|
|
| |
| subjects: [1, 2, 3, 5] |
|
|
| features: |
| internvl3_8b: |
| model: InternVL3_8B |
| layers: |
| layers.20: language_model.model.layers.20.post_attention_layernorm |
|
|
| internvl3_14b: |
| model: InternVL3_14B |
| layers: |
| layers.20: language_model.model.layers.20.post_attention_layernorm |
| layers.30: language_model.model.layers.30.post_attention_layernorm |
|
|
| qwen-2-5-omni-3b: |
| model: qwen2-5_3B |
| layers: |
| layers.10: model.layers.10.post_attention_layernorm |
| layers.15: model.layers.15.post_attention_layernorm |
| layers.20: model.layers.20.post_attention_layernorm |
| norm: model.norm |
|
|
| qwen-2-5-omni-7b: |
| model: qwen-2-5-omni-7b |
| layers: |
| layers.5: model.layers.5.post_attention_layernorm |
| layers.10: model.layers.10.post_attention_layernorm |
| layers.15: model.layers.15.post_attention_layernorm |
| layers.20: model.layers.20.post_attention_layernorm |
| norm: model.norm |
|
|
| whisper: |
| model: whisper |
| layers: |
| layers.12: layers.12.fc2 |
| layers.25: layers.25.fc2 |
| layers.31: layers.31.fc2 |
| norm: layer_norm |
|
|
| llama_3.2_1b: |
| model: Llama-3.2-1B |
| layers: |
| layers.7: model.layers.7 |
| layers.11: model.layers.11 |
| layers.15: model.layers.15 |
|
|
| llama_3.2_3b: |
| model: Llama-3.2-3B |
| layers: |
| layers.7: model.layers.7 |
| layers.11: model.layers.11 |
| layers.15: model.layers.15 |
| layers.19: model.layers.19 |
| layers.23: model.layers.23 |
|
|
| vjepa2: |
| model: vjepa2_avg_feat |
| layers: |
| layers.5: encoder.layer.5.norm1_avg |
| layers.15: encoder.layer.15.norm1_avg |
| layers.25: encoder.layer.25.norm1_avg |
| layers.35: encoder.layer.35.norm1_avg |
| norm: encoder.layernorm_avg |
|
|
| include_features: |
| - llama_3.2_3b/layers.11 |
| - whisper/layers.12 |
| - qwen-2-5-omni-3b/layers.20 |
| - internvl3_14b/layers.30 |
| - vjepa2/norm |
|
|
| datasets: |
| train: |
| filter: |
| |
| seasons: [1, 2, 3, 4, 5] |
| movies: ["bourne", "wolf"] |
| sample_length: 64 |
| num_samples: 2000 |
| shuffle: True |
| seed: 42 |
|
|
| val_s6: |
| filter: |
| seasons: [6] |
| movies: [] |
| sample_length: null |
| num_samples: null |
| shuffle: false |
|
|
| val_figures: |
| filter: |
| seasons: [] |
| movies: ["figures"] |
| sample_length: null |
| num_samples: null |
| shuffle: false |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| val_set_name: val_figures |
| datasets_root: null |