sabertoaster
/

flow-matching

Model card Files Files and versions

flow-matching / config.yml

sabertoaster's picture

Add files using upload-large-folder tool

bc1c44c verified 9 days ago

history blame contribute delete

4.2 kB

	# flow_matching/src/config.yml

	# Global settings
	out_dir: output/two_stage_encoding
	seed: 3315
	overwrite: false
	device: cuda
	batch_size: 16

	# Stage 1: Mean Anchor Generation (MultiSubjectConvLinearEncoder)
	stage1:
	epochs: 10
	lr: 3e-4
	weight_decay: 0.1

	model:
	embed_dim: 192
	encoder_kernel_size: 45
	decoder_kernel_size: 0
	hidden_model: null
	global_pool: avg
	encoder_causal: false
	encoder_positive: false
	encoder_blockwise: false
	pool_num_heads: 3
	with_shared_decoder: true
	with_subject_decoders: true
	# Note: num_subjects will be inferred from data

	# Transformer/Conv configuration for hidden_model if needed
	transformer:
	num_heads: 3
	depth: 6
	mlp_ratio: 4.0
	conv1dnext:
	depth: 6
	kernel_size: 11
	causal: false

	# Stage 2: Neural Vector Field (Flow Matching)
	stage2:
	epochs: 15
	lr: 3e-4
	weight_decay: 0.01
	n_timesteps: 25

	# CFM and training regularization
	cfm:
	solver: euler
	kld_weight: 3.0
	kld_target_std: 1.0
	detach_ut: false
	time_dist_shift: 1.0

	# DiT-style velocity model
	velocity_net:
	hidden_dim: 256
	modality_dims: [1000]
	n_blocks: 4
	n_heads: 8
	dropout: 0.05
	modality_dropout: 0.0
	max_seq_len: 2048
	temporal_attn_layers: 2

	# Source variational encoder
	source_ve:
	depth: 4
	num_heads: 8
	num_queries: 16
	dropout: 0.1
	use_variational: true
	init_logvar: 1.0
	fixed_std: null

	# CSFM transport + sampler settings
	transport:
	path_type: Linear
	prediction: velocity
	loss_weight: null
	time_dist_type: uniform
	time_dist_shift: 1.0

	# Dataset Configuration (from default_feature_encoding.yaml)
	subjects: [1, 2, 3, 5]

	features:
	internvl3_8b:
	model: InternVL3_8B
	layers:
	layers.20: language_model.model.layers.20.post_attention_layernorm

	internvl3_14b:
	model: InternVL3_14B
	layers:
	layers.20: language_model.model.layers.20.post_attention_layernorm
	layers.30: language_model.model.layers.30.post_attention_layernorm

	qwen-2-5-omni-3b:
	model: qwen2-5_3B
	layers:
	layers.10: model.layers.10.post_attention_layernorm
	layers.15: model.layers.15.post_attention_layernorm
	layers.20: model.layers.20.post_attention_layernorm
	norm: model.norm

	qwen-2-5-omni-7b:
	model: qwen-2-5-omni-7b
	layers:
	layers.5: model.layers.5.post_attention_layernorm
	layers.10: model.layers.10.post_attention_layernorm
	layers.15: model.layers.15.post_attention_layernorm
	layers.20: model.layers.20.post_attention_layernorm
	norm: model.norm

	whisper:
	model: whisper
	layers:
	layers.12: layers.12.fc2
	layers.25: layers.25.fc2
	layers.31: layers.31.fc2
	norm: layer_norm

	llama_3.2_1b:
	model: Llama-3.2-1B
	layers:
	layers.7: model.layers.7
	layers.11: model.layers.11
	layers.15: model.layers.15

	llama_3.2_3b:
	model: Llama-3.2-3B
	layers:
	layers.7: model.layers.7
	layers.11: model.layers.11
	layers.15: model.layers.15
	layers.19: model.layers.19
	layers.23: model.layers.23

	vjepa2:
	model: vjepa2_avg_feat
	layers:
	layers.5: encoder.layer.5.norm1_avg
	layers.15: encoder.layer.15.norm1_avg
	layers.25: encoder.layer.25.norm1_avg
	layers.35: encoder.layer.35.norm1_avg
	norm: encoder.layernorm_avg

	include_features:
	- llama_3.2_3b/layers.11
	- whisper/layers.12
	- qwen-2-5-omni-3b/layers.20
	- internvl3_14b/layers.30
	- vjepa2/norm

	datasets:
	train:
	filter:
	# seasons: [1]
	seasons: [1, 2, 3, 4, 5]
	movies: ["bourne", "wolf"]
	sample_length: 64
	num_samples: 2000
	shuffle: True
	seed: 42

	val_s6:
	filter:
	seasons: [6]
	movies: []
	sample_length: null
	num_samples: null
	shuffle: false

	val_figures:
	filter:
	seasons: []
	movies: ["figures"]
	sample_length: null
	num_samples: null
	shuffle: false

	# val_life:
	# filter:
	# seasons: []
	# movies: ["life"]
	# sample_length: null
	# num_samples: null
	# shuffle: false

	val_set_name: val_figures
	datasets_root: null