Add files using upload-large-folder tool

679ebf3 verified 13 days ago

15.9 kB

	run_name: multitask_train
	model:
	model_name: molmo
	llm:
	d_model: 2560
	n_heads: 32
	n_kv_heads: 8
	head_dim: 128
	qkv_bias: false
	clip_qkv: null
	n_layers: 36
	mlp_ratio: 4
	mlp_hidden_size: 19456
	activation_type: swiglu
	block_type: sequential
	rope: true
	rope_full_precision: true
	rope_theta: 1000000.0
	rope_type: default
	rope_factor: null
	rope_high_freq_factor: null
	rope_low_freq_factor: null
	rope_original_max_position_embeddings: null
	attention_type: sdpa
	float32_attention: true
	attention_dropout: 0.0
	attention_layer_norm: true
	attention_layer_norm_type: qwen3
	residual_dropout: 0.1
	response_residual_dropout: 0.0
	layer_norm_type: rms
	layer_norm_with_affine: true
	layer_norm_eps: 1.0e-06
	attention_layer_norm_with_affine: true
	max_sequence_length: 4096
	max_position_embeddings: null
	include_bias: false
	bias_for_layer_norm: null
	norm_after: false
	moe_num_experts: 8
	moe_top_k: 2
	moe_mlp_impl: sparse
	moe_log_expert_assignment: false
	moe_shared_expert: false
	moe_lbl_in_fp32: false
	moe_interleave: false
	moe_loss_weight: 0.1
	moe_zloss_weight: null
	moe_dropless: true
	moe_capacity_factor: 1.25
	embedding_dropout: 0.0
	scale_logits: false
	vocab_size: 151936
	additional_vocab_size: 128
	weight_tying: true
	embedding_size: 151936
	use_position_ids: true
	tokenizer:
	identifier: Qwen/Qwen3-4B
	tokenizer_dir: null
	init_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen3-4b.pt
	init_incremental: null
	new_embedding_init_range: 0.02
	initializer_range: 0.02
	normalize_input_embeds: false
	activation_checkpoint: whole_layer
	compile: blocks
	fix_pad_tokenizer: false
	init_std: 0.02
	init_fn: normal
	init_cutoff_factor: null
	vision_backbone:
	vit:
	image_model_type: siglip
	image_default_input_size:
	- 378
	- 378
	image_patch_size: 14
	image_pos_patch_size: 14
	image_emb_dim: 1152
	image_num_heads: 16
	image_num_key_value_heads: 16
	image_num_layers: 27
	image_head_dim: 72
	image_mlp_dim: 4304
	image_mlp_activations: gelu_pytorch_tanh
	image_dropout_rate: 0.0
	image_num_pos: 729
	image_norm_eps: 1.0e-06
	attention_dropout: 0.0
	residual_dropout: 0.0
	initializer_range: 0.02
	float32_attention: true
	attention_type: sdpa
	activation_checkpointing: true
	init_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
	resize_mode: siglip
	pad_value: 0.0
	normalize: siglip
	image_pooling_2d: attention_meanq
	pooling_attention_mask: false
	image_projector: mlp
	image_padding_embed: null
	vit_layers:
	- -3
	- -9
	skip_unused_layers: true
	image_feature_dropout: 0.0
	connector_activation_checkpointing: true
	compile_vit: blocks
	compile_connector: dynamic
	normalize_on_gpu: false
	data_formatter:
	prompt_templates: uber_model
	message_format: role
	system_prompt: demo_or_style
	always_start_with_space: false
	default_inference_len: 65
	select_answer: best
	debug: false
	image_last: false
	format_message_list: null
	p_one_message: 0.0
	eval_system_prompt_mapping: null
	timestamp_mode: 50-percent-seconds
	p_choice_content_in_mc: 1.0
	mm_preprocessor:
	crop_mode: overlap-and-resize-c2
	use_col_tokens: true
	max_crops: 8
	pooling_w: 2
	pooling_h: 2
	overlap_margins:
	- 4
	- 4
	max_images: null
	max_multi_image_crops: 4
	max_answer_len: null
	last_message_loss_only: false
	loss_token_weighting: root_subsegments
	max_text_tokens: null
	image_padding_mask: false
	legacy_image_mask: false
	bi_directional_attn: null
	seed: 6198
	epoch: null
	dry_run: false
	ft_llm: true
	ft_vit: true
	ft_connector: true
	ft_embedding: lm_head
	optimizer:
	name: adamw
	learning_rate: 0.0001
	weight_decay: 0.01
	betas:
	- 0.9
	- 0.95
	eps: 1.0e-05
	connector_learning_rate: 5.0e-06
	vit_learning_rate: 5.0e-06
	llm_learning_rate: 1.0e-05
	frame_selector_learning_rate: 0.0001
	temporal_token_scorer_learning_rate: 0.0001
	connector_weight_decay: 0.0
	vit_weight_decay: 0.0
	llm_weight_decay: 0.0
	frame_selector_weight_decay: 0.01
	temporal_token_scorer_weight_decay: 0.01
	connector_betas:
	- 0.9
	- 0.95
	vit_betas:
	- 0.9
	- 0.95
	llm_betas:
	- 0.9
	- 0.95
	frame_selector_betas:
	- 0.9
	- 0.95
	temporal_token_scorer_betas:
	- 0.9
	- 0.95
	connector_eps: 1.0e-06
	vit_eps: 1.0e-06
	llm_eps: 1.0e-06
	frame_selector_eps: 1.0e-06
	temporal_token_scorer_eps: 1.0e-06
	metrics_log_interval: -1
	scheduler:
	name: multimodal
	units: steps
	t_warmup: 100
	t_max: null
	alpha_f: 0.1
	connector_t_warmup: 200
	vit_t_warmup: 200
	llm_t_warmup: 200
	frame_selector_t_warmup: 200
	temporal_token_scorer_t_warmup: 200
	grad_clip_warmup_steps: null
	grad_clip_warmup_factor: null
	warmup_min_lr: 0.0
	data:
	dataset: null
	mixture: null
	root_size_mixture:
	- rate: 0.15
	mixture:
	pixmo_ask_model_anything: null
	pixmo_cap: 50000.0
	pixmo_cap_qa_as_user_qa: null
	pixmo_pointing_explanations: null
	- rate: 0.5
	mixture:
	coco_2014_vqa_multi: null
	text_vqa: null
	okvqa: null
	chart_qa_weighted: null
	doc_qa: null
	info_qa: null
	ai2_diagram_v2_mix_transparent: null
	a_okvqa_mc: null
	a_okvqa_da: null
	android_control: null
	science_qa_img: null
	tabwmp_da: null
	st_qa: null
	tally_qa: null
	pixmo_clocks: 250000.0
	dv_qa: 10000.0
	figure_qa: 10000.0
	plot_qa: 20000.0
	cosyn_chart_exp: null
	cosyn_chemical_exp: null
	cosyn_diagram_exp: null
	cosyn_document: null
	cosyn_math_exp: null
	cosyn_music_exp: null
	cosyn_table_exp: null
	- rate: 0.35
	mixture:
	pixmo_points_train: null
	pixmo_count_train: null
	pixmo_points_high_freq_train: null
	cosyn_point: null
	kwargs_mixture: null
	split: train
	seed: 50189
	pad: to_max
	sequence_length: 2304
	max_text_seq_len: null
	shuffle: true
	start_index: 0
	packing: null
	num_workers: 2
	drop_last: true
	pin_memory: true
	prefetch_factor: null
	persistent_workers: false
	timeout: 0
	restore_dataloader: true
	fast_forward_batches: null
	evaluators: []
	eval_interval: 2000
	inf_evaluators:
	- label: chart_qa_exp
	data:
	dataset: chart_qa_exp
	mixture: null
	root_size_mixture: null
	kwargs_mixture: null
	split: validation
	seed: 691203
	pad: to_max
	sequence_length: 1792
	max_text_seq_len: null
	shuffle: true
	start_index: 0
	packing: null
	num_workers: 2
	drop_last: true
	pin_memory: true
	prefetch_factor: null
	persistent_workers: true
	timeout: 0
	evaluator:
	n_to_log: 0
	num_wandb_examples: 32
	save_predictions: null
	save_tokens: false
	vqa_eval: relaxed_correctness,scifi_relaxed_correctness,em
	pointing_eval: false
	count_eval: false
	point_count_eval: false
	android_eval: false
	clock_eval: false
	clock_bench_eval: false
	math_vista_eval: false
	temp_compass_eval: ''
	temp_compass_disable_api: false
	video_mme_eval: ''
	mme_videoocr_mc_eval: false
	mlvu_gen_eval: false
	long_video_bench_eval: false
	plm_fgqa_eval: false
	long_video_bench_caption_eval: false
	vinoground_eval: false
	vixmo_caption_eval: false
	dream1k_caption_eval: false
	refexp_eval: false
	coco_caption_eval: false
	qv_highlights_eval: false
	tomato: false
	temporal_bench: false
	max_new_tokens: 256
	device_batch_size: 4
	subset_num_batches: null
	max_examples: 2048
	console_log_interval: 20
	include_image: false
	- label: info_qa
	data:
	dataset: info_qa
	mixture: null
	root_size_mixture: null
	kwargs_mixture: null
	split: validation
	seed: 691203
	pad: to_max
	sequence_length: 1792
	max_text_seq_len: null
	shuffle: true
	start_index: 0
	packing: null
	num_workers: 2
	drop_last: true
	pin_memory: true
	prefetch_factor: null
	persistent_workers: true
	timeout: 0
	evaluator:
	n_to_log: 0
	num_wandb_examples: 32
	save_predictions: null
	save_tokens: false
	vqa_eval: ansl,em
	pointing_eval: false
	count_eval: false
	point_count_eval: false
	android_eval: false
	clock_eval: false
	clock_bench_eval: false
	math_vista_eval: false
	temp_compass_eval: ''
	temp_compass_disable_api: false
	video_mme_eval: ''
	mme_videoocr_mc_eval: false
	mlvu_gen_eval: false
	long_video_bench_eval: false
	plm_fgqa_eval: false
	long_video_bench_caption_eval: false
	vinoground_eval: false
	vixmo_caption_eval: false
	dream1k_caption_eval: false
	refexp_eval: false
	coco_caption_eval: false
	qv_highlights_eval: false
	tomato: false
	temporal_bench: false
	max_new_tokens: 12
	device_batch_size: 4
	subset_num_batches: null
	max_examples: 2048
	console_log_interval: 20
	include_image: false
	- label: doc_qa
	data:
	dataset: doc_qa
	mixture: null
	root_size_mixture: null
	kwargs_mixture: null
	split: validation
	seed: 691203
	pad: to_max
	sequence_length: 1792
	max_text_seq_len: null
	shuffle: true
	start_index: 0
	packing: null
	num_workers: 2
	drop_last: true
	pin_memory: true
	prefetch_factor: null
	persistent_workers: true
	timeout: 0
	evaluator:
	n_to_log: 0
	num_wandb_examples: 32
	save_predictions: null
	save_tokens: false
	vqa_eval: ansl,em
	pointing_eval: false
	count_eval: false
	point_count_eval: false
	android_eval: false
	clock_eval: false
	clock_bench_eval: false
	math_vista_eval: false
	temp_compass_eval: ''
	temp_compass_disable_api: false
	video_mme_eval: ''
	mme_videoocr_mc_eval: false
	mlvu_gen_eval: false
	long_video_bench_eval: false
	plm_fgqa_eval: false
	long_video_bench_caption_eval: false
	vinoground_eval: false
	vixmo_caption_eval: false
	dream1k_caption_eval: false
	refexp_eval: false
	coco_caption_eval: false
	qv_highlights_eval: false
	tomato: false
	temporal_bench: false
	max_new_tokens: 12
	device_batch_size: 4
	subset_num_batches: null
	max_examples: 2048
	console_log_interval: 20
	include_image: false
	- label: ai2_diagram
	data:
	dataset: ai2_diagram_v2_mix_transparent
	mixture: null
	root_size_mixture: null
	kwargs_mixture: null
	split: validation
	seed: 691203
	pad: to_max
	sequence_length: 1792
	max_text_seq_len: null
	shuffle: true
	start_index: 0
	packing: null
	num_workers: 2
	drop_last: true
	pin_memory: true
	prefetch_factor: null
	persistent_workers: true
	timeout: 0
	evaluator:
	n_to_log: 0
	num_wandb_examples: 32
	save_predictions: null
	save_tokens: false
	vqa_eval: mc_ai2d_opaque,mc_ai2d_transparent
	pointing_eval: false
	count_eval: false
	point_count_eval: false
	android_eval: false
	clock_eval: false
	clock_bench_eval: false
	math_vista_eval: false
	temp_compass_eval: ''
	temp_compass_disable_api: false
	video_mme_eval: ''
	mme_videoocr_mc_eval: false
	mlvu_gen_eval: false
	long_video_bench_eval: false
	plm_fgqa_eval: false
	long_video_bench_caption_eval: false
	vinoground_eval: false
	vixmo_caption_eval: false
	dream1k_caption_eval: false
	refexp_eval: false
	coco_caption_eval: false
	qv_highlights_eval: false
	tomato: false
	temporal_bench: false
	max_new_tokens: 32
	device_batch_size: 4
	subset_num_batches: null
	max_examples: 2048
	console_log_interval: 20
	include_image: false
	- label: coco_2014_vqa
	data:
	dataset: coco_2014_vqa
	mixture: null
	root_size_mixture: null
	kwargs_mixture: null
	split: validation
	seed: 691203
	pad: to_max
	sequence_length: 1792
	max_text_seq_len: null
	shuffle: true
	start_index: 0
	packing: null
	num_workers: 2
	drop_last: true
	pin_memory: true
	prefetch_factor: null
	persistent_workers: true
	timeout: 0
	evaluator:
	n_to_log: 0
	num_wandb_examples: 32
	save_predictions: null
	save_tokens: false
	vqa_eval: vqa_score
	pointing_eval: false
	count_eval: false
	point_count_eval: false
	android_eval: false
	clock_eval: false
	clock_bench_eval: false
	math_vista_eval: false
	temp_compass_eval: ''
	temp_compass_disable_api: false
	video_mme_eval: ''
	mme_videoocr_mc_eval: false
	mlvu_gen_eval: false
	long_video_bench_eval: false
	plm_fgqa_eval: false
	long_video_bench_caption_eval: false
	vinoground_eval: false
	vixmo_caption_eval: false
	dream1k_caption_eval: false
	refexp_eval: false
	coco_caption_eval: false
	qv_highlights_eval: false
	tomato: false
	temporal_bench: false
	max_new_tokens: 12
	device_batch_size: 4
	subset_num_batches: null
	max_examples: 2048
	console_log_interval: 20
	include_image: false
	inf_eval_interval: 2000
	eval_on_last_step: true
	eval_on_load: false
	save_folder: /weka/oe-training-default/sanghol/molmo/models/uber-v1/uber3.4-synthetic-siglip2-qwen3_4b
	checkpointer_config:
	save_thread_count: null
	load_thread_count: null
	pre_download: false
	work_dir: null
	throttle_uploads: false
	canceled_check_interval: 50
	save_interval: 1000
	save_at: null
	save_final_optim: true
	save_num_checkpoints_to_keep: 1
	save_final_unsharded_checkpoint: false
	save_interval_ephemeral: null
	save_overwrite: true
	load_path: null
	reset_optimizer_state: false
	reset_trainer_state: false
	initial_model_checkpoint: /weka/oe-training-default/chrisk/molmo/models/dense-cap-v1/captioner-siglip2-qwen3_4b/step22347
	allow_resume: true
	max_duration: 30000
	global_train_batch_size: 256
	device_train_microbatch_size: 4
	max_grad_norm: 1.0
	multi_component_grad_norm: true
	batch_divisor: global_batch
	max_grad_norm_ratio: null
	precision: amp_bf16
	wandb:
	project: molmo2-dev
	entity: prior-ai2
	group: uber-v1
	name: uber3.4-synthetic-siglip2-qwen3_4b
	tags:
	- watching
	log_artifacts: false
	rank_zero_only: true
	log_interval: 20
	allow_resume: false
	beaker_log_interval: 50
	speed_monitor:
	window_size: 20
	gpu_flops_available: null
	console_log_interval: 20
	gen1_gc_interval: 1
	compile:
	mode: default
	fullgraph: false
	dynamic: false
	backend: inductor
	activation_checkpointing: true
	fsdp:
	fsdp2: true
	precision: float
	use_orig_params: true
	wrapping_strategy: by_block_and_size
	sharding_strategy: FULL_SHARD
	hybrid_sharding_num_model_replicas: null
	softmax_auxiliary_loss: true
	softmax_auxiliary_loss_scale: 0.0001
	saliency_score_loss_wt: null
	frame_score_loss_wt: null
	frame_score_loss_type: mse
	frame_score_loss_target: 0.7
	time_limit: null
	extra_steps_after_cancel: 10
	python_profiling: false
	torch_profiling: false
	stop_at: 30000
	stop_after: null
	fused_loss: false
	compile_loss: true
	runtime_data:
	args: /gantry-runtime/launch_scripts/train_multitask_model.py 3.4-synthetic /weka/oe-training-default/chrisk/molmo/models/dense-cap-v1/captioner-siglip2-qwen3_4b
	--save_overwrite --save_interval=1000 --wandb.group=uber-v1 --wandb.name=uber3.4-synthetic-siglip2-qwen3_4b
	--save_folder=/weka/oe-training-default/sanghol/molmo/models/uber-v1/uber3.4-synthetic-siglip2-qwen3_4b
	hostname: jupiter-cs-aus-112.reviz.ai2.in
	date: 09/17/2025, 10:56
	world_size: 16
	resuming_from: /weka/oe-training-default/sanghol/molmo/models/uber-v1/uber3.4-synthetic-siglip2-qwen3_4b/step27000
	beaker_experiment_id: 01K567APCW88M30XRRXSACWXPR
	beaker_experiment_url: https://beaker.org/ex/01K567APCW88M30XRRXSACWXPR
	wandb_id: 5307t37m
	wandb_url: https://wandb.ai/prior-ai2/molmo2-dev/runs/5307t37m