Upload hydra_cfg.yaml with huggingface_hub

bf5d0ee verified 23 days ago

7.88 kB

	RC_augmentation: false
	_dataset_cfg_lookup:
	dlb_cmp_gm12878:
	eval_split: validation
	hf_path: jzshared/dlb_cmp_gm12878
	label_key: label_ut
	mask_key: mask_ut
	num_workers: 0
	path: data/dlb_cmp_gm12878
	pin_memory: true
	reference_id: hg38
	sequence_format: string
	sequence_key: sequence
	shuffle: true
	test_split: test
	train_split: train
	type: cmp_seq
	dlb_cmp_h1hesc:
	eval_split: validation
	hf_path: jzshared/dlb_cmp_h1hesc
	label_key: label_ut
	mask_key: mask_ut
	num_workers: 0
	path: data/dlb_cmp_h1hesc
	pin_memory: true
	reference_id: hg38
	sequence_format: string
	sequence_key: sequence
	shuffle: true
	test_split: test
	train_split: train
	type: cmp_seq
	dlb_cmp_hct116:
	eval_split: validation
	hf_path: jzshared/dlb_cmp_hct116
	label_key: label_ut
	mask_key: mask_ut
	num_workers: 0
	path: data/dlb_cmp_hct116
	pin_memory: true
	reference_id: hg38
	sequence_format: string
	sequence_key: sequence
	shuffle: true
	test_split: test
	train_split: train
	type: cmp_seq
	dlb_cmp_hff:
	eval_split: validation
	hf_path: jzshared/dlb_cmp_hff
	label_key: label_ut
	mask_key: mask_ut
	num_workers: 0
	path: data/dlb_cmp_hff
	pin_memory: true
	reference_id: hg38
	sequence_format: string
	sequence_key: sequence
	shuffle: true
	test_split: test
	train_split: train
	type: cmp_seq
	dlb_cmp_imr90:
	eval_split: validation
	hf_path: jzshared/dlb_cmp_imr90
	label_key: label_ut
	mask_key: mask_ut
	num_workers: 0
	path: data/dlb_cmp_imr90
	pin_memory: true
	reference_id: hg38
	sequence_format: string
	sequence_key: sequence
	shuffle: true
	test_split: test
	train_split: train
	type: cmp_seq
	euks_refseq_region_12.8k:
	hf_path: jzshared/euks_refseq_all_12p8k_merged_10m_20260302
	path: data/euks_refseq_all_12p8k_merged_10m_20260302
	type: refseq
	gencode128k_basic:
	hf_path: jzshared/gencode128k_basic
	path: data/gencode128k_basic
	type: refseq
	gencode128k_debug:
	hf_path: jzshared/gencode128k_debug
	path: data/gencode128k_debug
	type: refseq
	gencode_human_12.8k:
	hf_path: jzshared/gencode_human_12.8k
	path: data/gencode_human_12.8k
	type: refseq
	gencode_human_128k:
	hf_path: jzshared/gencode_human_128k
	path: data/gencode_human_128k
	type: refseq
	hg38_128k:
	hf_path: jzshared/hg38_cds_anchored_128000
	path: data/hg38_cds_anchored_128000
	type: refseq
	hg38_12k:
	hf_path: jzshared/hg38_12800
	path: data/hg38_cds_anchored_len12800_mincds150_1000000samples
	type: refseq
	hg38_cds_4m:
	hf_path: null
	path: data/hg38_cds_dataset_4m_filtered
	type: refseq
	orca32m_cmp_seq:
	eval_split: validation
	hf_path: jzshared/orca32m_cmp
	label_key: label_ut
	mask_key: mask_ut
	num_workers: 0
	path: data/orca32m_cmp_seq
	pin_memory: true
	reference_id: hg38
	sequence_format: string
	sequence_key: sequence
	shuffle: true
	test_split: test
	train_split: train
	type: cmp_seq
	_unimportant_cfg:
	fields:
	- gpus
	- debug
	- wandb
	- env
	- uid
	- local_rank
	- is_distributed
	- master_port
	- device_type
	- cluster
	- world_size
	- train_dataset
	- eval_datasets
	- user_cfg
	- rank
	- device
	- hf_access_token
	- hf_private
	- hf_repo
	- hf_user
	- hf_token
	- save_every
	- eval_steps
	- save_steps
	- upload_to_hf
	- logging
	- log_every
	- use_wandb
	- project_root
	- version
	postfix:
	- _path
	- _file
	- _dir
	- _alias
	- _prefix
	prefix:
	- _
	add_special_tokens: true
	alias: Gencode-MxDNA
	arch: hnet
	batch_size: 8
	bidirectional_strategy: mean
	cluster: mila
	cmd: python src/scripts/rebuttal/train_mlm.py exp=rebuttal/mlm data=gencode_human_12.8k
	model=hnet/mamba_64m max_len=12800 batch_size=8 eval_batch_size=1 grad_acc_steps=4
	train_steps=7650 eval_steps=125 save_steps=750 log_every=2 num_valid_samples=3000
	upload_to_hf=true wandb.project=DNAFM_v2 tokenizer=mxdna alias=Gencode-MxDNA use_wandb=true
	hf_repo=jzshared/Gencode-MxDNA
	config_path: null
	data: gencode_human_12.8k
	data_alias: ${.data}_${max_len}
	dataset: ${_dataset_cfg_lookup[${data}]}
	dataset_sequence_key: sequence
	device: cuda
	device_type: GPU
	dirs:
	data_cache: ${project_root}/data_cache/
	data_storage: ${project_root}/data/
	hydra: ${project_root}/temp/hydra/
	output: ${project_root}/output/${data_alias}/${alias}/
	temp: ${project_root}/temp/working_dir/${uid}/
	wandb_cache: ${oc.env:WANDB_CACHE_DIR,${project_root}/temp/wandb_cache/}
	epochs: 200
	eval_batch_size: 1
	eval_steps: 125
	grad_acc_steps: 4
	hf_private: false
	hf_repo: jzshared/Gencode-MxDNA
	hf_user: jzshared
	is_distributed: true
	local_rank: 0
	log_every: 2
	logging:
	level: info
	log_wandb_metric_to_stdout: true
	lr: 0.001
	mask_replace_prob: 0.8
	master_port: '46807'
	max_data_samples: null
	max_grad_norm: 2.0
	max_len: 12800
	max_length: ${max_len}
	max_routing_tokens: 0
	max_train_steps: ${train_steps}
	min_routing_tokens: 8
	mixed_precision: bf16
	mlm_probability: 0.15
	mode: Formal
	model:
	arch: hnet
	name: hnet_mamba_64m
	model_alias: ${oc.select:model.name,UnknownModel}
	model_cfg:
	arch_layout:
	- m4
	- - m15
	- m4
	attn_cfg:
	num_heads:
	- 8
	- 12
	rotary_emb_dim:
	- 16
	- 24
	window_size:
	- 511
	- -1
	d_intermediate:
	- 0
	- 2048
	d_model:
	- 512
	- 768
	max_routing_tokens: ${max_routing_tokens}
	min_routing_tokens: ${min_routing_tokens}
	n_gpt: 1.0
	r_hi: ${r_hi}
	r_low: ${r_low}
	ssm_cfg:
	chunk_size: 256
	d_conv: 4
	d_state: 64
	expand: 2
	head_dim: 64
	tie_embeddings: true
	vocab_size: 9
	mxdna_tokenizer_vocab_path: src/scripts/rebuttal/assets/mxdna_1mertokenizer/vocab.txt
	name: hnet_base
	num_test_samples: 0
	num_train_samples: 0
	num_valid_samples: 3000
	project_root: ${hydra:runtime.cwd}
	r_hi: 0.3
	r_low: 0.0
	random_replace_prob: 0.1
	random_truncate: false
	rank: 0
	reference_loss: null
	save_steps: 750
	seed: 0
	source: ${dataset.type}
	tokenizer: mxdna
	tokenizer_cache_dir: ${dirs.data_cache}/hf_tokenizers
	tokenizer_max_length: null
	tokenizer_name_or_path: null
	tokenizer_name_or_path_resolved: /gpfs/scratch/guoh/DNAFM/src/scripts/rebuttal/assets/mxdna_1mertokenizer/vocab.txt
	tokenizer_pad_to_multiple_of: null
	tokenizer_trust_remote_code: false
	tokenizer_use_fast: true
	tokenizer_vocab_size: 9
	train_steps: 7650
	training:
	adam_beta1: 0.9
	adam_beta2: 0.95
	bf16: true
	dataloader_drop_last: true
	dataloader_num_workers: 1
	disable_tqdm: false
	do_train: true
	eval_steps: ${eval_steps}
	eval_strategy: steps
	gradient_accumulation_steps: ${grad_acc_steps}
	gradient_checkpointing: false
	group_by_length: false
	hnet_initializer_range: 0.02
	hnet_lr_multiplier: null
	label_names:
	- labels
	learning_rate: ${lr}
	logging_steps: ${log_every}
	lr_scheduler_type: linear
	max_grad_norm: ${max_grad_norm}
	max_train_steps: ${max_train_steps}
	num_train_epochs: ${epochs}
	output_dir: ${dirs.output}
	overrides: {}
	per_device_eval_batch_size: ${eval_batch_size}
	per_device_train_batch_size: ${batch_size}
	remove_unused_columns: false
	report_to: null
	resume_from_checkpoint: null
	save_steps: ${save_steps}
	save_strategy: steps
	use_lr_multiplier: true
	warmup_steps: 500
	weight_decay: 0.1
	training_alias: mlm_${tokenizer}_lr${lr}_${train_steps}steps_ms${max_train_steps}_maxlen${max_len}
	uid: ywrwxmjk
	upload_to_hf: true
	use_routing_ceiling: false
	use_routing_floor: true
	use_wandb: true
	valid_test_downsample: null
	version: NA
	wandb:
	dir: ${dirs.wandb_cache}
	entity: ${oc.select:env.vars.wandb_entity,${oc.env:WANDB_ENTITY,null}}
	id: ywrwxmjk
	mode: online
	name: Gencode-MxDNA
	project: DNAFM_v2
	step_metric: null
	tags: []
	url: https://wandb.ai/jzshared/DNAFM_v2/runs/ywrwxmjk
	world_size: 8