RC_augmentation: false _dataset_cfg_lookup: dlb_cmp_gm12878: eval_split: validation hf_path: jzshared/dlb_cmp_gm12878 label_key: label_ut mask_key: mask_ut num_workers: 0 path: data/dlb_cmp_gm12878 pin_memory: true reference_id: hg38 sequence_format: string sequence_key: sequence shuffle: true test_split: test train_split: train type: cmp_seq dlb_cmp_h1hesc: eval_split: validation hf_path: jzshared/dlb_cmp_h1hesc label_key: label_ut mask_key: mask_ut num_workers: 0 path: data/dlb_cmp_h1hesc pin_memory: true reference_id: hg38 sequence_format: string sequence_key: sequence shuffle: true test_split: test train_split: train type: cmp_seq dlb_cmp_hct116: eval_split: validation hf_path: jzshared/dlb_cmp_hct116 label_key: label_ut mask_key: mask_ut num_workers: 0 path: data/dlb_cmp_hct116 pin_memory: true reference_id: hg38 sequence_format: string sequence_key: sequence shuffle: true test_split: test train_split: train type: cmp_seq dlb_cmp_hff: eval_split: validation hf_path: jzshared/dlb_cmp_hff label_key: label_ut mask_key: mask_ut num_workers: 0 path: data/dlb_cmp_hff pin_memory: true reference_id: hg38 sequence_format: string sequence_key: sequence shuffle: true test_split: test train_split: train type: cmp_seq dlb_cmp_imr90: eval_split: validation hf_path: jzshared/dlb_cmp_imr90 label_key: label_ut mask_key: mask_ut num_workers: 0 path: data/dlb_cmp_imr90 pin_memory: true reference_id: hg38 sequence_format: string sequence_key: sequence shuffle: true test_split: test train_split: train type: cmp_seq euks_refseq_region_12.8k: hf_path: jzshared/euks_refseq_all_12p8k_merged_10m_20260302 path: data/euks_refseq_all_12p8k_merged_10m_20260302 type: refseq gencode128k_basic: hf_path: jzshared/gencode128k_basic path: data/gencode128k_basic type: refseq gencode128k_debug: hf_path: jzshared/gencode128k_debug path: data/gencode128k_debug type: refseq gencode_human_12.8k: hf_path: jzshared/gencode_human_12.8k path: data/gencode_human_12.8k type: refseq gencode_human_128k: hf_path: jzshared/gencode_human_128k path: data/gencode_human_128k type: refseq hg38_128k: hf_path: jzshared/hg38_cds_anchored_128000 path: data/hg38_cds_anchored_128000 type: refseq hg38_12k: hf_path: jzshared/hg38_12800 path: data/hg38_cds_anchored_len12800_mincds150_1000000samples type: refseq hg38_cds_4m: hf_path: null path: data/hg38_cds_dataset_4m_filtered type: refseq orca32m_cmp_seq: eval_split: validation hf_path: jzshared/orca32m_cmp label_key: label_ut mask_key: mask_ut num_workers: 0 path: data/orca32m_cmp_seq pin_memory: true reference_id: hg38 sequence_format: string sequence_key: sequence shuffle: true test_split: test train_split: train type: cmp_seq _unimportant_cfg: fields: - gpus - debug - wandb - env - uid - local_rank - is_distributed - master_port - device_type - cluster - world_size - train_dataset - eval_datasets - user_cfg - rank - device - hf_access_token - hf_private - hf_repo - hf_user - hf_token - save_every - eval_steps - save_steps - upload_to_hf - logging - log_every - use_wandb - project_root - version postfix: - _path - _file - _dir - _alias - _prefix prefix: - _ add_special_tokens: true alias: Gencode-MxDNA arch: hnet batch_size: 8 bidirectional_strategy: mean cluster: mila cmd: python src/scripts/rebuttal/train_mlm.py exp=rebuttal/mlm data=gencode_human_12.8k model=hnet/mamba_64m max_len=12800 batch_size=8 eval_batch_size=1 grad_acc_steps=4 train_steps=7650 eval_steps=125 save_steps=750 log_every=2 num_valid_samples=3000 upload_to_hf=true wandb.project=DNAFM_v2 tokenizer=mxdna alias=Gencode-MxDNA use_wandb=true hf_repo=jzshared/Gencode-MxDNA config_path: null data: gencode_human_12.8k data_alias: ${.data}_${max_len} dataset: ${_dataset_cfg_lookup[${data}]} dataset_sequence_key: sequence device: cuda device_type: GPU dirs: data_cache: ${project_root}/data_cache/ data_storage: ${project_root}/data/ hydra: ${project_root}/temp/hydra/ output: ${project_root}/output/${data_alias}/${alias}/ temp: ${project_root}/temp/working_dir/${uid}/ wandb_cache: ${oc.env:WANDB_CACHE_DIR,${project_root}/temp/wandb_cache/} epochs: 200 eval_batch_size: 1 eval_steps: 125 grad_acc_steps: 4 hf_private: false hf_repo: jzshared/Gencode-MxDNA hf_user: jzshared is_distributed: true local_rank: 0 log_every: 2 logging: level: info log_wandb_metric_to_stdout: true lr: 0.001 mask_replace_prob: 0.8 master_port: '46807' max_data_samples: null max_grad_norm: 2.0 max_len: 12800 max_length: ${max_len} max_routing_tokens: 0 max_train_steps: ${train_steps} min_routing_tokens: 8 mixed_precision: bf16 mlm_probability: 0.15 mode: Formal model: arch: hnet name: hnet_mamba_64m model_alias: ${oc.select:model.name,UnknownModel} model_cfg: arch_layout: - m4 - - m15 - m4 attn_cfg: num_heads: - 8 - 12 rotary_emb_dim: - 16 - 24 window_size: - 511 - -1 d_intermediate: - 0 - 2048 d_model: - 512 - 768 max_routing_tokens: ${max_routing_tokens} min_routing_tokens: ${min_routing_tokens} n_gpt: 1.0 r_hi: ${r_hi} r_low: ${r_low} ssm_cfg: chunk_size: 256 d_conv: 4 d_state: 64 expand: 2 head_dim: 64 tie_embeddings: true vocab_size: 9 mxdna_tokenizer_vocab_path: src/scripts/rebuttal/assets/mxdna_1mertokenizer/vocab.txt name: hnet_base num_test_samples: 0 num_train_samples: 0 num_valid_samples: 3000 project_root: ${hydra:runtime.cwd} r_hi: 0.3 r_low: 0.0 random_replace_prob: 0.1 random_truncate: false rank: 0 reference_loss: null save_steps: 750 seed: 0 source: ${dataset.type} tokenizer: mxdna tokenizer_cache_dir: ${dirs.data_cache}/hf_tokenizers tokenizer_max_length: null tokenizer_name_or_path: null tokenizer_name_or_path_resolved: /gpfs/scratch/guoh/DNAFM/src/scripts/rebuttal/assets/mxdna_1mertokenizer/vocab.txt tokenizer_pad_to_multiple_of: null tokenizer_trust_remote_code: false tokenizer_use_fast: true tokenizer_vocab_size: 9 train_steps: 7650 training: adam_beta1: 0.9 adam_beta2: 0.95 bf16: true dataloader_drop_last: true dataloader_num_workers: 1 disable_tqdm: false do_train: true eval_steps: ${eval_steps} eval_strategy: steps gradient_accumulation_steps: ${grad_acc_steps} gradient_checkpointing: false group_by_length: false hnet_initializer_range: 0.02 hnet_lr_multiplier: null label_names: - labels learning_rate: ${lr} logging_steps: ${log_every} lr_scheduler_type: linear max_grad_norm: ${max_grad_norm} max_train_steps: ${max_train_steps} num_train_epochs: ${epochs} output_dir: ${dirs.output} overrides: {} per_device_eval_batch_size: ${eval_batch_size} per_device_train_batch_size: ${batch_size} remove_unused_columns: false report_to: null resume_from_checkpoint: null save_steps: ${save_steps} save_strategy: steps use_lr_multiplier: true warmup_steps: 500 weight_decay: 0.1 training_alias: mlm_${tokenizer}_lr${lr}_${train_steps}steps_ms${max_train_steps}_maxlen${max_len} uid: ywrwxmjk upload_to_hf: true use_routing_ceiling: false use_routing_floor: true use_wandb: true valid_test_downsample: null version: NA wandb: dir: ${dirs.wandb_cache} entity: ${oc.select:env.vars.wandb_entity,${oc.env:WANDB_ENTITY,null}} id: ywrwxmjk mode: online name: Gencode-MxDNA project: DNAFM_v2 step_metric: null tags: [] url: https://wandb.ai/jzshared/DNAFM_v2/runs/ywrwxmjk world_size: 8