| RC_augmentation: false |
| _dataset_cfg_lookup: |
| dlb_cmp_gm12878: |
| eval_split: validation |
| hf_path: jzshared/dlb_cmp_gm12878 |
| label_key: label_ut |
| mask_key: mask_ut |
| num_workers: 0 |
| path: data/dlb_cmp_gm12878 |
| pin_memory: true |
| reference_id: hg38 |
| sequence_format: string |
| sequence_key: sequence |
| shuffle: true |
| test_split: test |
| train_split: train |
| type: cmp_seq |
| dlb_cmp_h1hesc: |
| eval_split: validation |
| hf_path: jzshared/dlb_cmp_h1hesc |
| label_key: label_ut |
| mask_key: mask_ut |
| num_workers: 0 |
| path: data/dlb_cmp_h1hesc |
| pin_memory: true |
| reference_id: hg38 |
| sequence_format: string |
| sequence_key: sequence |
| shuffle: true |
| test_split: test |
| train_split: train |
| type: cmp_seq |
| dlb_cmp_hct116: |
| eval_split: validation |
| hf_path: jzshared/dlb_cmp_hct116 |
| label_key: label_ut |
| mask_key: mask_ut |
| num_workers: 0 |
| path: data/dlb_cmp_hct116 |
| pin_memory: true |
| reference_id: hg38 |
| sequence_format: string |
| sequence_key: sequence |
| shuffle: true |
| test_split: test |
| train_split: train |
| type: cmp_seq |
| dlb_cmp_hff: |
| eval_split: validation |
| hf_path: jzshared/dlb_cmp_hff |
| label_key: label_ut |
| mask_key: mask_ut |
| num_workers: 0 |
| path: data/dlb_cmp_hff |
| pin_memory: true |
| reference_id: hg38 |
| sequence_format: string |
| sequence_key: sequence |
| shuffle: true |
| test_split: test |
| train_split: train |
| type: cmp_seq |
| dlb_cmp_imr90: |
| eval_split: validation |
| hf_path: jzshared/dlb_cmp_imr90 |
| label_key: label_ut |
| mask_key: mask_ut |
| num_workers: 0 |
| path: data/dlb_cmp_imr90 |
| pin_memory: true |
| reference_id: hg38 |
| sequence_format: string |
| sequence_key: sequence |
| shuffle: true |
| test_split: test |
| train_split: train |
| type: cmp_seq |
| euks_refseq_region_12.8k: |
| hf_path: jzshared/euks_refseq_all_12p8k_merged_10m_20260302 |
| path: data/euks_refseq_all_12p8k_merged_10m_20260302 |
| type: refseq |
| gencode128k_basic: |
| hf_path: jzshared/gencode128k_basic |
| path: data/gencode128k_basic |
| type: refseq |
| gencode128k_debug: |
| hf_path: jzshared/gencode128k_debug |
| path: data/gencode128k_debug |
| type: refseq |
| gencode_human_12.8k: |
| hf_path: jzshared/gencode_human_12.8k |
| path: data/gencode_human_12.8k |
| type: refseq |
| gencode_human_128k: |
| hf_path: jzshared/gencode_human_128k |
| path: data/gencode_human_128k |
| type: refseq |
| hg38_128k: |
| hf_path: jzshared/hg38_cds_anchored_128000 |
| path: data/hg38_cds_anchored_128000 |
| type: refseq |
| hg38_12k: |
| hf_path: jzshared/hg38_12800 |
| path: data/hg38_cds_anchored_len12800_mincds150_1000000samples |
| type: refseq |
| hg38_cds_4m: |
| hf_path: null |
| path: data/hg38_cds_dataset_4m_filtered |
| type: refseq |
| orca32m_cmp_seq: |
| eval_split: validation |
| hf_path: jzshared/orca32m_cmp |
| label_key: label_ut |
| mask_key: mask_ut |
| num_workers: 0 |
| path: data/orca32m_cmp_seq |
| pin_memory: true |
| reference_id: hg38 |
| sequence_format: string |
| sequence_key: sequence |
| shuffle: true |
| test_split: test |
| train_split: train |
| type: cmp_seq |
| _unimportant_cfg: |
| fields: |
| - gpus |
| - debug |
| - wandb |
| - env |
| - uid |
| - local_rank |
| - is_distributed |
| - master_port |
| - device_type |
| - cluster |
| - world_size |
| - train_dataset |
| - eval_datasets |
| - user_cfg |
| - rank |
| - device |
| - hf_access_token |
| - hf_private |
| - hf_repo |
| - hf_user |
| - hf_token |
| - save_every |
| - eval_steps |
| - save_steps |
| - upload_to_hf |
| - logging |
| - log_every |
| - use_wandb |
| - project_root |
| - version |
| postfix: |
| - _path |
| - _file |
| - _dir |
| - _alias |
| - _prefix |
| prefix: |
| - _ |
| add_special_tokens: true |
| alias: Gencode-MxDNA |
| arch: hnet |
| batch_size: 8 |
| bidirectional_strategy: mean |
| cluster: mila |
| cmd: python src/scripts/rebuttal/train_mlm.py exp=rebuttal/mlm data=gencode_human_12.8k |
| model=hnet/mamba_64m max_len=12800 batch_size=8 eval_batch_size=1 grad_acc_steps=4 |
| train_steps=7650 eval_steps=125 save_steps=750 log_every=2 num_valid_samples=3000 |
| upload_to_hf=true wandb.project=DNAFM_v2 tokenizer=mxdna alias=Gencode-MxDNA use_wandb=true |
| hf_repo=jzshared/Gencode-MxDNA |
| config_path: null |
| data: gencode_human_12.8k |
| data_alias: ${.data}_${max_len} |
| dataset: ${_dataset_cfg_lookup[${data}]} |
| dataset_sequence_key: sequence |
| device: cuda |
| device_type: GPU |
| dirs: |
| data_cache: ${project_root}/data_cache/ |
| data_storage: ${project_root}/data/ |
| hydra: ${project_root}/temp/hydra/ |
| output: ${project_root}/output/${data_alias}/${alias}/ |
| temp: ${project_root}/temp/working_dir/${uid}/ |
| wandb_cache: ${oc.env:WANDB_CACHE_DIR,${project_root}/temp/wandb_cache/} |
| epochs: 200 |
| eval_batch_size: 1 |
| eval_steps: 125 |
| grad_acc_steps: 4 |
| hf_private: false |
| hf_repo: jzshared/Gencode-MxDNA |
| hf_user: jzshared |
| is_distributed: true |
| local_rank: 0 |
| log_every: 2 |
| logging: |
| level: info |
| log_wandb_metric_to_stdout: true |
| lr: 0.001 |
| mask_replace_prob: 0.8 |
| master_port: '46807' |
| max_data_samples: null |
| max_grad_norm: 2.0 |
| max_len: 12800 |
| max_length: ${max_len} |
| max_routing_tokens: 0 |
| max_train_steps: ${train_steps} |
| min_routing_tokens: 8 |
| mixed_precision: bf16 |
| mlm_probability: 0.15 |
| mode: Formal |
| model: |
| arch: hnet |
| name: hnet_mamba_64m |
| model_alias: ${oc.select:model.name,UnknownModel} |
| model_cfg: |
| arch_layout: |
| - m4 |
| - - m15 |
| - m4 |
| attn_cfg: |
| num_heads: |
| - 8 |
| - 12 |
| rotary_emb_dim: |
| - 16 |
| - 24 |
| window_size: |
| - 511 |
| - -1 |
| d_intermediate: |
| - 0 |
| - 2048 |
| d_model: |
| - 512 |
| - 768 |
| max_routing_tokens: ${max_routing_tokens} |
| min_routing_tokens: ${min_routing_tokens} |
| n_gpt: 1.0 |
| r_hi: ${r_hi} |
| r_low: ${r_low} |
| ssm_cfg: |
| chunk_size: 256 |
| d_conv: 4 |
| d_state: 64 |
| expand: 2 |
| head_dim: 64 |
| tie_embeddings: true |
| vocab_size: 9 |
| mxdna_tokenizer_vocab_path: src/scripts/rebuttal/assets/mxdna_1mertokenizer/vocab.txt |
| name: hnet_base |
| num_test_samples: 0 |
| num_train_samples: 0 |
| num_valid_samples: 3000 |
| project_root: ${hydra:runtime.cwd} |
| r_hi: 0.3 |
| r_low: 0.0 |
| random_replace_prob: 0.1 |
| random_truncate: false |
| rank: 0 |
| reference_loss: null |
| save_steps: 750 |
| seed: 0 |
| source: ${dataset.type} |
| tokenizer: mxdna |
| tokenizer_cache_dir: ${dirs.data_cache}/hf_tokenizers |
| tokenizer_max_length: null |
| tokenizer_name_or_path: null |
| tokenizer_name_or_path_resolved: /gpfs/scratch/guoh/DNAFM/src/scripts/rebuttal/assets/mxdna_1mertokenizer/vocab.txt |
| tokenizer_pad_to_multiple_of: null |
| tokenizer_trust_remote_code: false |
| tokenizer_use_fast: true |
| tokenizer_vocab_size: 9 |
| train_steps: 7650 |
| training: |
| adam_beta1: 0.9 |
| adam_beta2: 0.95 |
| bf16: true |
| dataloader_drop_last: true |
| dataloader_num_workers: 1 |
| disable_tqdm: false |
| do_train: true |
| eval_steps: ${eval_steps} |
| eval_strategy: steps |
| gradient_accumulation_steps: ${grad_acc_steps} |
| gradient_checkpointing: false |
| group_by_length: false |
| hnet_initializer_range: 0.02 |
| hnet_lr_multiplier: null |
| label_names: |
| - labels |
| learning_rate: ${lr} |
| logging_steps: ${log_every} |
| lr_scheduler_type: linear |
| max_grad_norm: ${max_grad_norm} |
| max_train_steps: ${max_train_steps} |
| num_train_epochs: ${epochs} |
| output_dir: ${dirs.output} |
| overrides: {} |
| per_device_eval_batch_size: ${eval_batch_size} |
| per_device_train_batch_size: ${batch_size} |
| remove_unused_columns: false |
| report_to: null |
| resume_from_checkpoint: null |
| save_steps: ${save_steps} |
| save_strategy: steps |
| use_lr_multiplier: true |
| warmup_steps: 500 |
| weight_decay: 0.1 |
| training_alias: mlm_${tokenizer}_lr${lr}_${train_steps}steps_ms${max_train_steps}_maxlen${max_len} |
| uid: ywrwxmjk |
| upload_to_hf: true |
| use_routing_ceiling: false |
| use_routing_floor: true |
| use_wandb: true |
| valid_test_downsample: null |
| version: NA |
| wandb: |
| dir: ${dirs.wandb_cache} |
| entity: ${oc.select:env.vars.wandb_entity,${oc.env:WANDB_ENTITY,null}} |
| id: ywrwxmjk |
| mode: online |
| name: Gencode-MxDNA |
| project: DNAFM_v2 |
| step_metric: null |
| tags: [] |
| url: https://wandb.ai/jzshared/DNAFM_v2/runs/ywrwxmjk |
| world_size: 8 |
|
|