andyjzhao commited on
Commit
bf5d0ee
·
verified ·
1 Parent(s): 4dcb67b

Upload hydra_cfg.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. hydra_cfg.yaml +332 -0
hydra_cfg.yaml ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ RC_augmentation: false
2
+ _dataset_cfg_lookup:
3
+ dlb_cmp_gm12878:
4
+ eval_split: validation
5
+ hf_path: jzshared/dlb_cmp_gm12878
6
+ label_key: label_ut
7
+ mask_key: mask_ut
8
+ num_workers: 0
9
+ path: data/dlb_cmp_gm12878
10
+ pin_memory: true
11
+ reference_id: hg38
12
+ sequence_format: string
13
+ sequence_key: sequence
14
+ shuffle: true
15
+ test_split: test
16
+ train_split: train
17
+ type: cmp_seq
18
+ dlb_cmp_h1hesc:
19
+ eval_split: validation
20
+ hf_path: jzshared/dlb_cmp_h1hesc
21
+ label_key: label_ut
22
+ mask_key: mask_ut
23
+ num_workers: 0
24
+ path: data/dlb_cmp_h1hesc
25
+ pin_memory: true
26
+ reference_id: hg38
27
+ sequence_format: string
28
+ sequence_key: sequence
29
+ shuffle: true
30
+ test_split: test
31
+ train_split: train
32
+ type: cmp_seq
33
+ dlb_cmp_hct116:
34
+ eval_split: validation
35
+ hf_path: jzshared/dlb_cmp_hct116
36
+ label_key: label_ut
37
+ mask_key: mask_ut
38
+ num_workers: 0
39
+ path: data/dlb_cmp_hct116
40
+ pin_memory: true
41
+ reference_id: hg38
42
+ sequence_format: string
43
+ sequence_key: sequence
44
+ shuffle: true
45
+ test_split: test
46
+ train_split: train
47
+ type: cmp_seq
48
+ dlb_cmp_hff:
49
+ eval_split: validation
50
+ hf_path: jzshared/dlb_cmp_hff
51
+ label_key: label_ut
52
+ mask_key: mask_ut
53
+ num_workers: 0
54
+ path: data/dlb_cmp_hff
55
+ pin_memory: true
56
+ reference_id: hg38
57
+ sequence_format: string
58
+ sequence_key: sequence
59
+ shuffle: true
60
+ test_split: test
61
+ train_split: train
62
+ type: cmp_seq
63
+ dlb_cmp_imr90:
64
+ eval_split: validation
65
+ hf_path: jzshared/dlb_cmp_imr90
66
+ label_key: label_ut
67
+ mask_key: mask_ut
68
+ num_workers: 0
69
+ path: data/dlb_cmp_imr90
70
+ pin_memory: true
71
+ reference_id: hg38
72
+ sequence_format: string
73
+ sequence_key: sequence
74
+ shuffle: true
75
+ test_split: test
76
+ train_split: train
77
+ type: cmp_seq
78
+ euks_refseq_region_12.8k:
79
+ hf_path: jzshared/euks_refseq_all_12p8k_merged_10m_20260302
80
+ path: data/euks_refseq_all_12p8k_merged_10m_20260302
81
+ type: refseq
82
+ gencode128k_basic:
83
+ hf_path: jzshared/gencode128k_basic
84
+ path: data/gencode128k_basic
85
+ type: refseq
86
+ gencode128k_debug:
87
+ hf_path: jzshared/gencode128k_debug
88
+ path: data/gencode128k_debug
89
+ type: refseq
90
+ gencode_human_12.8k:
91
+ hf_path: jzshared/gencode_human_12.8k
92
+ path: data/gencode_human_12.8k
93
+ type: refseq
94
+ gencode_human_128k:
95
+ hf_path: jzshared/gencode_human_128k
96
+ path: data/gencode_human_128k
97
+ type: refseq
98
+ hg38_128k:
99
+ hf_path: jzshared/hg38_cds_anchored_128000
100
+ path: data/hg38_cds_anchored_128000
101
+ type: refseq
102
+ hg38_12k:
103
+ hf_path: jzshared/hg38_12800
104
+ path: data/hg38_cds_anchored_len12800_mincds150_1000000samples
105
+ type: refseq
106
+ hg38_cds_4m:
107
+ hf_path: null
108
+ path: data/hg38_cds_dataset_4m_filtered
109
+ type: refseq
110
+ orca32m_cmp_seq:
111
+ eval_split: validation
112
+ hf_path: jzshared/orca32m_cmp
113
+ label_key: label_ut
114
+ mask_key: mask_ut
115
+ num_workers: 0
116
+ path: data/orca32m_cmp_seq
117
+ pin_memory: true
118
+ reference_id: hg38
119
+ sequence_format: string
120
+ sequence_key: sequence
121
+ shuffle: true
122
+ test_split: test
123
+ train_split: train
124
+ type: cmp_seq
125
+ _unimportant_cfg:
126
+ fields:
127
+ - gpus
128
+ - debug
129
+ - wandb
130
+ - env
131
+ - uid
132
+ - local_rank
133
+ - is_distributed
134
+ - master_port
135
+ - device_type
136
+ - cluster
137
+ - world_size
138
+ - train_dataset
139
+ - eval_datasets
140
+ - user_cfg
141
+ - rank
142
+ - device
143
+ - hf_access_token
144
+ - hf_private
145
+ - hf_repo
146
+ - hf_user
147
+ - hf_token
148
+ - save_every
149
+ - eval_steps
150
+ - save_steps
151
+ - upload_to_hf
152
+ - logging
153
+ - log_every
154
+ - use_wandb
155
+ - project_root
156
+ - version
157
+ postfix:
158
+ - _path
159
+ - _file
160
+ - _dir
161
+ - _alias
162
+ - _prefix
163
+ prefix:
164
+ - _
165
+ add_special_tokens: true
166
+ alias: Gencode-MxDNA
167
+ arch: hnet
168
+ batch_size: 8
169
+ bidirectional_strategy: mean
170
+ cluster: mila
171
+ cmd: python src/scripts/rebuttal/train_mlm.py exp=rebuttal/mlm data=gencode_human_12.8k
172
+ model=hnet/mamba_64m max_len=12800 batch_size=8 eval_batch_size=1 grad_acc_steps=4
173
+ train_steps=7650 eval_steps=125 save_steps=750 log_every=2 num_valid_samples=3000
174
+ upload_to_hf=true wandb.project=DNAFM_v2 tokenizer=mxdna alias=Gencode-MxDNA use_wandb=true
175
+ hf_repo=jzshared/Gencode-MxDNA
176
+ config_path: null
177
+ data: gencode_human_12.8k
178
+ data_alias: ${.data}_${max_len}
179
+ dataset: ${_dataset_cfg_lookup[${data}]}
180
+ dataset_sequence_key: sequence
181
+ device: cuda
182
+ device_type: GPU
183
+ dirs:
184
+ data_cache: ${project_root}/data_cache/
185
+ data_storage: ${project_root}/data/
186
+ hydra: ${project_root}/temp/hydra/
187
+ output: ${project_root}/output/${data_alias}/${alias}/
188
+ temp: ${project_root}/temp/working_dir/${uid}/
189
+ wandb_cache: ${oc.env:WANDB_CACHE_DIR,${project_root}/temp/wandb_cache/}
190
+ epochs: 200
191
+ eval_batch_size: 1
192
+ eval_steps: 125
193
+ grad_acc_steps: 4
194
+ hf_private: false
195
+ hf_repo: jzshared/Gencode-MxDNA
196
+ hf_user: jzshared
197
+ is_distributed: true
198
+ local_rank: 0
199
+ log_every: 2
200
+ logging:
201
+ level: info
202
+ log_wandb_metric_to_stdout: true
203
+ lr: 0.001
204
+ mask_replace_prob: 0.8
205
+ master_port: '46807'
206
+ max_data_samples: null
207
+ max_grad_norm: 2.0
208
+ max_len: 12800
209
+ max_length: ${max_len}
210
+ max_routing_tokens: 0
211
+ max_train_steps: ${train_steps}
212
+ min_routing_tokens: 8
213
+ mixed_precision: bf16
214
+ mlm_probability: 0.15
215
+ mode: Formal
216
+ model:
217
+ arch: hnet
218
+ name: hnet_mamba_64m
219
+ model_alias: ${oc.select:model.name,UnknownModel}
220
+ model_cfg:
221
+ arch_layout:
222
+ - m4
223
+ - - m15
224
+ - m4
225
+ attn_cfg:
226
+ num_heads:
227
+ - 8
228
+ - 12
229
+ rotary_emb_dim:
230
+ - 16
231
+ - 24
232
+ window_size:
233
+ - 511
234
+ - -1
235
+ d_intermediate:
236
+ - 0
237
+ - 2048
238
+ d_model:
239
+ - 512
240
+ - 768
241
+ max_routing_tokens: ${max_routing_tokens}
242
+ min_routing_tokens: ${min_routing_tokens}
243
+ n_gpt: 1.0
244
+ r_hi: ${r_hi}
245
+ r_low: ${r_low}
246
+ ssm_cfg:
247
+ chunk_size: 256
248
+ d_conv: 4
249
+ d_state: 64
250
+ expand: 2
251
+ head_dim: 64
252
+ tie_embeddings: true
253
+ vocab_size: 9
254
+ mxdna_tokenizer_vocab_path: src/scripts/rebuttal/assets/mxdna_1mertokenizer/vocab.txt
255
+ name: hnet_base
256
+ num_test_samples: 0
257
+ num_train_samples: 0
258
+ num_valid_samples: 3000
259
+ project_root: ${hydra:runtime.cwd}
260
+ r_hi: 0.3
261
+ r_low: 0.0
262
+ random_replace_prob: 0.1
263
+ random_truncate: false
264
+ rank: 0
265
+ reference_loss: null
266
+ save_steps: 750
267
+ seed: 0
268
+ source: ${dataset.type}
269
+ tokenizer: mxdna
270
+ tokenizer_cache_dir: ${dirs.data_cache}/hf_tokenizers
271
+ tokenizer_max_length: null
272
+ tokenizer_name_or_path: null
273
+ tokenizer_name_or_path_resolved: /gpfs/scratch/guoh/DNAFM/src/scripts/rebuttal/assets/mxdna_1mertokenizer/vocab.txt
274
+ tokenizer_pad_to_multiple_of: null
275
+ tokenizer_trust_remote_code: false
276
+ tokenizer_use_fast: true
277
+ tokenizer_vocab_size: 9
278
+ train_steps: 7650
279
+ training:
280
+ adam_beta1: 0.9
281
+ adam_beta2: 0.95
282
+ bf16: true
283
+ dataloader_drop_last: true
284
+ dataloader_num_workers: 1
285
+ disable_tqdm: false
286
+ do_train: true
287
+ eval_steps: ${eval_steps}
288
+ eval_strategy: steps
289
+ gradient_accumulation_steps: ${grad_acc_steps}
290
+ gradient_checkpointing: false
291
+ group_by_length: false
292
+ hnet_initializer_range: 0.02
293
+ hnet_lr_multiplier: null
294
+ label_names:
295
+ - labels
296
+ learning_rate: ${lr}
297
+ logging_steps: ${log_every}
298
+ lr_scheduler_type: linear
299
+ max_grad_norm: ${max_grad_norm}
300
+ max_train_steps: ${max_train_steps}
301
+ num_train_epochs: ${epochs}
302
+ output_dir: ${dirs.output}
303
+ overrides: {}
304
+ per_device_eval_batch_size: ${eval_batch_size}
305
+ per_device_train_batch_size: ${batch_size}
306
+ remove_unused_columns: false
307
+ report_to: null
308
+ resume_from_checkpoint: null
309
+ save_steps: ${save_steps}
310
+ save_strategy: steps
311
+ use_lr_multiplier: true
312
+ warmup_steps: 500
313
+ weight_decay: 0.1
314
+ training_alias: mlm_${tokenizer}_lr${lr}_${train_steps}steps_ms${max_train_steps}_maxlen${max_len}
315
+ uid: ywrwxmjk
316
+ upload_to_hf: true
317
+ use_routing_ceiling: false
318
+ use_routing_floor: true
319
+ use_wandb: true
320
+ valid_test_downsample: null
321
+ version: NA
322
+ wandb:
323
+ dir: ${dirs.wandb_cache}
324
+ entity: ${oc.select:env.vars.wandb_entity,${oc.env:WANDB_ENTITY,null}}
325
+ id: ywrwxmjk
326
+ mode: online
327
+ name: Gencode-MxDNA
328
+ project: DNAFM_v2
329
+ step_metric: null
330
+ tags: []
331
+ url: https://wandb.ai/jzshared/DNAFM_v2/runs/ywrwxmjk
332
+ world_size: 8