| |
| seed_everything: 42 |
| trainer: |
| accelerator: auto |
| strategy: auto |
| devices: auto |
| num_nodes: 1 |
| precision: bf16-mixed |
| callbacks: |
| - class_path: lightning.pytorch.callbacks.ModelCheckpoint |
| init_args: |
| dirpath: null |
| filename: null |
| monitor: null |
| verbose: false |
| save_last: null |
| save_top_k: 1 |
| save_weights_only: false |
| mode: min |
| auto_insert_metric_name: true |
| every_n_train_steps: null |
| train_time_interval: null |
| every_n_epochs: null |
| save_on_train_epoch_end: null |
| enable_version_counter: true |
| fast_dev_run: false |
| max_epochs: null |
| min_epochs: null |
| max_steps: 200000 |
| min_steps: null |
| max_time: null |
| limit_train_batches: null |
| limit_val_batches: null |
| limit_test_batches: null |
| limit_predict_batches: null |
| overfit_batches: 0.0 |
| val_check_interval: 50000 |
| check_val_every_n_epoch: 1 |
| num_sanity_val_steps: null |
| log_every_n_steps: null |
| enable_checkpointing: null |
| enable_progress_bar: false |
| enable_model_summary: null |
| accumulate_grad_batches: 2 |
| gradient_clip_val: 1 |
| gradient_clip_algorithm: null |
| deterministic: null |
| benchmark: null |
| inference_mode: true |
| use_distributed_sampler: true |
| profiler: null |
| detect_anomaly: false |
| barebones: false |
| plugins: null |
| sync_batchnorm: false |
| reload_dataloaders_every_n_epochs: 0 |
| default_root_dir: null |
| model_registry: null |
| model: |
| class_path: tite.module.TiteModule |
| init_args: |
| model: |
| class_path: tite.model.TiteForPreTraining |
| init_args: |
| config: |
| class_path: tite.model.TiteConfig |
| init_args: |
| vocab_size: 30522 |
| num_hidden_layers: 12 |
| hidden_sizes: 768 |
| num_attention_heads: 12 |
| intermediate_sizes: 3072 |
| kernel_sizes: |
| - null |
| - null |
| - null |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| strides: |
| - null |
| - null |
| - null |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| - 2 |
| dropout_prob: 0.1 |
| max_position_embeddings: 512 |
| initializer_range: 0.02 |
| layer_norm_eps: 1.0e-12 |
| pad_token_id: 0 |
| hidden_act: gelu_pytorch_tanh |
| absolute_positional_embedding_type: null |
| relative_positional_embedding_type: rotary |
| pooling_location: intra |
| rotary_interleaved: true |
| norm_location: post |
| norm_type: layer |
| pooling_implementation: triton |
| rope_implementation: eager |
| positional_embedding_type: null |
| enhanced_masked_auto_encoding: true |
| bow_auto_encoding: true |
| tokenizer: |
| class_path: tite.model.TiteTokenizer |
| init_args: |
| vocab_file: tokenizers/tite/vocab.txt |
| tokenizer_file: tokenizers/tite/tokenizer.json |
| do_lower_case: true |
| unk_token: '[UNK]' |
| sep_token: '[SEP]' |
| pad_token: '[PAD]' |
| cls_token: '[CLS]' |
| mask_token: '[MASK]' |
| tokenize_chinese_chars: true |
| strip_accents: null |
| dict_kwargs: |
| model_max_length: 512 |
| validate_on_glue: true |
| validate_on_trec_dl: true |
| log_gradients: false |
| compile: true |
| data: |
| class_path: tite.datasets.FineWebDataModule |
| init_args: |
| collator: |
| class_path: tite.datasets.TransformationCollator |
| init_args: |
| text_keys: |
| - text |
| - null |
| string_transformations: null |
| token_transformations: |
| - class_path: tite.transformation.TokenMask |
| init_args: |
| mask_id: 103 |
| mask_prob: 0.3 |
| transformation_prob: 1.0 |
| max_length: 512 |
| path: HuggingFaceFW/fineweb-edu |
| batch_size: 128 |
| seed: null |
| num_workers: 8 |
| streaming: true |
| lr_scheduler: |
| class_path: tite.utils.lr_schedulers.SigmoidLRSchedulerWithLinearWarmup |
| init_args: |
| num_warmup_steps: 3000 |
| final_value: 0.02 |
| num_delay_steps: 0 |
| optimizer: |
| class_path: tite.utils.adamw.AdamWNoWeightDecayBiasNorm |
| init_args: |
| lr: 0.0001 |
| betas: |
| - 0.9 |
| - 0.999 |
| eps: 1.0e-08 |
| weight_decay: 0.01 |
| amsgrad: false |
| maximize: false |
| foreach: null |
| capturable: false |
| differentiable: false |
| fused: null |
| ckpt_path: null |
|
|