| |
| checkpoint_dir: "/lustre/scratch/data/polyglot_datasets/portuguese/checkpoints/models/tucano_v2" |
| train_dataset_dir: |
| |
| |
| |
| - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/gigaverbo_v2/4 |
| |
| - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/gigaverbo_v2_synth |
| - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/gigaverbo_v2_synth |
| - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/gigaverbo_v2_synth |
| |
| - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/nvidia_openscience |
| - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/big_reasoning_traces |
| - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/math_meta_reasoning_filtered |
| |
| - /lustre/scratch/data/polyglot_datasets/portuguese/tokenized/finemath/4 |
| val_dataset_dir: "/lustre/scratch/data/polyglot_datasets/portuguese/tokenized/validation" |
| dataset_type: "parquet" |
| cache_dir: "/lustre/mlnvme/data/polyglot/.cache" |
|
|
| |
| pin_memory: true |
| num_workers_for_dataloader: 16 |
| shuffle_dataset: true |
| mask_eos_token: false |
| mask_pad_token: false |
|
|
| |
| vocab_size: 49152 |
| num_hidden_layers: 28 |
| num_attention_heads: 16 |
| num_key_value_heads: 8 |
| head_dim: null |
| hidden_size: 1536 |
| intermediate_size: 3072 |
| max_position_embeddings: 4096 |
| tie_word_embeddings: true |
| hidden_act: "silu" |
| output_hidden_states: false |
| attn_implementation: "flash_attention_2" |
| use_cache: false |
| no_rope_layer_interval: null |
| rope_theta: 50000.0 |
| rope_scale_factor: null |
| rms_norm_eps: 0.000001 |
|
|
| |
| total_batch_size: 2097152 |
| micro_batch_size: 16 |
| eval_micro_batch_size: 8 |
| num_train_epochs: 1 |
| warmup_steps: 0 |
| max_learning_rate: 0.0007 |
| min_learning_rate: 0.0 |
| muon_learning_rate: 0.007 |
| weight_decay: 0.1 |
| beta1: 0.9 |
| beta2: 0.95 |
| eps: 0.00000001 |
| lr_decay_type: "wsd" |
| use_sqrt: true |
| lr_decay_iters_coef: 1. |
| seed: 42 |
| max_steps: 35000 |
| max_grad_norm: 1.0 |
|
|
| |
| torch_compile: false |
| mat_mul_precision: "highest" |
| tf32: true |
| bf16: true |
| gradient_checkpointing: false |
| use_liger_kernel: true |
| static_graph: false |
|
|
| |
| push_to_hub: false |
| hub_token: null |
| hub_model_id: null |
|
|
| |
| tokenizer_name_or_path: "/lustre/scratch/data/polyglot_datasets/portuguese/checkpoints/tokenizers/sentencepiece" |
| chat_template_path: null |
| reference_model: "HuggingFaceTB/SmolLM2-360M" |
| continual_pretraining: false |
|
|
| |
| resume_from_checkpoint: null |
| checkpointing_steps: 5000 |
| begin_new_stage: false |
| stage_name: "stage3" |
|
|
| |
| sanity_check: false |
| sanity_check_num_samples: 100000 |
| wandb_token: null |
| wandb_id: "tucano-v2" |
| wandb_project: "Polyglot" |
| wandb_desc: "Developing LLMs for low-resource languages" |
|
|