| |
| checkpoint_dir: "/lustre/scratch/data/polyglot_datasets/bengali/checkpoints/models/LilTii/v2" |
| train_dataset_dir: |
| |
| |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_1" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_2" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_3" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_4" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_5" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_1" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_2" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_3" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_4" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_5" |
| |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/fineweb_edu/edu_score_3" |
| |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/math_meta_reasoning_filtered" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/math_meta_reasoning_filtered" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/nvidia_openscience" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/big_reasoning_traces" |
| |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/finemath_34b/edu_score_4" |
| - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/finemath_34b/edu_score_5" |
| val_dataset_dir: "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/validation_split" |
| dataset_type: "parquet" |
| cache_dir: "/lustre/mlnvme/data/polyglot/.cache" |
|
|
| |
| pin_memory: true |
| num_workers_for_dataloader: 32 |
| shuffle_dataset: true |
|
|
| |
| vocab_size: 49152 |
| num_hidden_layers: 28 |
| num_attention_heads: 16 |
| num_key_value_heads: 8 |
| head_dim: null |
| hidden_size: 1536 |
| intermediate_size: 3072 |
| max_position_embeddings: 4096 |
| tie_word_embeddings: true |
| hidden_act: "silu" |
| output_hidden_states: false |
| attn_implementation: "flash_attention_2" |
| use_cache: false |
| no_rope_layer_interval: null |
| rope_theta: 50000.0 |
| rope_scale_factor: null |
| rms_norm_eps: 0.000001 |
|
|
| |
| total_batch_size: 2097152 |
| micro_batch_size: 16 |
| eval_micro_batch_size: 8 |
| num_train_epochs: 1 |
| warmup_steps: 2000 |
| max_learning_rate: 0.0007 |
| min_learning_rate: 0.0 |
| weight_decay: 0.1 |
| beta1: 0.9 |
| beta2: 0.95 |
| eps: 0.00000001 |
| lr_decay_type: "wsd" |
| lr_decay_iters_coef: 0.0 |
| seed: 1337 |
| max_steps: 47500 |
| max_grad_norm: 1.0 |
|
|
| |
| torch_compile: false |
| mat_mul_precision: "highest" |
| tf32: true |
| bf16: true |
| gradient_checkpointing: false |
| use_liger_kernel: true |
| static_graph: false |
|
|
| |
| push_to_hub: false |
| hub_token: null |
| hub_model_id: null |
|
|
| |
| tokenizer_name_or_path: "Polygl0t/LilTii-v0.2" |
| reference_model: "HuggingFaceTB/SmolLM2-360M" |
|
|
| |
| resume_from_checkpoint: null |
| checkpointing_steps: 2500 |
| begin_new_stage: false |
| stage_name: "Warmup-Stable" |
|
|
| |
| sanity_check: false |
| sanity_check_num_samples: 100000 |
| wandb_token: null |
| wandb_id: "LilTii-v0.2" |
| wandb_project: "Polyglot" |
| wandb_desc: "Developing LLMs for low-resource languages" |