| |
|
|
| model: |
| vocab_size: 8000 |
| d_model: 1024 |
| n_layers: 24 |
| n_heads: 16 |
| d_ff: 4096 |
| max_seq_len: 2048 |
| dropout: 0.15 |
| use_swiglu: true |
| use_flash_attention: true |
| use_confidence_scoring: true |
| min_confidence: 0.3 |
|
|
| training: |
| batch_size: 2 |
| accumulation_steps: 16 |
| epochs: 25 |
| learning_rate: 0.0002 |
| min_lr: 0.000005 |
| weight_decay: 0.15 |
| max_grad_norm: 0.5 |
| num_threads: 4 |
| save_every: 5 |
| |
| |
| patience: 10 |
| min_delta: 0.0003 |
| |
| |
| warmup_steps: 500 |
| use_lr_scheduler: true |
| |
| |
| label_smoothing: 0.15 |
| use_eos_loss_weight: true |
| eos_weight: 3.0 |
| |
| |
| use_gradient_checkpointing: true |
| use_fp16: true |
|
|
| data: |
| corpus_path: corpus/mtp_mini_corpus.jsonl |
| min_text_length: 100 |
| max_text_length: 4000 |
| validation_split: 0.2 |
| |
| |
| use_augmentation: true |
| augmentation_prob: 0.4 |
|
|
| generation: |
| default_max_tokens: 300 |
| default_temperature: 0.65 |
| default_top_k: 50 |
| default_top_p: 0.9 |
| default_repetition_penalty: 1.2 |
| min_response_length: 30 |
| |
| |
| use_perplexity_filter: true |
| max_perplexity: 80.0 |
| use_entropy_threshold: true |
| max_entropy: 4.0 |
| |
| |
| use_confidence_filter: true |
| min_confidence_threshold: 0.3 |
| |
| stop_sequences: |
| - "###" |
| - "\n\n\n\n" |
| - "Instrucción:" |
| - "Usuario:" |
|
|
| |
| memory: |
| use_fp16: true |
| use_gradient_checkpointing: true |
| max_memory_gb: 14 |