| """
|
| Step 2: Model configuration
|
| """
|
|
|
| from dataclasses import dataclass
|
| from transformers import GPT2Config
|
|
|
| @dataclass
|
| class ModelConfig:
|
|
|
| vocab_size: int = 8000
|
| n_positions: int = 256
|
| n_embd: int = 512
|
| n_layer: int = 8
|
| n_head: int = 8
|
| n_inner: int = 1024
|
|
|
|
|
| batch_size: int = 8
|
| gradient_accumulation: int = 4
|
| learning_rate: float = 3e-4
|
| warmup_steps: int = 1000
|
| total_steps: int = 20000
|
| weight_decay: float = 0.1
|
| max_grad_norm: float = 1.0
|
|
|
|
|
| train_file: str = "./final_corpus/multilingual_corpus_train.txt"
|
| val_file: str = "./final_corpus/multilingual_corpus_val.txt"
|
| tokenizer_path: str = "./final_corpus/multilingual_spm.model"
|
|
|
|
|
| output_dir: str = "./checkpoints"
|
| save_steps: int = 1000
|
| eval_steps: int = 500
|
| logging_steps: int = 100
|
|
|
|
|
| fp16: bool = True
|
|
|
| def __post_init__(self):
|
| print(f"\nModel Configuration (REALISTIC):")
|
| print(f" Parameters: ~{self.total_params:.1f}M")
|
| print(f" Hidden size: {self.n_embd}")
|
| print(f" Layers: {self.n_layer}")
|
| print(f" Context length: {self.n_positions}")
|
| print(f" Effective batch: {self.effective_batch_size}")
|
| print(f" Total steps: {self.total_steps} (~8-9 epochs)")
|
| print(f" Learning rate: {self.learning_rate}")
|
|
|
| @property
|
| def effective_batch_size(self):
|
| return self.batch_size * self.gradient_accumulation
|
|
|
| @property
|
| def total_params(self):
|
|
|
| embedding = self.vocab_size * self.n_embd
|
| attention = 4 * self.n_embd * self.n_embd
|
| ffn = 2 * self.n_embd * self.n_inner
|
| ln = 2 * self.n_embd
|
| per_layer = attention + ffn + ln
|
| total = embedding + (self.n_layer * per_layer)
|
| return total / 1e6 |