| from __future__ import annotations |
|
|
| """ |
| Pré-treinamento de um pequeno modelo de linguagem |
| ================================================= |
| |
| Fluxo: |
| 1. Usa o texto do README (artigo/resumo) como corpus inicial. |
| 2. Treina um tokenizador SentencePiece (BPE) se ainda não existir. |
| 3. Constrói um Dataset de LM (inputs + labels deslocados). |
| 4. Treina `EpistemicLanguageModel` com cross-entropy e AdamW. |
| 5. Salva pesos do modelo e reutiliza o tokenizador treinado. |
| """ |
|
|
| from dataclasses import dataclass, field |
| from typing import List, Tuple, Optional |
| import os |
|
|
| import torch |
| from torch.utils.data import Dataset, DataLoader |
| from torch.optim import AdamW |
| from torch.optim.lr_scheduler import CosineAnnealingLR |
| from tqdm import tqdm |
|
|
| from custom_tokenizer import SPConfig, train_sentencepiece, CustomSPTokenizer |
| from custom_lm_model import LMConfig, EpistemicLanguageModel, save_lm, generate_text |
| from corpus_utils import load_main_corpus |
|
|
|
|
| @dataclass |
| class TrainLMConfig: |
| sp_config: SPConfig = field(default_factory=SPConfig) |
| max_seq_len: int = 128 |
| batch_size: int = 16 |
| num_epochs: int = 3 |
| learning_rate: float = 3e-4 |
| grad_clip: float = 1.0 |
| grad_accum_steps: int = 1 |
| save_dir: str = "checkpoints_lm" |
|
|
|
|
| class LMDataset(Dataset): |
| """ |
| Dataset de linguagem causal: divide o fluxo de tokens em blocos |
| de tamanho fixo e usa input_ids e labels deslocados em 1. |
| """ |
|
|
| def __init__(self, token_ids: List[int], block_size: int) -> None: |
| self.block_size = block_size |
| |
| n = (len(token_ids) // block_size) * block_size |
| self.data = token_ids[:n] |
|
|
| def __len__(self) -> int: |
| return max(len(self.data) // self.block_size - 1, 0) |
|
|
| def __getitem__(self, idx: int): |
| start = idx * self.block_size |
| end = start + self.block_size |
| x = torch.tensor(self.data[start:end], dtype=torch.long) |
| y = torch.tensor(self.data[start + 1 : end + 1], dtype=torch.long) |
| return x, y |
|
|
|
|
| def ensure_tokenizer(config: TrainLMConfig) -> CustomSPTokenizer: |
| model_file = f"{config.sp_config.model_prefix}.model" |
| if not os.path.exists(model_file): |
| |
| texts = load_main_corpus() |
| tmp_corpus = "sp_corpus_tmp.txt" |
| with open(tmp_corpus, "w", encoding="utf-8") as f: |
| for t in texts: |
| f.write(t.replace("\r\n", "\n") + "\n") |
| train_sentencepiece([tmp_corpus], config.sp_config) |
| os.remove(tmp_corpus) |
| return CustomSPTokenizer(model_prefix=config.sp_config.model_prefix) |
|
|
|
|
| def build_token_stream(tokenizer: CustomSPTokenizer) -> Tuple[List[int], List[int]]: |
| """ |
| Constrói streams de tokens para treino e validação a partir do corpus principal. |
| Usa divisão simples train/val em nível de documento. |
| """ |
| texts = load_main_corpus() |
| if len(texts) == 1: |
| train_texts = texts |
| val_texts = texts |
| else: |
| split = max(1, int(0.8 * len(texts))) |
| train_texts = texts[:split] |
| val_texts = texts[split:] |
|
|
| def encode_all(lst: List[str]) -> List[int]: |
| ids: List[int] = [] |
| for t in lst: |
| ids.extend(tokenizer.encode(t, add_bos=True, add_eos=True)) |
| return ids |
|
|
| return encode_all(train_texts), encode_all(val_texts) |
|
|
|
|
| def evaluate_lm( |
| model: EpistemicLanguageModel, |
| dataloader: DataLoader, |
| device: torch.device, |
| loss_fn, |
| ) -> float: |
| model.eval() |
| total_loss, steps = 0.0, 0 |
| with torch.no_grad(): |
| for x, y in dataloader: |
| x = x.to(device) |
| y = y.to(device) |
| logits = model(x) |
| loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1)) |
| total_loss += float(loss.item()) |
| steps += 1 |
| avg_loss = total_loss / max(steps, 1) |
| return avg_loss |
|
|
|
|
| def train_lm(config: TrainLMConfig) -> EpistemicLanguageModel: |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| tokenizer = ensure_tokenizer(config) |
|
|
| train_ids, val_ids = build_token_stream(tokenizer) |
|
|
| train_dataset = LMDataset(train_ids, block_size=config.max_seq_len) |
| val_dataset = LMDataset(val_ids, block_size=config.max_seq_len) |
|
|
| train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True) |
| val_loader = DataLoader(val_dataset, batch_size=config.batch_size) |
|
|
| lm_config = LMConfig( |
| vocab_size=tokenizer.vocab_size, |
| max_seq_len=config.max_seq_len, |
| ) |
| model = EpistemicLanguageModel(lm_config).to(device) |
|
|
| |
| if torch.cuda.device_count() > 1: |
| model = torch.nn.DataParallel(model) |
|
|
| optimizer = AdamW(model.parameters(), lr=config.learning_rate) |
| scheduler = CosineAnnealingLR(optimizer, T_max=config.num_epochs) |
| loss_fn = torch.nn.CrossEntropyLoss() |
|
|
| os.makedirs(config.save_dir, exist_ok=True) |
|
|
| for epoch in range(config.num_epochs): |
| model.train() |
| total_loss = 0.0 |
| steps = 0 |
| optimizer.zero_grad() |
|
|
| for step, (x, y) in enumerate( |
| tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.num_epochs}") |
| ): |
| x = x.to(device) |
| y = y.to(device) |
|
|
| logits = model(x) |
| loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1)) |
|
|
| loss = loss / max(config.grad_accum_steps, 1) |
| loss.backward() |
|
|
| if (step + 1) % config.grad_accum_steps == 0: |
| if config.grad_clip is not None and config.grad_clip > 0: |
| torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) |
| optimizer.step() |
| optimizer.zero_grad() |
|
|
| total_loss += float(loss.item()) |
| steps += 1 |
|
|
| scheduler.step() |
| avg_train_loss = total_loss / max(steps, 1) |
|
|
| |
| val_loss = evaluate_lm( |
| model.module if isinstance(model, torch.nn.DataParallel) else model, |
| val_loader, |
| device, |
| loss_fn, |
| ) |
| ppl = torch.exp(torch.tensor(val_loss)).item() |
|
|
| print( |
| f"Epoch {epoch+1} - train loss: {avg_train_loss:.4f} | " |
| f"val loss: {val_loss:.4f} | ppl: {ppl:.2f}" |
| ) |
|
|
| |
| base_model = model.module if isinstance(model, torch.nn.DataParallel) else model |
| prompt = "A inteligência artificial" |
| sample = generate_text(base_model, tokenizer, prompt, max_new_tokens=40) |
| print(f"Exemplo de geração: {sample}\n") |
|
|
| |
| ckpt_path = os.path.join(config.save_dir, f"epistemic_lm_epoch{epoch+1}.pt") |
| save_lm(base_model, ckpt_path) |
|
|
| |
| return model.module if isinstance(model, torch.nn.DataParallel) else model |
|
|
|
|
| def main() -> None: |
| config = TrainLMConfig() |
| model = train_lm(config) |
| save_path = "epistemic_lm.pt" |
| save_lm(model, save_path) |
| print(f"Modelo de linguagem salvo em '{save_path}'") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|