| from __future__ import annotations |
|
|
| """ |
| Modelo de Linguagem Customizado (TransformerEncoder + BPE) |
| ========================================================== |
| |
| Pequeno modelo de linguagem causal baseado em TransformerEncoder, usando |
| tokens produzidos pelo `CustomSPTokenizer` (SentencePiece). |
| |
| Funções principais: |
| - `EpistemicLanguageModel`: arquitetura PyTorch |
| - `generate_text`: função de geração autoregressiva |
| - helpers para salvar/carregar pesos |
| """ |
|
|
| from dataclasses import dataclass |
| from typing import Optional, List |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
|
|
|
|
| @dataclass |
| class LMConfig: |
| vocab_size: int |
| d_model: int = 256 |
| n_heads: int = 4 |
| num_layers: int = 4 |
| dim_feedforward: int = 512 |
| max_seq_len: int = 256 |
| dropout: float = 0.1 |
|
|
|
|
| class EpistemicLanguageModel(nn.Module): |
| """ |
| Modelo de linguagem simples (causal) com TransformerEncoder. |
| """ |
|
|
| def __init__(self, config: LMConfig) -> None: |
| super().__init__() |
| self.config = config |
|
|
| self.token_emb = nn.Embedding(config.vocab_size, config.d_model) |
| self.pos_emb = nn.Embedding(config.max_seq_len, config.d_model) |
|
|
| encoder_layer = nn.TransformerEncoderLayer( |
| d_model=config.d_model, |
| nhead=config.n_heads, |
| dim_feedforward=config.dim_feedforward, |
| dropout=config.dropout, |
| batch_first=True, |
| ) |
| self.encoder = nn.TransformerEncoder( |
| encoder_layer, |
| num_layers=config.num_layers, |
| ) |
| self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) |
|
|
| def forward( |
| self, |
| input_ids: torch.Tensor, |
| attention_mask: Optional[torch.Tensor] = None, |
| ) -> torch.Tensor: |
| bsz, seq_len = input_ids.shape |
| device = input_ids.device |
|
|
| pos_ids = torch.arange(seq_len, device=device).unsqueeze(0).expand(bsz, -1) |
| x = self.token_emb(input_ids) + self.pos_emb(pos_ids) |
|
|
| |
| causal_mask = torch.triu( |
| torch.ones(seq_len, seq_len, device=device, dtype=torch.bool), |
| diagonal=1, |
| ) |
|
|
| if attention_mask is not None: |
| |
| |
| key_padding_mask = attention_mask == 0 |
| else: |
| key_padding_mask = None |
|
|
| hidden = self.encoder( |
| x, |
| mask=causal_mask, |
| src_key_padding_mask=key_padding_mask, |
| ) |
| logits = self.lm_head(hidden) |
| return logits |
|
|
|
|
| def generate_text( |
| model: EpistemicLanguageModel, |
| tokenizer, |
| prompt: str, |
| max_new_tokens: int = 50, |
| temperature: float = 1.0, |
| top_k: int = 50, |
| device: Optional[torch.device] = None, |
| ) -> str: |
| """ |
| Geração autoregressiva simples. |
| """ |
| if device is None: |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
| model.eval() |
| model.to(device) |
|
|
| ids = tokenizer.encode(prompt, add_bos=True, add_eos=False) |
| input_ids = torch.tensor([ids], dtype=torch.long, device=device) |
|
|
| for _ in range(max_new_tokens): |
| if input_ids.size(1) >= model.config.max_seq_len: |
| break |
|
|
| with torch.no_grad(): |
| logits = model(input_ids) |
| next_token_logits = logits[0, -1, :] / max(temperature, 1e-4) |
|
|
| if top_k > 0: |
| values, indices = torch.topk(next_token_logits, k=min(top_k, next_token_logits.size(-1))) |
| probs = F.softmax(values, dim=-1) |
| next_token = indices[torch.multinomial(probs, num_samples=1)] |
| else: |
| probs = F.softmax(next_token_logits, dim=-1) |
| next_token = torch.multinomial(probs, num_samples=1) |
|
|
| input_ids = torch.cat([input_ids, next_token.view(1, 1)], dim=1) |
|
|
| generated_ids: List[int] = input_ids[0].tolist() |
| return tokenizer.decode(generated_ids) |
|
|
|
|
| def save_lm(model: EpistemicLanguageModel, path: str) -> None: |
| torch.save({"config": model.config.__dict__, "state_dict": model.state_dict()}, path) |
|
|
|
|
| def load_lm(path: str, vocab_size: int) -> EpistemicLanguageModel: |
| data = torch.load(path, map_location="cpu") |
| cfg_dict = data.get("config", {}) |
| cfg_dict["vocab_size"] = vocab_size |
| config = LMConfig(**cfg_dict) |
| model = EpistemicLanguageModel(config) |
| model.load_state_dict(data["state_dict"]) |
| return model |
|
|
|
|