Eeppa commited on
Commit
0247f2b
·
verified ·
1 Parent(s): 4954ba4

Update modeling_tinybuddy.py

Browse files
Files changed (1) hide show
  1. modeling_tinybuddy.py +34 -20
modeling_tinybuddy.py CHANGED
@@ -6,10 +6,37 @@ import math
6
  import torch
7
  import torch.nn as nn
8
  import torch.nn.functional as F
9
- from transformers import PreTrainedModel
10
- from configuration_tinybuddy import GPTConfig
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class CausalSelfAttention(nn.Module):
14
  def __init__(self, cfg: GPTConfig):
15
  super().__init__()
@@ -20,7 +47,6 @@ class CausalSelfAttention(nn.Module):
20
  self.qkv = nn.Linear(cfg.n_embd, 3 * cfg.n_embd, bias=True)
21
  self.proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=True)
22
  self.drop = nn.Dropout(cfg.dropout)
23
- # causal mask
24
  mask = torch.tril(torch.ones(cfg.block_size, cfg.block_size)).bool()
25
  self.register_buffer("mask", mask, persistent=False)
26
 
@@ -31,7 +57,6 @@ class CausalSelfAttention(nn.Module):
31
  q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
32
  k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
33
  v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
34
- # use PyTorch's fused SDPA (faster on CPU than manual)
35
  y = F.scaled_dot_product_attention(q, k, v, is_causal=True,
36
  dropout_p=self.drop.p if self.training else 0.0)
37
  y = y.transpose(1, 2).contiguous().view(B, T, C)
@@ -82,23 +107,15 @@ class TinyGPT(PreTrainedModel):
82
 
83
  def _init_weights(self, module):
84
  if isinstance(module, nn.Linear):
85
- module.weight.data.normal_(mean=0.0, std=self.config.initializer_range if hasattr(self.config, 'initializer_range') else 0.02)
86
  if module.bias is not None:
87
  module.bias.data.zero_()
88
  elif isinstance(module, nn.Embedding):
89
- module.weight.data.normal_(mean=0.0, std=self.config.initializer_range if hasattr(self.config, 'initializer_range') else 0.02)
90
-
91
- def num_params(self, non_embedding=False):
92
- n = sum(p.numel() for p in self.parameters())
93
- if non_embedding:
94
- n -= self.tok_emb.weight.numel() + self.pos_emb.weight.numel()
95
- if not self.config.tie_weights:
96
- n -= self.lm_head.weight.numel()
97
- return n
98
 
99
  def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
100
  B, T = input_ids.shape
101
- assert T <= self.config.block_size, f"sequence length {T} > block_size {self.config.block_size}"
102
  pos = torch.arange(T, device=input_ids.device)
103
  x = self.tok_emb(input_ids) + self.pos_emb(pos)[None, :, :]
104
  x = self.drop(x)
@@ -128,10 +145,7 @@ class TinyGPT(PreTrainedModel):
128
 
129
 
130
  if __name__ == "__main__":
131
- from configuration_tinybuddy import GPTConfig
132
  cfg = GPTConfig()
133
  m = TinyGPT(cfg)
134
- total = m.num_params()
135
- nonemb = m.num_params(non_embedding=True)
136
- print(f"Total params : {total:,} (~{total/1e6:.2f}M)")
137
- print(f"Non-embedding params: {nonemb:,} (~{nonemb/1e6:.2f}M)")
 
6
  import torch
7
  import torch.nn as nn
8
  import torch.nn.functional as F
9
+ from transformers import PreTrainedModel, PretrainedConfig
 
10
 
11
 
12
+ # ========== CONFIG CLASS (embedded to avoid import issues) ==========
13
+ class GPTConfig(PretrainedConfig):
14
+ model_type = "tinybuddy"
15
+
16
+ def __init__(
17
+ self,
18
+ vocab_size: int = 50000,
19
+ block_size: int = 512,
20
+ n_layer: int = 6,
21
+ n_head: int = 8,
22
+ n_embd: int = 256,
23
+ mlp_ratio: int = 4,
24
+ dropout: float = 0.0,
25
+ tie_weights: bool = False,
26
+ **kwargs
27
+ ):
28
+ super().__init__(**kwargs)
29
+ self.vocab_size = vocab_size
30
+ self.block_size = block_size
31
+ self.n_layer = n_layer
32
+ self.n_head = n_head
33
+ self.n_embd = n_embd
34
+ self.mlp_ratio = mlp_ratio
35
+ self.dropout = dropout
36
+ self.tie_weights = tie_weights
37
+
38
+
39
+ # ========== MODEL ARCHITECTURE ==========
40
  class CausalSelfAttention(nn.Module):
41
  def __init__(self, cfg: GPTConfig):
42
  super().__init__()
 
47
  self.qkv = nn.Linear(cfg.n_embd, 3 * cfg.n_embd, bias=True)
48
  self.proj = nn.Linear(cfg.n_embd, cfg.n_embd, bias=True)
49
  self.drop = nn.Dropout(cfg.dropout)
 
50
  mask = torch.tril(torch.ones(cfg.block_size, cfg.block_size)).bool()
51
  self.register_buffer("mask", mask, persistent=False)
52
 
 
57
  q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
58
  k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
59
  v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
 
60
  y = F.scaled_dot_product_attention(q, k, v, is_causal=True,
61
  dropout_p=self.drop.p if self.training else 0.0)
62
  y = y.transpose(1, 2).contiguous().view(B, T, C)
 
107
 
108
  def _init_weights(self, module):
109
  if isinstance(module, nn.Linear):
110
+ module.weight.data.normal_(mean=0.0, std=0.02)
111
  if module.bias is not None:
112
  module.bias.data.zero_()
113
  elif isinstance(module, nn.Embedding):
114
+ module.weight.data.normal_(mean=0.0, std=0.02)
 
 
 
 
 
 
 
 
115
 
116
  def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
117
  B, T = input_ids.shape
118
+ assert T <= self.config.block_size
119
  pos = torch.arange(T, device=input_ids.device)
120
  x = self.tok_emb(input_ids) + self.pos_emb(pos)[None, :, :]
121
  x = self.drop(x)
 
145
 
146
 
147
  if __name__ == "__main__":
 
148
  cfg = GPTConfig()
149
  m = TinyGPT(cfg)
150
+ total = sum(p.numel() for p in m.parameters())
151
+ print(f"Total params: {total:,} (~{total/1e6:.2f}M)")