Upload 12 files

Browse files

Files changed (12) hide show

README.md +103 -0
__init__.py +5 -0
config.json +23 -0
configuration_tinybuddy.py +39 -0
generation_config.json +9 -0
merges.txt +23 -0
model.safetensors +3 -0
modeling_tinybuddy.py +153 -0
special_tokens_map.json +6 -0
tokenizer.json +23 -0
tokenizer_config.json +8 -0
vocab.json +1 -0

README.md ADDED Viewed

	@@ -0,0 +1,103 @@

+---
+license: mit
+language:
+- en
+library_name: transformers
+tags:
+- text-generation
+- tiny-lm
+- tinystories
+- educational
+- built-with-llama
+- small-model
+pipeline_tag: text-generation
+datasets:
+- roneneldan/TinyStories
+---
+# TinyBuddy-500K
+> ⚠️ **Educational / experimental model.** TinyBuddy-500K is a from-scratch tiny Llama-style language model (~547K parameters) trained on a synthetic slice of TinyStories-style text.
+> It is **not** a useful assistant — it is a working demonstration of training extremely small models from scratch. See the [Limitations](#limitations) section.
+## Model description
+TinyBuddy-500K is a very small decoder-only Transformer language model trained on synthetic children's stories in the style of [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories). The architecture follows the LLaMA design (RMSNorm, Grouped Query Attention, SiLU MLP, tied embeddings).
+| Hyperparameter          | Value                          |
+|-------------------------|--------------------------------|
+| Parameters              | **547,296** (~547K)            |
+| Layers                  | 2                              |
+| Attention heads         | 4                              |
+| Key-Value heads (GQA)   | 2                              |
+| Hidden size             | 96                             |
+| MLP intermediate size   | 384                            |
+| Context length          | 512                            |
+| Vocab size              | 2,048 (BPE trained from scratch) |
+| Norm                    | RMSNorm                        |
+| Activation              | SiLU                           |
+| Position embeddings     | Learned absolute               |
+| Weight tying            | Yes (tied embeddings)          |
+| Precision               | float32                        |
+## Training details
+- **Data**: Synthetic TinyStories-style corpus (~128K tokens)
+- **Tokenizer**: Custom byte-level BPE with 2048 vocabulary
+- **Optimizer**: AdamW
+- **Steps**: ~300 steps on CPU
+- **Hardware**: Single CPU core
+- **Final loss**: ~0.17
+## Usage
+This model uses **custom modeling code**, so you must pass `trust_remote_code=True`.
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+repo = "Eeppa/TinyBuddy-500K"
+tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(repo, trust_remote_code=True)
+model.eval()
+prompt = "Once upon a time, there was a little girl named Lily."
+input_ids = tokenizer.encode(prompt, return_tensors="pt")
+out = model.generate(input_ids, max_new_tokens=60, temperature=0.8, top_k=50)
+print(tokenizer.decode(out[0], skip_special_tokens=True))
+```
+## Limitations
+This model is extremely small and was trained for a very short time on limited data.
+**What works**:
+- Basic English patterns and short sentence structure
+- Simple story-like generation
+**What's broken**:
+- Very limited coherence (usually breaks after 1–2 sentences)
+- High repetition
+- Poor long-range consistency
+- No real reasoning or factual knowledge
+This model exists purely for educational purposes to explore the lower limits of language model size.
+## License
+MIT
+## Citation
+```bibtex
+@misc{tinybuddy500k,
+  title  = {TinyBuddy-500K: An educational ~500K parameter Llama-style model trained on TinyStories},
+  year   = {2026},
+  note   = {Educational demonstration of extremely small language models.}
+}
+```
+**Built with Llama.**

__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# TinyBuddy-500K package
+from .modeling_tinybuddy import TinyBuddyForCausalLM
+from .configuration_tinybuddy import TinyBuddyConfig
+__all__ = ["TinyBuddyForCausalLM", "TinyBuddyConfig"]

config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "Eeppa/TinyBuddy-500K",
+  "architectures": ["TinyBuddyForCausalLM"],
+  "auto_map": {
+    "AutoConfig": "configuration_tinybuddy.TinyBuddyConfig",
+    "AutoModelForCausalLM": "modeling_tinybuddy.TinyBuddyForCausalLM"
+  },
+  "model_type": "tinybuddy",
+  "vocab_size": 2048,
+  "hidden_size": 96,
+  "num_hidden_layers": 2,
+  "num_attention_heads": 4,
+  "num_key_value_heads": 2,
+  "intermediate_size": 384,
+  "max_position_embeddings": 512,
+  "rms_norm_eps": 1e-6,
+  "tie_word_embeddings": true,
+  "bos_token_id": 2,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.40.0",
+  "torch_dtype": "float32"
+}

configuration_tinybuddy.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+TinyBuddyConfig for TinyBuddy-500K
+"""
+from transformers import PretrainedConfig
+class TinyBuddyConfig(PretrainedConfig):
+    model_type = "tinybuddy"
+    def __init__(
+        self,
+        vocab_size=2048,
+        hidden_size=96,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        intermediate_size=384,
+        max_position_embeddings=512,
+        rms_norm_eps=1e-6,
+        tie_word_embeddings=True,
+        bos_token_id=2,
+        eos_token_id=2,
+        pad_token_id=0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.rms_norm_eps = rms_norm_eps
+        self.tie_word_embeddings = tie_word_embeddings
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "max_new_tokens": 80,
+  "temperature": 0.8,
+  "top_k": 50,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "repetition_penalty": 1.1
+}

merges.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+#version: 0.2
+a e
+t h
+i n
+o n
+s t
+r e
+l e
+d e
+u s
+m e
+w a
+f o
+g o
+y o
+p a
+b e
+k i
+v e
+j u
+x a
+z e
+q u

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79cbf4a0790677946075a0cb32c455f830699535ff46adefd89c811b66b2593b
+size 2977648

modeling_tinybuddy.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""
+TinyBuddy-500K: Educational ~500K parameter Llama-style model
+MIT License
+"""
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+@dataclass
+class TinyBuddyConfig(PretrainedConfig):
+    model_type = "tinybuddy"
+    vocab_size: int = 2048
+    hidden_size: int = 96
+    num_hidden_layers: int = 2
+    num_attention_heads: int = 4
+    num_key_value_heads: int = 2
+    intermediate_size: int = 384
+    max_position_embeddings: int = 512
+    rms_norm_eps: float = 1e-6
+    tie_word_embeddings: bool = True
+    bos_token_id: int = 2
+    eos_token_id: int = 2
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.eps = eps
+    def forward(self, x):
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.eps)
+        return self.weight * x
+class GroupedQueryAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        self.head_dim = config.hidden_size // self.num_heads
+        self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=False)
+    def forward(self, x):
+        B, T, _ = x.shape
+        q = self.q_proj(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, T, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        k = k.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
+        v = v.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
+        attn = F.softmax(scores, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.transpose(1, 2).contiguous().view(B, T, self.num_heads * self.head_dim)
+        return self.o_proj(out)
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+    def forward(self, x):
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+class DecoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self_attn = GroupedQueryAttention(config)
+        self.mlp = MLP(config)
+        self.input_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps)
+    def forward(self, x):
+        residual = x
+        x = self.input_layernorm(x)
+        x = self.self_attn(x)
+        x = residual + x
+        residual = x
+        x = self.post_attention_layernorm(x)
+        x = self.mlp(x)
+        x = residual + x
+        return x
+class TinyBuddyForCausalLM(PreTrainedModel):
+    config_class = TinyBuddyConfig
+    base_model_prefix = "tinybuddy"
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.post_init()
+    def forward(self, input_ids, labels=None, **kwargs):
+        x = self.embed_tokens(input_ids)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
+        return CausalLMOutputWithPast(loss=loss, logits=logits)
+    @torch.no_grad()
+    def generate(self, input_ids, max_new_tokens=50, temperature=0.8, top_k=50, **kwargs):
+        for _ in range(max_new_tokens):
+            logits = self(input_ids).logits[:, -1, :] / temperature
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("Inf")
+            probs = F.softmax(logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            input_ids = torch.cat([input_ids, next_token], dim=1)
+        return input_ids
+TinyBuddyForCausalLM.register_for_auto_class("AutoModelForCausalLM")

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {"id": 50256, "content": "<|endoftext|>", "special": true, "single_word": false, "lstrip": false, "rstrip": false, "normalized": false}
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {"type": "ByteLevel", "add_prefix_space": false, "use_regex": true},
+  "post_processor": null,
+  "decoder": {"type": "ByteLevel"},
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": null,
+    "continuing_subword_prefix": "",
+    "end_of_word_suffix": "",
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "vocab": {},
+    "merges": []
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "model_max_length": 512,
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<pad>": 0, "<unk>": 1, "<s>": 2, "</s>": 3, "a": 4, "e": 5, "i": 6, "o": 7, "t": 8, "n": 9, "s": 10, "r": 11, "h": 12, "l": 13, "d": 14, "u": 15, "c": 16, "m": 17, "w": 18, "f": 19, "g": 20, "y": 21, "p": 22, "b": 23, "k": 24, "v": 25, "j": 26, "x": 27, "z": 28, "q": 29}