PleIAs
/

CommonLingua

@@ -1,73 +1,47 @@
 """
-ByteHybrid v2: Byte-level document classifier with optional n-gram hash embeddings.
-Changes from v1:
-- Added ByteNgramEmbed: rolling hash of byte trigrams into fixed-size embedding table
-- New config "base_ngram" with ngram_buckets=4096, ngram_dim=64 (~262k extra params)
-- Backward compatible: existing configs work unchanged (ngram_buckets=0)
-"""
-import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-# ── Byte N-gram Hash Embedding ───────────────────────────────────────────
 class ByteNgramEmbed(nn.Module):
-    """Rolling hash of byte n-grams into fixed-size embedding table.
-    Supports single n-gram size or multi-scale (e.g., trigrams + 5-grams).
-    Uses polynomial hash. Collisions act as regularization.
     """
     def __init__(self, num_buckets=4096, embed_dim=64, n=3):
         super().__init__()
         self.n = n
         self.num_buckets = num_buckets
         self.embed = nn.Embedding(num_buckets, embed_dim)
-    def _hash(self, byte_ids, n):
         B, T = byte_ids.shape
         clamped = byte_ids.clamp(max=255)
-        padded = F.pad(clamped, (0, n - 1), value=0)
         h = torch.zeros(B, T, dtype=torch.long, device=byte_ids.device)
-        for i in range(n):
-            h = h * 257 + padded[:, i:i+T]
-        h = h % self.num_buckets
-        return h
-    def forward(self, byte_ids):
-        return self.embed(self._hash(byte_ids, self.n))
-class MultiScaleNgramEmbed(nn.Module):
-    """Multi-scale n-gram hash embeddings (e.g., 3-gram + 5-gram).
-    Each scale gets its own hash table and embedding. Outputs are summed.
-    """
-    def __init__(self, num_buckets=4096, embed_dim=64, scales=(3, 5)):
-        super().__init__()
-        self.scales = scales
-        self.ngrams = nn.ModuleList([
-            ByteNgramEmbed(num_buckets, embed_dim, n=n) for n in scales
-        ])
-    def forward(self, byte_ids):
-        out = self.ngrams[0](byte_ids)
-        for ng in self.ngrams[1:]:
-            out = out + ng(byte_ids)
-        return out
-# ── Causal Conv1d Block ──────────────────────────────────────────────────
 class ByteConvBlock(nn.Module):
-    """Causal conv1d + gated FFN. Captures local byte patterns."""
     def __init__(self, d_model, kernel_size=15, expand=2):
         super().__init__()
@@ -75,160 +49,108 @@ class ByteConvBlock(nn.Module):
         self.pad = kernel_size - 1
         self.conv = nn.Conv1d(d_model, d_model, kernel_size, groups=d_model)
         self.norm2 = nn.LayerNorm(d_model)
-        ffn_dim = d_model * expand
-        self.ffn_gate = nn.Linear(d_model, ffn_dim, bias=False)
-        self.ffn_up = nn.Linear(d_model, ffn_dim, bias=False)
-        self.ffn_down = nn.Linear(ffn_dim, d_model, bias=False)
     def forward(self, x):
         residual = x
-        x = self.norm1(x)
-        x = x.transpose(1, 2)
         x = F.pad(x, (self.pad, 0))
-        x = F.silu(self.conv(x))
-        x = x.transpose(1, 2)
         x = residual + x
         residual = x
         x = self.norm2(x)
         x = self.ffn_down(F.silu(self.ffn_gate(x)) * self.ffn_up(x))
-        x = residual + x
-        return x
-# ── Attention Block ──────────────────────────────────────────────────────
 class ByteAttnBlock(nn.Module):
-    """Standard bidirectional attention + SwiGLU FFN with RoPE."""
     def __init__(self, d_model, n_heads=4, expand=2):
         super().__init__()
         self.n_heads = n_heads
         self.head_dim = d_model // n_heads
         self.norm1 = nn.LayerNorm(d_model)
         self.qkv = nn.Linear(d_model, 3 * d_model, bias=False)
         self.out_proj = nn.Linear(d_model, d_model, bias=False)
         self.norm2 = nn.LayerNorm(d_model)
-        ffn_dim = d_model * expand
-        self.ffn_gate = nn.Linear(d_model, ffn_dim, bias=False)
-        self.ffn_up = nn.Linear(d_model, ffn_dim, bias=False)
-        self.ffn_down = nn.Linear(ffn_dim, d_model, bias=False)
     def forward(self, x):
         B, T, D = x.shape
         residual = x
-        x = self.norm1(x)
-        qkv = self.qkv(x).reshape(B, T, 3, self.n_heads, self.head_dim)
-        q, k, v = qkv.unbind(dim=2)
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
-        q, k = apply_rope(q, k, T, self.head_dim, x.device)
         attn = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
         attn = attn.softmax(dim=-1)
         out = (attn @ v).transpose(1, 2).contiguous().view(B, T, D)
-        out = self.out_proj(out)
-        x = residual + out
         residual = x
-        x = self.norm2(x)
-        x = self.ffn_down(F.silu(self.ffn_gate(x)) * self.ffn_up(x))
-        x = residual + x
-        return x
-# ── Rotary Position Embedding ────────────────────────────────────────────
-def precompute_freqs(dim, max_len=4096, theta=10000.0):
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
-    t = torch.arange(max_len)
-    freqs = torch.outer(t, freqs)
-    return torch.cos(freqs), torch.sin(freqs)
-def apply_rope(q, k, seq_len, head_dim, device):
-    cos, sin = precompute_freqs(head_dim, seq_len)
-    cos = cos[:seq_len].to(device=device, dtype=q.dtype)
-    sin = sin[:seq_len].to(device=device, dtype=q.dtype)
-    def rotate(x):
-        x1, x2 = x[..., : head_dim // 2], x[..., head_dim // 2 :]
-        return torch.cat([x1 * cos - x2 * sin, x2 * cos + x1 * sin], dim=-1)
-    return rotate(q), rotate(k)
-# ── Full Model ───────────────────────────────────────────────────────────
 class ByteHybrid(nn.Module):
-    """Byte-level classifier with optional n-gram hash embeddings.
-    Args:
-        num_classes: number of output classes
-        d_model: hidden dimension
-        n_conv: number of conv1d blocks
-        n_attn: number of attention blocks
-        n_heads: attention heads
-        max_len: maximum byte sequence length
-        conv_kernel: conv1d kernel size
-        ngram_buckets: hash table size for n-gram embeddings (0 = disabled)
-        ngram_dim: embedding dimension for n-gram hashes
-    """
     def __init__(
         self,
-        num_classes=13,
         d_model=256,
         n_conv=3,
         n_attn=1,
         n_heads=4,
         ffn_expand=2,
-        max_len=2048,
         conv_kernel=15,
         ngram_buckets=0,
         ngram_dim=64,
-        ngram_scales=None,
     ):
         super().__init__()
         self.max_len = max_len
-        # Byte embedding: 256 possible byte values + 1 padding
         self.embed = nn.Embedding(257, d_model, padding_idx=256)
-        # Optional n-gram hash embedding
-        # ngram_scales: tuple of n-gram sizes, e.g. (3,) or (3, 5)
         self.ngram_embed = None
         if ngram_buckets > 0:
-            scales = ngram_scales if ngram_scales else (3,)
-            if len(scales) == 1:
-                self.ngram_embed = ByteNgramEmbed(ngram_buckets, ngram_dim, n=scales[0])
-            else:
-                self.ngram_embed = MultiScaleNgramEmbed(ngram_buckets, ngram_dim, scales=scales)
             self.ngram_proj = nn.Linear(ngram_dim, d_model, bias=False)
-        # Conv blocks
-        self.conv_layers = nn.ModuleList([
-            ByteConvBlock(d_model, kernel_size=conv_kernel, expand=ffn_expand)
-            for _ in range(n_conv)
-        ])
-        # Attention blocks
-        self.attn_layers = nn.ModuleList([
-            ByteAttnBlock(d_model, n_heads, ffn_expand)
-            for _ in range(n_attn)
-        ])
         self.final_norm = nn.LayerNorm(d_model)
-        # Classification head
         self.head = nn.Sequential(
             nn.Linear(d_model, d_model),
             nn.GELU(),
@@ -237,82 +159,25 @@ class ByteHybrid(nn.Module):
         )
     def forward(self, byte_ids):
-        """
-        Args:
-            byte_ids: (B, T) long tensor of byte values [0-255], padded with 256
-        Returns:
-            logits: (B, num_classes)
-        """
         pad_mask = byte_ids != 256
         x = self.embed(byte_ids)
-        # Add n-gram features if enabled
         if self.ngram_embed is not None:
-            ng = self.ngram_embed(byte_ids)
-            x = x + self.ngram_proj(ng)
         for layer in self.conv_layers:
             x = layer(x)
         for layer in self.attn_layers:
             x = layer(x)
         x = self.final_norm(x)
         mask = pad_mask.unsqueeze(-1).to(x.dtype)
         x = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
         return self.head(x)
-    @staticmethod
-    def encode_text(text, max_len=2048):
-        """Convert text string to byte tensor, padded to max_len."""
-        raw = text.encode("utf-8", errors="replace")[:max_len]
-        byte_ids = list(raw) + [256] * (max_len - len(byte_ids))
-        return torch.tensor(byte_ids, dtype=torch.long)
-# ── Configurations ───────────────────────────────────────────────────────
 CONFIGS = {
-    # ~2M params: 3 conv + 1 attn, d=256 (original)
-    "base": dict(d_model=256, n_conv=3, n_attn=1, n_heads=4, conv_kernel=15),
-    # ~2.3M params: base + trigram hash embeddings (4k buckets × 64 dim)
-    "base_ngram": dict(d_model=256, n_conv=3, n_attn=1, n_heads=4, conv_kernel=15,
-                       ngram_buckets=4096, ngram_dim=64),
-    # ~2.5M params: larger hash table
-    "base_ngram_large": dict(d_model=256, n_conv=3, n_attn=1, n_heads=4, conv_kernel=15,
-                             ngram_buckets=8192, ngram_dim=64),
-    # ~3.5M params: 3 conv + 2 attn, d=256
-    "large": dict(d_model=256, n_conv=3, n_attn=2, n_heads=4, conv_kernel=15),
-    # ~2M params: deeper conv, no attn
-    "conv_only": dict(d_model=256, n_conv=5, n_attn=0, n_heads=4, conv_kernel=15),
-    # ~2M params: wider kernel conv
-    "wide_conv": dict(d_model=256, n_conv=3, n_attn=1, n_heads=4, conv_kernel=31),
-    # Scaled-up configs
-    "d384": dict(d_model=384, n_conv=3, n_attn=1, n_heads=4, conv_kernel=15),
-    "d384_2attn": dict(d_model=384, n_conv=3, n_attn=2, n_heads=4, conv_kernel=15),
-    "d512": dict(d_model=512, n_conv=3, n_attn=1, n_heads=8, conv_kernel=15),
-    # 4-gram variant
-    "base_4gram": dict(d_model=256, n_conv=3, n_attn=1, n_heads=4, conv_kernel=15,
-                       ngram_buckets=4096, ngram_dim=64, ngram_scales=(4,)),
-    # Multi-scale: 3-gram + 5-gram (two hash tables, summed)
-    "base_multiscale": dict(d_model=256, n_conv=3, n_attn=1, n_heads=4, conv_kernel=15,
-                            ngram_buckets=4096, ngram_dim=64, ngram_scales=(3, 5)),
-    # Multi-scale: 3-gram + 4-gram + 5-gram
-    "base_multiscale3": dict(d_model=256, n_conv=3, n_attn=1, n_heads=4, conv_kernel=15,
-                             ngram_buckets=4096, ngram_dim=64, ngram_scales=(3, 4, 5)),
 }
-def count_params(model):
-    return sum(p.numel() for p in model.parameters())
-if __name__ == "__main__":
-    for name, cfg in CONFIGS.items():
-        model = ByteHybrid(num_classes=334, max_len=512, **cfg)
-        byte_ids = torch.randint(0, 256, (4, 512))
-        logits = model(byte_ids)
-        print(f"{name:<20s} {count_params(model):>10,} params  output={logits.shape}")

 """
+ByteHybrid: byte-level language identification (CommonLingua v7.2.1).
+Operates directly on raw UTF-8 bytes — no tokenizer required:
+    raw bytes → byte-embed + trigram-hash-embed (summed)
+              → 3 × depthwise Conv1D (k=15)
+              → 1 × bidirectional attention (RoPE, 4 heads)
+              → masked mean-pool
+              → classification head (334 logits)
+The shipped checkpoint uses the `base_ngram` config: d_model=256, 4096 trigram
+hash buckets × 64 dim, max_len=512 bytes. Total parameters ≈ 2.35 M.
+"""
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class ByteNgramEmbed(nn.Module):
+    """Rolling polynomial hash of byte trigrams into a fixed-size table.
+    Hash collisions act as regularisation; the small table (4096 × 64)
+    keeps parameter count bounded under arbitrary input distributions.
     """
     def __init__(self, num_buckets=4096, embed_dim=64, n=3):
         super().__init__()
         self.n = n
         self.num_buckets = num_buckets
         self.embed = nn.Embedding(num_buckets, embed_dim)
+    def forward(self, byte_ids):
         B, T = byte_ids.shape
         clamped = byte_ids.clamp(max=255)
+        padded = F.pad(clamped, (0, self.n - 1), value=0)
         h = torch.zeros(B, T, dtype=torch.long, device=byte_ids.device)
+        for i in range(self.n):
+            h = h * 257 + padded[:, i:i + T]
+        return self.embed(h % self.num_buckets)
 class ByteConvBlock(nn.Module):
+    """Causal depthwise Conv1D + SwiGLU FFN, with residual + layernorm."""
     def __init__(self, d_model, kernel_size=15, expand=2):
         super().__init__()
         self.pad = kernel_size - 1
         self.conv = nn.Conv1d(d_model, d_model, kernel_size, groups=d_model)
         self.norm2 = nn.LayerNorm(d_model)
+        ffn = d_model * expand
+        self.ffn_gate = nn.Linear(d_model, ffn, bias=False)
+        self.ffn_up = nn.Linear(d_model, ffn, bias=False)
+        self.ffn_down = nn.Linear(ffn, d_model, bias=False)
     def forward(self, x):
         residual = x
+        x = self.norm1(x).transpose(1, 2)
         x = F.pad(x, (self.pad, 0))
+        x = F.silu(self.conv(x)).transpose(1, 2)
         x = residual + x
         residual = x
         x = self.norm2(x)
         x = self.ffn_down(F.silu(self.ffn_gate(x)) * self.ffn_up(x))
+        return residual + x
+def _rope(q, k):
+    head_dim = q.shape[-1]
+    seq_len = q.shape[-2]
+    freqs = 1.0 / (10000.0 ** (torch.arange(0, head_dim, 2, device=q.device).float() / head_dim))
+    t = torch.arange(seq_len, device=q.device)
+    a = torch.outer(t, freqs)
+    cos = a.cos().to(q.dtype)
+    sin = a.sin().to(q.dtype)
+    def rot(x):
+        x1, x2 = x[..., : head_dim // 2], x[..., head_dim // 2:]
+        return torch.cat([x1 * cos - x2 * sin, x2 * cos + x1 * sin], dim=-1)
+    return rot(q), rot(k)
 class ByteAttnBlock(nn.Module):
+    """Bidirectional self-attention with RoPE + SwiGLU FFN."""
     def __init__(self, d_model, n_heads=4, expand=2):
         super().__init__()
         self.n_heads = n_heads
         self.head_dim = d_model // n_heads
         self.norm1 = nn.LayerNorm(d_model)
         self.qkv = nn.Linear(d_model, 3 * d_model, bias=False)
         self.out_proj = nn.Linear(d_model, d_model, bias=False)
         self.norm2 = nn.LayerNorm(d_model)
+        ffn = d_model * expand
+        self.ffn_gate = nn.Linear(d_model, ffn, bias=False)
+        self.ffn_up = nn.Linear(d_model, ffn, bias=False)
+        self.ffn_down = nn.Linear(ffn, d_model, bias=False)
     def forward(self, x):
         B, T, D = x.shape
         residual = x
+        h = self.norm1(x)
+        qkv = self.qkv(h).reshape(B, T, 3, self.n_heads, self.head_dim)
+        q, k, v = (t.transpose(1, 2) for t in qkv.unbind(dim=2))
+        q, k = _rope(q, k)
         attn = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
         attn = attn.softmax(dim=-1)
         out = (attn @ v).transpose(1, 2).contiguous().view(B, T, D)
+        x = residual + self.out_proj(out)
         residual = x
+        h = self.norm2(x)
+        h = self.ffn_down(F.silu(self.ffn_gate(h)) * self.ffn_up(h))
+        return residual + h
 class ByteHybrid(nn.Module):
+    """Byte-level classifier with optional trigram-hash augmentation."""
     def __init__(
         self,
+        num_classes,
         d_model=256,
         n_conv=3,
         n_attn=1,
         n_heads=4,
         ffn_expand=2,
+        max_len=512,
         conv_kernel=15,
         ngram_buckets=0,
         ngram_dim=64,
     ):
         super().__init__()
         self.max_len = max_len
+        # Byte values 0–255 plus index 256 = padding token
         self.embed = nn.Embedding(257, d_model, padding_idx=256)
         self.ngram_embed = None
         if ngram_buckets > 0:
+            self.ngram_embed = ByteNgramEmbed(ngram_buckets, ngram_dim, n=3)
             self.ngram_proj = nn.Linear(ngram_dim, d_model, bias=False)
+        self.conv_layers = nn.ModuleList(
+            [ByteConvBlock(d_model, conv_kernel, ffn_expand) for _ in range(n_conv)]
+        )
+        self.attn_layers = nn.ModuleList(
+            [ByteAttnBlock(d_model, n_heads, ffn_expand) for _ in range(n_attn)]
+        )
         self.final_norm = nn.LayerNorm(d_model)
         self.head = nn.Sequential(
             nn.Linear(d_model, d_model),
             nn.GELU(),
         )
     def forward(self, byte_ids):
         pad_mask = byte_ids != 256
         x = self.embed(byte_ids)
         if self.ngram_embed is not None:
+            x = x + self.ngram_proj(self.ngram_embed(byte_ids))
         for layer in self.conv_layers:
             x = layer(x)
         for layer in self.attn_layers:
             x = layer(x)
         x = self.final_norm(x)
         mask = pad_mask.unsqueeze(-1).to(x.dtype)
         x = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
         return self.head(x)
+# Single shipped configuration. The checkpoint encodes which config it was
+# trained with under the "config" key.
 CONFIGS = {
+    "base_ngram": dict(
+        d_model=256, n_conv=3, n_attn=1, n_heads=4, conv_kernel=15,
+        ngram_buckets=4096, ngram_dim=64,
+    ),
 }