Spaces:

Ghostgim
/

ghostlm

Sleeping

App Files Files Community

Ghostgim commited on 15 days ago

Commit

c6fa371

verified ·

1 Parent(s): 5126953

fix(model): sync ghostlm package to v0.9 (SwiGLU + RMSNorm + RoPE)

Browse files

The Space's bundled ghostlm/ package was the v0.4-era model.py without
SwiGLU / RMSNorm support, so loading the v0.9 chat checkpoint failed at
startup with size mismatches on every blocks.*.ffn.* weight: the v0.9
checkpoint stores SwiGLU-compressed FFN weights at hidden=2048
(int(d_ff * 2/3) rounded to a multiple of 64), but the old GELU
FeedForward in the Space's model.py allocated full d_ff=3072.

Syncs five files from the GhostLM main branch:
- ghostlm/__init__.py (re-exports + version)
- ghostlm/config.py (use_rope / use_swiglu / use_rmsnorm flags)
- ghostlm/model.py (SwiGLU class, RMSNorm class, RoPE rotary embed)
- ghostlm/tokenizer.py (50264 vocab + 7 special tokens, chat roles)
- ghostlm/trainer.py (kept aligned even though Space does not train)

Files changed (5) hide show

ghostlm/__init__.py +1 -1
ghostlm/config.py +14 -0
ghostlm/model.py +75 -7
ghostlm/tokenizer.py +127 -0
ghostlm/trainer.py +53 -6

ghostlm/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ from ghostlm.tokenizer import GhostTokenizer
 from ghostlm.dataset import GhostDataset, build_dataloaders
 from ghostlm.trainer import GhostTrainer
-__version__ = "0.1.0"
 __author__ = "Joe Munene"
 __all__ = [

 from ghostlm.dataset import GhostDataset, build_dataloaders
 from ghostlm.trainer import GhostTrainer
+__version__ = "0.5.0"
 __author__ = "Joe Munene"
 __all__ = [

ghostlm/config.py CHANGED Viewed

@@ -21,6 +21,9 @@ class GhostLMConfig:
     dropout: float = 0.1
     bias: bool = True
     use_rope: bool = False
     use_flash_attention: bool = False
     # Training
@@ -107,6 +110,17 @@ class GhostLMConfig:
                 "n_heads": 12,
                 "d_ff": 3072,
             },
         }
         if preset not in presets:

     dropout: float = 0.1
     bias: bool = True
     use_rope: bool = False
+    rope_base: float = 10000.0
+    use_swiglu: bool = False
+    use_rmsnorm: bool = False
     use_flash_attention: bool = False
     # Training
                 "n_heads": 12,
                 "d_ff": 3072,
             },
+            # v0.5 preset — same param shape as ghost-small but flips on
+            # the modern-arch switches. Use this for the v0.4.2 retrain.
+            "ghost-small-v0.5": {
+                "n_layers": 6,
+                "d_model": 512,
+                "n_heads": 8,
+                "d_ff": 2048,
+                "use_rope": True,
+                "use_swiglu": True,
+                "use_rmsnorm": True,
+            },
         }
         if preset not in presets:

ghostlm/model.py CHANGED Viewed

@@ -58,6 +58,33 @@ def apply_rotary_pos_emb(q, k, cos, sin):
     return q_embed, k_embed
 class CausalSelfAttention(nn.Module):
     """Multi-head causal self-attention with autoregressive masking.
@@ -187,6 +214,40 @@ class FeedForward(nn.Module):
         return x
 class TransformerBlock(nn.Module):
     """Single transformer decoder block with pre-normalization.
@@ -199,13 +260,15 @@ class TransformerBlock(nn.Module):
         """Initialize the transformer block.
         Args:
-            config: GhostLMConfig passed to sub-modules.
         """
         super().__init__()
-        self.ln_1 = nn.LayerNorm(config.d_model)
         self.attn = CausalSelfAttention(config)
-        self.ln_2 = nn.LayerNorm(config.d_model)
-        self.ffn = FeedForward(config)
     def forward(self, x):
         """Forward pass through the transformer block.
@@ -250,8 +313,8 @@ class GhostLM(nn.Module):
             [TransformerBlock(config) for _ in range(config.n_layers)]
         )
-        # Final layer norm
-        self.ln_f = nn.LayerNorm(config.d_model)
         # Output head with weight tying (no bias)
         self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
@@ -385,7 +448,12 @@ class GhostLM(nn.Module):
         no_decay = set()
         whitelist = (nn.Linear,)
-        blacklist = (nn.LayerNorm, nn.Embedding)
         for mn, m in self.named_modules():
             for pn, p in m.named_parameters():

     return q_embed, k_embed
+class RMSNorm(nn.Module):
+    """Root-mean-square layer normalization (LLaMA-style, no mean subtraction).
+    Used by Llama-2 / Llama-3 / Mistral / Gemma — half the params of LayerNorm
+    and matches its quality at this scale per the 2024 - 2026 small-LM
+    literature. Toggled via ``GhostLMConfig.use_rmsnorm``.
+    """
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """Initialize a learned scale vector of shape (dim,)."""
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        """Normalize by RMS along the last dim, then scale."""
+        norm = x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
+        return (norm * self.weight).to(x.dtype)
+def make_norm(config: GhostLMConfig, dim: int) -> nn.Module:
+    """Return RMSNorm or LayerNorm based on ``config.use_rmsnorm``."""
+    if getattr(config, "use_rmsnorm", False):
+        return RMSNorm(dim)
+    return nn.LayerNorm(dim)
 class CausalSelfAttention(nn.Module):
     """Multi-head causal self-attention with autoregressive masking.
         return x
+class SwiGLU(nn.Module):
+    """SwiGLU feed-forward — Llama / Mistral / Gemma style gated FFN.
+    Two parallel projections from d_model to a 2/3 d_ff hidden, gated through
+    SiLU. Matches GELU's parameter budget (we shrink the hidden dim by 2/3 to
+    compensate for the extra projection) but reliably wins by 1-2 nat-loss in
+    sub-1B comparisons. Toggled via ``GhostLMConfig.use_swiglu``.
+    """
+    def __init__(self, config: GhostLMConfig):
+        """Initialize the gated FFN with three linear projections (no bias)."""
+        super().__init__()
+        # Shrink hidden dim to keep total parameter count comparable to the
+        # GELU FeedForward at the same d_ff (which has 2 projections vs our 3).
+        hidden = int(config.d_ff * 2 / 3)
+        # Round to a multiple of 64 so MPS / CUDA matmul shapes stay friendly.
+        hidden = (hidden + 63) // 64 * 64
+        self.fc1 = nn.Linear(config.d_model, hidden, bias=False)
+        self.fc2 = nn.Linear(config.d_model, hidden, bias=False)
+        self.fc3 = nn.Linear(hidden, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        """fc3(SiLU(fc1(x)) * fc2(x))."""
+        return self.dropout(self.fc3(F.silu(self.fc1(x)) * self.fc2(x)))
+def make_ffn(config: GhostLMConfig) -> nn.Module:
+    """Return SwiGLU or FeedForward based on ``config.use_swiglu``."""
+    if getattr(config, "use_swiglu", False):
+        return SwiGLU(config)
+    return FeedForward(config)
 class TransformerBlock(nn.Module):
     """Single transformer decoder block with pre-normalization.
         """Initialize the transformer block.
         Args:
+            config: GhostLMConfig passed to sub-modules. Switches between
+                LayerNorm / RMSNorm and FeedForward / SwiGLU based on the
+                ``use_rmsnorm`` and ``use_swiglu`` flags.
         """
         super().__init__()
+        self.ln_1 = make_norm(config, config.d_model)
         self.attn = CausalSelfAttention(config)
+        self.ln_2 = make_norm(config, config.d_model)
+        self.ffn = make_ffn(config)
     def forward(self, x):
         """Forward pass through the transformer block.
             [TransformerBlock(config) for _ in range(config.n_layers)]
         )
+        # Final layer norm — RMSNorm or LayerNorm depending on config.
+        self.ln_f = make_norm(config, config.d_model)
         # Output head with weight tying (no bias)
         self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
         no_decay = set()
         whitelist = (nn.Linear,)
+        # RMSNorm is custom (defined in this module), so we include it in the
+        # blacklist by class — its `.weight` should be no-decay just like
+        # LayerNorm's. Without this, the v0.5 ghost-small-v0.5 preset
+        # crashes at optimizer setup with every block's ln_*.weight
+        # uncategorized.
+        blacklist = (nn.LayerNorm, nn.Embedding, RMSNorm)
         for mn, m in self.named_modules():
             for pn, p in m.named_parameters():

ghostlm/tokenizer.py CHANGED Viewed

@@ -319,3 +319,130 @@ class GhostTokenizer:
             String like: GhostTokenizer(vocab_size=50261, special_tokens=4)
         """
         return f"GhostTokenizer(vocab_size={self.vocab_size}, special_tokens={len(self._special_tokens)})"

             String like: GhostTokenizer(vocab_size=50261, special_tokens=4)
         """
         return f"GhostTokenizer(vocab_size={self.vocab_size}, special_tokens={len(self._special_tokens)})"
+class GhostTokenizerV05:
+    """v0.5 tokenizer — domain-trained 32K BPE via HuggingFace tokenizers.
+    Drop-in replacement for ``GhostTokenizer`` with the same API surface
+    (`encode`, `decode`, `encode_chat`, `format_chat_prompt`, `vocab_size`,
+    `_special_tokens`) so the existing dataset / trainer / chat code paths
+    work unchanged. The seven GhostLM special tokens land at the start of
+    the vocab (IDs 0-6) — different from v0.4's tail placement, but the
+    chat-format machinery only cares about the name -> ID mapping, not
+    the absolute IDs.
+    Trained by ``scripts/train_tokenizer.py``. Load via ``from_file``.
+    """
+    # Same special-token names as the legacy tokenizer.
+    BOS = GhostTokenizer.BOS
+    EOS = GhostTokenizer.EOS
+    PAD = GhostTokenizer.PAD
+    UNK = GhostTokenizer.UNK
+    USER = GhostTokenizer.USER
+    ASSISTANT = GhostTokenizer.ASSISTANT
+    END = GhostTokenizer.END
+    def __init__(self, path: str):
+        """Load the trained tokenizer.json file."""
+        from tokenizers import Tokenizer
+        self._tok = Tokenizer.from_file(path)
+        self._vocab_size = self._tok.get_vocab_size()
+        self._special_tokens = {
+            name: self._tok.token_to_id(name)
+            for name in (
+                self.BOS, self.EOS, self.PAD, self.UNK,
+                self.USER, self.ASSISTANT, self.END,
+            )
+        }
+        self._id_to_special = {v: k for k, v in self._special_tokens.items()}
+    @classmethod
+    def from_file(cls, path: str) -> "GhostTokenizerV05":
+        """Alias for the constructor — matches the GhostTokenizer.load shape."""
+        return cls(path)
+    @property
+    def vocab_size(self) -> int:
+        """Return the total vocabulary size (~32,000 by default)."""
+        return self._vocab_size
+    def _special_token_ids(self) -> set:
+        """Return the set of special-token integer IDs."""
+        return set(self._special_tokens.values())
+    def encode(self, text: str, add_bos: bool = False, add_eos: bool = False) -> List[int]:
+        """Encode text to token IDs (matches GhostTokenizer.encode)."""
+        ids = self._tok.encode(text).ids
+        if add_bos:
+            ids = [self._special_tokens[self.BOS]] + ids
+        if add_eos:
+            ids = ids + [self._special_tokens[self.EOS]]
+        return ids
+    def decode(self, ids: List[int], skip_special: bool = True) -> str:
+        """Decode token IDs to text (matches GhostTokenizer.decode)."""
+        if skip_special:
+            specials = self._special_token_ids()
+            ids = [i for i in ids if i not in specials]
+        return self._tok.decode(ids)
+    def encode_batch(self, texts: List[str], add_bos: bool = False, add_eos: bool = False) -> List[List[int]]:
+        """Encode a batch of texts."""
+        return [self.encode(t, add_bos=add_bos, add_eos=add_eos) for t in texts]
+    def encode_chat(self, turns: List[dict]) -> tuple:
+        """Build (token_ids, loss_mask) for a chat conversation.
+        Mirrors ``GhostTokenizer.encode_chat`` exactly — only the underlying
+        BPE differs.
+        """
+        user_id = self._special_tokens[self.USER]
+        assistant_id = self._special_tokens[self.ASSISTANT]
+        end_id = self._special_tokens[self.END]
+        ids: List[int] = []
+        mask: List[int] = []
+        for turn in turns:
+            role = turn["role"]
+            content_ids = self._tok.encode(turn["content"]).ids
+            if role == "user":
+                ids.append(user_id); mask.append(0)
+                ids.extend(content_ids); mask.extend([0] * len(content_ids))
+                ids.append(end_id); mask.append(0)
+            elif role == "assistant":
+                ids.append(assistant_id); mask.append(0)
+                ids.extend(content_ids); mask.extend([1] * len(content_ids))
+                ids.append(end_id); mask.append(1)
+            else:
+                raise ValueError(f"Unknown role: {role!r}")
+        return ids, mask
+    def format_chat_prompt(self, turns: List[dict]) -> List[int]:
+        """Build an inference prompt ending in <|ghost_assistant|>."""
+        ids, _ = self.encode_chat(turns)
+        ids.append(self._special_tokens[self.ASSISTANT])
+        return ids
+    def to_tensor(self, ids: List[int], device: str = "cpu") -> torch.Tensor:
+        """Convert ids to a (1, T) torch.LongTensor."""
+        return torch.tensor(ids, dtype=torch.long, device=device).unsqueeze(0)
+    def __len__(self) -> int:
+        """Return the vocab size."""
+        return self._vocab_size
+    def __repr__(self) -> str:
+        """Concise summary."""
+        return f"GhostTokenizerV05(vocab_size={self._vocab_size}, special_tokens={len(self._special_tokens)})"
+def load_tokenizer(path: Optional[str] = None):
+    """Factory: return v0.5 BPE if a tokenizer.json is provided, else legacy.
+    Train code paths can call this with ``config.tokenizer_path`` and not
+    care about the backend.
+    """
+    if path and Path(path).exists():
+        return GhostTokenizerV05(path)
+    return GhostTokenizer()

ghostlm/trainer.py CHANGED Viewed

@@ -48,19 +48,54 @@ class GhostTrainer:
         else:
             self.device = config.device
         self.model = self.model.to(self.device)
-        # Mixed precision (AMP) — only effective on CUDA
         if use_amp is None:
-            self.use_amp = self.device == "cuda"
         else:
-            self.use_amp = use_amp and self.device == "cuda"
         self.grad_scaler = torch.amp.GradScaler("cuda", enabled=self.use_amp)
-        # Optimizer
         self.optimizer = self.model.configure_optimizers(config)
         # Create directories
         self.checkpoint_dir = Path(config.checkpoint_dir)
         self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
@@ -188,13 +223,23 @@ class GhostTrainer:
         state dict, and config. Also saves as "best_model.pt" if the current
         validation loss is the best seen so far.
         Args:
             val_loss: Current validation loss for comparison.
         """
         checkpoint = {
             "step": self.step,
             "val_loss": val_loss,
-            "model_state_dict": self.model.state_dict(),
             "optimizer_state_dict": self.optimizer.state_dict(),
             "grad_scaler_state_dict": self.grad_scaler.state_dict(),
             "config": asdict(self.config),
@@ -222,7 +267,9 @@ class GhostTrainer:
         """
         checkpoint = torch.load(path, map_location=self.device, weights_only=False)
-        self.model.load_state_dict(checkpoint["model_state_dict"])
         self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
         if "grad_scaler_state_dict" in checkpoint:
             self.grad_scaler.load_state_dict(checkpoint["grad_scaler_state_dict"])

         else:
             self.device = config.device
+        # Distributed training support (issue #8). Detect whether we are
+        # running inside torchrun / torch.distributed.launch by reading the
+        # standard env vars; if so, set the local-rank device and wrap the
+        # model in DistributedDataParallel after moving to device.
+        # Single-GPU / CPU training is the default and unchanged.
+        self.is_distributed = (
+            "RANK" in os.environ
+            and "WORLD_SIZE" in os.environ
+            and int(os.environ.get("WORLD_SIZE", "1")) > 1
+        )
+        self.local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+        self.world_size = int(os.environ.get("WORLD_SIZE", "1"))
+        self.global_rank = int(os.environ.get("RANK", "0"))
+        self.is_main_process = self.global_rank == 0
+        if self.is_distributed:
+            import torch.distributed as dist
+            backend = "nccl" if torch.cuda.is_available() else "gloo"
+            if not dist.is_initialized():
+                dist.init_process_group(backend=backend)
+            if torch.cuda.is_available():
+                torch.cuda.set_device(self.local_rank)
+                self.device = f"cuda:{self.local_rank}"
         self.model = self.model.to(self.device)
+        # Mixed precision (AMP), only effective on CUDA
         if use_amp is None:
+            self.use_amp = self.device.startswith("cuda")
         else:
+            self.use_amp = use_amp and self.device.startswith("cuda")
         self.grad_scaler = torch.amp.GradScaler("cuda", enabled=self.use_amp)
+        # Optimizer (built BEFORE wrapping in DDP so param groups see raw modules)
         self.optimizer = self.model.configure_optimizers(config)
+        # DDP wrap. Each rank now sees a self.model that does the all-reduce
+        # transparently in backward(). Other code paths that touch
+        # self.model.* still work because DDP forwards attribute access.
+        if self.is_distributed:
+            from torch.nn.parallel import DistributedDataParallel as DDP
+            ddp_kwargs = {}
+            if torch.cuda.is_available():
+                ddp_kwargs["device_ids"] = [self.local_rank]
+                ddp_kwargs["output_device"] = self.local_rank
+            self.model = DDP(self.model, **ddp_kwargs)
         # Create directories
         self.checkpoint_dir = Path(config.checkpoint_dir)
         self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
         state dict, and config. Also saves as "best_model.pt" if the current
         validation loss is the best seen so far.
+        Under distributed training, only rank 0 writes; the saved state_dict
+        unwraps DDP so checkpoints remain compatible with single-GPU loading.
         Args:
             val_loss: Current validation loss for comparison.
         """
+        # Only rank 0 writes checkpoints in DDP runs
+        if getattr(self, "is_distributed", False) and not self.is_main_process:
+            return
+        # Unwrap DDP to keep checkpoints loadable on a single GPU
+        raw_model = self.model.module if hasattr(self.model, "module") else self.model
         checkpoint = {
             "step": self.step,
             "val_loss": val_loss,
+            "model_state_dict": raw_model.state_dict(),
             "optimizer_state_dict": self.optimizer.state_dict(),
             "grad_scaler_state_dict": self.grad_scaler.state_dict(),
             "config": asdict(self.config),
         """
         checkpoint = torch.load(path, map_location=self.device, weights_only=False)
+        # Load into the raw model (works for DDP-wrapped or single-GPU)
+        raw_model = self.model.module if hasattr(self.model, "module") else self.model
+        raw_model.load_state_dict(checkpoint["model_state_dict"])
         self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
         if "grad_scaler_state_dict" in checkpoint:
             self.grad_scaler.load_state_dict(checkpoint["grad_scaler_state_dict"])