Update BF16 weights + code to modelv2 shards (region LN + finetune support)

#32

by err805 - opened Feb 9

base: refs/heads/main

←

from: refs/pr/32

Discussion Files changed

+1175

-784

Files changed (11) hide show

hf_moondream.py +0 -1
layers.py +38 -13
lora.py +411 -56
model.safetensors.index.json +0 -0
modelv2-00001-of-00004.safetensors +3 -0
modelv2-00002-of-00004.safetensors +3 -0
modelv2-00003-of-00004.safetensors +3 -0
modelv2-00004-of-00004.safetensors +3 -0
moondream.py +35 -29
region.py +2 -0
text.py +12 -23

hf_moondream.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel, PretrainedConfig
 from typing import Union

 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel, PretrainedConfig
 from typing import Union

layers.py CHANGED Viewed

@@ -5,6 +5,14 @@ import torch.nn.functional as F
 from dataclasses import dataclass
 from typing import Literal, Optional
 try:
     from torchao import quantize_
     from torchao.quantization import int4_weight_only
@@ -126,11 +134,12 @@ class MLPWeights:
     act: Literal["gelu_approx"] = "gelu_approx"
-def mlp(x: torch.Tensor, w: MLPWeights, lora: Optional[dict] = None) -> torch.Tensor:
     x0 = w.fc1(x)
     if lora is not None:
-        x1 = F.linear(F.linear(x, lora["fc1"]["A"]), lora["fc1"]["B"])
-        x = x0 + x1
     else:
         x = x0
@@ -138,8 +147,7 @@ def mlp(x: torch.Tensor, w: MLPWeights, lora: Optional[dict] = None) -> torch.Te
     x0 = w.fc2(x)
     if lora is not None:
-        x1 = F.linear(F.linear(x, lora["fc2"]["A"]), lora["fc2"]["B"])
-        x = x0 + x1
     else:
         x = x0
@@ -147,7 +155,10 @@ def mlp(x: torch.Tensor, w: MLPWeights, lora: Optional[dict] = None) -> torch.Te
 def moe_mlp(
-    x: torch.Tensor, mlp_module: nn.Module, experts_per_token: int
 ) -> torch.Tensor:
     B, T, C = x.shape
     x = x.reshape(-1, C)
@@ -167,21 +178,23 @@ def moe_mlp(
         flat_weights = topk_weights.view(-1)  # [T*A]
         # Select expert weights
-        w1_selected = w1_weight[flat_idxs]  # [T*A, H, D]
-        w2_selected = w2_weight[flat_idxs]  # [T*A, D, H]
         # Expand input for all token-expert pairs
         x_expanded = x.unsqueeze(1).expand(-1, top_k, -1).reshape(-1, C)  # [T*A, D]
         # First linear layer with GeGLU: [T*A, H, D] @ [T*A, D, 1] -> [T*A, H]
-        x1_full = torch.bmm(w1_selected, x_expanded.unsqueeze(-1)).squeeze(
-            -1
-        )  # [T*A, H]
         x1, g = x1_full.chunk(2, dim=-1)
         x1 = F.gelu(x1) * (g + 1)
         # Second linear layer: [T*A, D, H] @ [T*A, H, 1] -> [T*A, D]
         expert_outs = torch.bmm(w2_selected, x1.unsqueeze(-1)).squeeze(-1)  # [T*A, D]
         # Apply weights and reshape
         weighted_outs = expert_outs * flat_weights.unsqueeze(-1)  # [T*A, D]
@@ -203,10 +216,22 @@ def moe_mlp(
             x_tok = x.index_select(0, token_pos)
             gate_tok = topk_weights[token_pos, which_k]
-            h_full = F.linear(x_tok, mlp_module.fc1.weight[expert_id])
             h, g = h_full.chunk(2, dim=-1)
             h = F.gelu(h) * (g + 1)
-            y = F.linear(h, mlp_module.fc2.weight[expert_id])
             y.mul_(gate_tok.unsqueeze(-1))
             out.index_add_(0, token_pos, y)

 from dataclasses import dataclass
 from typing import Literal, Optional
+from .lora import (
+    DenseLoRALayer,
+    MoELoRALayer,
+    apply_dense_lora,
+    apply_moe_lora_fc1_flat,
+    apply_moe_lora_fc2_flat,
+)
 try:
     from torchao import quantize_
     from torchao.quantization import int4_weight_only
     act: Literal["gelu_approx"] = "gelu_approx"
+def mlp(
+    x: torch.Tensor, w: MLPWeights, lora: Optional[DenseLoRALayer] = None
+) -> torch.Tensor:
     x0 = w.fc1(x)
     if lora is not None:
+        x = x0 + apply_dense_lora(x, lora.up_a, lora.up_b)
     else:
         x = x0
     x0 = w.fc2(x)
     if lora is not None:
+        x = x0 + apply_dense_lora(x, lora.down_a, lora.down_b)
     else:
         x = x0
 def moe_mlp(
+    x: torch.Tensor,
+    mlp_module: nn.Module,
+    experts_per_token: int,
+    lora: Optional[MoELoRALayer] = None,
 ) -> torch.Tensor:
     B, T, C = x.shape
     x = x.reshape(-1, C)
         flat_weights = topk_weights.view(-1)  # [T*A]
         # Select expert weights
+        w1_selected = w1_weight[flat_idxs]
+        w2_selected = w2_weight[flat_idxs]
         # Expand input for all token-expert pairs
         x_expanded = x.unsqueeze(1).expand(-1, top_k, -1).reshape(-1, C)  # [T*A, D]
         # First linear layer with GeGLU: [T*A, H, D] @ [T*A, D, 1] -> [T*A, H]
+        x1_full = torch.bmm(w1_selected, x_expanded.unsqueeze(-1)).squeeze(-1)  # [T*A, H]
+        if lora is not None:
+            x1_full = x1_full + apply_moe_lora_fc1_flat(x_expanded, lora, flat_idxs)
         x1, g = x1_full.chunk(2, dim=-1)
         x1 = F.gelu(x1) * (g + 1)
         # Second linear layer: [T*A, D, H] @ [T*A, H, 1] -> [T*A, D]
         expert_outs = torch.bmm(w2_selected, x1.unsqueeze(-1)).squeeze(-1)  # [T*A, D]
+        if lora is not None:
+            expert_outs = expert_outs + apply_moe_lora_fc2_flat(x1, lora, flat_idxs)
         # Apply weights and reshape
         weighted_outs = expert_outs * flat_weights.unsqueeze(-1)  # [T*A, D]
             x_tok = x.index_select(0, token_pos)
             gate_tok = topk_weights[token_pos, which_k]
+            w1 = mlp_module.fc1.weight[expert_id]
+            h_full = F.linear(x_tok, w1)
+            if lora is not None:
+                lora_up_a = lora.up_a[expert_id]
+                lora_up_b = lora.up_b[expert_id]
+                lora_mid = F.linear(x_tok, lora_up_a)
+                h_full = h_full + F.linear(lora_mid, lora_up_b)
             h, g = h_full.chunk(2, dim=-1)
             h = F.gelu(h) * (g + 1)
+            w2 = mlp_module.fc2.weight[expert_id]
+            y = F.linear(h, w2)
+            if lora is not None:
+                lora_down_a = lora.down_a[expert_id]
+                lora_down_b = lora.down_b[expert_id]
+                lora_mid = F.linear(h, lora_down_a)
+                y = y + F.linear(lora_mid, lora_down_b)
             y.mul_(gate_tok.unsqueeze(-1))
             out.index_add_(0, token_pos, y)

lora.py CHANGED Viewed

@@ -1,82 +1,437 @@
-import functools
 import os
 import shutil
-import torch
 from pathlib import Path
 from urllib.request import Request, urlopen
-from typing import Optional
-def variant_cache_dir():
     hf_hub_cache = os.environ.get("HF_HUB_CACHE")
-    if hf_hub_cache is not None:
-        return Path(hf_hub_cache) / "md_variants"
     hf_home = os.environ.get("HF_HOME")
-    if hf_home is not None:
-        return Path(hf_home) / "hub" / "md_variants"
-    return Path("~/.cache/huggingface/hub").expanduser() / "md_variants"
-def cached_variant_path(variant_id: str):
-    variant, *rest = variant_id.split("/", 1)
-    step = rest[0] if rest else "final"
-    cache_dir = variant_cache_dir() / variant
-    os.makedirs(cache_dir, exist_ok=True)
-    dest = cache_dir / f"{step}.pt"
-    if dest.exists():
-        return dest
-    md_endpoint = os.getenv("MOONDREAM_ENDPOINT", "https://api.moondream.ai")
-    headers = {"User-Agent": "moondream-torch"}
     api_key = os.getenv("MOONDREAM_API_KEY")
-    if api_key is not None:
-        headers["X-Moondream-Auth"] = api_key
-    req = Request(f"{md_endpoint}/v1/variants/{variant_id}/download", headers=headers)
-    with urlopen(req) as r, open(dest, "wb") as f:
-        shutil.copyfileobj(r, f)
     return dest
-def nest(flat):
-    tree = {}
-    for k, v in flat.items():
-        parts = k.split(".")
-        d = tree
-        for p in parts[:-1]:
-            d = d.setdefault(p, {})
-        d[parts[-1]] = v
-    return tree
-@functools.lru_cache(maxsize=5)
-def variant_state_dict(variant_id: Optional[str] = None, device: str = "cpu"):
-    if variant_id is None:
         return None
-    state_dict = torch.load(
-        cached_variant_path(variant_id), map_location=device, weights_only=True
     )
-    # TODO: Move these into the training code that saves checkpoints...
-    rename_rules = [
-        ("text_model.transformer.h", "text.blocks"),
-        (".mixer", ".attn"),
-        (".out_proj", ".proj"),
-        (".Wqkv", ".qkv"),
-        (".parametrizations.weight.0", ""),
-    ]
-    new_state_dict = {}
-    for key, tensor in state_dict.items():
-        new_key = key
-        for old, new in rename_rules:
-            if old in new_key:
-                new_key = new_key.replace(old, new)
-        new_state_dict[new_key] = tensor
-    return nest(new_state_dict)

+import json
 import os
+import re
 import shutil
+from dataclasses import dataclass
 from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
 from urllib.request import Request, urlopen
+import torch
+from .config import TextConfig
+class AdapterLoadError(RuntimeError):
+    pass
+def _cache_root() -> Path:
     hf_hub_cache = os.environ.get("HF_HUB_CACHE")
+    if hf_hub_cache:
+        return Path(hf_hub_cache)
     hf_home = os.environ.get("HF_HOME")
+    if hf_home:
+        return Path(hf_home) / "hub"
+    return Path("~/.cache/huggingface/hub").expanduser()
+def adapter_cache_dir() -> Path:
+    return _cache_root() / "md_finetunes"
+def normalize_adapter_id(value: Optional[str]) -> Optional[str]:
+    if not value:
+        return None
+    tail = value.split("/")[-1].strip()
+    if "@" not in tail:
+        return None
+    return tail
+def parse_adapter_id(adapter_id: str) -> Tuple[str, str]:
+    if not adapter_id or "@" not in adapter_id:
+        raise AdapterLoadError(
+            f"Invalid adapter id '{adapter_id}'. Expected 'finetune_id@step'."
+        )
+    finetune_id, step = adapter_id.split("@", 1)
+    if not finetune_id or not step:
+        raise AdapterLoadError(
+            f"Invalid adapter id '{adapter_id}'. Expected 'finetune_id@step'."
+        )
+    return finetune_id, step
+def _fetch_presigned_url(finetune_id: str, step: str) -> str:
+    endpoint = os.getenv("MOONDREAM_ENDPOINT", "https://api.moondream.ai").rstrip("/")
     api_key = os.getenv("MOONDREAM_API_KEY")
+    if not api_key:
+        raise AdapterLoadError("MOONDREAM_API_KEY is required to load finetune adapters.")
+    headers = {"User-Agent": "moondream-torch", "X-Moondream-Auth": api_key}
+    url = f"{endpoint}/v1/tuning/finetunes/{finetune_id}/checkpoints/{step}/download"
+    req = Request(url, headers=headers)
+    try:
+        with urlopen(req) as r:
+            payload = json.loads(r.read().decode("utf-8"))
+    except Exception as e:
+        raise AdapterLoadError(f"Failed to fetch adapter URL: {e}") from e
+    presigned = payload.get("url")
+    if not presigned:
+        raise AdapterLoadError("Adapter URL response missing 'url' field.")
+    return presigned
+def cached_adapter_path(adapter_id: str) -> Path:
+    finetune_id, step = parse_adapter_id(adapter_id)
+    cache_dir = adapter_cache_dir() / finetune_id / step
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    for name in ("adapter.pt", "adapter.safetensors"):
+        path = cache_dir / name
+        if path.exists() and path.stat().st_size > 0:
+            return path
+    presigned_url = _fetch_presigned_url(finetune_id, step)
+    dest = cache_dir / "adapter.pt"
+    try:
+        with urlopen(presigned_url) as r, open(dest, "wb") as f:
+            shutil.copyfileobj(r, f)
+    except Exception as e:
+        raise AdapterLoadError(f"Failed to download adapter: {e}") from e
     return dest
+def _load_state_dict(path: Path, device: torch.device) -> Dict[str, Any]:
+    if path.suffix == ".safetensors":
+        try:
+            from safetensors.torch import safe_open
+        except Exception as e:
+            raise AdapterLoadError(
+                "safetensors is required to load .safetensors adapters."
+            ) from e
+        data = {}
+        with safe_open(str(path), framework="pt") as f:
+            for key in f.keys():
+                data[key] = f.get_tensor(key).to(device=device)
+        return data
+    try:
+        return torch.load(path, map_location=device, weights_only=True)
+    except TypeError:
+        return torch.load(path, map_location=device)
+@dataclass
+class DenseLoRALayer:
+    up_a: torch.Tensor
+    up_b: torch.Tensor
+    down_a: torch.Tensor
+    down_b: torch.Tensor
+@dataclass
+class MoELoRALayer:
+    up_a: torch.Tensor
+    up_b: torch.Tensor
+    down_a: torch.Tensor
+    down_b: torch.Tensor
+class TextLoRA:
+    def __init__(
+        self,
+        text_config: TextConfig,
+        *,
+        rank: int,
+        max_rank: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        adapter_id: Optional[str] = None,
+    ) -> None:
+        if rank <= 0:
+            raise AdapterLoadError("LoRA rank must be positive.")
+        if max_rank < rank:
+            raise AdapterLoadError("max_rank must be >= rank.")
+        self.text_config = text_config
+        self.rank = rank
+        self.max_rank = max_rank
+        self.adapter_id = adapter_id
+        moe_cfg = text_config.moe
+        self.start_layer = moe_cfg.start_layer if moe_cfg else text_config.n_layers
+        if moe_cfg is not None:
+            self.rank_per_expert = rank // moe_cfg.experts_per_token
+            if self.rank_per_expert < 1:
+                raise AdapterLoadError(
+                    f"rank ({rank}) must be >= experts_per_token ({moe_cfg.experts_per_token})"
+                )
+            self.max_rank_per_expert = max_rank // moe_cfg.experts_per_token
+            if self.max_rank_per_expert < 1:
+                raise AdapterLoadError(
+                    f"max_rank ({max_rank}) must be >= experts_per_token ({moe_cfg.experts_per_token})"
+                )
+        else:
+            self.rank_per_expert = 0
+            self.max_rank_per_expert = 0
+        d_model = text_config.dim
+        d_ffn = text_config.ff_dim
+        self.dense: list[DenseLoRALayer] = []
+        for _ in range(self.start_layer):
+            self.dense.append(
+                DenseLoRALayer(
+                    up_a=torch.zeros((max_rank, d_model), device=device, dtype=dtype),
+                    up_b=torch.zeros((d_ffn, max_rank), device=device, dtype=dtype),
+                    down_a=torch.zeros((max_rank, d_ffn), device=device, dtype=dtype),
+                    down_b=torch.zeros((d_model, max_rank), device=device, dtype=dtype),
+                )
+            )
+        self.moe: list[MoELoRALayer] = []
+        if moe_cfg is not None:
+            num_experts = moe_cfg.num_experts
+            d_expert = moe_cfg.expert_inner_dim
+            for _ in range(text_config.n_layers - self.start_layer):
+                self.moe.append(
+                    MoELoRALayer(
+                        up_a=torch.zeros(
+                            (num_experts, self.max_rank_per_expert, d_model),
+                            device=device,
+                            dtype=dtype,
+                        ),
+                        up_b=torch.zeros(
+                            (num_experts, d_expert * 2, self.max_rank_per_expert),
+                            device=device,
+                            dtype=dtype,
+                        ),
+                        down_a=torch.zeros(
+                            (num_experts, self.max_rank_per_expert, d_expert),
+                            device=device,
+                            dtype=dtype,
+                        ),
+                        down_b=torch.zeros(
+                            (num_experts, d_model, self.max_rank_per_expert),
+                            device=device,
+                            dtype=dtype,
+                        ),
+                    )
+                )
+    def dense_layer(self, layer_idx: int) -> Optional[DenseLoRALayer]:
+        if layer_idx < len(self.dense):
+            return self.dense[layer_idx]
+        return None
+    def moe_layer(self, layer_idx: int) -> Optional[MoELoRALayer]:
+        moe_idx = layer_idx - self.start_layer
+        if 0 <= moe_idx < len(self.moe):
+            return self.moe[moe_idx]
         return None
+    @staticmethod
+    def _pad_axis(tensor: torch.Tensor, target: int, axis: int) -> torch.Tensor:
+        if tensor.shape[axis] == target:
+            return tensor
+        if tensor.shape[axis] > target:
+            raise AdapterLoadError(
+                f"LoRA tensor rank {tensor.shape[axis]} exceeds max {target}"
+            )
+        pad_shape = list(tensor.shape)
+        pad_shape[axis] = target - tensor.shape[axis]
+        pad = torch.zeros(pad_shape, device=tensor.device, dtype=tensor.dtype)
+        return torch.cat([tensor, pad], dim=axis)
+    @staticmethod
+    def detect_rank(state_dict: Dict[str, Any], text_config: TextConfig) -> int:
+        for key, tensor in state_dict.items():
+            if "dense" in key and "up_a" in key:
+                return int(tensor.shape[0])
+        for key, tensor in state_dict.items():
+            if "moe" in key and "up_a" in key:
+                rank_per_expert = int(tensor.shape[1])
+                moe_cfg = text_config.moe
+                if moe_cfg:
+                    return rank_per_expert * moe_cfg.experts_per_token
+                return rank_per_expert
+        raise AdapterLoadError("Could not detect LoRA rank from state dict.")
+    @classmethod
+    def from_state_dict(
+        cls,
+        state_dict: Dict[str, Any],
+        *,
+        text_config: TextConfig,
+        max_rank: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        adapter_id: Optional[str] = None,
+    ) -> "TextLoRA":
+        rank = cls.detect_rank(state_dict, text_config)
+        if rank > max_rank:
+            raise AdapterLoadError(
+                f"Adapter rank ({rank}) exceeds max_rank ({max_rank})."
+            )
+        lora = cls(
+            text_config,
+            rank=rank,
+            max_rank=max_rank,
+            dtype=dtype,
+            device=device,
+            adapter_id=adapter_id,
+        )
+        dense_seen = set()
+        moe_seen = set()
+        pattern = re.compile(r"(dense|moe)\.(\d+)\.(up_a|up_b|down_a|down_b)$")
+        for key, tensor in state_dict.items():
+            match = pattern.search(key)
+            if not match:
+                continue
+            kind, idx_str, name = match.group(1), match.group(2), match.group(3)
+            idx = int(idx_str)
+            arr = tensor.to(device=device, dtype=dtype)
+            if kind == "dense":
+                if idx >= len(lora.dense):
+                    raise AdapterLoadError(f"Dense LoRA layer index {idx} out of range.")
+                layer = lora.dense[idx]
+                if name in ("up_a", "down_a"):
+                    arr = cls._pad_axis(arr, lora.max_rank, axis=0)
+                else:
+                    arr = cls._pad_axis(arr, lora.max_rank, axis=1)
+                setattr(layer, name, arr)
+                dense_seen.add((idx, name))
+            else:
+                if idx >= len(lora.moe):
+                    raise AdapterLoadError(f"MoE LoRA layer index {idx} out of range.")
+                layer = lora.moe[idx]
+                if name in ("up_a", "down_a"):
+                    arr = cls._pad_axis(arr, lora.max_rank_per_expert, axis=1)
+                else:
+                    arr = cls._pad_axis(arr, lora.max_rank_per_expert, axis=2)
+                setattr(layer, name, arr)
+                moe_seen.add((idx, name))
+        for layer_idx in range(len(lora.dense)):
+            for name in ("up_a", "up_b", "down_a", "down_b"):
+                if (layer_idx, name) not in dense_seen:
+                    raise AdapterLoadError(
+                        f"Adapter missing dense LoRA for layer {layer_idx} ({name})."
+                    )
+        for layer_idx in range(len(lora.moe)):
+            for name in ("up_a", "up_b", "down_a", "down_b"):
+                if (layer_idx, name) not in moe_seen:
+                    raise AdapterLoadError(
+                        f"Adapter missing MoE LoRA for layer {layer_idx} ({name})."
+                    )
+        return lora
+def select_layer_lora(
+    lora: Optional[TextLoRA], layer_idx: int, *, is_moe: bool
+) -> Optional[object]:
+    if lora is None:
+        return None
+    return lora.moe_layer(layer_idx) if is_moe else lora.dense_layer(layer_idx)
+def apply_dense_lora(
+    x: torch.Tensor, lora_a: torch.Tensor, lora_b: torch.Tensor
+) -> torch.Tensor:
+    b, t, c = x.shape
+    x_flat = x.reshape(-1, c)
+    lora_mid = torch.matmul(x_flat, lora_a.t())
+    lora_out = torch.matmul(lora_mid, lora_b.t())
+    return lora_out.reshape(b, t, -1)
+def apply_moe_lora_fc1_flat(
+    x_expanded: torch.Tensor, lora: MoELoRALayer, flat_idxs: torch.Tensor
+) -> torch.Tensor:
+    lora_up_a = lora.up_a[flat_idxs]
+    lora_up_b = lora.up_b[flat_idxs]
+    lora_mid = torch.bmm(lora_up_a, x_expanded.unsqueeze(-1)).squeeze(-1)
+    lora_up = torch.bmm(lora_up_b, lora_mid.unsqueeze(-1)).squeeze(-1)
+    return lora_up
+def apply_moe_lora_fc2_flat(
+    h: torch.Tensor, lora: MoELoRALayer, flat_idxs: torch.Tensor
+) -> torch.Tensor:
+    lora_down_a = lora.down_a[flat_idxs]
+    lora_down_b = lora.down_b[flat_idxs]
+    lora_mid = torch.bmm(lora_down_a, h.unsqueeze(-1)).squeeze(-1)
+    lora_down = torch.bmm(lora_down_b, lora_mid.unsqueeze(-1)).squeeze(-1)
+    return lora_down
+_ADAPTER_CACHE: Dict[Tuple[str, str, str, Tuple], TextLoRA] = {}
+_CACHE_ORDER: list[Tuple[str, str, str, Tuple]] = []
+_CACHE_SIZE = 8
+def _config_key(text_config: TextConfig) -> Tuple:
+    moe = text_config.moe
+    moe_key = None
+    if moe is not None:
+        moe_key = (
+            moe.num_experts,
+            moe.start_layer,
+            moe.experts_per_token,
+            moe.expert_inner_dim,
+        )
+    return (
+        text_config.dim,
+        text_config.ff_dim,
+        text_config.n_layers,
+        moe_key,
+    )
+def load_adapter(
+    adapter_id: Optional[str],
+    *,
+    text_config: TextConfig,
+    device: torch.device,
+    dtype: torch.dtype,
+    max_rank: int = 16,
+) -> Optional[TextLoRA]:
+    if adapter_id is None:
+        return None
+    adapter_id = normalize_adapter_id(adapter_id)
+    if adapter_id is None:
+        return None
+    key = (adapter_id, str(device), str(dtype), _config_key(text_config))
+    cached = _ADAPTER_CACHE.get(key)
+    if cached is not None:
+        return cached
+    path = cached_adapter_path(adapter_id)
+    checkpoint = _load_state_dict(path, device)
+    if not isinstance(checkpoint, dict):
+        raise AdapterLoadError("Invalid adapter checkpoint format.")
+    state_dict = checkpoint.get("lora_state_dict", checkpoint)
+    if not isinstance(state_dict, dict):
+        raise AdapterLoadError("Adapter checkpoint missing lora_state_dict.")
+    lora = TextLoRA.from_state_dict(
+        state_dict,
+        text_config=text_config,
+        max_rank=max_rank,
+        dtype=dtype,
+        device=device,
+        adapter_id=adapter_id,
     )
+    _ADAPTER_CACHE[key] = lora
+    _CACHE_ORDER.append(key)
+    if len(_CACHE_ORDER) > _CACHE_SIZE:
+        old = _CACHE_ORDER.pop(0)
+        _ADAPTER_CACHE.pop(old, None)
+    return lora

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

modelv2-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79006ed488cca15b173cd5c0c7c1a467c20aaf5508e13934c36378d071d48c13
+size 4907406296

modelv2-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40202c61286ec7386d9bbce31d87af3064e42931b10323ed4b3e44158c0521e3
+size 4736548872

modelv2-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff46835f23bac47c7409032391e02a095821e274f3faaeea3f826a960db9bf80
+size 4502742464

modelv2-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a4d39e1bcb0ab835b9a00c7f458dedca4faf8741fc0b23fd2caf2af4547bca6
+size 4390628760

moondream.py CHANGED Viewed

@@ -21,12 +21,12 @@ from .region import (
     SpatialRefs,
 )
 from .layers import QuantizedLinear
-from .lora import variant_state_dict
 from .utils import remove_outlier_points
 ImageEncodingSettings = TypedDict(
     "ImageEncodingSettings",
-    {"variant": str},
     total=False,
 )
@@ -36,14 +36,15 @@ TextSamplingSettings = TypedDict(
         "max_tokens": int,
         "temperature": float,
         "top_p": float,
-        "variant": str,
     },
     total=False,
 )
 ObjectSamplingSettings = TypedDict(
     "ObjectSamplingSettings",
-    {"max_objects": int, "variant": str},
     total=False,
 )
@@ -120,6 +121,7 @@ class MoondreamModel(nn.Module):
                 "size_decoder": linear_cls(
                     config.region.dim, config.region.size_out_dim, dtype=dtype
                 ),
             }
         )
         self.region.coord_features = nn.Parameter(
@@ -181,6 +183,29 @@ class MoondreamModel(nn.Module):
                 dtype=self.vision.pos_emb.dtype,
             )
     @property
     def device(self):
         return self.vision.pos_emb.device
@@ -303,11 +328,7 @@ class MoondreamModel(nn.Module):
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
-        lora = (
-            variant_state_dict(settings["variant"], device=self.device)
-            if settings is not None and "variant" in settings
-            else None
-        )
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
@@ -408,11 +429,7 @@ class MoondreamModel(nn.Module):
             if settings
             else DEFAULT_TEMPERATURE
         )
-        lora = (
-            variant_state_dict(settings["variant"], device=self.device)
-            if settings is not None and "variant" in settings
-            else None
-        )
         top_p = settings.get("top_p", DEFAULT_TOP_P) if settings else DEFAULT_TOP_P
         eos_id = self.config.tokenizer.answer_id
@@ -524,11 +541,7 @@ class MoondreamModel(nn.Module):
         )
         top_p = settings.get("top_p", DEFAULT_TOP_P) if settings else DEFAULT_TOP_P
         eos_id = eos_id if eos_id is not None else self.config.tokenizer.eos_id
-        lora = (
-            variant_state_dict(settings["variant"], device=self.device)
-            if settings is not None and "variant" in settings
-            else None
-        )
         _, _, next_token, pos = self._prefill_prompt(
             prompt_tokens,
@@ -671,6 +684,7 @@ class MoondreamModel(nn.Module):
             reasoning_dict = {
                 "reasoning": {"text": reasoning_text, "grounding": reasoning_grounding}
             }
         else:
             prompt_tokens[0] += self.config.tokenizer.templates["query"]["suffix"]
             reasoning_dict = {}
@@ -834,11 +848,7 @@ class MoondreamModel(nn.Module):
             device=self.device,
         )
-        lora = (
-            variant_state_dict(settings["variant"], device=self.device)
-            if settings is not None and "variant" in settings
-            else None
-        )
         _, hidden, next_token, pos = self._prefill_prompt(
             prompt_tokens, image.pos, temperature=0, top_p=0, lora=lora
@@ -882,11 +892,7 @@ class MoondreamModel(nn.Module):
             device=self.device,
         )
-        lora = (
-            variant_state_dict(settings["variant"], device=self.device)
-            if settings is not None and "variant" in settings
-            else None
-        )
         _, hidden, next_token, pos = self._prefill_prompt(
             prompt_tokens, image.pos, temperature=0, top_p=0, lora=lora

     SpatialRefs,
 )
 from .layers import QuantizedLinear
+from .lora import load_adapter, normalize_adapter_id
 from .utils import remove_outlier_points
 ImageEncodingSettings = TypedDict(
     "ImageEncodingSettings",
+    {"adapter": str, "model": str},
     total=False,
 )
         "max_tokens": int,
         "temperature": float,
         "top_p": float,
+        "adapter": str,
+        "model": str,
     },
     total=False,
 )
 ObjectSamplingSettings = TypedDict(
     "ObjectSamplingSettings",
+    {"max_objects": int, "adapter": str, "model": str},
     total=False,
 )
                 "size_decoder": linear_cls(
                     config.region.dim, config.region.size_out_dim, dtype=dtype
                 ),
+                "ln": nn.LayerNorm(config.region.dim, dtype=dtype),
             }
         )
         self.region.coord_features = nn.Parameter(
                 dtype=self.vision.pos_emb.dtype,
             )
+    def _adapter_id_from_settings(self, settings: Optional[dict]) -> Optional[str]:
+        if settings is None:
+            return None
+        adapter = settings.get("adapter")
+        if adapter is not None:
+            return normalize_adapter_id(adapter)
+        model_value = settings.get("model")
+        if isinstance(model_value, str):
+            return normalize_adapter_id(model_value)
+        return None
+    def _resolve_lora(self, settings: Optional[dict]) -> Optional[object]:
+        adapter_id = self._adapter_id_from_settings(settings)
+        if adapter_id is None:
+            return None
+        return load_adapter(
+            adapter_id,
+            text_config=self.config.text,
+            device=self.device,
+            dtype=self.vision.pos_emb.dtype,
+        )
     @property
     def device(self):
         return self.vision.pos_emb.device
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
+        lora = self._resolve_lora(settings)
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
             if settings
             else DEFAULT_TEMPERATURE
         )
+        lora = self._resolve_lora(settings)
         top_p = settings.get("top_p", DEFAULT_TOP_P) if settings else DEFAULT_TOP_P
         eos_id = self.config.tokenizer.answer_id
         )
         top_p = settings.get("top_p", DEFAULT_TOP_P) if settings else DEFAULT_TOP_P
         eos_id = eos_id if eos_id is not None else self.config.tokenizer.eos_id
+        lora = self._resolve_lora(settings)
         _, _, next_token, pos = self._prefill_prompt(
             prompt_tokens,
             reasoning_dict = {
                 "reasoning": {"text": reasoning_text, "grounding": reasoning_grounding}
             }
+            spatial_refs = None
         else:
             prompt_tokens[0] += self.config.tokenizer.templates["query"]["suffix"]
             reasoning_dict = {}
             device=self.device,
         )
+        lora = self._resolve_lora(settings)
         _, hidden, next_token, pos = self._prefill_prompt(
             prompt_tokens, image.pos, temperature=0, top_p=0, lora=lora
             device=self.device,
         )
+        lora = self._resolve_lora(settings)
         _, hidden, next_token, pos = self._prefill_prompt(
             prompt_tokens, image.pos, temperature=0, top_p=0, lora=lora

region.py CHANGED Viewed

@@ -52,6 +52,7 @@ def decode_coordinate(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
     Returns:
         A single logit representing the predicted coordinate value (x or y)
     """
     return w.coord_decoder(hidden_state)
@@ -88,6 +89,7 @@ def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
         A tensor containing logits for 1024 bins for width and height.
         Shape is (2, 1024) where the first dimension corresponds to width and height.
     """
     return w.size_decoder(hidden_state).view(2, -1)

     Returns:
         A single logit representing the predicted coordinate value (x or y)
     """
+    hidden_state = w.ln(hidden_state)
     return w.coord_decoder(hidden_state)
         A tensor containing logits for 1024 bins for width and height.
         Shape is (2, 1024) where the first dimension corresponds to width and height.
     """
+    hidden_state = w.ln(hidden_state)
     return w.size_decoder(hidden_state).view(2, -1)

text.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Optional
 from .layers import layer_norm, mlp, QuantizedLinear, moe_mlp
 from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
 def text_encoder(input_ids: torch.Tensor, w: nn.Module):
@@ -23,15 +24,12 @@ def attn(
     n_heads: int,
     n_kv_heads: int,
     position_ids: torch.Tensor,
-    lora: Optional[dict] = None,
     flex_block_mask_slice=None,
 ):
     bsz, q_len, d_model = x.shape
     head_dim = d_model // n_heads
     qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
-    if lora is not None:
-        qkv_out += F.linear(F.linear(x, lora["qkv"]["A"]), lora["qkv"]["B"])
     q_dim = n_heads * head_dim
     kv_dim = n_kv_heads * head_dim
     q, k, v = qkv_out.split([q_dim, kv_dim, kv_dim], dim=-1)
@@ -69,14 +67,7 @@ def attn(
     out = out.transpose(1, 2).reshape(bsz, q_len, d_model)
-    out0 = w.proj(out)
-    if lora is not None:
-        out1 = F.linear(F.linear(x, lora["proj"]["A"]), lora["proj"]["B"])
-        out = out0 + out1
-    else:
-        out = out0
-    return out
 def text_decoder(
@@ -85,17 +76,13 @@ def text_decoder(
     attn_mask: torch.Tensor,
     position_ids: torch.Tensor,
     config: TextConfig,
-    lora: Optional[dict] = None,
     flex_block_mask_slice=None,
 ):
     for i, block in enumerate(w.blocks):
-        if lora is not None:
-            layer_lora = lora["text"]["blocks"][str(i)]
-            mlp_lora = layer_lora["mlp"]
-            attn_lora = layer_lora["attn"]
-        else:
-            mlp_lora = None
-            attn_lora = None
         l_in = layer_norm(x, block.ln)
         l_attn = attn(
@@ -107,14 +94,15 @@ def text_decoder(
             n_heads=config.n_heads,
             n_kv_heads=config.n_kv_heads,
             position_ids=position_ids,
-            lora=attn_lora,
             flex_block_mask_slice=flex_block_mask_slice,
         )
         if config.moe is not None and i >= config.moe.start_layer:
-            l_mlp = moe_mlp(l_in, block.mlp, config.moe.experts_per_token)
         else:
-            l_mlp = mlp(l_in, block.mlp, lora=mlp_lora)
         x = x + l_attn + l_mlp
@@ -145,7 +133,7 @@ def build_dense_mlp(d_model, d_ffn, dtype, linear_cls):
 def build_moe_mlp(d_model, d_ffn, n_experts, dtype):
     # For GeGLU, fc1 needs to output 2 * d_ffn (for gating)
-    return nn.ModuleDict(
         {
             "router": nn.Linear(d_model, n_experts, dtype=dtype),
             "fc1": nn.ParameterDict(
@@ -164,6 +152,7 @@ def build_moe_mlp(d_model, d_ffn, n_experts, dtype):
             ),
         }
     )
 def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:

 from .layers import layer_norm, mlp, QuantizedLinear, moe_mlp
 from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
+from .lora import select_layer_lora
 def text_encoder(input_ids: torch.Tensor, w: nn.Module):
     n_heads: int,
     n_kv_heads: int,
     position_ids: torch.Tensor,
     flex_block_mask_slice=None,
 ):
     bsz, q_len, d_model = x.shape
     head_dim = d_model // n_heads
     qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
     q_dim = n_heads * head_dim
     kv_dim = n_kv_heads * head_dim
     q, k, v = qkv_out.split([q_dim, kv_dim, kv_dim], dim=-1)
     out = out.transpose(1, 2).reshape(bsz, q_len, d_model)
+    return w.proj(out)
 def text_decoder(
     attn_mask: torch.Tensor,
     position_ids: torch.Tensor,
     config: TextConfig,
+    lora: Optional[object] = None,
     flex_block_mask_slice=None,
 ):
     for i, block in enumerate(w.blocks):
+        layer_lora = select_layer_lora(
+            lora, i, is_moe=config.moe is not None and i >= config.moe.start_layer
+        )
         l_in = layer_norm(x, block.ln)
         l_attn = attn(
             n_heads=config.n_heads,
             n_kv_heads=config.n_kv_heads,
             position_ids=position_ids,
             flex_block_mask_slice=flex_block_mask_slice,
         )
         if config.moe is not None and i >= config.moe.start_layer:
+            l_mlp = moe_mlp(
+                l_in, block.mlp, config.moe.experts_per_token, lora=layer_lora
+            )
         else:
+            l_mlp = mlp(l_in, block.mlp, lora=layer_lora)
         x = x + l_attn + l_mlp
 def build_moe_mlp(d_model, d_ffn, n_experts, dtype):
     # For GeGLU, fc1 needs to output 2 * d_ffn (for gating)
+    mlp = nn.ModuleDict(
         {
             "router": nn.Linear(d_model, n_experts, dtype=dtype),
             "fc1": nn.ParameterDict(
             ),
         }
     )
+    return mlp
 def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module: