Spaces:

ChatterjeeLab
/

PeptiVerse

Running

App Files Files Community

yinuozhang commited on 7 days ago

Commit

8d63dc0

1 Parent(s): 64b4595

update model

Browse files

Files changed (7) hide show

app.py +70 -3
basic_models.txt +10 -0
best_models.txt +0 -10
description.md +4 -3
inference.py +671 -547
tokenizer/__pycache__/__init__.cpython-310.pyc +0 -0
tokenizer/__pycache__/my_tokenizers.cpython-310.pyc +0 -0

app.py CHANGED Viewed

@@ -20,7 +20,6 @@ from inference import (
     PeptiVersePredictor,
     read_best_manifest_csv,
     BestRow,
-    canon_model,
 )
 try:
@@ -75,6 +74,74 @@ ASSETS_DATA   = ASSETS / "training_data_cleaned"; ASSETS_DATA.mkdir(parents=True
 MODEL_REPO = "ChatterjeeLab/PeptiVerse"       # model repo
 DATASET_REPO = "ChatterjeeLab/PeptiVerse"        # dataset repo
 def fetch_models_and_data():
     snapshot_download(
         repo_id=MODEL_REPO,
@@ -94,8 +161,8 @@ def fetch_models_and_data():
     )
 fetch_models_and_data()
-BEST_TXT = Path("best_models.txt")
 TRAINING_ROOT = ASSETS_MODELS / "training_classifiers"
 TOKENIZER_DIR = ASSETS_MODELS / "tokenizer"

     PeptiVersePredictor,
     read_best_manifest_csv,
     BestRow,
 )
 try:
 MODEL_REPO = "ChatterjeeLab/PeptiVerse"       # model repo
 DATASET_REPO = "ChatterjeeLab/PeptiVerse"        # dataset repo
+def canon_model(parsed) -> Optional[str]:
+    """Return the bare lowercase model name from a parsed (model, emb_tag) tuple or raw string."""
+    if parsed is None:
+        return None
+    if isinstance(parsed, tuple):
+        return parsed[0].lower() if parsed[0] else None
+    return str(parsed).lower()
+def get_required_patterns(manifest_path: Path) -> List[str]:
+    """Build allow_patterns from the manifest so we only download what we need."""
+    from inference import read_best_manifest_csv, EMB_TAG_TO_FOLDER_SUFFIX
+    manifest = read_best_manifest_csv(manifest_path)
+    patterns = set()
+    patterns.add("tokenizer/new_vocab.txt")
+    patterns.add("tokenizer/new_splits.txt")
+    patterns.add("training_data_cleaned/**/*.csv")
+    for prop_key, row in manifest.items():
+        disk_prop = "half_life" if prop_key == "halflife" else prop_key
+        for parsed in [row.best_wt, row.best_smiles]:
+            if parsed is None:
+                continue
+            model_name, emb_tag = parsed
+            if prop_key == "binding_affinity":
+                folder = model_name  # e.g. "wt_wt_pooled", "chemberta_smiles_pooled"
+                patterns.add(f"training_classifiers/binding_affinity/{folder}/best_model*")
+                continue
+            # infer emb_tag fallback
+            if emb_tag is None:
+                emb_tag = "wt" if parsed == row.best_wt else "smiles"
+            suffix = EMB_TAG_TO_FOLDER_SUFFIX.get(emb_tag, emb_tag)
+            # halflife special cases
+            if prop_key == "halflife" and emb_tag == "wt":
+                if model_name in {"transformer"}:
+                    for variant in ["transformer_wt_log", "transformer_wt"]:
+                        patterns.add(f"training_classifiers/{disk_prop}/{variant}/best_model*")
+                    continue
+                if model_name in {"xgb", "xgb_reg"}:
+                    patterns.add(f"training_classifiers/{disk_prop}/xgb_wt_log/best_model*")
+                    continue
+            patterns.add(f"training_classifiers/{disk_prop}/{model_name}_{suffix}/best_model*")
+            patterns.add(f"training_classifiers/{disk_prop}/{model_name}/best_model*")
+    return sorted(patterns)
+def fetch_models_and_data():
+    patterns = get_required_patterns(BEST_TXT)
+    print(f"Downloading {len(patterns)} targeted pattern(s):")
+    for p in patterns:
+        print(f"  {p}")
+    snapshot_download(
+        repo_id=MODEL_REPO,
+        local_dir=str(ASSETS_MODELS),
+        local_dir_use_symlinks=False,
+        allow_patterns=patterns,
+    )
+"""
 def fetch_models_and_data():
     snapshot_download(
         repo_id=MODEL_REPO,
     )
 fetch_models_and_data()
+"""
+BEST_TXT = Path("basic_models.txt")
 TRAINING_ROOT = ASSETS_MODELS / "training_classifiers"
 TOKENIZER_DIR = ASSETS_MODELS / "tokenizer"

basic_models.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
+Hemolysis, XGB, CNN (chemberta), Classifier, 0.2801, 0.564,
+Non-Fouling, Transformer, XGB (peptideclm), Classifier, 0.57, 0.3892,
+Solubility, CNN, Transformer (peptideclm), Classifier, 0.377, 0.329,
+Permeability (Penetrance), XGB, XGB (chemberta), Classifier, 0.4301, 0.5028,
+Toxicity, -, CNN (chemberta), Classifier, -, 0.49,
+Binding_affinity, wt_wt_pooled, chemberta_smiles_pooled, Regression, -, -,
+Permeability_PAMPA, -, CNN (chemberta), Regression, -, -,
+Permeability_CACO2, -, SVR (chemberta), Regression, -, -,
+Halflife, Transformer, XGB (peptideclm), Regression, -, -,

best_models.txt DELETED Viewed

@@ -1,10 +0,0 @@
-Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
-Hemolysis, XGB, Transformer, Classifier, 0.2801, 0.4343,
-Non-Fouling, MLP, XGB, Classifier, 0.57, 0.3982,
-Solubility, CNN, -, Classifier, 0.377, -,
-Permeability (Penetrance), XGB, -, Classifier, 0.4301, -,
-Toxicity, -, Transformer, Classifier, -, 0.3401,
-Binding_affinity, unpooled, unpooled, Regression, -, -,
-Permeability_PAMPA, -, CNN, Regression, -, -,
-Permeability_CACO2, -, SVR, Regression, -, -,
-Halflife, xgb_wt_log, xgb_smiles, Regression, -, -,

description.md CHANGED Viewed

@@ -16,8 +16,8 @@
 |---|---:|---:|---:|---:|
 | Hemolysis | 4765 | 1311 | 4765 | 1311 |
 | Non-Fouling | 13580 | 3600 | 13580 | 3600 |
-| Solubility | 9668 | 8785 | – | – |
-| Permeability (Penetrance) | 1162 | 1162 | – | – |
 | Toxicity | – | – | 5518 | 5518 |
 #### Regression (total N)
@@ -27,7 +27,7 @@
 | Permeability (PAMPA) | – | 6869 |
 | Permeability (CACO2) | – | 606 |
 | Half-Life | 130 | 245 |
-| Binding Affinity | 1436 | 1597 |
 Our models are trained on curated datasets from multiple sources. For detailed cleaning up procedures please refer to our [paper](https://www.biorxiv.org/content/10.64898/2025.12.31.697180v1).
@@ -90,6 +90,7 @@ Higher scores indicate stronger non-fouling behavior, desirable for circulation
 ### Model Training and Weight Hosting
 - More instructions can be found here at [PeptiVersse](https://huggingface.co/ChatterjeeLab/PeptiVerse)
 ### 🧪 Physicochemical Properties

 |---|---:|---:|---:|---:|
 | Hemolysis | 4765 | 1311 | 4765 | 1311 |
 | Non-Fouling | 13580 | 3600 | 13580 | 3600 |
+| Solubility | 9668 | 8785 | 9668 | 8785 |
+| Permeability (Penetrance) | 1162 | 1162 | 1162 | 1162 |
 | Toxicity | – | – | 5518 | 5518 |
 #### Regression (total N)
 | Permeability (PAMPA) | – | 6869 |
 | Permeability (CACO2) | – | 606 |
 | Half-Life | 130 | 245 |
+| Binding Affinity | 1433 | 1702 |
 Our models are trained on curated datasets from multiple sources. For detailed cleaning up procedures please refer to our [paper](https://www.biorxiv.org/content/10.64898/2025.12.31.697180v1).
 ### Model Training and Weight Hosting
 - More instructions can be found here at [PeptiVersse](https://huggingface.co/ChatterjeeLab/PeptiVerse)
+- Model uncertainty predictions is not supported for the app version, but the code is available at [PeptiVersse](https://huggingface.co/ChatterjeeLab/PeptiVerse) for local deployment.
 ### 🧪 Physicochemical Properties

inference.py CHANGED Viewed

@@ -1,31 +1,46 @@
-# peptiverse_infer.py
 from __future__ import annotations
 import csv, re, json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Optional, Tuple, Any, List
 import numpy as np
 import torch
 import torch.nn as nn
 import joblib
 import xgboost as xgb
 from transformers import EsmModel, EsmTokenizer, AutoModelForMaskedLM
 from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
 # -----------------------------
 # Manifest
 # -----------------------------
 @dataclass(frozen=True)
 class BestRow:
     property_key: str
-    best_wt: Optional[str]
-    best_smiles: Optional[str]
-    task_type: str               # "Classifier" or "Regression"
-    thr_wt: Optional[float]
     thr_smiles: Optional[float]
@@ -34,21 +49,16 @@ def _clean(s: str) -> str:
 def _none_if_dash(s: str) -> Optional[str]:
     s = _clean(s)
-    if s in {"", "-", "—", "NA", "N/A"}:
-        return None
-    return s
 def _float_or_none(s: str) -> Optional[float]:
     s = _clean(s)
-    if s in {"", "-", "—", "NA", "N/A"}:
-        return None
-    return float(s)
 def normalize_property_key(name: str) -> str:
     n = name.strip().lower()
     n = re.sub(r"\s*\(.*?\)\s*", "", n)
     n = n.replace("-", "_").replace(" ", "_")
     if "permeability" in n and "pampa" not in n and "caco" not in n:
         return "permeability_penetrance"
     if n == "binding_affinity":
@@ -60,11 +70,40 @@ def normalize_property_key(name: str) -> str:
     return n
 def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
-    """
-    Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
-    Hemolysis, SVM, SGB, Classifier, 0.2801, 0.2223,
-    """
     p = Path(path)
     out: Dict[str, BestRow] = {}
@@ -90,10 +129,13 @@ def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
                 continue
             prop_key = normalize_property_key(prop_raw)
             row = BestRow(
                 property_key=prop_key,
-                best_wt=_none_if_dash(rec.get("Best_Model_WT", "")),
-                best_smiles=_none_if_dash(rec.get("Best_Model_SMILES", "")),
                 task_type=_clean(rec.get("Type", "Classifier")),
                 thr_wt=_float_or_none(rec.get("Threshold_WT", "")),
                 thr_smiles=_float_or_none(rec.get("Threshold_SMILES", "")),
@@ -103,53 +145,32 @@ def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
     return out
-MODEL_ALIAS = {
-    "SVM": "svm_gpu",
-    "SVR": "svr",
-    "ENET": "enet_gpu",
-    "CNN": "cnn",
-    "MLP": "mlp",
-    "TRANSFORMER": "transformer",
-    "XGB": "xgb",
-    "XGB_REG": "xgb_reg",
-    "POOLED": "pooled",
-    "UNPOOLED": "unpooled",
-    "TRANSFORMER_WT_LOG": "transformer_wt_log",
-}
-def canon_model(label: Optional[str]) -> Optional[str]:
-    if label is None:
-        return None
-    k = label.strip().upper()
-    return MODEL_ALIAS.get(k, label.strip().lower())
 # -----------------------------
 # Generic artifact loading
 # -----------------------------
 def find_best_artifact(model_dir: Path) -> Path:
-    for pat in ["best_model.json", "best_model.pt", "best_model*.joblib"]:
         hits = sorted(model_dir.glob(pat))
         if hits:
             return hits[0]
     raise FileNotFoundError(f"No best_model artifact found in {model_dir}")
 def load_artifact(model_dir: Path, device: torch.device) -> Tuple[str, Any, Path]:
     art = find_best_artifact(model_dir)
     if art.suffix == ".json":
         booster = xgb.Booster()
-        print(str(art))
         booster.load_model(str(art))
         return "xgb", booster, art
     if art.suffix == ".joblib":
         obj = joblib.load(art)
         return "joblib", obj, art
     if art.suffix == ".pt":
         ckpt = torch.load(art, map_location=device, weights_only=False)
         return "torch_ckpt", ckpt, art
     raise ValueError(f"Unknown artifact type: {art}")
@@ -157,7 +178,7 @@ def load_artifact(model_dir: Path, device: torch.device) -> Tuple[str, Any, Path
 # NN architectures
 # -----------------------------
 class MaskedMeanPool(nn.Module):
-    def forward(self, X, M):  # X:(B,L,H), M:(B,L)
         Mf = M.unsqueeze(-1).float()
         denom = Mf.sum(dim=1).clamp(min=1.0)
         return (X * Mf).sum(dim=1) / denom
@@ -167,34 +188,25 @@ class MLPHead(nn.Module):
         super().__init__()
         self.pool = MaskedMeanPool()
         self.net = nn.Sequential(
-            nn.Linear(in_dim, hidden),
-            nn.GELU(),
-            nn.Dropout(dropout),
             nn.Linear(hidden, 1),
         )
     def forward(self, X, M):
-        z = self.pool(X, M)
-        return self.net(z).squeeze(-1)
 class CNNHead(nn.Module):
     def __init__(self, in_ch, c=256, k=5, layers=2, dropout=0.1):
         super().__init__()
-        blocks = []
-        ch = in_ch
         for _ in range(layers):
-            blocks += [nn.Conv1d(ch, c, kernel_size=k, padding=k//2),
-                       nn.GELU(),
-                       nn.Dropout(dropout)]
             ch = c
         self.conv = nn.Sequential(*blocks)
         self.head = nn.Linear(c, 1)
     def forward(self, X, M):
-        Xc = X.transpose(1, 2)              # (B,H,L)
-        Y = self.conv(Xc).transpose(1, 2)   # (B,L,C)
         Mf = M.unsqueeze(-1).float()
-        denom = Mf.sum(dim=1).clamp(min=1.0)
-        pooled = (Y * Mf).sum(dim=1) / denom
         return self.head(pooled).squeeze(-1)
 class TransformerHead(nn.Module):
@@ -207,28 +219,44 @@ class TransformerHead(nn.Module):
         )
         self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
         self.head = nn.Linear(d_model, 1)
     def forward(self, X, M):
-        pad_mask = ~M
-        Z = self.proj(X)
-        Z = self.enc(Z, src_key_padding_mask=pad_mask)
         Mf = M.unsqueeze(-1).float()
-        denom = Mf.sum(dim=1).clamp(min=1.0)
-        pooled = (Z * Mf).sum(dim=1) / denom
         return self.head(pooled).squeeze(-1)
 def _infer_in_dim_from_sd(sd: dict, model_name: str) -> int:
-    if model_name == "mlp":
-        return int(sd["net.0.weight"].shape[1])
-    if model_name == "cnn":
-        return int(sd["conv.0.weight"].shape[1])
-    if model_name == "transformer":
-        return int(sd["proj.weight"].shape[1])
     raise ValueError(model_name)
 def build_torch_model_from_ckpt(model_name: str, ckpt: dict, device: torch.device) -> nn.Module:
     params = ckpt["best_params"]
-    sd = ckpt["state_dict"]
     in_dim = int(ckpt.get("in_dim", _infer_in_dim_from_sd(sd, model_name)))
     dropout = float(params.get("dropout", 0.1))
@@ -238,39 +266,132 @@ def build_torch_model_from_ckpt(model_name: str, ckpt: dict, device: torch.devic
         model = CNNHead(in_ch=in_dim, c=int(params["channels"]), k=int(params["kernel"]),
                         layers=int(params["layers"]), dropout=dropout)
     elif model_name == "transformer":
-        d_model = (
-            params.get("d_model")
-            or params.get("hidden")
-            or params.get("hidden_dim")
-        )
         if d_model is None:
-            raise KeyError(
-                f"Transformer checkpoint missing d_model/hidden. "
-                f"Available keys: {list(params.keys())}"
             )
-        model = TransformerHead(
-            in_dim=in_dim,
-            d_model=int(d_model),
-            nhead=int(params["nhead"]),
-            layers=int(params["layers"]),
-            ff=int(params.get("ff", 4 * int(d_model))),
-            dropout=dropout
-        )
     else:
         raise ValueError(f"Unknown NN model_name={model_name}")
     model.load_state_dict(sd)
-    model.to(device)
-    model.eval()
     return model
 # -----------------------------
-# Binding affinity models
 # -----------------------------
 def affinity_to_class(y: float) -> int:
-    # 0=High(>=9), 1=Moderate(7-9), 2=Low(<7)
     if y >= 9.0: return 0
     if y < 7.0:  return 2
     return 1
@@ -280,38 +401,31 @@ class CrossAttnPooled(nn.Module):
         super().__init__()
         self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
         self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
         self.layers = nn.ModuleList([])
         for _ in range(n_layers):
             self.layers.append(nn.ModuleDict({
                 "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
                 "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
-                "n1t": nn.LayerNorm(hidden),
-                "n2t": nn.LayerNorm(hidden),
-                "n1b": nn.LayerNorm(hidden),
-                "n2b": nn.LayerNorm(hidden),
                 "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
                 "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
             }))
         self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
         self.reg = nn.Linear(hidden, 1)
         self.cls = nn.Linear(hidden, 3)
     def forward(self, t_vec, b_vec):
-        t = self.t_proj(t_vec).unsqueeze(0)  # (1,B,H)
-        b = self.b_proj(b_vec).unsqueeze(0)  # (1,B,H)
         for L in self.layers:
             t_attn, _ = L["attn_tb"](t, b, b)
             t = L["n1t"]((t + t_attn).transpose(0,1)).transpose(0,1)
             t = L["n2t"]((t + L["fft"](t)).transpose(0,1)).transpose(0,1)
             b_attn, _ = L["attn_bt"](b, t, t)
             b = L["n1b"]((b + b_attn).transpose(0,1)).transpose(0,1)
             b = L["n2b"]((b + L["ffb"](b)).transpose(0,1)).transpose(0,1)
-        z = torch.cat([t[0], b[0]], dim=-1)
-        h = self.shared(z)
         return self.reg(h).squeeze(-1), self.cls(h)
 class CrossAttnUnpooled(nn.Module):
@@ -319,344 +433,247 @@ class CrossAttnUnpooled(nn.Module):
         super().__init__()
         self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
         self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
         self.layers = nn.ModuleList([])
         for _ in range(n_layers):
             self.layers.append(nn.ModuleDict({
                 "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
                 "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
-                "n1t": nn.LayerNorm(hidden),
-                "n2t": nn.LayerNorm(hidden),
-                "n1b": nn.LayerNorm(hidden),
-                "n2b": nn.LayerNorm(hidden),
                 "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
                 "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
             }))
         self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
         self.reg = nn.Linear(hidden, 1)
         self.cls = nn.Linear(hidden, 3)
     def _masked_mean(self, X, M):
         Mf = M.unsqueeze(-1).float()
-        denom = Mf.sum(dim=1).clamp(min=1.0)
-        return (X * Mf).sum(dim=1) / denom
     def forward(self, T, Mt, B, Mb):
-        T = self.t_proj(T)
-        Bx = self.b_proj(B)
-        kp_t = ~Mt
-        kp_b = ~Mb
         for L in self.layers:
             T_attn, _ = L["attn_tb"](T, Bx, Bx, key_padding_mask=kp_b)
-            T = L["n1t"](T + T_attn)
-            T = L["n2t"](T + L["fft"](T))
             B_attn, _ = L["attn_bt"](Bx, T, T, key_padding_mask=kp_t)
-            Bx = L["n1b"](Bx + B_attn)
-            Bx = L["n2b"](Bx + L["ffb"](Bx))
-        t_pool = self._masked_mean(T, Mt)
-        b_pool = self._masked_mean(Bx, Mb)
-        z = torch.cat([t_pool, b_pool], dim=-1)
-        h = self.shared(z)
         return self.reg(h).squeeze(-1), self.cls(h)
 def load_binding_model(best_model_pt: Path, pooled_or_unpooled: str, device: torch.device) -> nn.Module:
     ckpt = torch.load(best_model_pt, map_location=device, weights_only=False)
     params = ckpt["best_params"]
-    sd = ckpt["state_dict"]
-    # infer Ht/Hb from projection weights
     Ht = int(sd["t_proj.0.weight"].shape[1])
     Hb = int(sd["b_proj.0.weight"].shape[1])
-    common = dict(
-        Ht=Ht, Hb=Hb,
-        hidden=int(params["hidden_dim"]),
-        n_heads=int(params["n_heads"]),
-        n_layers=int(params["n_layers"]),
-        dropout=float(params["dropout"]),
-    )
-    if pooled_or_unpooled == "pooled":
-        model = CrossAttnPooled(**common)
-    elif pooled_or_unpooled == "unpooled":
-        model = CrossAttnUnpooled(**common)
-    else:
-        raise ValueError(pooled_or_unpooled)
     model.load_state_dict(sd)
-    model.to(device).eval()
-    return model
 # -----------------------------
 # Embedding generation
 # -----------------------------
 def _safe_isin(ids: torch.Tensor, test_ids: torch.Tensor) -> torch.Tensor:
-    """
-    Pytorch patch
-    """
     if hasattr(torch, "isin"):
         return torch.isin(ids, test_ids)
-    # Fallback: compare against each special id
-    # (B,L,1) == (1,1,K) -> (B,L,K)
     return (ids.unsqueeze(-1) == test_ids.view(1, 1, -1)).any(dim=-1)
 class SMILESEmbedder:
-    """
-    PeptideCLM RoFormer embeddings for SMILES.
-    - pooled(): mean over tokens where attention_mask==1 AND token_id not in SPECIAL_IDS
-    - unpooled(): returns token embeddings filtered to valid tokens (specials removed),
-                  plus a 1-mask of length Li (since already filtered).
-    """
-    def __init__(
-        self,
-        device: torch.device,
-        vocab_path: str,
-        splits_path: str,
-        clm_name: str = "aaronfeller/PeptideCLM-23M-all",
-        max_len: int = 512,
-        use_cache: bool = True,
-    ):
         self.device = device
         self.max_len = max_len
         self.use_cache = use_cache
         self.tokenizer = SMILES_SPE_Tokenizer(vocab_path, splits_path)
         self.model = AutoModelForMaskedLM.from_pretrained(clm_name).roformer.to(device).eval()
         self.special_ids = self._get_special_ids(self.tokenizer)
         self.special_ids_t = (torch.tensor(self.special_ids, device=device, dtype=torch.long)
-                              if len(self.special_ids) else None)
         self._cache_pooled: Dict[str, torch.Tensor] = {}
         self._cache_unpooled: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
     @staticmethod
     def _get_special_ids(tokenizer) -> List[int]:
-        cand = [
-            getattr(tokenizer, "pad_token_id", None),
-            getattr(tokenizer, "cls_token_id", None),
-            getattr(tokenizer, "sep_token_id", None),
-            getattr(tokenizer, "bos_token_id", None),
-            getattr(tokenizer, "eos_token_id", None),
-            getattr(tokenizer, "mask_token_id", None),
-        ]
         return sorted({int(x) for x in cand if x is not None})
-    def _tokenize(self, smiles_list: List[str]) -> Dict[str, torch.Tensor]:
-        tok = self.tokenizer(
-            smiles_list,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=self.max_len,
-        )
-        for k in tok:
-            tok[k] = tok[k].to(self.device)
         if "attention_mask" not in tok:
             tok["attention_mask"] = torch.ones_like(tok["input_ids"], dtype=torch.long, device=self.device)
         return tok
     @torch.no_grad()
     def pooled(self, smiles: str) -> torch.Tensor:
         s = smiles.strip()
-        if self.use_cache and s in self._cache_pooled:
-            return self._cache_pooled[s]
         tok = self._tokenize([s])
-        ids = tok["input_ids"]                 # (1,L)
-        attn = tok["attention_mask"].bool()    # (1,L)
-        out = self.model(input_ids=ids, attention_mask=tok["attention_mask"])
-        h = out.last_hidden_state              # (1,L,H)
-        valid = attn
         if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
             valid = valid & (~_safe_isin(ids, self.special_ids_t))
         vf = valid.unsqueeze(-1).float()
-        summed = (h * vf).sum(dim=1)                       # (1,H)
-        denom = vf.sum(dim=1).clamp(min=1e-9)              # (1,1)
-        pooled = summed / denom                            # (1,H)
-        if self.use_cache:
-            self._cache_pooled[s] = pooled
         return pooled
     @torch.no_grad()
     def unpooled(self, smiles: str) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Returns:
-          X: (1, Li, H) float32 on device
-          M: (1, Li) bool on device
-        where Li excludes padding + special tokens.
-        """
         s = smiles.strip()
-        if self.use_cache and s in self._cache_unpooled:
-            return self._cache_unpooled[s]
         tok = self._tokenize([s])
-        ids = tok["input_ids"]                 # (1,L)
-        attn = tok["attention_mask"].bool()    # (1,L)
-        out = self.model(input_ids=ids, attention_mask=tok["attention_mask"])
-        h = out.last_hidden_state              # (1,L,H)
-        valid = attn
-        if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
-            valid = valid & (~_safe_isin(ids, self.special_ids_t))
-        # filter valid tokens
-        keep = valid[0]                        # (L,)
-        X = h[:, keep, :]                      # (1,Li,H)
         M = torch.ones((1, X.shape[1]), dtype=torch.bool, device=self.device)
-        if self.use_cache:
-            self._cache_unpooled[s] = (X, M)
         return X, M
 class WTEmbedder:
-    """
-    ESM2 embeddings for AA sequences.
-    - pooled(): mean over tokens where attention_mask==1 AND token_id not in {CLS, EOS, PAD,...}
-    - unpooled(): returns token embeddings filtered to valid tokens (specials removed),
-                  plus a 1-mask of length Li (since already filtered).
-    """
-    def __init__(
-        self,
-        device: torch.device,
-        esm_name: str = "facebook/esm2_t33_650M_UR50D",
-        max_len: int = 1022,
-        use_cache: bool = True,
-    ):
         self.device = device
         self.max_len = max_len
         self.use_cache = use_cache
         self.tokenizer = EsmTokenizer.from_pretrained(esm_name)
         self.model = EsmModel.from_pretrained(esm_name, add_pooling_layer=False).to(device).eval()
         self.special_ids = self._get_special_ids(self.tokenizer)
         self.special_ids_t = (torch.tensor(self.special_ids, device=device, dtype=torch.long)
-                              if len(self.special_ids) else None)
         self._cache_pooled: Dict[str, torch.Tensor] = {}
         self._cache_unpooled: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
     @staticmethod
     def _get_special_ids(tokenizer) -> List[int]:
-        cand = [
-            getattr(tokenizer, "pad_token_id", None),
-            getattr(tokenizer, "cls_token_id", None),
-            getattr(tokenizer, "sep_token_id", None),
-            getattr(tokenizer, "bos_token_id", None),
-            getattr(tokenizer, "eos_token_id", None),
-            getattr(tokenizer, "mask_token_id", None),
-        ]
         return sorted({int(x) for x in cand if x is not None})
-    def _tokenize(self, seq_list: List[str]) -> Dict[str, torch.Tensor]:
-        tok = self.tokenizer(
-            seq_list,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=self.max_len,
-        )
         tok = {k: v.to(self.device) for k, v in tok.items()}
         if "attention_mask" not in tok:
             tok["attention_mask"] = torch.ones_like(tok["input_ids"], dtype=torch.long, device=self.device)
         return tok
     @torch.no_grad()
     def pooled(self, seq: str) -> torch.Tensor:
         s = seq.strip()
-        if self.use_cache and s in self._cache_pooled:
-            return self._cache_pooled[s]
         tok = self._tokenize([s])
-        ids = tok["input_ids"]                 # (1,L)
-        attn = tok["attention_mask"].bool()    # (1,L)
-        out = self.model(**tok)
-        h = out.last_hidden_state              # (1,L,H)
-        valid = attn
-        if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
-            valid = valid & (~_safe_isin(ids, self.special_ids_t))
         vf = valid.unsqueeze(-1).float()
-        summed = (h * vf).sum(dim=1)                       # (1,H)
-        denom = vf.sum(dim=1).clamp(min=1e-9)              # (1,1)
-        pooled = summed / denom                            # (1,H)
-        if self.use_cache:
-            self._cache_pooled[s] = pooled
         return pooled
     @torch.no_grad()
     def unpooled(self, seq: str) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Returns:
-          X: (1, Li, H) float32 on device
-          M: (1, Li) bool on device
-        where Li excludes padding + special tokens.
-        """
         s = seq.strip()
-        if self.use_cache and s in self._cache_unpooled:
-            return self._cache_unpooled[s]
         tok = self._tokenize([s])
-        ids = tok["input_ids"]                 # (1,L)
-        attn = tok["attention_mask"].bool()    # (1,L)
-        out = self.model(**tok)
-        h = out.last_hidden_state              # (1,L,H)
-        valid = attn
-        if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
-            valid = valid & (~_safe_isin(ids, self.special_ids_t))
-        keep = valid[0]                        # (L,)
-        X = h[:, keep, :]                      # (1,Li,H)
         M = torch.ones((1, X.shape[1]), dtype=torch.bool, device=self.device)
-        if self.use_cache:
-            self._cache_unpooled[s] = (X, M)
         return X, M
-def _clean_state_dict(sd: dict) -> dict:
-    # just for wt halflife transformer predictor
-    out = {}
-    for k, v in sd.items():
-        if k.startswith("module."):
-            k = k[len("module."):]
-        if k.startswith("model."):
-            k = k[len("model."):]
-        out[k] = v
-    return out
 # -----------------------------
 # Predictor
 # -----------------------------
 class PeptiVersePredictor:
-    """
-      - loads best models from training_classifiers/
-      - computes embeddings as needed (pooled/unpooled)
-      - supports: xgb, joblib(ENET/SVM/SVR), NN(mlp/cnn/transformer), binding pooled/unpooled.
-    """
     def __init__(
         self,
         manifest_path: str | Path,
         classifier_weight_root: str | Path,
         esm_name="facebook/esm2_t33_650M_UR50D",
         clm_name="aaronfeller/PeptideCLM-23M-all",
         smiles_vocab="tokenizer/new_vocab.txt",
         smiles_splits="tokenizer/new_splits.txt",
         device: Optional[str] = None,
@@ -667,291 +684,398 @@ class PeptiVersePredictor:
         self.manifest = read_best_manifest_csv(manifest_path)
-        self.wt_embedder = WTEmbedder(self.device)
-        self.smiles_embedder = SMILESEmbedder(self.device, clm_name=clm_name,
-                                              vocab_path=str(self.root / smiles_vocab),
-                                              splits_path=str(self.root / smiles_splits))
-        self.models: Dict[Tuple[str, str], Any] = {}
-        self.meta: Dict[Tuple[str, str], Dict[str, Any]] = {}
         self._load_all_best_models()
-    def _resolve_dir(self, prop_key: str, model_name: str, mode: str) -> Path:
-        # map halflife -> half_life folder on disk (common layout)
         disk_prop = "half_life" if prop_key == "halflife" else prop_key
         base = self.training_root / disk_prop
-        # special handling for halflife xgb_wt_log / xgb_smiles
-        if prop_key == "halflife" and model_name in {"xgb_wt_log", "xgb_smiles"}:
-            d = base / model_name
-            if d.exists():
-                return d
-        if prop_key == "halflife" and model_name == "xgb":
-            d = base / ("xgb_wt_log" if mode == "wt" else "xgb_smiles")
-            if d.exists():
-                return d
         candidates = [
-            base / f"{model_name}_{mode}",
             base / model_name,
         ]
-        if mode == "wt":
-            candidates += [base / f"{model_name}_wt"]
-        if mode == "smiles":
-            candidates += [base / f"{model_name}_smiles"]
         for d in candidates:
-            if d.exists():
-                return d
         raise FileNotFoundError(
-            f"Cannot find model directory for {prop_key} {model_name} {mode}. Tried: {candidates}"
         )
     def _load_all_best_models(self):
         for prop_key, row in self.manifest.items():
-            for mode, label, thr in [
-                ("wt", row.best_wt, row.thr_wt),
-                ("smiles", row.best_smiles, row.thr_smiles),
             ]:
-                m = canon_model(label)
-                if m is None:
                     continue
-                # ---- binding affinity special ----
                 if prop_key == "binding_affinity":
-                    # label is pooled/unpooled; mode chooses folder wt_wt_* vs wt_smiles_*
-                    pooled_or_unpooled = m  # "pooled" or "unpooled"
-                    folder = f"wt_{mode}_{pooled_or_unpooled}"  # wt_wt_pooled / wt_smiles_unpooled etc.
                     model_dir = self.training_root / "binding_affinity" / folder
                     art = find_best_artifact(model_dir)
-                    if art.suffix != ".pt":
-                        raise RuntimeError(f"Binding model expected best_model.pt, got {art}")
-                    model = load_binding_model(art, pooled_or_unpooled=pooled_or_unpooled, device=self.device)
-                    self.models[(prop_key, mode)] = model
-                    self.meta[(prop_key, mode)] = {
-                        "task_type": "Regression",
-                        "threshold": None,
-                        "artifact": str(art),
-                        "model_name": pooled_or_unpooled,
                     }
                     continue
-                model_dir = self._resolve_dir(prop_key, m, mode)
                 kind, obj, art = load_artifact(model_dir, self.device)
-                if kind in {"xgb", "joblib"}:
-                    self.models[(prop_key, mode)] = obj
                 else:
-                    # rebuild NN architecture
-                    arch = m
-                    if arch.startswith("transformer"):
-                        arch = "transformer"
-                    elif arch.startswith("mlp"):
-                        arch = "mlp"
-                    elif arch.startswith("cnn"):
-                        arch = "cnn"
-                    if prop_key == "halflife" and mode == "wt" and m == "transformer_wt_log":
-                        if isinstance(obj, dict) and "state_dict" in obj:
-                            obj = dict(obj)
-                            obj["state_dict"] = _clean_state_dict(obj["state_dict"])
-                    self.models[(prop_key, mode)] = build_torch_model_from_ckpt(arch, obj, self.device)
-                self.meta[(prop_key, mode)] = {
-                        "task_type": row.task_type,
-                        "threshold": thr,
-                        "artifact": str(art),
-                        "model_name": m,
-                        "kind": kind,
-                    }
-    def _get_features_for_model(self, prop_key: str, mode: str, input_str: str):
-        """
-        Returns either:
-          - pooled np array shape (1,H) for xgb/joblib
-          - unpooled torch tensors (X,M) for NN
-        """
-        model = self.models[(prop_key, mode)]
-        meta = self.meta[(prop_key, mode)]
-        kind = meta.get("kind", None)
-        model_name = meta.get("model_name", "")
-        if prop_key == "binding_affinity":
-            raise RuntimeError("Use predict_binding_affinity().")
-        # If torch NN: needs unpooled
         if kind == "torch_ckpt":
-            if mode == "wt":
-                X, M = self.wt_embedder.unpooled(input_str)
-            else:
-                X, M = self.smiles_embedder.unpooled(input_str)
-            return X, M
-        # Otherwise pooled vectors for xgb/joblib
-        if mode == "wt":
-            v = self.wt_embedder.pooled(input_str)     # (1,H)
-        else:
-            v = self.smiles_embedder.pooled(input_str) # (1,H)
-        feats = v.detach().cpu().numpy().astype(np.float32)
-        feats = np.nan_to_num(feats, nan=0.0)
-        feats = np.clip(feats, np.finfo(np.float32).min, np.finfo(np.float32).max)
-        return feats
-    def predict_property(self, prop_key: str, mode: str, input_str: str) -> Dict[str, Any]:
-        """
-        mode: "wt" for AA sequence input, "smiles" for SMILES input
-        Returns dict with score + label if classifier threshold exists.
-        """
-        if (prop_key, mode) not in self.models:
-            raise KeyError(f"No model loaded for ({prop_key}, {mode}). Check manifest and folders.")
-        meta = self.meta[(prop_key, mode)]
-        model = self.models[(prop_key, mode)]
-        task_type = meta["task_type"].lower()
-        thr = meta.get("threshold", None)
-        kind = meta.get("kind", None)
         if prop_key == "binding_affinity":
             raise RuntimeError("Use predict_binding_affinity().")
-        # NN path (logits / regression)
         if kind == "torch_ckpt":
-            X, M = self._get_features_for_model(prop_key, mode, input_str)
             with torch.no_grad():
-                y = model(X, M).squeeze().float().cpu().item()
-            # invert log1p(hours) ONLY for WT half-life log models
-            model_name = meta.get("model_name", "")
-            if (
-                prop_key == "halflife"
-                and mode == "wt"
-                and model_name in {"xgb_wt_log", "transformer_wt_log"}
-            ):
-                y = float(np.expm1(y))
             if task_type == "classifier":
-                prob = float(1.0 / (1.0 + np.exp(-y)))  # sigmoid(logit)
-                out = {"property": prop_key, "mode": mode, "score": prob}
                 if thr is not None:
-                    out["label"] = int(prob >= float(thr))
-                    out["threshold"] = float(thr)
-                return out
             else:
-                return {"property": prop_key, "mode": mode, "score": float(y)}
-        if kind == "xgb":
-            feats = self._get_features_for_model(prop_key, mode, input_str)
-            dmat = xgb.DMatrix(feats)
-            pred = float(model.predict(dmat)[0])
-            # invert log1p(hours) ONLY for WT half-life log models
-            model_name = meta.get("model_name", "")
-            if (
-                prop_key == "halflife"
-                and mode == "wt"
-                and model_name in {"xgb_wt_log", "transformer_wt_log"}
-            ):
                 pred = float(np.expm1(pred))
-            out = {"property": prop_key, "mode": mode, "score": pred}
-            return out
-        # joblib path (svm/enet/svr)
-        if kind == "joblib":
-            feats = self._get_features_for_model(prop_key, mode, input_str)  # (1,H)
-            # classifier vs regressor behavior differs by estimator
             if task_type == "classifier":
                 if hasattr(model, "predict_proba"):
                     pred = float(model.predict_proba(feats)[:, 1][0])
                 else:
-                    if hasattr(model, "decision_function"):
-                        logit = float(model.decision_function(feats)[0])
-                        pred = float(1.0 / (1.0 + np.exp(-logit)))
-                    else:
-                        pred = float(model.predict(feats)[0])
-                out = {"property": prop_key, "mode": mode, "score": pred}
                 if thr is not None:
-                    out["label"] = int(pred >= float(thr))
-                    out["threshold"] = float(thr)
-                return out
             else:
                 pred = float(model.predict(feats)[0])
-                return {"property": prop_key, "mode": mode, "score": pred}
-        raise RuntimeError(f"Unknown model kind={kind}")
-    def predict_binding_affinity(self, mode: str, target_seq: str, binder_str: str) -> Dict[str, Any]:
-        """
-        mode: "wt" (binder is AA sequence) -> wt_wt_(pooled|unpooled)
-              "smiles" (binder is SMILES) -> wt_smiles_(pooled|unpooled)
-        """
-        prop_key = "binding_affinity"
-        if (prop_key, mode) not in self.models:
-            raise KeyError(f"No binding model loaded for ({prop_key}, {mode}).")
-        model = self.models[(prop_key, mode)]
-        pooled_or_unpooled = self.meta[(prop_key, mode)]["model_name"]  # pooled/unpooled
-        # target is always WT sequence (ESM)
-        if pooled_or_unpooled == "pooled":
-            t_vec = self.wt_embedder.pooled(target_seq)  # (1,Ht)
-            if mode == "wt":
-                b_vec = self.wt_embedder.pooled(binder_str)  # (1,Hb)
-            else:
-                b_vec = self.smiles_embedder.pooled(binder_str)  # (1,Hb)
             with torch.no_grad():
                 reg, logits = model(t_vec, b_vec)
-                affinity = float(reg.squeeze().cpu().item())
-                cls_logit = int(torch.argmax(logits, dim=-1).cpu().item())
-                cls_thr = affinity_to_class(affinity)
         else:
             T, Mt = self.wt_embedder.unpooled(target_seq)
-            if mode == "wt":
-                B, Mb = self.wt_embedder.unpooled(binder_str)
-            else:
-                B, Mb = self.smiles_embedder.unpooled(binder_str)
             with torch.no_grad():
                 reg, logits = model(T, Mt, B, Mb)
-                affinity = float(reg.squeeze().cpu().item())
-                cls_logit = int(torch.argmax(logits, dim=-1).cpu().item())
-                cls_thr = affinity_to_class(affinity)
-        names = {0: "High (≥9)", 1: "Moderate (7-9)", 2: "Low (<7)"}
-        return {
-            "property": "binding_affinity",
-            "mode": mode,
-            "affinity": affinity,
             "class_by_threshold": names[cls_thr],
-            "class_by_logits": names[cls_logit],
-            "binding_model": pooled_or_unpooled,
         }
 if __name__ == "__main__":
-    predictor = PeptiVersePredictor(
-       manifest_path="best_models.txt",
-       classifier_weight_root="./Classifier_Weight"
-     )
-    print(predictor.predict_property("hemolysis", "wt", "GIGAVLKVLTTGLPALISWIKRKRQQ"))
-    print(predictor.predict_binding_affinity("wt", target_seq="...", binder_str="..."))
-    # Test Embedding #
-    """
-    device = torch.device("cuda:0")
-    wt = WTEmbedder(device)
-    sm = SMILESEmbedder(device,
-        vocab_path="./tokeizner/new_vocab.txt",
-        splits_path="./tokenizer/new_splits.txt"
     )
-    p = wt.pooled("GIGAVLKVLTTGLPALISWIKRKRQQ")        # (1,1280)
-    X, M = wt.unpooled("GIGAVLKVLTTGLPALISWIKRKRQQ")    # (1,Li,1280), (1,Li)
-    p2 = sm.pooled("NCC(=O)N[C@H](CS)C(=O)O")           # (1,H_smiles)
-    X2, M2 = sm.unpooled("NCC(=O)N[C@H](CS)C(=O)O")     # (1,Li,H_smiles), (1,Li)
-    """

 from __future__ import annotations
 import csv, re, json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Optional, Tuple, Any, List
 import numpy as np
 import torch
 import torch.nn as nn
 import joblib
 import xgboost as xgb
 from transformers import EsmModel, EsmTokenizer, AutoModelForMaskedLM
 from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+from lightning.pytorch import seed_everything
+seed_everything(1986)
 # -----------------------------
 # Manifest
 # -----------------------------
+EMB_TAG_TO_FOLDER_SUFFIX = {
+    "wt":         "wt",
+    "peptideclm": "smiles",
+    "chemberta":  "chemberta",
+}
+EMB_TAG_TO_RUNTIME_MODE = {
+    "wt":         "wt",
+    "peptideclm": "smiles",
+    "chemberta":  "chemberta",
+}
+MAPIE_REGRESSION_MODELS = {"svr", "enet_gpu"}
+DNN_ARCHS = {"mlp", "cnn", "transformer"}
+XGB_MODELS = {"xgb", "xgb_reg", "xgb_wt_log", "xgb_smiles"}
 @dataclass(frozen=True)
 class BestRow:
     property_key: str
+    best_wt:    Optional[Tuple[str, Optional[str]]]
+    best_smiles: Optional[Tuple[str, Optional[str]]]
+    task_type: str
+    thr_wt:    Optional[float]
     thr_smiles: Optional[float]
 def _none_if_dash(s: str) -> Optional[str]:
     s = _clean(s)
+    return None if s in {"", "-", "-", "NA", "N/A"} else s
 def _float_or_none(s: str) -> Optional[float]:
     s = _clean(s)
+    return None if s in {"", "-", "-", "NA", "N/A"} else float(s)
 def normalize_property_key(name: str) -> str:
     n = name.strip().lower()
     n = re.sub(r"\s*\(.*?\)\s*", "", n)
     n = n.replace("-", "_").replace(" ", "_")
     if "permeability" in n and "pampa" not in n and "caco" not in n:
         return "permeability_penetrance"
     if n == "binding_affinity":
     return n
+MODEL_ALIAS = {
+    "SVM":         "svm_gpu",
+    "SVR":         "svr",
+    "ENET":        "enet_gpu",
+    "CNN":         "cnn",
+    "MLP":         "mlp",
+    "TRANSFORMER": "transformer",
+    "XGB":         "xgb",
+    "XGB_REG":     "xgb_reg",
+    "POOLED":      "pooled",
+    "UNPOOLED":    "unpooled",
+    "TRANSFORMER_WT_LOG": "transformer_wt_log",
+}
+def _parse_model_and_emb(raw: Optional[str]) -> Optional[Tuple[str, Optional[str]]]:
+    if raw is None:
+        return None
+    raw = _clean(raw)
+    if not raw or raw in {"-", "-", "NA", "N/A"}:
+        return None
+    m = re.match(r"^(.+?)\s*\((.+?)\)\s*$", raw)
+    if m:
+        model_raw = m.group(1).strip()
+        emb_tag   = m.group(2).strip().lower()
+    else:
+        model_raw = raw
+        emb_tag   = None
+    canon = MODEL_ALIAS.get(model_raw.upper(), model_raw.lower())
+    return canon, emb_tag
 def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
     p = Path(path)
     out: Dict[str, BestRow] = {}
                 continue
             prop_key = normalize_property_key(prop_raw)
+            best_wt    = _parse_model_and_emb(_none_if_dash(rec.get("Best_Model_WT", "")))
+            best_smiles = _parse_model_and_emb(_none_if_dash(rec.get("Best_Model_SMILES", "")))
             row = BestRow(
                 property_key=prop_key,
+                best_wt=best_wt,
+                best_smiles=best_smiles,
                 task_type=_clean(rec.get("Type", "Classifier")),
                 thr_wt=_float_or_none(rec.get("Threshold_WT", "")),
                 thr_smiles=_float_or_none(rec.get("Threshold_SMILES", "")),
     return out
 # -----------------------------
 # Generic artifact loading
 # -----------------------------
 def find_best_artifact(model_dir: Path) -> Path:
+    for pat in ["best_model.json", "best_model.pt", "best_model*.joblib",
+                "model.json", "model.ubj", "final_model.json"]:
         hits = sorted(model_dir.glob(pat))
         if hits:
             return hits[0]
+    seed_pt = model_dir / "seed_1986" / "model.pt"
+    if seed_pt.exists():
+        return seed_pt
     raise FileNotFoundError(f"No best_model artifact found in {model_dir}")
 def load_artifact(model_dir: Path, device: torch.device) -> Tuple[str, Any, Path]:
     art = find_best_artifact(model_dir)
     if art.suffix == ".json":
         booster = xgb.Booster()
         booster.load_model(str(art))
         return "xgb", booster, art
     if art.suffix == ".joblib":
         obj = joblib.load(art)
         return "joblib", obj, art
     if art.suffix == ".pt":
         ckpt = torch.load(art, map_location=device, weights_only=False)
         return "torch_ckpt", ckpt, art
     raise ValueError(f"Unknown artifact type: {art}")
 # NN architectures
 # -----------------------------
 class MaskedMeanPool(nn.Module):
+    def forward(self, X, M):
         Mf = M.unsqueeze(-1).float()
         denom = Mf.sum(dim=1).clamp(min=1.0)
         return (X * Mf).sum(dim=1) / denom
         super().__init__()
         self.pool = MaskedMeanPool()
         self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden), nn.GELU(), nn.Dropout(dropout),
             nn.Linear(hidden, 1),
         )
     def forward(self, X, M):
+        return self.net(self.pool(X, M)).squeeze(-1)
 class CNNHead(nn.Module):
     def __init__(self, in_ch, c=256, k=5, layers=2, dropout=0.1):
         super().__init__()
+        blocks, ch = [], in_ch
         for _ in range(layers):
+            blocks += [nn.Conv1d(ch, c, kernel_size=k, padding=k//2), nn.GELU(), nn.Dropout(dropout)]
             ch = c
         self.conv = nn.Sequential(*blocks)
         self.head = nn.Linear(c, 1)
     def forward(self, X, M):
+        Y = self.conv(X.transpose(1, 2)).transpose(1, 2)
         Mf = M.unsqueeze(-1).float()
+        pooled = (Y * Mf).sum(dim=1) / Mf.sum(dim=1).clamp(min=1.0)
         return self.head(pooled).squeeze(-1)
 class TransformerHead(nn.Module):
         )
         self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
         self.head = nn.Linear(d_model, 1)
     def forward(self, X, M):
+        Z = self.enc(self.proj(X), src_key_padding_mask=~M)
         Mf = M.unsqueeze(-1).float()
+        pooled = (Z * Mf).sum(dim=1) / Mf.sum(dim=1).clamp(min=1.0)
         return self.head(pooled).squeeze(-1)
 def _infer_in_dim_from_sd(sd: dict, model_name: str) -> int:
+    if model_name == "mlp":        return int(sd["net.0.weight"].shape[1])
+    if model_name == "cnn":        return int(sd["conv.0.weight"].shape[1])
+    if model_name == "transformer": return int(sd["proj.weight"].shape[1])
     raise ValueError(model_name)
+def _infer_num_layers_from_sd(sd: dict, prefix: str = "enc.layers.") -> int:
+    idxs = set()
+    for k in sd.keys():
+        if k.startswith(prefix):
+            m = re.match(r"(\d+)\.", k[len(prefix):])
+            if m:
+                idxs.add(int(m.group(1)))
+    return (max(idxs) + 1) if idxs else 1
+def _infer_transformer_arch_from_sd(sd: dict) -> Tuple[int, int, int]:
+    if "proj.weight" not in sd:
+        raise KeyError("Missing proj.weight in state_dict")
+    d_model = int(sd["proj.weight"].shape[0])
+    layers  = _infer_num_layers_from_sd(sd, prefix="enc.layers.")
+    ff      = int(sd["enc.layers.0.linear1.weight"].shape[0]) if "enc.layers.0.linear1.weight" in sd else 4 * d_model
+    return d_model, layers, ff
+def _pick_nhead(d_model: int) -> int:
+    for h in (8, 6, 4, 3, 2, 1):
+        if d_model % h == 0:
+            return h
+    return 1
 def build_torch_model_from_ckpt(model_name: str, ckpt: dict, device: torch.device) -> nn.Module:
     params = ckpt["best_params"]
+    sd     = ckpt["state_dict"]
     in_dim = int(ckpt.get("in_dim", _infer_in_dim_from_sd(sd, model_name)))
     dropout = float(params.get("dropout", 0.1))
         model = CNNHead(in_ch=in_dim, c=int(params["channels"]), k=int(params["kernel"]),
                         layers=int(params["layers"]), dropout=dropout)
     elif model_name == "transformer":
+        d_model = params.get("d_model") or params.get("hidden") or params.get("hidden_dim")
         if d_model is None:
+            d_model_i, layers_i, ff_i = _infer_transformer_arch_from_sd(sd)
+            nhead_i = _pick_nhead(d_model_i)
+            model = TransformerHead(
+                in_dim=in_dim, d_model=int(d_model_i), nhead=int(params.get("nhead", nhead_i)),
+                layers=int(params.get("layers", layers_i)), ff=int(params.get("ff", ff_i)),
+                dropout=float(params.get("dropout", dropout)),
+            )
+        else:
+            d_model = int(d_model)
+            model = TransformerHead(
+                in_dim=in_dim, d_model=d_model,
+                nhead=int(params.get("nhead", _pick_nhead(d_model))),
+                layers=int(params.get("layers", 2)),
+                ff=int(params.get("ff", 4 * d_model)),
+                dropout=dropout,
             )
     else:
         raise ValueError(f"Unknown NN model_name={model_name}")
     model.load_state_dict(sd)
+    model.to(device).eval()
     return model
 # -----------------------------
+# Wrappers
+# -----------------------------
+from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
+class PassthroughRegressor(BaseEstimator, RegressorMixin):
+    def __init__(self, preds: np.ndarray):
+        self.preds = preds
+    def fit(self, X, y): return self
+    def predict(self, X): return self.preds[:len(X)]
+class PassthroughClassifier(BaseEstimator, ClassifierMixin):
+    def __init__(self, preds: np.ndarray):
+        self.preds = preds
+        self.classes_ = np.array([0, 1])
+    def fit(self, X, y): return self
+    def predict(self, X): return (self.preds[:len(X)] >= 0.5).astype(int)
+    def predict_proba(self, X):
+        p = self.preds[:len(X)]
+        return np.stack([1 - p, p], axis=1)
+# -----------------------------
+# Uncertainty helpers
 # -----------------------------
+SEED_DIRS = ["seed_1986", "seed_42", "seed_0", "seed_123", "seed_12345"]
+def load_seed_ensemble(model_dir: Path, arch: str, device: torch.device) -> List[nn.Module]:
+    ensemble = []
+    for sd_name in SEED_DIRS:
+        pt = model_dir / sd_name / "model.pt"
+        if not pt.exists():
+            continue
+        ckpt = torch.load(pt, map_location=device, weights_only=False)
+        ensemble.append(build_torch_model_from_ckpt(arch, ckpt, device))
+    return ensemble
+def _binary_entropy(p: float) -> float:
+    p = float(np.clip(p, 1e-9, 1 - 1e-9))
+    return float(-p * np.log(p) - (1 - p) * np.log(1 - p))
+def _ensemble_clf_uncertainty(ensemble: List[nn.Module], X: torch.Tensor, M: torch.Tensor) -> float:
+    probs = []
+    with torch.no_grad():
+        for m in ensemble:
+            logit = m(X, M).squeeze().float().cpu().item()
+            probs.append(1.0 / (1.0 + np.exp(-logit)))
+    return _binary_entropy(float(np.mean(probs)))
+def _ensemble_reg_uncertainty(ensemble: List[nn.Module], X: torch.Tensor, M: torch.Tensor) -> float:
+    preds = []
+    with torch.no_grad():
+        for m in ensemble:
+            preds.append(m(X, M).squeeze().float().cpu().item())
+    return float(np.std(preds))
+def _mapie_uncertainty(mapie_bundle: dict, score: float,
+                        embedding: Optional[np.ndarray] = None) -> Tuple[float, float]:
+    """
+    Returns (ci_low, ci_high) from a conformal bundle.
+      - adaptive:       {"quantile": q, "sigma_model": xgb, "emb_tag": ..., "adaptive": True}
+                        Input-dependent: interval = score +/- q * sigma(embedding)
+      - plain_quantile: {"quantile": q, "alpha": ...}
+                        Fixed-width: interval = score +/- q
+    """
+    # Adaptive format is input-dependent interval
+    if mapie_bundle.get("adaptive") and "sigma_model" in mapie_bundle:
+        q = float(mapie_bundle["quantile"])
+        if embedding is not None:
+            # Adaptive interval: y_hat ± q * sigma_hat(x).
+            # Equivalent to MAPIE's get_estimation_distribution():
+            #   y_pred + conformity_scores * r_pred
+            # where conformity_scores=q and r_pred=sigma_hat(x).
+            # (ResidualNormalisedScore, Cordier et al. 2023)
+            sigma_model = mapie_bundle["sigma_model"]
+            sigma = float(sigma_model.predict(xgb.DMatrix(embedding.reshape(1, -1)))[0])
+            sigma = max(sigma, 1e-6)
+        else:
+            # No embedding available - fall back to fixed interval with sigma=1
+            sigma = 1.0
+        return float(score - q * sigma), float(score + q * sigma)
+    # Plain quantile format
+    if "quantile" in mapie_bundle:
+        q = float(mapie_bundle["quantile"])
+        return float(score - q), float(score + q)
+    X_dummy = np.zeros((1, 1))
+    result = mapie.predict(X_dummy)
+    if isinstance(result, tuple):
+        intervals = np.asarray(result[1])
+        if intervals.ndim == 3:
+            return float(intervals[0, 0, 0]), float(intervals[0, 1, 0])
+        return float(intervals[0, 0]), float(intervals[0, 1])
+    raise RuntimeError(
+        f"Cannot extract intervals: unknown MAPIE bundle format. "
+        f"Bundle keys: {list(mapie_bundle.keys())}."
+    )
 def affinity_to_class(y: float) -> int:
     if y >= 9.0: return 0
     if y < 7.0:  return 2
     return 1
         super().__init__()
         self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
         self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
         self.layers = nn.ModuleList([])
         for _ in range(n_layers):
             self.layers.append(nn.ModuleDict({
                 "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
                 "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
+                "n1t": nn.LayerNorm(hidden), "n2t": nn.LayerNorm(hidden),
+                "n1b": nn.LayerNorm(hidden), "n2b": nn.LayerNorm(hidden),
                 "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
                 "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
             }))
         self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
         self.reg = nn.Linear(hidden, 1)
         self.cls = nn.Linear(hidden, 3)
     def forward(self, t_vec, b_vec):
+        t = self.t_proj(t_vec).unsqueeze(0)
+        b = self.b_proj(b_vec).unsqueeze(0)
         for L in self.layers:
             t_attn, _ = L["attn_tb"](t, b, b)
             t = L["n1t"]((t + t_attn).transpose(0,1)).transpose(0,1)
             t = L["n2t"]((t + L["fft"](t)).transpose(0,1)).transpose(0,1)
             b_attn, _ = L["attn_bt"](b, t, t)
             b = L["n1b"]((b + b_attn).transpose(0,1)).transpose(0,1)
             b = L["n2b"]((b + L["ffb"](b)).transpose(0,1)).transpose(0,1)
+        h = self.shared(torch.cat([t[0], b[0]], dim=-1))
         return self.reg(h).squeeze(-1), self.cls(h)
 class CrossAttnUnpooled(nn.Module):
         super().__init__()
         self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
         self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
         self.layers = nn.ModuleList([])
         for _ in range(n_layers):
             self.layers.append(nn.ModuleDict({
                 "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
                 "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
+                "n1t": nn.LayerNorm(hidden), "n2t": nn.LayerNorm(hidden),
+                "n1b": nn.LayerNorm(hidden), "n2b": nn.LayerNorm(hidden),
                 "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
                 "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
             }))
         self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
         self.reg = nn.Linear(hidden, 1)
         self.cls = nn.Linear(hidden, 3)
     def _masked_mean(self, X, M):
         Mf = M.unsqueeze(-1).float()
+        return (X * Mf).sum(dim=1) / Mf.sum(dim=1).clamp(min=1.0)
     def forward(self, T, Mt, B, Mb):
+        T = self.t_proj(T); Bx = self.b_proj(B)
+        kp_t, kp_b = ~Mt, ~Mb
         for L in self.layers:
             T_attn, _ = L["attn_tb"](T, Bx, Bx, key_padding_mask=kp_b)
+            T = L["n1t"](T + T_attn); T = L["n2t"](T + L["fft"](T))
             B_attn, _ = L["attn_bt"](Bx, T, T, key_padding_mask=kp_t)
+            Bx = L["n1b"](Bx + B_attn); Bx = L["n2b"](Bx + L["ffb"](Bx))
+        h = self.shared(torch.cat([self._masked_mean(T, Mt), self._masked_mean(Bx, Mb)], dim=-1))
         return self.reg(h).squeeze(-1), self.cls(h)
 def load_binding_model(best_model_pt: Path, pooled_or_unpooled: str, device: torch.device) -> nn.Module:
     ckpt = torch.load(best_model_pt, map_location=device, weights_only=False)
     params = ckpt["best_params"]
+    sd     = ckpt["state_dict"]
     Ht = int(sd["t_proj.0.weight"].shape[1])
     Hb = int(sd["b_proj.0.weight"].shape[1])
+    common = dict(Ht=Ht, Hb=Hb, hidden=int(params["hidden_dim"]),
+                  n_heads=int(params["n_heads"]), n_layers=int(params["n_layers"]),
+                  dropout=float(params["dropout"]))
+    cls = CrossAttnPooled if pooled_or_unpooled == "pooled" else CrossAttnUnpooled
+    model = cls(**common)
     model.load_state_dict(sd)
+    return model.to(device).eval()
 # -----------------------------
 # Embedding generation
 # -----------------------------
 def _safe_isin(ids: torch.Tensor, test_ids: torch.Tensor) -> torch.Tensor:
     if hasattr(torch, "isin"):
         return torch.isin(ids, test_ids)
     return (ids.unsqueeze(-1) == test_ids.view(1, 1, -1)).any(dim=-1)
 class SMILESEmbedder:
+    def __init__(self, device, vocab_path, splits_path,
+                 clm_name="aaronfeller/PeptideCLM-23M-all", max_len=512, use_cache=True):
         self.device = device
         self.max_len = max_len
         self.use_cache = use_cache
         self.tokenizer = SMILES_SPE_Tokenizer(vocab_path, splits_path)
         self.model = AutoModelForMaskedLM.from_pretrained(clm_name).roformer.to(device).eval()
         self.special_ids = self._get_special_ids(self.tokenizer)
         self.special_ids_t = (torch.tensor(self.special_ids, device=device, dtype=torch.long)
+                              if self.special_ids else None)
         self._cache_pooled: Dict[str, torch.Tensor] = {}
         self._cache_unpooled: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
     @staticmethod
     def _get_special_ids(tokenizer) -> List[int]:
+        cand = [getattr(tokenizer, f"{x}_token_id", None)
+                for x in ("pad", "cls", "sep", "bos", "eos", "mask")]
         return sorted({int(x) for x in cand if x is not None})
+    def _tokenize(self, smiles_list):
+        tok = self.tokenizer(smiles_list, return_tensors="pt", padding=True,
+                             truncation=True, max_length=self.max_len)
+        for k in tok: tok[k] = tok[k].to(self.device)
         if "attention_mask" not in tok:
             tok["attention_mask"] = torch.ones_like(tok["input_ids"], dtype=torch.long, device=self.device)
         return tok
+    def _valid_mask(self, ids, attn):
+        valid = attn.bool()
+        if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
+            valid = valid & (~_safe_isin(ids, self.special_ids_t))
+        return valid
     @torch.no_grad()
     def pooled(self, smiles: str) -> torch.Tensor:
         s = smiles.strip()
+        if self.use_cache and s in self._cache_pooled: return self._cache_pooled[s]
+        tok = self._tokenize([s])
+        h = self.model(input_ids=tok["input_ids"], attention_mask=tok["attention_mask"]).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
+        vf = valid.unsqueeze(-1).float()
+        pooled = (h * vf).sum(dim=1) / vf.sum(dim=1).clamp(min=1e-9)
+        if self.use_cache: self._cache_pooled[s] = pooled
+        return pooled
+    @torch.no_grad()
+    def unpooled(self, smiles: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        s = smiles.strip()
+        if self.use_cache and s in self._cache_unpooled: return self._cache_unpooled[s]
         tok = self._tokenize([s])
+        h = self.model(input_ids=tok["input_ids"], attention_mask=tok["attention_mask"]).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
+        X = h[:, valid[0], :]
+        M = torch.ones((1, X.shape[1]), dtype=torch.bool, device=self.device)
+        if self.use_cache: self._cache_unpooled[s] = (X, M)
+        return X, M
+class ChemBERTaEmbedder:
+    def __init__(self, device, model_name="DeepChem/ChemBERTa-77M-MLM",
+                 max_len=512, use_cache=True):
+        from transformers import AutoTokenizer, AutoModel
+        self.device = device
+        self.max_len = max_len
+        self.use_cache = use_cache
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name).to(device).eval()
+        self.special_ids = self._get_special_ids(self.tokenizer)
+        self.special_ids_t = (torch.tensor(self.special_ids, device=device, dtype=torch.long)
+                              if self.special_ids else None)
+        self._cache_pooled: Dict[str, torch.Tensor] = {}
+        self._cache_unpooled: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
+    @staticmethod
+    def _get_special_ids(tokenizer) -> List[int]:
+        cand = [getattr(tokenizer, f"{x}_token_id", None)
+                for x in ("pad", "cls", "sep", "bos", "eos", "mask")]
+        return sorted({int(x) for x in cand if x is not None})
+    def _tokenize(self, smiles_list):
+        tok = self.tokenizer(smiles_list, return_tensors="pt", padding=True,
+                             truncation=True, max_length=self.max_len)
+        for k in tok: tok[k] = tok[k].to(self.device)
+        if "attention_mask" not in tok:
+            tok["attention_mask"] = torch.ones_like(tok["input_ids"], dtype=torch.long, device=self.device)
+        return tok
+    def _valid_mask(self, ids, attn):
+        valid = attn.bool()
         if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
             valid = valid & (~_safe_isin(ids, self.special_ids_t))
+        return valid
+    @torch.no_grad()
+    def pooled(self, smiles: str) -> torch.Tensor:
+        s = smiles.strip()
+        if self.use_cache and s in self._cache_pooled: return self._cache_pooled[s]
+        tok = self._tokenize([s])
+        h = self.model(input_ids=tok["input_ids"], attention_mask=tok["attention_mask"]).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
         vf = valid.unsqueeze(-1).float()
+        pooled = (h * vf).sum(dim=1) / vf.sum(dim=1).clamp(min=1e-9)
+        if self.use_cache: self._cache_pooled[s] = pooled
         return pooled
     @torch.no_grad()
     def unpooled(self, smiles: str) -> Tuple[torch.Tensor, torch.Tensor]:
         s = smiles.strip()
+        if self.use_cache and s in self._cache_unpooled: return self._cache_unpooled[s]
         tok = self._tokenize([s])
+        h = self.model(input_ids=tok["input_ids"], attention_mask=tok["attention_mask"]).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
+        X = h[:, valid[0], :]
         M = torch.ones((1, X.shape[1]), dtype=torch.bool, device=self.device)
+        if self.use_cache: self._cache_unpooled[s] = (X, M)
         return X, M
 class WTEmbedder:
+    def __init__(self, device, esm_name="facebook/esm2_t33_650M_UR50D", max_len=1022, use_cache=True):
         self.device = device
         self.max_len = max_len
         self.use_cache = use_cache
         self.tokenizer = EsmTokenizer.from_pretrained(esm_name)
         self.model = EsmModel.from_pretrained(esm_name, add_pooling_layer=False).to(device).eval()
         self.special_ids = self._get_special_ids(self.tokenizer)
         self.special_ids_t = (torch.tensor(self.special_ids, device=device, dtype=torch.long)
+                              if self.special_ids else None)
         self._cache_pooled: Dict[str, torch.Tensor] = {}
         self._cache_unpooled: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
     @staticmethod
     def _get_special_ids(tokenizer) -> List[int]:
+        cand = [getattr(tokenizer, f"{x}_token_id", None)
+                for x in ("pad", "cls", "sep", "bos", "eos", "mask")]
         return sorted({int(x) for x in cand if x is not None})
+    def _tokenize(self, seq_list):
+        tok = self.tokenizer(seq_list, return_tensors="pt", padding=True,
+                             truncation=True, max_length=self.max_len)
         tok = {k: v.to(self.device) for k, v in tok.items()}
         if "attention_mask" not in tok:
             tok["attention_mask"] = torch.ones_like(tok["input_ids"], dtype=torch.long, device=self.device)
         return tok
+    def _valid_mask(self, ids, attn):
+        valid = attn.bool()
+        if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
+            valid = valid & (~_safe_isin(ids, self.special_ids_t))
+        return valid
     @torch.no_grad()
     def pooled(self, seq: str) -> torch.Tensor:
         s = seq.strip()
+        if self.use_cache and s in self._cache_pooled: return self._cache_pooled[s]
         tok = self._tokenize([s])
+        h = self.model(**tok).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
         vf = valid.unsqueeze(-1).float()
+        pooled = (h * vf).sum(dim=1) / vf.sum(dim=1).clamp(min=1e-9)
+        if self.use_cache: self._cache_pooled[s] = pooled
         return pooled
     @torch.no_grad()
     def unpooled(self, seq: str) -> Tuple[torch.Tensor, torch.Tensor]:
         s = seq.strip()
+        if self.use_cache and s in self._cache_unpooled: return self._cache_unpooled[s]
         tok = self._tokenize([s])
+        h = self.model(**tok).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
+        X = h[:, valid[0], :]
         M = torch.ones((1, X.shape[1]), dtype=torch.bool, device=self.device)
+        if self.use_cache: self._cache_unpooled[s] = (X, M)
         return X, M
 # -----------------------------
 # Predictor
 # -----------------------------
 class PeptiVersePredictor:
     def __init__(
         self,
         manifest_path: str | Path,
         classifier_weight_root: str | Path,
         esm_name="facebook/esm2_t33_650M_UR50D",
         clm_name="aaronfeller/PeptideCLM-23M-all",
+        chemberta_name="DeepChem/ChemBERTa-77M-MLM",
         smiles_vocab="tokenizer/new_vocab.txt",
         smiles_splits="tokenizer/new_splits.txt",
         device: Optional[str] = None,
         self.manifest = read_best_manifest_csv(manifest_path)
+        self.wt_embedder       = WTEmbedder(self.device, esm_name=esm_name)
+        self.smiles_embedder   = SMILESEmbedder(self.device, clm_name=clm_name,
+                                                vocab_path=str(self.root / smiles_vocab),
+                                                splits_path=str(self.root / smiles_splits))
+        self.chemberta_embedder = ChemBERTaEmbedder(self.device, model_name=chemberta_name)
+        self.models:    Dict[Tuple[str, str], Any]            = {}
+        self.meta:      Dict[Tuple[str, str], Dict[str, Any]] = {}
+        self.mapie:     Dict[Tuple[str, str], dict]           = {}
+        self.ensembles: Dict[Tuple[str, str], List]           = {}
         self._load_all_best_models()
+    def _get_embedder(self, emb_tag: str):
+        if emb_tag == "wt":         return self.wt_embedder
+        if emb_tag == "peptideclm": return self.smiles_embedder
+        if emb_tag == "chemberta":  return self.chemberta_embedder
+        raise ValueError(f"Unknown emb_tag={emb_tag!r}")
+    def _embed_pooled(self, emb_tag: str, input_str: str) -> np.ndarray:
+        v = self._get_embedder(emb_tag).pooled(input_str)
+        feats = v.detach().cpu().numpy().astype(np.float32)
+        feats = np.nan_to_num(feats, nan=0.0)
+        return np.clip(feats, np.finfo(np.float32).min, np.finfo(np.float32).max)
+    def _embed_unpooled(self, emb_tag: str, input_str: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self._get_embedder(emb_tag).unpooled(input_str)
+    def _resolve_dir(self, prop_key: str, model_name: str, emb_tag: str) -> Path:
         disk_prop = "half_life" if prop_key == "halflife" else prop_key
         base = self.training_root / disk_prop
+        folder_suffix = EMB_TAG_TO_FOLDER_SUFFIX.get(emb_tag, emb_tag)
+        if prop_key == "halflife" and emb_tag == "wt":
+            if model_name == "transformer":
+                for d in [base / "transformer_wt_log", base / "transformer_wt"]:
+                    if d.exists(): return d
+            if model_name in {"xgb", "xgb_reg"}:
+                d = base / "xgb_wt_log"
+                if d.exists(): return d
         candidates = [
+            base / f"{model_name}_{folder_suffix}",
             base / model_name,
         ]
         for d in candidates:
+            if d.exists(): return d
         raise FileNotFoundError(
+            f"Cannot find model dir for {prop_key}/{model_name}/{emb_tag}. Tried: {candidates}"
         )
     def _load_all_best_models(self):
         for prop_key, row in self.manifest.items():
+            for col, parsed, thr in [
+                ("wt",     row.best_wt,     row.thr_wt),
+                ("smiles", row.best_smiles,  row.thr_smiles),
             ]:
+                if parsed is None:
                     continue
+                model_name, emb_tag = parsed
+                # binding affinity
                 if prop_key == "binding_affinity":
+                    folder = model_name
+                    pooled_or_unpooled = "unpooled" if "unpooled" in folder else "pooled"
                     model_dir = self.training_root / "binding_affinity" / folder
                     art = find_best_artifact(model_dir)
+                    model = load_binding_model(art, pooled_or_unpooled, self.device)
+                    self.models[(prop_key, col)] = model
+                    self.meta[(prop_key, col)] = {
+                        "task_type":    "Regression",
+                        "threshold":    None,
+                        "artifact":     str(art),
+                        "model_name":   pooled_or_unpooled,
+                        "emb_tag":      emb_tag,
+                        "folder":       folder,
+                        "kind":         "binding",
                     }
+                    print(f"  [LOAD] binding_affinity ({col}): folder={folder}, arch={pooled_or_unpooled}, emb_tag={emb_tag}, art={art.name}")
+                    mapie_path = model_dir / "mapie_calibration.joblib"
+                    if mapie_path.exists():
+                        try:
+                            self.mapie[(prop_key, col)] = joblib.load(mapie_path)
+                            print(f"  MAPIE loaded from {mapie_path.name}")
+                        except Exception as e:
+                            print(f"  MAPIE load FAILED for ({prop_key}, {col}): {e}")
+                    else:
+                        print(f"     No MAPIE bundle found (uncertainty will be unavailable)")
                     continue
+                # infer emb_tag
+                if emb_tag is None:
+                    emb_tag = col
+                model_dir = self._resolve_dir(prop_key, model_name, emb_tag)
                 kind, obj, art = load_artifact(model_dir, self.device)
+                if kind == "torch_ckpt":
+                    arch = self._base_arch(model_name)
+                    model = build_torch_model_from_ckpt(arch, obj, self.device)
                 else:
+                    model = obj
+                self.models[(prop_key, col)] = model
+                self.meta[(prop_key, col)] = {
+                    "task_type":  row.task_type,
+                    "threshold":  thr,
+                    "artifact":   str(art),
+                    "model_name": model_name,
+                    "emb_tag":    emb_tag,
+                    "kind":       kind,
+                }
+                print(f"  [LOAD] ({prop_key}, {col}): kind={kind}, model={model_name}, emb={emb_tag}, task={row.task_type}, art={art.name}")
+                # MAPIE: SVR/ElasticNet, XGBoost regression, AND all regression torch_ckpt
+                is_regression = row.task_type.lower() == "regression"
+                wants_mapie = (
+                    (model_name in MAPIE_REGRESSION_MODELS and is_regression)
+                    or (kind == "xgb" and is_regression)
+                    or (kind == "torch_ckpt" and is_regression)
+                )
+                if wants_mapie:
+                    mapie_path = model_dir / "mapie_calibration.joblib"
+                    if mapie_path.exists():
+                        try:
+                            self.mapie[(prop_key, col)] = joblib.load(mapie_path)
+                            print(f"          MAPIE loaded from {mapie_path.name}")
+                        except Exception as e:
+                            print(f"   MAPIE load FAILED for ({prop_key}, {col}): {e}")
+                    else:
+                        print(f"          No MAPIE bundle found at {mapie_path} (will fall back to ensemble if available)")
+                # Seed ensembles: DNN only, used when MAPIE not available
+                if kind == "torch_ckpt":
+                    arch = self._base_arch(model_name)
+                    ens = load_seed_ensemble(model_dir, arch, self.device)
+                    if ens:
+                        self.ensembles[(prop_key, col)] = ens
+                        if (prop_key, col) in self.mapie:
+                            print(f"          Seed ensemble: {len(ens)} seeds loaded (MAPIE takes priority for regression)")
+                        else:
+                            unc_type = "ensemble_predictive_entropy" if row.task_type.lower() == "classifier" else "ensemble_std"
+                            print(f"          Seed ensemble: {len(ens)} seeds loaded  uncertainty method: {unc_type}")
+                    else:
+                        if (prop_key, col) in self.mapie:
+                            print(f"          No seed ensemble (MAPIE covers uncertainty)")
+                        else:
+                            print(f"          No seed ensemble found (checked: {SEED_DIRS}) - uncertainty unavailable")
+                # XGBoost/SVM classifiers: binary entropy
+                if kind in ("xgb", "joblib") and row.task_type.lower() == "classifier":
+                    print(f"         Uncertainty method: binary_predictive_entropy (computed at inference)")
+    @staticmethod
+    def _base_arch(model_name: str) -> str:
+        if model_name.startswith("transformer"): return "transformer"
+        if model_name.startswith("mlp"):         return "mlp"
+        if model_name.startswith("cnn"):         return "cnn"
+        return model_name
+    # Feature extraction
+    def _get_features(self, prop_key: str, col: str, input_str: str):
+        meta = self.meta[(prop_key, col)]
+        emb_tag = meta["emb_tag"]
+        kind    = meta["kind"]
         if kind == "torch_ckpt":
+            return self._embed_unpooled(emb_tag, input_str)
+        return self._embed_pooled(emb_tag, input_str)
+    # Uncertainty
+    def _compute_uncertainty(self, prop_key: str, col: str, input_str: str,
+                              score: float) -> Tuple[Any, str]:
+        meta      = self.meta[(prop_key, col)]
+        kind      = meta["kind"]
+        model_name = meta["model_name"]
+        task_type  = meta["task_type"].lower()
+        emb_tag   = meta["emb_tag"]
+        # Pooled embedding for adaptive MAPIE sigma model
+        def get_pooled_emb():
+            return self._embed_pooled(emb_tag, input_str) if emb_tag else None
+        # DNN
+        if kind == "torch_ckpt":
+            # Regression: prefer MAPIE if available
+            if task_type == "regression":
+                mapie_bundle = self.mapie.get((prop_key, col))
+                if mapie_bundle:
+                    emb = get_pooled_emb() if mapie_bundle.get("adaptive") else None
+                    lo, hi = _mapie_uncertainty(mapie_bundle, score, emb)
+                    return (lo, hi), "conformal_prediction_interval"
+                # Fall back to seed ensemble std
+                ens = self.ensembles.get((prop_key, col))
+                if ens:
+                    X, M = self._embed_unpooled(emb_tag, input_str)
+                    return _ensemble_reg_uncertainty(ens, X, M), "ensemble_std"
+                return None, "unavailable (no MAPIE bundle and no seed ensemble)"
+            # Classifier: ensemble predictive entropy
+            ens = self.ensembles.get((prop_key, col))
+            if not ens:
+                return None, "unavailable (no seed ensemble found)"
+            X, M = self._embed_unpooled(emb_tag, input_str)
+            return _ensemble_clf_uncertainty(ens, X, M), "ensemble_predictive_entropy"
+        # XGBoost
+        if kind == "xgb":
+            if task_type == "classifier":
+                return _binary_entropy(score), "binary_predictive_entropy"
+            mapie_bundle = self.mapie.get((prop_key, col))
+            if mapie_bundle:
+                emb = get_pooled_emb() if mapie_bundle.get("adaptive") else None
+                lo, hi = _mapie_uncertainty(mapie_bundle, score, emb)
+                return (lo, hi), "conformal_prediction_interval"
+            return None, "unavailable (no MAPIE bundle for XGBoost regression)"
+        # SVR / ElasticNet regression: MAPIE
+        if kind == "joblib" and model_name in MAPIE_REGRESSION_MODELS and task_type == "regression":
+            mapie_bundle = self.mapie.get((prop_key, col))
+            if mapie_bundle:
+                emb = get_pooled_emb() if mapie_bundle.get("adaptive") else None
+                lo, hi = _mapie_uncertainty(mapie_bundle, score, emb)
+                return (lo, hi), "conformal_prediction_interval"
+            return None, "unavailable (MAPIE bundle not found)"
+        # joblib classifiers (SVM, ElasticNet used as classifier)
+        if kind == "joblib" and task_type == "classifier":
+            return _binary_entropy(score), "binary_predictive_entropy_single_model"
+        return None, "unavailable"
+    def predict_property(self, prop_key: str, col: str, input_str: str,
+                         uncertainty: bool = False) -> Dict[str, Any]:
+        if (prop_key, col) not in self.models:
+            raise KeyError(f"No model loaded for ({prop_key}, {col}).")
+        meta       = self.meta[(prop_key, col)]
+        model      = self.models[(prop_key, col)]
+        task_type  = meta["task_type"].lower()
+        thr        = meta.get("threshold")
+        kind       = meta["kind"]
+        model_name = meta["model_name"]
         if prop_key == "binding_affinity":
             raise RuntimeError("Use predict_binding_affinity().")
+        # DNN
         if kind == "torch_ckpt":
+            X, M = self._get_features(prop_key, col, input_str)
             with torch.no_grad():
+                raw = model(X, M).squeeze().float().cpu().item()
+            if prop_key == "halflife" and col == "wt" and "log" in model_name:
+                raw = float(np.expm1(raw))
             if task_type == "classifier":
+                score = float(1.0 / (1.0 + np.exp(-raw)))
+                out   = {"property": prop_key, "col": col, "score": score,
+                         "emb_tag": meta["emb_tag"]}
                 if thr is not None:
+                    out["label"] = int(score >= float(thr)); out["threshold"] = float(thr)
             else:
+                out = {"property": prop_key, "col": col, "score": float(raw),
+                       "emb_tag": meta["emb_tag"]}
+        # XGBoost
+        elif kind == "xgb":
+            feats = self._get_features(prop_key, col, input_str)
+            pred  = float(model.predict(xgb.DMatrix(feats))[0])
+            if prop_key == "halflife" and col == "wt" and "log" in model_name:
                 pred = float(np.expm1(pred))
+            out = {"property": prop_key, "col": col, "score": pred,
+                   "emb_tag": meta["emb_tag"]}
+            if task_type == "classifier" and thr is not None:
+                out["label"] = int(pred >= float(thr)); out["threshold"] = float(thr)
+        # joblib (SVM / ElasticNet / SVR)
+        elif kind == "joblib":
+            feats = self._get_features(prop_key, col, input_str)
             if task_type == "classifier":
                 if hasattr(model, "predict_proba"):
                     pred = float(model.predict_proba(feats)[:, 1][0])
+                elif hasattr(model, "decision_function"):
+                    pred = float(1.0 / (1.0 + np.exp(-model.decision_function(feats)[0])))
                 else:
+                    pred = float(model.predict(feats)[0])
+                out = {"property": prop_key, "col": col, "score": pred,
+                       "emb_tag": meta["emb_tag"]}
                 if thr is not None:
+                    out["label"] = int(pred >= float(thr)); out["threshold"] = float(thr)
             else:
                 pred = float(model.predict(feats)[0])
+                out  = {"property": prop_key, "col": col, "score": pred,
+                        "emb_tag": meta["emb_tag"]}
+        else:
+            raise RuntimeError(f"Unknown kind={kind}")
+        if uncertainty:
+            u_val, u_type = self._compute_uncertainty(prop_key, col, input_str, out["score"])
+            out["uncertainty"]      = u_val
+            out["uncertainty_type"] = u_type
+        return out
+    def predict_binding_affinity(self, col: str, target_seq: str, binder_str: str,
+                                  uncertainty: bool = False) -> Dict[str, Any]:
+        prop_key = "binding_affinity"
+        if (prop_key, col) not in self.models:
+            raise KeyError(f"No binding model loaded for ({prop_key}, {col}).")
+        model  = self.models[(prop_key, col)]
+        meta   = self.meta[(prop_key, col)]
+        arch   = meta["model_name"]
+        emb_tag = meta.get("emb_tag")
+        if arch == "pooled":
+            t_vec = self.wt_embedder.pooled(target_seq)
+            b_vec = self._get_embedder(emb_tag or col).pooled(binder_str) if emb_tag else \
+                    (self.wt_embedder.pooled(binder_str) if col == "wt" else self.smiles_embedder.pooled(binder_str))
             with torch.no_grad():
                 reg, logits = model(t_vec, b_vec)
         else:
             T, Mt = self.wt_embedder.unpooled(target_seq)
+            binder_emb = self._get_embedder(emb_tag or col) if emb_tag else \
+                         (self.wt_embedder if col == "wt" else self.smiles_embedder)
+            B, Mb = binder_emb.unpooled(binder_str)
             with torch.no_grad():
                 reg, logits = model(T, Mt, B, Mb)
+        affinity  = float(reg.squeeze().cpu().item())
+        cls_logit = int(torch.argmax(logits, dim=-1).cpu().item())
+        cls_thr   = affinity_to_class(affinity)
+        names     = {0: "High (≥9)", 1: "Moderate (7-9)", 2: "Low (<7)"}
+        out = {
+            "property":          "binding_affinity",
+            "col":               col,
+            "affinity":          affinity,
             "class_by_threshold": names[cls_thr],
+            "class_by_logits":   names[cls_logit],
+            "binding_model":     arch,
         }
+        if uncertainty:
+            mapie_bundle = self.mapie.get((prop_key, col))
+            if mapie_bundle:
+                if mapie_bundle.get("adaptive") and "sigma_model" in mapie_bundle:
+                    # Concatenate target + binder pooled embeddings for sigma model
+                    binder_emb_tag = mapie_bundle.get("emb_tag") or col
+                    target_emb_tag = mapie_bundle.get("target_emb_tag", "wt")
+                    t_vec = self.wt_embedder.pooled(target_seq).cpu().float().numpy()
+                    b_vec = self._get_embedder(binder_emb_tag).pooled(binder_str).cpu().float().numpy()
+                    emb = np.concatenate([t_vec, b_vec], axis=1)
+                else:
+                    emb = None
+                lo, hi = _mapie_uncertainty(mapie_bundle, affinity, emb)
+                out["uncertainty"]      = (lo, hi)
+                out["uncertainty_type"] = "conformal_prediction_interval"
+            else:
+                out["uncertainty"]      = None
+                out["uncertainty_type"] = "unavailable (no MAPIE bundle found)"
+        return out
 if __name__ == "__main__":
+    root = Path(__file__).resolve().parent  # current script folder
+    predictor = PeptiVersePredictor(
+        manifest_path=root / "best_models.txt",
+        classifier_weight_root=root
     )
+    print(predictor.training_root)
+    print("MAPIE keys:",    list(predictor.mapie.keys()))
+    print("Ensemble keys:", list(predictor.ensembles.keys()))
+    seq = "GIGAVLKVLTTGLPALISWIKRKRQQ"
+    smiles = "C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O"
+    print(predictor.predict_property("hemolysis",   "wt",     seq))
+    print(predictor.predict_property("hemolysis",   "smiles",     smiles, uncertainty=True))
+    print(predictor.predict_property("nf",    "wt",     seq, uncertainty=True))
+    print(predictor.predict_property("nf",    "smiles",     smiles, uncertainty=True))
+    print(predictor.predict_binding_affinity("wt",  target_seq=seq, binder_str="GIGAVLKVLT"))
+    print(predictor.predict_binding_affinity("wt",  target_seq=seq, binder_str="GIGAVLKVLT", uncertainty=True))
+    seq1 = "GIGAVLKVLTTGLPALISWIKRKRQQ"
+    seq2 = "ACDEFGHIKLMNPQRSTVWY"
+    r1 = predictor.predict_binding_affinity("wt",  target_seq=seq2, binder_str="GIGAVLKVLT", uncertainty=True)
+    r2 = predictor.predict_property("nf", "wt", seq1, uncertainty=True)
+    r3 = predictor.predict_property("nf", "wt", seq2, uncertainty=True)
+    print(r1)
+    print(r2)
+    print(r3)

tokenizer/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (136 Bytes)

tokenizer/__pycache__/my_tokenizers.cpython-310.pyc DELETED Viewed

Binary file (16.2 kB)