File size: 8,333 Bytes

411e478

"""
ESM2 backbone + pluggable aggregation head + classification head.

The ESM2 backbone is always frozen.  Only the aggregation module and the
classifier head are trained.

ESM2 model variants (all from facebook):
    esm2_t6_8M_UR50D      ->  d=320,   8M params
    esm2_t12_35M_UR50D    ->  d=480,  35M params   (default)
    esm2_t30_150M_UR50D   ->  d=640, 150M params
    esm2_t33_650M_UR50D   ->  d=1280, 650M params
    esm2_t36_3B_UR50D     ->  d=2560,  3B params
"""

from typing import Dict, List, Optional, Union

import torch
import torch.nn as nn
from transformers import AutoTokenizer, EsmModel

from .aggregators import (
    CLSPooling,
    CovariancePooling,
    GLOTPooling,
    GLOTResidueGraphPooling,
    MaxPooling,
    MeanPooling,
)

# Map of aggregation method names to classes
AGGREGATOR_REGISTRY = {
    "mean": MeanPooling,
    "max": MaxPooling,
    "cls": CLSPooling,
    "glot": GLOTPooling,
    "glot_residue": GLOTResidueGraphPooling,
    "covariance": CovariancePooling,
}

# ESM2 hidden dimensions by model name
ESM2_HIDDEN_DIMS = {
    "facebook/esm2_t6_8M_UR50D": 320,
    "facebook/esm2_t12_35M_UR50D": 480,
    "facebook/esm2_t30_150M_UR50D": 640,
    "facebook/esm2_t33_650M_UR50D": 1280,
    "facebook/esm2_t36_3B_UR50D": 2560,
}


class ProteinSequenceClassifier(nn.Module):
    """End-to-end model: frozen ESM2 -> aggregation -> classification.

    Args:
        esm2_model_name:  HuggingFace model ID for ESM2.
        aggregation:      Name of aggregation method (see AGGREGATOR_REGISTRY).
        num_classes:      Number of output classes.
        aggregator_kwargs: Extra arguments passed to the aggregator constructor.
        classifier_hidden: If >0, adds a hidden layer in the classifier head.
        dropout:          Dropout rate before the classifier.
        strip_special_tokens: If True (default for mean/max/glot/glot_residue/covariance),
                              strips the <cls> and <eos> tokens from the ESM2 output
                              before aggregation.  CLS pooling operates on the raw output.
    """

    def __init__(
        self,
        esm2_model_name: str = "facebook/esm2_t12_35M_UR50D",
        aggregation: str = "mean",
        num_classes: int = 10,
        aggregator_kwargs: Optional[Dict] = None,
        classifier_hidden: int = 0,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.esm2_model_name = esm2_model_name
        self.aggregation_name = aggregation

        # ---- ESM2 backbone (frozen) ----
        self.esm2 = EsmModel.from_pretrained(esm2_model_name)
        for param in self.esm2.parameters():
            param.requires_grad = False
        self.esm2.eval()

        # ---- Determine hidden size ----
        self.d_esm2 = ESM2_HIDDEN_DIMS.get(
            esm2_model_name, self.esm2.config.hidden_size
        )

        # ---- Aggregation head ----
        if aggregation not in AGGREGATOR_REGISTRY:
            raise ValueError(
                f"Unknown aggregation '{aggregation}'. "
                f"Choose from: {list(AGGREGATOR_REGISTRY.keys())}"
            )

        agg_cls = AGGREGATOR_REGISTRY[aggregation]
        agg_kwargs = aggregator_kwargs or {}
        self.aggregator = agg_cls(d_in=self.d_esm2, **agg_kwargs)

        # Whether to strip <cls>/<eos> before aggregation
        self.strip_special = aggregation != "cls"

        # ---- Classification head ----
        agg_dim = self.aggregator.out_dim
        if classifier_hidden > 0:
            self.classifier = nn.Sequential(
                nn.Dropout(dropout),
                nn.Linear(agg_dim, classifier_hidden),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(classifier_hidden, num_classes),
            )
        else:
            self.classifier = nn.Sequential(
                nn.Dropout(dropout),
                nn.Linear(agg_dim, num_classes),
            )

    @property
    def tokenizer(self):
        """Lazy-load tokenizer."""
        if not hasattr(self, "_tokenizer"):
            self._tokenizer = AutoTokenizer.from_pretrained(self.esm2_model_name)
        return self._tokenizer

    def get_residue_embeddings(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> tuple:
        """Extract per-residue embeddings from frozen ESM2.

        Returns:
            token_embeddings: [B, L, d]  (optionally with special tokens stripped)
            mask:             [B, L]
        """
        with torch.no_grad():
            outputs = self.esm2(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )

        hidden_states = outputs.last_hidden_state  # [B, L_full, d]

        if self.strip_special:
            # Strip <cls> (pos 0) and <eos> (last valid position)
            # For ESM2: input is [<cls>, AA1, AA2, ..., AAN, <eos>, <pad>, ...]
            token_embeddings = hidden_states[:, 1:, :]  # remove <cls>
            mask = attention_mask[:, 1:].clone()  # adjust mask

            # Now remove the <eos> token for each sequence
            # The <eos> is the last 1 in the mask (before padding)
            B, L = mask.shape
            # Find the position of the last 1 in each row
            lengths = mask.sum(dim=1).long()  # number of valid tokens after removing <cls>
            for i in range(B):
                if lengths[i] > 0:
                    mask[i, lengths[i] - 1] = 0  # zero out <eos> position
        else:
            token_embeddings = hidden_states
            mask = attention_mask

        return token_embeddings, mask

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        labels: Optional[torch.Tensor] = None,
        pdb_paths: Optional[List[Optional[str]]] = None,
        **kwargs,
    ) -> Dict[str, torch.Tensor]:
        """
        Args:
            input_ids:      [B, L] tokenized protein sequences.
            attention_mask: [B, L] attention mask.
            labels:         [B] class labels (optional, for loss computation).
            pdb_paths:      List of PDB file paths (only for glot_residue aggregation).

        Returns:
            Dict with keys: 'logits', optionally 'loss', 'embeddings'.
        """
        # Extract residue embeddings from frozen ESM2
        token_embeddings, mask = self.get_residue_embeddings(input_ids, attention_mask)

        # Aggregate to sequence-level
        extra_kwargs = {}
        if pdb_paths is not None:
            extra_kwargs["pdb_paths"] = pdb_paths

        sequence_embedding = self.aggregator(
            token_embeddings, mask, **extra_kwargs
        )  # [B, agg_dim]

        # Classify
        logits = self.classifier(sequence_embedding)  # [B, num_classes]

        result = {"logits": logits, "embeddings": sequence_embedding}

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            result["loss"] = loss_fn(logits, labels)

        return result

    def encode(
        self,
        sequences: Union[str, List[str]],
        pdb_paths: Optional[List[Optional[str]]] = None,
        max_length: int = 1024,
        device: Optional[torch.device] = None,
    ) -> torch.Tensor:
        """Convenience method: tokenize + forward to get sequence embeddings.

        Args:
            sequences: Single protein sequence or list of sequences.
            pdb_paths: Optional PDB paths for glot_residue aggregation.
            max_length: Maximum sequence length (ESM2 supports up to 1026).
            device: Device to run on.

        Returns:
            Sequence-level embeddings [B, agg_dim].
        """
        if isinstance(sequences, str):
            sequences = [sequences]

        if device is None:
            device = next(self.parameters()).device

        inputs = self.tokenizer(
            sequences,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        ).to(device)

        self.eval()
        with torch.no_grad():
            outputs = self.forward(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                pdb_paths=pdb_paths,
            )

        return outputs["embeddings"]