Shrey Goel commited on Jan 27

Commit

94c2704

0 Parent(s):

initial commit

Files changed (21) hide show

README.md +26 -0
__init__.py +0 -0
configs/wt_pep.yaml +84 -0
setup.py +10 -0
src/__init__.py +0 -0
src/madsbm/__init__.py +0 -0
src/madsbm/wt_peptide/__init__.py +0 -0
src/madsbm/wt_peptide/control_field.py +199 -0
src/madsbm/wt_peptide/dataloader.py +93 -0
src/madsbm/wt_peptide/main.py +96 -0
src/madsbm/wt_peptide/sbm_module.py +320 -0
src/sampling/diffusion_sampler.py +106 -0
src/sampling/guided_sample.py +121 -0
src/sampling/madsbm_sampler.py +198 -0
src/sampling/path_tracer.py +46 -0
src/utils/__init__.py +0 -0
src/utils/eval_utils.py +27 -0
src/utils/fbd_score_model.py +104 -0
src/utils/generate_utils.py +118 -0
src/utils/model_utils.py +42 -0
src/utils/time_utils.py +41 -0

README.md ADDED Viewed

	@@ -0,0 +1,26 @@

+# Token-Level Guided Discrete Diffusion for Membrane Protein Design
+![MemDLM diagram](./memdlm_schematic.png)
+arXiv preprint: ...
+Reparameterized diffusion models (RDMs) have recently matched autoregressive methods in protein generation, motivating their use for challenging tasks such as designing membrane proteins, which possess interleaved soluble and transmembrane (TM) regions.
+We introduce ***Membrane Diffusion Language Model (MemDLM)***, a fine-tuned RDM-based protein language model that enables controllable membrane protein sequence design. MemDLM-generated sequences recapitulate the TM residue density and structural features of natural membrane proteins, achieving comparable biological plausibility and outperforming state-of-the-art diffusion baselines in motif scaffolding tasks by producing:
+- Lower perplexity
+- Higher BLOSUM-62 scores
+- Improved pLDDT confidence
+To enhance controllability, we develop ***Per-Token Guidance (PET)***, a novel classifier-guided sampling strategy that selectively solubilizes residues while preserving conserved TM domains. This yields sequences with reduced TM density but intact functional cores.
+Importantly, MemDLM designs validated in TOXCAT β-lactamase growth assays demonstrate successful TM insertion, distinguishing high-quality generated sequences from poor ones.
+Together, our framework establishes the first experimentally validated diffusion-based model for rational membrane protein generation, integrating *de novo* design, motif scaffolding, and targeted property optimization.
+## **Repository Authors**
+- <u>[Shrey Goel](https://shreygoel09.github.io/)</u> – undergraduate student at Duke University
+- <u>[Pranam Chatterjee](mailto:pranam@seas.upenn.edu)</u> – Assistant Professor at University of Pennsylvania

__init__.py ADDED Viewed

File without changes

configs/wt_pep.yaml ADDED Viewed

	@@ -0,0 +1,84 @@

+seed: 42
+base_dir: /scratch/pranamlab/sgoel/MadSBM
+training:
+  mode: test  # train / test / resume_from_ckpt
+  n_unfrozen: 3
+  n_epochs: 50
+  log_every_n_steps: 50
+  num_sanity_val_steps: 2
+  val_check_interval:
+  enable_progress_bar: true
+  grad_clip_val: 10.0
+  accumulate_grad_batches: 16  # to workaround dynamic batching
+  devices: 1 # number of GPUs
+model:
+  ablate: false
+  evoflow_model: fredzzp/EvoFlow-150M-afdbseq
+  esm_model: facebook/esm2_t33_650M_UR50D
+  n_layers: 2 #8
+  n_heads: 16 #8
+  hidden_dim: 1280
+  attn_drop: 0.0
+  resid_drop: 0.0
+  mlp_ratio: 4.0
+  beta1: 1e-6
+  beta2: 1e-6
+time_embed:
+  time_dim: 512
+  fourier_dim: 64
+  fourier_scale: 30.0
+  time_schedule: uniform  # linear / exponential / uniform
+  anneal_frac: 0.75
+  min_time: 1e-6
+  n_timesteps: 500
+data:
+  batch_size: 1
+  #max_seq_len: 500
+  train: /scratch/pranamlab/tong/data/peptide/tokenized_peptide_batched/train
+  test: /scratch/pranamlab/tong/data/peptide/tokenized_peptide_batched/test
+  val: /scratch/pranamlab/tong/data/peptide/tokenized_peptide_batched/val
+optim:
+  type: adamw
+  scheduler: cosine
+  lr: 1e-4
+  lr_end: 1e-5
+  warmup_init_lr: 1e-6
+  warmup_epochs: 2
+  weight_decay: 0.01
+  beta1: 0.9
+  beta2: 0.999
+  power: 1
+wandb:
+  project: MadSBM_PEPTIDE
+  group: programmablebio
+  name: peptide_og-madsbm_esm_no-gclip_lr=1e-4_n-layers=2_n-heads=16_trainable-lm-head_logits-sum-SM_gate-esm
+  #name: peptide_og-madsbm_esm_no-gclip_lr=1e-4_n-layers=2_n-heads=16_trainable-lm-head_logits-sum-SM_ABLATE-gate-esm
+  id: ${.name}_${seed}
+checkpointing:
+  save_every_n_epochs: 1
+  save_dir: ${base_dir}/checkpoints/wt_pep/${wandb.name}
+  resume_ckpt_path: ${checkpointing.save_dir}/last.ckpt
+  best_ckpt_path: ${checkpointing.save_dir}/best-model_epoch=41_step=106890.ckpt
+sampling:
+  model_type: madsbm  # madsbm / diffusion / dfm
+  n_steps: 32
+  top_p: 0.9
+  rate_scale: 0.01
+  jump_scale: 0.05
+  tau: 0.5
+  M: 16
+  beta: 2.0

setup.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from setuptools import setup, find_packages
+setup(
+    name='MadSBM',
+    version='1.0',
+    packages=find_packages(),
+    install_requires=[],
+    author='Shrey Goel',
+    author_email='shrey.goel@duke.edu'
+)

src/__init__.py ADDED Viewed

File without changes

src/madsbm/__init__.py ADDED Viewed

File without changes

src/madsbm/wt_peptide/__init__.py ADDED Viewed

File without changes

src/madsbm/wt_peptide/control_field.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer
+from src.utils.time_utils import TimeEmbedding
+from src.utils.model_utils import _print
+# -------------------------
+# DiT building blocks
+# -------------------------
+class MLP(nn.Module):
+    def __init__(self, dim, mlp_ratio, dropout):
+        super().__init__()
+        hidden_dim = int(dim * mlp_ratio)
+        self.fc1 = nn.Linear(dim, hidden_dim)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class DiTBlock1D(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.hidden_dim = cfg.model.hidden_dim
+        self.time_dim = cfg.time_embed.time_dim
+        self.norm1 = nn.LayerNorm(self.hidden_dim, eps=1e-6)
+        self.norm2 = nn.LayerNorm(self.hidden_dim, eps=1e-6)
+        # time-conditioned scale & shift for both norms
+        self.time_proj1 = nn.Linear(self.time_dim, 2 * self.hidden_dim)  # scale1, shift1
+        self.time_proj2 = nn.Linear(self.time_dim, 2 * self.hidden_dim)  # scale2, shift2
+        self.attn = nn.MultiheadAttention(
+            embed_dim=self.hidden_dim,
+            num_heads=cfg.model.n_heads,
+            dropout=cfg.model.attn_drop,
+            batch_first=True
+        )
+        self.mlp = MLP(
+            self.hidden_dim,
+            mlp_ratio=cfg.model.mlp_ratio,
+            dropout=cfg.model.resid_drop
+        )
+    def forward(self, x, t_emb, key_padding_mask=None):
+        # ----- Self-attention branch -----
+        # Adaptive LayerNorm (AdaLN) + FiLM from time embedding
+        scale1, shift1 = self.time_proj1(t_emb).chunk(2, dim=-1)  # [B, D] and [B, D]
+        h = self.norm1(x)
+        h = h * (1 + scale1.unsqueeze(1)) + shift1.unsqueeze(1)   # [B, L, D]
+        attn_out, _ = self.attn(
+            h,
+            h,
+            h,
+            key_padding_mask=key_padding_mask,  # True for pads
+            need_weights=False,
+        )
+        x = x + attn_out
+        # ----- MLP branch -----
+        scale2, shift2 = self.time_proj2(t_emb).chunk(2, dim=-1)
+        h2 = self.norm2(x)
+        h2 = h2 * (1 + scale2.unsqueeze(1)) + shift2.unsqueeze(1)
+        mlp_out = self.mlp(h2)
+        x = x + mlp_out
+        return x
+class PeptideControlField(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        pth = cfg.model.esm_model
+        self.embed_model = AutoModelForMaskedLM.from_pretrained(pth, trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(pth, trust_remote_code=True)
+        # Freeze params
+        self.embed_model.eval()
+        for param in self.embed_model.parameters():
+            param.requires_grad = False
+        # # Unfreeze QKV in last few encoder layers
+        # encoder_layers = self.embed_model.esm.encoder.layer
+        # for layer in encoder_layers[-cfg.training.n_unfrozen:]:
+        #     for param in layer.parameters():
+        #         param.requires_grad = True
+        self.time_embed = TimeEmbedding(
+            hidden_dim=cfg.time_embed.time_dim,
+            fourier_dim=cfg.time_embed.fourier_dim,
+            scale=cfg.time_embed.fourier_scale
+        )
+        self.blocks = nn.ModuleList([
+            DiTBlock1D(self.cfg)
+            for _ in range(cfg.model.n_layers)
+        ])
+        self.final_norm = nn.LayerNorm(cfg.model.hidden_dim, eps=1e-6)
+        # self.output_proj = self.embed_model.lm_head
+        # for param in self.output_proj.parameters():
+        #     param.requires_grad = False
+        self.output_proj = nn.Linear(cfg.model.hidden_dim, self.tokenizer.vocab_size)
+        nn.init.zeros_(self.output_proj.weight)
+        nn.init.zeros_(self.output_proj.bias)
+    def forward(self, t, xt, attention_mask):
+        with torch.no_grad():
+            outs = self.embed_model(input_ids=xt, attention_mask=attention_mask, output_hidden_states=True)
+        gate = (1.0 - t).view(-1, 1, 1)
+        u_base = gate * outs.logits
+        h = outs.hidden_states[-1]
+        t_emb = self.time_embed(t)  # [B, time_dim]
+        # Transformer head (key_padding_mask=True for pads)
+        key_padding_mask = (attention_mask == 0)  # (B, L) bool
+        for dit_block in self.blocks:
+            h = dit_block(h, t_emb, key_padding_mask=key_padding_mask)
+        # Final norm + projection to vocab logits
+        h = self.final_norm(h)  # [B, L, hidden_dim]
+        logits = self.output_proj(h)  # [B, L, V]
+        return {
+            "esm": u_base,
+            "dit": logits,
+            "madsbm": u_base + logits
+        }
+    # def forward(self, t, xt, attention_mask):
+    #     outs = self.embed_model(input_ids=xt, attention_mask=attention_mask, output_hidden_states=True)
+    #     h = outs.hidden_states[-1]
+    #     t_emb = self.time_embed(t)  # [B, time_dim]
+    #     # Transformer head (key_padding_mask=True for pads)
+    #     key_padding_mask = (attention_mask == 0)  # (B, L) bool
+    #     for dit_block in self.blocks:
+    #         h = dit_block(h, t_emb, key_padding_mask=key_padding_mask)
+    #     # Final norm + projection to vocab logits
+    #     h = self.final_norm(h)  # [B, L, hidden_dim]
+    #     logits = self.output_proj(h)  # [B, L, V]
+    #     return logits
+    # def forward(self, xt, attention_mask, t):
+    #     with torch.no_grad():
+    #         base_out = self.embed_model(
+    #             input_ids=xt,
+    #             attention_mask=attention_mask,
+    #             output_hidden_states=True
+    #         )
+    #         logits_base = base_out.logits
+    #         h_base = base_out.hidden_states[-1]
+    #         norm = self.token_norm_sqrd.view(1,1,-1)  # 1, 1, V
+    #         log_R0 = (self.beta1 * logits_base) - (self.beta2 * norm)
+    #     t_emb = self.time_embed(t)  # [B, time_dim]
+    #     key_padding_mask = (attention_mask == 0)  # (B, L) bool
+    #     h_ctrl = h_base
+    #     for dit_block in self.blocks:
+    #         h_ctrl = dit_block(h_ctrl, t_emb, key_padding_mask=key_padding_mask)
+    #     h_ctrl = self.final_norm(h_ctrl)
+    #     u_theta = self.output_proj(h_ctrl)
+    #     tot_logits = log_R0 + u_theta
+    #     return tot_logits, u_theta

src/madsbm/wt_peptide/dataloader.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+import pandas as pd
+import lightning.pytorch as pl
+from omegaconf import OmegaConf
+from datasets import load_from_disk
+from torch.utils.data import DataLoader
+from torch.nn.utils.rnn import pad_sequence
+from functools import partial
+from src.utils.model_utils import _print
+config = OmegaConf.load('/scratch/pranamlab/sgoel/MadSBM/configs/wt_pep.yaml')
+# class DNADataset(Dataset):
+#     def __init__(self, config, data_path):
+#         self.config = config
+#         self.data = pd.read_csv(data_path)
+#         self.custom_tokenizer = CustomDNATokenizer(config.model.dna_model_path)
+#     def __len__(self):
+#         return len(self.data)
+#     def __getitem__(self, idx):
+#         sequence = self.data.iloc[idx]["Sequence"]
+#         seq = sequence.upper()
+#         tokenized = self.custom_tokenizer(seq, max_length=self.config.data.max_seq_len)
+#         return {
+#             "input_ids": tokenized["input_ids"].squeeze(0),
+#             "attention_mask": tokenized["attention_mask"].squeeze(0)
+#         }
+def collate_fn(batch, pad_id=None):
+    input_ids = torch.tensor(batch[0]['input_ids'])
+    attention_mask = torch.tensor(batch[0]['attention_mask'])
+    return {
+        'input_ids': input_ids,
+        'attention_mask': attention_mask
+    }
+class PeptideDataModule(pl.LightningDataModule):
+    def __init__(self, config, train_dataset, val_dataset, test_dataset, tokenizer, collate_fn=collate_fn):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.test_dataset = test_dataset
+        self.tokenizer = tokenizer
+        self.collate_fn = collate_fn
+        self.batch_size = config.data.batch_size
+        assert self.batch_size == 1, f'Batch size = {self.batch_size}. Needs to be 1 for dynamic batching'
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset,
+                          batch_size=self.batch_size,
+                          collate_fn=partial(self.collate_fn),
+                          num_workers=8,
+                          shuffle=False,
+                          pin_memory=True)
+    def val_dataloader(self):
+        return DataLoader(self.val_dataset,
+                          batch_size=self.batch_size,
+                          collate_fn=partial(self.collate_fn),
+                          num_workers=8,
+                          shuffle=False,
+                          pin_memory=True)
+    def test_dataloader(self):
+        return DataLoader(self.test_dataset,
+                          batch_size=self.batch_size,
+                          collate_fn=partial(self.collate_fn),
+                          num_workers=8,
+                          shuffle=False,
+                          pin_memory=True)
+def get_datasets(config):
+    """Helper method to grab datasets to quickly init data module in main.py"""
+    train_dataset = load_from_disk(config.data.train)
+    test_dataset = load_from_disk(config.data.test)
+    val_dataset = load_from_disk(config.data.val)
+    return  {
+        "train": train_dataset,
+        "val": val_dataset,
+        "test": test_dataset
+    }

src/madsbm/wt_peptide/main.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/python3
+import sys
+import os
+import torch
+import wandb
+import lightning.pytorch as pl
+from omegaconf import OmegaConf
+from lightning.pytorch.strategies import DDPStrategy
+from lightning.pytorch.loggers import WandbLogger
+from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor
+from src.madsbm.wt_peptide.sbm_module import MadSBM
+from src.madsbm.wt_peptide.dataloader import PeptideDataModule, get_datasets
+wandb.login(key='2b76a2fa2c1cdfddc5f443602c17b011fefb0a8f')
+# Load yaml config
+config = OmegaConf.load("/scratch/pranamlab/sgoel/MadSBM/configs/wt_pep.yaml")
+# Initialize WandB for logging
+wandb.init(project=config.wandb.project, name=config.wandb.name)
+wandb_logger = WandbLogger(**config.wandb)
+# PL checkpoints
+lr_monitor = LearningRateMonitor(logging_interval="step")
+every_epoch_cb = ModelCheckpoint(
+    dirpath=config.checkpointing.save_dir,
+    filename="{epoch:02d}_{step}",
+    save_top_k=-1,
+    every_n_epochs=1,
+    save_on_train_epoch_end=True,
+    verbose=True,
+)
+best_ckpt_cb = ModelCheckpoint(
+    monitor="val/loss",
+    dirpath=config.checkpointing.save_dir,
+    filename="best-model_{epoch:02d}_{step}",
+    save_top_k=1,
+    mode="min",
+    verbose=True,
+    save_last=False,
+)
+# PL trainer
+trainer = pl.Trainer(
+    #max_steps=None,  # Ensure training is based on epochs so we can compare with MOG-DFM and DirichletFM
+    max_epochs=config.training.n_epochs,
+    accelerator="cuda" if torch.cuda.is_available() else "cpu",
+    devices=config.training.devices if config.training.mode=='train' else [0],
+    strategy=DDPStrategy(find_unused_parameters=True),
+    callbacks=[every_epoch_cb, best_ckpt_cb, lr_monitor],
+    logger=wandb_logger
+)
+# Folder to save checkpoints
+ckpt_path = config.checkpointing.save_dir
+try: os.makedirs(ckpt_path, exist_ok=False)
+except FileExistsError: pass
+# PL Model for training
+sbm_model = MadSBM(config)
+sbm_model.validate_config()
+# Get datasets
+datasets = get_datasets(config)
+data_module = PeptideDataModule(
+    config=config,
+    train_dataset=datasets['train'],
+    val_dataset=datasets['val'],
+    test_dataset=datasets['test'],
+    tokenizer=sbm_model.tokenizer,
+)
+# Start/resume training or evaluate the model
+if config.training.mode == "train":
+    trainer.fit(sbm_model, datamodule=data_module)
+elif config.training.mode == "test":
+    state_dict = sbm_model.get_state_dict(config.checkpointing.best_ckpt_path)
+    sbm_model.load_state_dict(state_dict)
+    trainer.test(sbm_model, datamodule=data_module, ckpt_path=config.checkpointing.best_ckpt_path)
+elif config.training.mode == "resume_from_checkpoint":
+    state_dict = sbm_model.get_state_dict(config.training.resume_ckpt_path)
+    sbm_model.load_state_dict(state_dict)
+    trainer.fit(sbm_model, datamodule=data_module, ckpt_path=ckpt_path)
+wandb.finish()

src/madsbm/wt_peptide/sbm_module.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import gc
+import os
+import math
+from re import L
+import torch
+import lightning as pl
+import torch.nn.functional as F
+from transformers import AutoModel
+from src.madsbm.wt_peptide.control_field import PeptideControlField
+from src.PeptiVerse.inference import PeptiVersePredictor
+from src.utils.model_utils import CosineWarmup, _print, compute_grad_norms
+class MadSBM(pl.LightningModule):
+    def __init__(self, config, guidance=None):
+        super().__init__()
+        self.config = config
+        self.model = PeptideControlField(config)
+        self.tokenizer = self.model.tokenizer
+        self.vocab_size = self.tokenizer.vocab_size
+        self.mask_id = self.tokenizer.mask_token_id
+        self.pad_id = self.tokenizer.pad_token_id
+        self.embed_model = AutoModel.from_pretrained(config.model.esm_model)
+        self.embed_model.eval()
+        for param in self.embed_model.parameters():
+            param.requires_grad = False
+        self.beta = 1.0 / self.config.model.hidden_dim
+        # self.L = config.data.max_seq_len
+        # self.V = self.vocab_size
+        # self.log_R0 = - math.log(self.L * self.V) # uninformed generator is constant
+        self.time_schedule = config.time_embed.time_schedule
+        self.anneal_frac = config.time_embed.anneal_frac
+        self.eps = float(config.time_embed.min_time)
+        self.t_max = 1.0 - self.eps
+    # -------# Forward Pass #-------- #
+    def forward(self, input_ids, attention_mask, t):
+        return self.model(xt=input_ids, attention_mask=attention_mask, t=t)
+    def step(self, batch):
+        x1 = batch['input_ids']
+        attn_mask = batch['attention_mask']
+        maskable = self.is_maskable(x1)
+        t = self.sample_t(x1)
+        xt = self.noise_seq(x1, t, maskable_mask=maskable)
+        outs = self.forward(xt, attn_mask, t)
+        if self.config.model.ablate:
+            logits = outs['dit']
+        else:
+            logits = outs['madsbm']
+            max_u_logit = outs['dit'].max().item()
+            max_esm_logit = outs['esm'].max().item()
+        loss_token = F.cross_entropy(
+            logits.view(-1, logits.size(-1)),
+            x1.view(-1),
+            reduction = 'none',
+            ignore_index=self.pad_id
+        )
+        loss_token = loss_token.view(x1.size(0), x1.size(1))
+        sample_loss = (loss_token * maskable.float()).sum(dim=1) / maskable.float().sum(dim=1).clamp(min=1.0)
+        loss = sample_loss.mean()
+        ppl = torch.exp(loss)
+        _print(f'loss: {loss}')
+        _print(f'ppl: {ppl}')
+        return loss, ppl, max_u_logit, max_esm_logit
+    # def step(self, batch):
+    #     x1 = batch['input_ids']
+    #     attn_mask = batch['attention_mask']
+    #     maskable = self.is_maskable(x1)
+    #     t = self.sample_t(x1)
+    #     xt = self.noise_seq(x1, t, maskable_mask=maskable)
+    #     u_theta = self.forward(xt, attn_mask, t)
+    #     b, l, v_target = self.compute_target(x1, xt, t, maskable_mask=maskable)
+    #     loss, ppl = self.compute_loss(u_theta, v_target, x1, b, l)
+    #     _print(f'loss: {loss}')
+    #     _print(f'ppl: {ppl}')
+    #     return loss, ppl
+    # -------# Main Training Logic #-------- #
+    def noise_seq(self, x1, t, maskable_mask):
+        B, L = x1.shape
+        t = t.unsqueeze(1) # B, 1
+        # reveal if u < t, mask if u >= t
+        u = torch.rand((B, L), device=x1.device)
+        masked = (u < t) & maskable_mask
+        xt = x1.clone()
+        xt = xt.masked_fill(masked, self.mask_id)
+        return xt
+    # def compute_target(self, x1, xt, t, maskable_mask):
+    #     L = x1.size(1)
+    #     V = self.vocab_size
+    #     device = x1.device
+    #     mask = (xt == self.mask_id) & maskable_mask
+    #     b, l = torch.nonzero(mask, as_tuple=True)
+    #     if b.numel() == 0:
+    #         return b, l, torch.empty(0, device=device, dtype=torch.long)
+    #     log_R0 = - math.log(L * V)  # uniform generator with rates (1 / L*V)
+    #     time = - torch.log(1 - t[b])
+    #     v_target =  time - log_R0  # log(1/1-t) - log(1/L*V)
+    #     v_target = v_target.clamp(min=-100.0, max=100.0)
+    #     return b, l, v_target
+    # def compute_loss(self, u_theta, v_target, x1, b, l):
+    #     if b.numel() == 0:
+    #         dummy_loss = 0.0 * u_theta.sum()
+    #         return dummy_loss, torch.tensor(0.0, device=u_theta.device)
+    #     true_toks = x1[b, l]
+    #     u_pred = u_theta[b, l, :]  # N_masks, V
+    #     tgt = torch.zeros_like(u_pred)
+    #     tgt.scatter_(1, true_toks.unsqueeze(1), v_target.unsqueeze(1))
+    #     sse = F.mse_loss(u_pred, tgt, reduction='sum')
+    #     loss = sse / b.numel() if b.numel != 0 else sse  # normalize by number of masks
+    #     with torch.no_grad():
+    #         ppl = torch.exp(F.cross_entropy(u_pred, true_toks))
+    #     return loss, ppl
+    # -------# Time Schedules #-------- #
+    def sample_t(self, x1):
+        ts = self.time_schedule
+        if ts == 'linear':
+            return self.sample_linear_t(x1)
+        elif ts == 'exponential':
+            return self.sample_exp_t(x1)
+        elif ts == 'uniform':
+            return self.sample_uni_t(x1)
+        else:
+            raise ValueError(f"Unrecognized time scheduler type: {ts}")
+    def sample_uni_t(self, x1):
+        B = x1.size(0)
+        T = self.config.time_embed.n_timesteps
+        discrete_ts = torch.randint(1, T+1, (B,), device=x1.device)
+        timesteps = discrete_ts.float() / float(T)
+        _print(f'timesteps: {timesteps}')
+        return timesteps.clamp(min=self.eps, max=self.t_max)
+    def sample_linear_t(self, x1):
+        B = x1.size(0)
+        eps = self.eps
+        # fraction of total training steps completed
+        frac = float(self.global_step) / float(self.tot_steps)
+        t_max = 1.0 - eps
+        if frac < self.anneal_frac:
+            # normalize progress within the anneal window
+            prog = frac / max(1e-12, self.anneal_frac)  # maps [0, anneal_frac) to [0,1)
+            t_min = eps + prog * (t_max - eps)  # linear increase from eps to 1.0-eps
+            t = t_min + (t_max - t_min) * torch.rand(B, device=x1.device)
+        else:
+            # after anneal_frac of training steps completed, then uniform sample over entire range [eps, 1.0-eps]
+            t = eps + (t_max - eps) * torch.rand(B, device=x1.device)
+        return t.clamp(min=eps, max=t_max)
+    def sample_t_exponential(self, x1, t_min=1e-6, t_max=1.0-1e-6):
+        # TODO - FIX THIS METHOD IF NEEDED !!
+        """
+        Exponentially anneal center of t from t_min to t_max over training.
+        Implement if linear schedule isn't expressive enough
+        But for annealing over training steps, which can be a very large quantity,
+        exponential approximates linear schedule
+        """
+        # k controls how fast the curve rises.
+        k = self.config.training.exp_time_k
+        progress = self.trainer.step / self.tot_steps
+        frac = 1.0 - torch.exp(-k * torch.tensor(progress))
+        center = t_min + frac * (t_max - t_min)
+        # add small jitter so we don't collapse onto a distribution
+        t = torch.randn(x1.size(0)) * self.config.training.time_sigma + center
+        return t.clamp(min=t_min, max=t_max)
+    # -------# Model Training / Evaluation #-------- #
+    def training_step(self, batch):
+        loss, ppl = self.step(batch)
+        self.log("train/loss", loss, on_step=True, on_epoch=False, prog_bar=True)
+        self.log("train/ppl", ppl, on_step=True, on_epoch=False, prog_bar=False)
+        return loss
+    def validation_step(self, batch):
+        loss, ppl = self.step(batch)
+        self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=True, sync_dist=True)
+        self.log("val/ppl", ppl, on_step=False, on_epoch=True, prog_bar=False, sync_dist=True)
+        return loss
+    def test_step(self, batch):
+        loss, ppl, max_u, max_esm = self.step(batch)
+        self.log('test/loss', loss, on_step=False, on_epoch=True, prog_bar=True, sync_dist=True)
+        self.log("test/ppl", ppl, on_step=False, on_epoch=True, prog_bar=False, sync_dist=True)
+        self.log("test/max_madsbm_logit", max_u, on_step=False, on_epoch=True, prog_bar=False, sync_dist=True)
+        self.log("test/max_esm_logit", max_esm, on_step=False, on_epoch=True, prog_bar=False, sync_dist=True)
+        return loss
+    def on_after_backward(self):
+        pre_norm = compute_grad_norms(self.parameters())
+        self.log('train/grad_norm_PRE_clip', pre_norm, on_step=True, on_epoch=False, prog_bar=False, sync_dist=True)
+        # torch.nn.utils.clip_grad_norm_(self.parameters(), float(self.config.training.grad_clip_val))
+        # post_norm = compute_grad_norms(self.parameters())
+        # self.log('train/grad_norm_POST_clip', post_norm, on_step=True, on_epoch=False, prog_bar=False, sync_dist=True)
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            params = self.model.parameters(),
+            lr = self.config.optim.lr,
+            weight_decay = self.config.optim.weight_decay,
+            betas = (self.config.optim.beta1, self.config.optim.beta2)
+        )
+        self.tot_steps = self.trainer.estimated_stepping_batches
+        warmup_steps = int(self.config.optim.warmup_epochs * self.tot_steps / self.config.training.n_epochs)
+        lr_scheduler = CosineWarmup(
+            optimizer = optimizer,
+            warmup_steps = warmup_steps,
+            total_steps = self.tot_steps
+        )
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": lr_scheduler,
+                "interval": "step",
+                "frequency": 1
+            }
+        }
+    def on_save_checkpoint(self, checkpoint: dict):
+        """
+        Don't save the classifier model used for FBD calculation in the ckpt
+        """
+        sd = checkpoint.get('state_dict', None)
+        if sd is None:
+            return
+        keys_to_remove = [k for k in sd.keys() if k.startswith("score_model.")]
+        for k in keys_to_remove:
+            sd.pop(k, None)
+        checkpoint['state_dict'] = sd
+   # -------# Helper methods #-------- #
+    def is_maskable(self, input_ids: torch.Tensor):
+        return (
+            (input_ids != self.tokenizer.pad_token_id)
+            & (input_ids != self.tokenizer.cls_token_id)
+            & (input_ids != self.tokenizer.eos_token_id)
+        )
+    def validate_config(self):
+        assert os.path.isdir(self.config.checkpointing.save_dir), "invalid checkpointing path"
+        assert self.config.model.hidden_dim % 2 == 0, 'odd value for embedding dim'
+        assert self.config.time_embed.time_dim % 2 == 0, 'odd value for time dim'
+        assert self.config.time_embed.fourier_dim % 2 == 0, 'odd value for fourier dim'
+    def get_state_dict(self, ckpt_path):
+        def remove_model_prefix(state_dict):
+            for k, v in state_dict.items():
+                if "model." in k:
+                    k.replace('model.', '')
+            return state_dict
+        checkpoint = torch.load(ckpt_path, map_location='cuda:3' if torch.cuda.is_available() else 'cpu')
+        state_dict = checkpoint.get("state_dict", checkpoint)
+        if any(k.startswith("model.") for k in state_dict.keys()):
+            state_dict = remove_model_prefix(state_dict)
+        return state_dict
+    def cleanup(self):
+        torch.cuda.empty_cache()
+        gc.collect()

src/sampling/diffusion_sampler.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import sys
+import torch
+import random
+import numpy as np
+from tqdm import tqdm
+from src.utils.model_utils import _print
+class DiffusionSampler:
+    def __init__(self, model, tokenizer):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = self.model.device
+        self.mask_id = self.tokenizer.mask_token_id
+        self.seed_everything(seed=42)
+    @torch.inference_mode()
+    def sample_unconditional(self, xt, num_steps, tracer, tau=1.0, kappa_fn=lambda t: t, eta=1, alpha=1.):
+        """
+        Stochastic remasking sampling method for iterative refinement of sequences.
+        Args:
+            xt (Tensor): Initial token tensor.
+            num_steps (int): Number of refinement steps.
+            tau (float): Temperature parameter for softmax sampling.
+            kappa_fn (callable): Function controlling the unmasking schedule.
+            eta (float): Scaling factor for score adjustments.
+            alpha (float): Weighting for confidence-based scoring.
+        Returns:
+            Tensor: Final sampled sequence tensor.
+        """
+        dt = 1 / num_steps
+        fix_mask = xt != self.mask_id # tokens to retain
+        attention_mask = torch.ones_like(xt).to(self.device)
+        if tracer:
+            tracer.log_step(xt=xt, step_idx = 0)
+        for i in range(1, num_steps + 1):
+            kappa_t = kappa_fn(i * dt)
+            logits = self.model(input_ids=xt, attention_mask=attention_mask).logits
+            last_mask = xt == self.mask_id # tokens currently masked
+            unmask_t = ~last_mask & ~fix_mask # unmasked and not fixed tokens - candidates for masking
+            x0, logp = self.stochastic_sample_from_categorical(logits, tau) # tokens, logprobs
+            # Confidence-based scoring
+            entropy = torch.distributions.Categorical(logits=logits).entropy()
+            score = alpha * logp + (1 - alpha) * -entropy # alpha = 1 --> score = logp
+            score = score.masked_fill(fix_mask, float('inf'))
+            score[unmask_t] = score[unmask_t] * eta
+            num_to_mask = ((~fix_mask).sum(1, keepdim=True).float() * (1 - kappa_t)).long()
+            lowest_k_mask = self.topk_lowest_masking(score, num_to_mask)
+            xt[lowest_k_mask] = self.mask_id
+            mask_2_x0 = last_mask & ~lowest_k_mask
+            xt[mask_2_x0] = x0[mask_2_x0]
+            tracer.log_step(xt=xt, step_idx = i)
+        xt[xt == self.mask_id] = x0[xt == self.mask_id]
+        tracer.log_step(xt, num_steps + 1)
+        return xt
+    def stochastic_sample_from_categorical(self, logits, temperature, noise_scale=1.0):
+        """
+        Sample from a categorical distribution with optional temperature scaling and Gumbel noise.
+        """
+        logits = logits.double()
+        if temperature != 0:
+            gumbel_noise = -torch.log(-torch.log(torch.rand_like(logits) + 1e-8) + 1e-8)
+            logits = logits / temperature + noise_scale * gumbel_noise
+        scores, tokens = logits.log_softmax(dim=-1).max(dim=-1)
+        return tokens, scores
+    def topk_lowest_masking(self, scores, cutoff_len):
+        """
+        scores: [b, n]
+        cutoff_len: [b, 1]
+        returns:
+            mask: [b, n], with 1 if the token is in top-k lowest scores, 0 otherwise
+        """
+        sorted_index = scores.sort(-1)[0]
+        cutoff = sorted_index.gather(dim=-1, index=cutoff_len)
+        return scores < cutoff
+    def seed_everything(self, seed):
+        """
+        Set the seed for reproducibility across various libraries.
+        """
+        if seed is None:
+            return
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)  # if using multi-GPU
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False

src/sampling/guided_sample.py ADDED Viewed

	@@ -0,0 +1,121 @@

+#!/usr/bin/env python3
+import os
+import random
+import torch
+import pandas as pd
+from tqdm import tqdm
+from omegaconf import OmegaConf
+from datetime import datetime
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+from src.madsbm.wt_peptide.sbm_module import MadSBM
+from src.sampling.madsbm_sampler import MadSBMSampler
+from src.utils.generate_utils import calc_entropy, mask_for_de_novo, calc_ppl
+from src.utils.model_utils import _print
+device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
+os.chdir('/scratch/pranamlab/sgoel/MadSBM')
+config = OmegaConf.load("./configs/wt_pep.yaml")
+date = datetime.now().strftime("%Y-%m-%d")
+def generate_sequence(masked_seq, target_toks, tokenizer, generator, device):
+    input_ids = tokenizer(masked_seq, return_tensors="pt").to(device)['input_ids']
+    uncond_ids, uncond_bind = generator.sample(xt=input_ids, num_steps=config.sampling.n_steps, target_toks=target_toks, guidance=False)
+    guided_ids, guided_bind = generator.sample(xt=input_ids, num_steps=config.sampling.n_steps, target_toks=target_toks, guidance=True)
+    uncond_seq = tokenizer.decode(uncond_ids[0].squeeze())[5:-5].replace(" ", "") # bos/eos tokens & spaces between residues
+    guided_seq = tokenizer.decode(guided_ids[0].squeeze())[5:-5].replace(" ", "")
+    return uncond_seq, guided_seq, uncond_bind, guided_bind
+def main():
+    csv_save_path = f'./results/guided/'
+    try: os.makedirs(csv_save_path, exist_ok=False)
+    except FileExistsError: pass
+    # Load ESM model for eval
+    esm_pth = config.model.esm_model
+    esm_model = AutoModelForMaskedLM.from_pretrained(esm_pth).to(device)
+    esm_model.eval()
+    # Load SBM model
+    gen_model = MadSBM(config)
+    state_dict = gen_model.get_state_dict(config.checkpointing.best_ckpt_path)
+    gen_model.load_state_dict(state_dict)
+    gen_model.to(device)
+    gen_model.eval()
+    tokenizer = gen_model.tokenizer
+    generator = MadSBMSampler(gen_model, config, device, guidance=True)
+    tgt_name = "3HVE"
+    df = pd.read_csv("./data/wt_pep/targets.csv")
+    tgt_seq = df.loc[df['Target'] == tgt_name, 'Sequence'].iloc[0]
+    target_toks = tokenizer(tgt_seq, return_tensors='pt')['input_ids'].to(device)
+    existing_binder = df.loc[df['Target'] == tgt_name, 'Existing Binder'].iloc[0]
+    existing_binder_pred = generator.peptiverse.predict_binding_affinity(
+        mode = 'wt',
+        target_ids = target_toks,
+        binder_ids = tokenizer(existing_binder, return_tensors='pt')['input_ids'].to(device).detach()
+    )['affinity']
+    _print(f'EXISTING BINDER AFFINITY: {existing_binder_pred}')
+    seq_lengths = [length for length in [10, 15, 20] for _ in range(20)]
+    generation_results = []
+    for seq_len in tqdm(seq_lengths, desc=f"Generating sequences: "):
+        masked_seq = mask_for_de_novo(seq_len) # Sequence of all <MASK> tokens
+        uncond_seq, guided_seq, uncond_bind, guided_bind = generate_sequence(masked_seq, target_toks, tokenizer, generator, device)
+        uncond_ppl = calc_ppl(esm_model, tokenizer, uncond_seq, [i for i in range(len(uncond_seq))], model_type='esm')
+        guided_ppl = calc_ppl(esm_model, tokenizer, uncond_seq, [i for i in range(len(uncond_seq))], model_type='esm')
+        _print(f'uncond seq: {uncond_seq}')
+        _print(f'uncond ppl: {uncond_ppl}')
+        _print(f'uncond bind: {uncond_bind}')
+        _print(f'guided seq: {guided_seq}')
+        _print(f'guided ppl: {guided_ppl}')
+        _print(f'guided bind: {guided_bind}')
+        res_row = {
+            "Uncond Generated Sequence": uncond_seq,
+            "Guided Generated Sequence": guided_seq,
+            "Uncond PPL": uncond_ppl,
+            "Guided PPL": guided_ppl,
+            "Uncond Affinity": uncond_bind,
+            "Guided Affinity": guided_bind
+        }
+        generation_results.append(res_row)
+    df = pd.DataFrame(generation_results)
+    _print(f"Uncond PPL Res: {df['Uncond PPL'].mean()}, {df['Uncond PPL'].std()}")
+    _print(f"Guided PPL Res: {df['Guided PPL'].mean()}, {df['Guided PPL'].std()}")
+    _print(f"Uncond Affinity Res: {df['Uncond Affinity'].mean()}, {df['Uncond Affinity'].std()}")
+    _print(f"Guided Affinity Res: {df['Guided Affinity'].mean()}, {df['Guided Affinity'].std()}")
+    df.to_csv(
+        csv_save_path + f"/{tgt_name}/tau=0.5_topp=0.9_no-gumbel_rate=0.01_jump=0.05_ablate={config.model.ablate}_nsteps={config.sampling.n_steps}_seqs_with_ppl_{date}.csv",
+        index=False
+    )
+if __name__ == "__main__":
+    main()

src/sampling/madsbm_sampler.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import random
+import torch
+import numpy as np
+import torch.nn.functional as F
+from src.PeptiVerse.inference import PeptiVersePredictor
+from src.utils.model_utils import _print
+class MadSBMSampler:
+    def __init__(self, model, config, device, guidance=None):
+        self.config = config
+        self.device = device
+        self.model = model
+        self.tokenizer = model.tokenizer
+        self.mask_id = self.tokenizer.mask_token_id
+        self.eps = config.time_embed.min_time
+        self.seed_everything(seed=42)
+        if guidance:
+            self.guidance = guidance
+            self.peptiverse = PeptiVersePredictor(
+                manifest_path="/scratch/pranamlab/sgoel/MadSBM/src/PeptiVerse/best_models.txt",
+                classifier_weight_root="/scratch/pranamlab/sgoel/MadSBM/src/PeptiVerse",
+                device=self.device
+            )
+    @torch.inference_mode()
+    def sample(self, xt, num_steps, tracer, target_toks=None, guidance=None):
+        xt = xt.clone()
+        B, L = xt.shape
+        assert B == 1, "Do only 1 sequence at a time"
+        t_max = 1.0 - self.eps
+        dt = 1.0 / num_steps
+        attn_mask = torch.ones_like(xt, device=self.device)
+        action_traj = {}
+        tot_action = 0.0
+        tracer.log_step(xt=xt, step_idx=0)
+        converge_idx = num_steps
+        converged = False
+        for k in range(num_steps):
+            # t decreases from 1 --> 0 as our model was trained that t=1 --> noise and t=0 --> clean
+            prog = (k + 1) / float(num_steps)
+            t_val = t_max - (t_max - self.eps) * prog
+            t = torch.full((B,), fill_value=float(t_val), device=self.device)  # B = 1 during sampling
+            # predicted control field --> B, L, V
+            outs = self.model(input_ids=xt, attention_mask=attn_mask, t=t)
+            u_tilt = outs['dit']
+            total_logits = outs['madsbm']
+            esm_logits = outs['esm']
+            if self.config.model.ablate:
+                actional = self.compute_action(u_tilt, esm_logits=None)
+            else:
+                actional = self.compute_action(u_tilt, esm_logits=esm_logits)
+            action_traj[f"action_step_{k+1}"] = actional
+            tot_action += (actional * dt)
+            # Compute jump rates and jump probs
+            # P(jump) = 1 - exp(-rate * dt)
+            r_theta = torch.exp(u_tilt * self.config.sampling.rate_scale)
+            R_tot = r_theta.sum(dim=-1) # 1, L
+            rate = (- R_tot * self.config.sampling.jump_scale * dt).clamp(min=-40.0, max=0.0)
+            jump_prob = 1.0 - torch.exp(rate)
+            # Scale and filter logits with nucleus sampling
+            logits = total_logits.clone()
+            logits /= self.config.sampling.tau
+            logits = self.top_p_filter(logits, self.config.sampling.top_p)
+            # Sample new tokens
+            probs = F.softmax(logits, dim=-1)
+            probs = probs.view(-1, probs.size(-1))
+            sample = torch.multinomial(probs, 1)
+            candidate_toks = sample.view(B, L)
+            # determine tokens we can change
+            rand = torch.rand(B, L, device=self.device)
+            can_jump = (rand < jump_prob)
+            updatable = can_jump & self.is_masked(xt)
+            # Update the sequence
+            if guidance:
+                chosen_candidate = self.binding_guidance(probs, target_toks, B, L)
+                xt[updatable] = chosen_candidate[updatable]
+            else:
+                xt[updatable] = candidate_toks[updatable]
+            tracer.log_step(xt=xt, step_idx = k+1)
+            if k == num_steps-1:
+                final_logits = total_logits
+                still_masked = self.is_masked(xt)
+            if not converged and not self.is_masked(xt).any():
+                converge_idx = k + 1
+                converged = True
+        # Copy over remaining tokens
+        if still_masked.any():
+            final_toks = final_logits.argmax(dim=-1)
+            xt[still_masked] = final_toks[still_masked]
+            tracer.log_step(xt, num_steps + 1)
+        binding_affin = self.peptiverse.predict_binding_affinity(
+            mode = 'wt',
+            target_ids = target_toks,
+            binder_ids = xt
+        )['affinity']
+        return xt, binding_affin
+    def binding_guidance(self, probs, target_toks, B, L):
+        M = self.config.sampling.M
+        candidate_toks = []
+        affinities = []
+        for _ in range(M):
+            ith_sample = torch.multinomial(probs, 1).view(B, L)
+            candidate_toks.append(ith_sample)
+        for toks in candidate_toks:
+            pred = self.peptiverse.predict_binding_affinity(
+                mode = 'wt',
+                target_ids = target_toks,
+                binder_ids = toks.detach()
+            )['affinity']
+            affinities.append(pred)
+        affinities = torch.tensor(affinities, dtype=torch.float32)
+        weights = F.softmax(affinities / self.config.sampling.tau, dim=0)
+        chosen_idx = torch.multinomial(weights, 1).item()
+        return candidate_toks[chosen_idx]
+    def compute_action(self, u_tilt, esm_logits=None):
+        """ Computes the action functional for evals """
+        if esm_logits is not None:
+            R0 = torch.softmax(esm_logits, dim=-1)
+        else:
+            R0 = 1.0 / self.tokenizer.vocab_size
+        psi_u = torch.exp(u_tilt) - u_tilt - 1.0
+        action_per_tok = (R0 * psi_u).sum(dim=-1)  # R0 goes to 1 in both cases
+        return action_per_tok.mean().item()
+    def top_p_filter(self, logits, p_val):
+        """
+        Implementation of nucleus / top-p sampling
+        Masks out tokens that contribute to the bottom (1 - p) cumulative probability
+        """
+        # Sort logits and get cumulative probabilities
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cum prob > p-val thresh
+        sorted_idx_to_remove = cum_probs > p_val
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_idx_to_remove[..., 1:] = sorted_idx_to_remove[..., :-1].clone()
+        sorted_idx_to_remove[..., 0] = 0
+        idx_to_remove = sorted_idx_to_remove.scatter(-1, sorted_indices, sorted_idx_to_remove)
+        logits[idx_to_remove] = float('-inf')
+        return logits
+    def is_masked(self, xt):
+        return (xt == self.mask_id)
+    def seed_everything(self, seed):
+        if seed is None:
+            return
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)  # if using multi-GPU
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False

src/sampling/path_tracer.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+import torch.nn.functional as F
+class ProbabilityPathTracer:
+    def __init__(self, oracle_model, tokenizer, device):
+        self.oracle = oracle_model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.mask_id = tokenizer.mask_token_id
+        self.history = {}  # {nth_step: prob_score}
+    @torch.inference_mode()
+    def compute_loglikeli(self, xt):
+        is_revealed = (xt != self.mask_id)
+        if not is_revealed.any():
+            return 0.0
+        # esm forward pass
+        logits = self.oracle(
+            input_ids=xt,
+            attention_mask=torch.ones_like(xt, device=xt.device)
+        ).logits
+        # Calculate CE loss only on unmasked tokens
+        nll = F.cross_entropy(
+            logits.view(-1, logits.size(-1)),
+            xt.view(-1),
+            reduction='none'
+        )
+        nll = nll.view(xt.shape)
+        # Lower NLL = better --> higher LL = better
+        avg_ll = -(nll * is_revealed.float()).sum(dim=1) / is_revealed.float().sum(dim=1).clamp(min=1)
+        return avg_ll.item()
+    def log_step(self, xt, step_idx):
+        score = self.compute_loglikeli(xt)
+        self.history[f"trace_step_{step_idx}"] = score
+    def get_trace(self):
+        return self.history

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/eval_utils.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+import numpy as np
+from scipy.linalg import sqrtm
+def dna_to_tensor(seq):
+    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
+    indices = [mapping[base] for base in seq]
+    return torch.tensor(indices, dtype=torch.long)
+def compute_fbd(true_seqs, gen_seqs, score_model):
+    """
+    The Frechet Biological Distance (FBD) is defined as the Wasserstein distance between Gaussian / true embeddings
+    """
+    embeds1 = score_model()
+    embeds2 = score_model()
+    if np.isnan(embeds2).any() or np.isnan(embeds1).any() or len(embeds1) == 0 or len(embeds2) == 0:
+        return float('nan')
+    mu1, sigma1 = embeds1.mean(axis=0), np.cov(embeds1, rowvar=False)
+    mu2, sigma2 = embeds2.mean(axis=0), np.cov(embeds2, rowvar=False)
+    ssdiff = np.sum((mu1 - mu2) ** 2.0)
+    covmean = sqrtm(sigma1.dot(sigma2))
+    if np.iscomplexobj(covmean):
+        covmean = covmean.real
+    dist = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
+    return dist

src/utils/fbd_score_model.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import copy
+import torch
+import torch.nn.functional as F
+from torch import nn
+from src.utils.time_utils import GaussianFourierProjection
+class Dense(nn.Module):
+    """
+    A fully connected layer that reshapes outputs to feature maps.
+    """
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, output_dim)
+    def forward(self, x):
+        return self.dense(x)[...]
+class Swish(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return torch.sigmoid(x) * x
+class CNNClassifier(nn.Module):
+    def __init__(self, args, alphabet_size, num_cls, classifier=False):
+        super().__init__()
+        self.alphabet_size = alphabet_size
+        self.args = args
+        self.classifier = classifier
+        self.num_cls = num_cls
+        self.linear = nn.Embedding(self.alphabet_size, embedding_dim=args.hidden_dim)
+        self.num_layers = 5 * args.num_cnn_stacks
+        self.convs = [
+            nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=9, padding=4),
+            nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=9, padding=4),
+            nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=9, dilation=4, padding=16),
+            nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=9, dilation=16, padding=64),
+            nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=9, dilation=64, padding=256)
+        ]
+        self.convs = nn.ModuleList([copy.deepcopy(layer) for layer in self.convs for i in range(args.num_cnn_stacks)])
+        self.time_layers = nn.ModuleList([Dense(args.hidden_dim, args.hidden_dim) for _ in range(self.num_layers)])
+        self.norms = nn.ModuleList([nn.LayerNorm(args.hidden_dim) for _ in range(self.num_layers)])
+        self.final_conv = nn.Sequential(
+            nn.Conv1d(args.hidden_dim, args.hidden_dim, kernel_size=1),
+            nn.ReLU(),
+            nn.Conv1d(args.hidden_dim, args.hidden_dim if classifier else self.alphabet_size, kernel_size=1)
+        )
+        self.dropout = nn.Dropout(args.dropout)
+        if classifier:
+            self.cls_head = nn.Sequential(
+                nn.Linear(args.hidden_dim, args.hidden_dim),
+                nn.ReLU(),
+                nn.Linear(args.hidden_dim, self.num_cls)
+            )
+        if self.args.cls_free_guidance and not self.classifier:
+            self.cls_embedder = nn.Embedding(num_embeddings=self.num_cls + 1, embedding_dim=args.hidden_dim)
+            self.cls_layers = nn.ModuleList([Dense(args.hidden_dim, args.hidden_dim) for _ in range(self.num_layers)])
+    def forward(self, seq, t, cls = None, return_embedding=False):
+        if self.args.clean_data:
+            feat = self.linear(seq)
+            feat = feat.permute(0, 2, 1)
+        else:
+            time_emb = F.relu(self.time_embedder(t))
+            feat = seq.permute(0, 2, 1)
+            feat = F.relu(self.linear(feat))
+        if self.args.cls_free_guidance and not self.classifier and cls is not None:
+            cls_emb = self.cls_embedder(cls)
+        for i in range(self.num_layers):
+            h = self.dropout(feat.clone())
+            if not self.args.clean_data:
+                h = h + self.time_layers[i](time_emb)[:, :, None]
+            if self.args.cls_free_guidance and not self.classifier and cls is not None:
+                h = h + self.cls_layers[i](cls_emb)[:, :, None]
+            h = self.norms[i]((h).permute(0, 2, 1))
+            h = F.relu(self.convs[i](h.permute(0, 2, 1)))
+            if h.shape == feat.shape:
+                feat = h + feat
+            else:
+                feat = h
+        feat = self.final_conv(feat)
+        feat = feat.permute(0, 2, 1)
+        if self.classifier:
+            feat = feat.mean(dim=1)
+            if return_embedding:
+                embedding = self.cls_head[:1](feat)
+                return self.cls_head[1:](embedding), embedding
+            else:
+                return self.cls_head(feat)
+        return feat

src/utils/generate_utils.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import sys
+import torch
+import math
+import numpy as np
+import torch.nn.functional as F
+from collections import Counter
+from omegaconf import OmegaConf
+config = OmegaConf.load("/scratch/pranamlab/sgoel/MeMDLM_v2/src/configs/lm.yaml")
+# -------# Masking #-------- #
+def mask_for_de_novo(sequence_length):
+    return "<mask>" * sequence_length
+def mask_for_scaffold(sequence, generate_type, mask_token):
+    if generate_type == "uppercase":
+        sequence = ''.join([mask_token if residue.isupper() else residue.upper() for residue in sequence])
+    elif generate_type == "lowercase":
+        sequence = ''.join([mask_token if residue.islower() else residue for residue in sequence])
+    return sequence
+# -------# Generation #-------- #
+def evodiff_infill(motif_seq, tokenizer, model, device, batch_size=1):
+    """
+    Following the given evodiff example
+    https://github.com/microsoft/evodiff/blob/main/examples/evodiff.ipynb
+    """
+    # Manual masking of infilling sequence
+    motif_seq = ''.join(["#" if aa.islower() else aa for aa in motif_seq])  # Mask token is "#" in evodiff tokenizer
+    tkns = tokenizer.tokenize([motif_seq])
+    sample = torch.as_tensor(tkns).to(device)
+    # Create input motif + scaffold
+    loc = torch.arange(0, len(motif_seq)).to(device)[sample==tokenizer.mask_id].cpu().numpy()
+    np.random.shuffle(loc)
+    sample = sample.to(device).unsqueeze(0)
+    # og_sample = sample.clone()
+    with torch.no_grad():
+        for i in loc:
+            timestep = torch.tensor([0] * batch_size).to(device)  # placeholder but not called in model
+            timestep = timestep.to(device)
+            prediction = model(sample, timestep)
+            p = prediction[:, i, :len(tokenizer.all_aas) - 6]  # only canonical
+            p = F.softmax(p, dim=1)  # softmax over logits
+            p_sample = torch.multinomial(p, num_samples=1) # sample from categorical distribution
+            sample[:, i] = p_sample.squeeze()
+    output = [tokenizer.untokenize(s) for s in sample]
+    return output[0] #if batch_size==1 else output, og_sample, loc
+def dplm_infill(masked_seq, tokenizer, model, device):
+    from src.lm.dplm.diffusion_module import DPLM
+    from src.lm.dplm.unconditional_sampler import UnconditionalSampler as DPLMUnconditionalSampler
+    generator = DPLMUnconditionalSampler(tokenizer, model)
+    xt = tokenizer(masked_seq, return_tensors='pt')['input_ids'].to(model.device)
+    denoised_tokens = generator.sample_unconditional(xt, config.sampling.n_steps)[0].squeeze()
+    generated_sequence = tokenizer.decode(denoised_tokens).replace(" ", "")[5:-5]
+    return generated_sequence
+# -------# Metrics #-------- #
+def calc_progen_ppl(model, tokenizer, target, device, fp16=True):
+    """Compute causal LM cross-entropy loss for a given sequence."""
+    with torch.no_grad():
+        with torch.cuda.amp.autocast(enabled=fp16):
+            logits = model(
+                input_ids = target,
+                attention_mask = torch.ones_like(target)
+            ).logits
+            # Shift
+            logits = logits[:-1, ...]
+            target = target[1:]
+            loss = torch.nn.functional.cross_entropy(
+                input=logits,
+                target=target,
+                reduction='mean'
+            )
+            return torch.exp(loss).item()
+def calc_ppl(model, tokenizer, generated_sequence, mask_token_indices, model_type):
+    total_loss = 0.0
+    tensor_input = tokenizer.encode(generated_sequence, return_tensors='pt').to(model.device)
+    for i in mask_token_indices:
+        masked_input = tensor_input.clone()
+        masked_input[0, i] = tokenizer.mask_token_id
+        labels = torch.full(tensor_input.shape, -100).to(model.device)
+        labels[0, i] = tensor_input[0, i]
+        with torch.no_grad():
+            loss = model(masked_input, labels=labels).loss.item()
+            total_loss += loss
+    avg_loss = total_loss / len(generated_sequence)
+    perplexity = math.exp(avg_loss)
+    return perplexity
+def calc_entropy(seq):
+    counts = Counter(seq)
+    total_len = len(seq)
+    entropy = 0.0
+    for count in counts.values():
+        prob = count / total_len
+        entropy -= prob * math.log2(prob)
+    return entropy

src/utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import sys
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+def _print(s):
+    print(s)
+    sys.stdout.flush()
+def compute_grad_norms(params):
+    """ Compute the norms of a matrix of gradients """
+    sqrd_sum = 0.0
+    for p in params:
+        if p.grad != None:
+            sqrd_sum += p.grad.norm(2).item() ** 2
+    norm = sqrd_sum ** 0.5
+    return norm
+class CosineWarmup(torch.optim.lr_scheduler._LRScheduler):
+    def __init__(self, optimizer, warmup_steps, total_steps, eta_ratio=0.1, last_epoch=-1):
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.eta_ratio = eta_ratio  # The ratio of minimum to maximum learning rate
+        super(CosineWarmup, self).__init__(optimizer, last_epoch)
+    def get_lr(self):
+        step = self.last_epoch
+        if step < self.warmup_steps:
+            return [
+                base_lr * self.last_epoch / self.warmup_steps
+                for base_lr in self.base_lrs
+            ]
+        progress = (step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
+        cosine_decay = 0.5 * (1 + np.cos(np.pi * progress))
+        lr_mult = (1 - self.eta_ratio) * cosine_decay + self.eta_ratio
+        return [base_lr * lr_mult for base_lr in self.base_lrs]

src/utils/time_utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+import torch.nn as nn
+# -------------------------
+# Timestep embeddings
+# -------------------------
+class GaussianFourierProjection(nn.Module):
+    """
+    Gaussian Fourier features for continuous time t in [0, 1].
+    Produces 2 * embed_dim features: [sin(W t), cos(W t)].
+    """
+    def __init__(self, embed_dim, scale):
+        super().__init__()
+        assert embed_dim % 2 == 0, "embed_dim must be even."
+        self.embed_dim = embed_dim
+        self.register_buffer("W", torch.randn(embed_dim // 2) * scale, persistent=False)  # Fixed random frequencies
+    def forward(self, t):
+        # Ensure float
+        t = t.float().unsqueeze(-1)  # Broadcoast to [B, 1]
+        angles = t * self.W  # B, embed_dim // 2
+        return torch.cat([torch.sin(angles), torch.cos(angles)], dim=-1)
+class TimeEmbedding(nn.Module):
+    def __init__(self, hidden_dim, fourier_dim, scale):
+        super().__init__()
+        assert fourier_dim % 2 == 0, "fourier_dim must be even for sine/cosine pairs."
+        self.fourier = GaussianFourierProjection(fourier_dim, scale)
+        self.mlp = nn.Sequential(
+            nn.Linear(fourier_dim, hidden_dim),
+            nn.SiLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+        )
+    def forward(self, t):
+        ft = self.fourier(t)        # (B, fourier_dim)
+        return self.mlp(ft)         # (B, hidden_dim)