File size: 20,722 Bytes
ee6da62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
"""
TD3B Loss Functions
Implements contrastive loss for separating agonist/antagonist embeddings.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple


class ContrastiveLoss(nn.Module):
    """
    Margin-based contrastive loss for separating agonist and antagonist embeddings.

    For a pair of sequences (y_i, y_j):
        - If both are agonists OR both are antagonists (similar): minimize distance
        - If one is agonist and one is antagonist (dissimilar): maximize distance

    Loss formula:
        L_ctr = (1 - y_ij) * 0.5 * d²
              + y_ij * 0.5 * max(0, margin - d)²

    where:
        - d = ||emb_i - emb_j||_2 (Euclidean distance)
        - y_ij = 0 if similar, 1 if dissimilar
        - margin = minimum distance between dissimilar pairs
    """

    def __init__(self, margin: float = 1.0, distance_metric: str = 'euclidean', adaptive_margin: bool = False):
        """
        Args:
            margin: Minimum distance between dissimilar pairs (base margin)
            distance_metric: 'euclidean' or 'cosine'
            adaptive_margin: If True, adjust margin based on actual dissimilar distances
        """
        super().__init__()
        self.base_margin = margin
        self.distance_metric = distance_metric
        self.adaptive_margin = adaptive_margin

    def compute_distance(self, emb1: torch.Tensor, emb2: torch.Tensor) -> torch.Tensor:
        """
        Compute pairwise distance between embeddings.

        Args:
            emb1: (batch_size, embedding_dim)
            emb2: (batch_size, embedding_dim)
        Returns:
            distances: (batch_size,)
        """
        if self.distance_metric == 'euclidean':
            # L2 distance
            distances = torch.norm(emb1 - emb2, p=2, dim=-1)  # (B,)
        elif self.distance_metric == 'cosine':
            # Cosine distance = 1 - cosine_similarity
            cos_sim = F.cosine_similarity(emb1, emb2, dim=-1)  # (B,)
            distances = 1.0 - cos_sim
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

        return distances

    def forward(
        self,
        embeddings: torch.Tensor,
        labels: torch.Tensor,
        confidences: Optional[torch.Tensor] = None,
        debug: bool = False
    ) -> torch.Tensor:
        """
        Compute contrastive loss for a batch.

        Args:
            embeddings: (batch_size, embedding_dim) sequence embeddings
            labels: (batch_size,) directional labels in {-1, +1}
                +1 = agonist, -1 = antagonist
            confidences: (batch_size,) oracle confidence scores; pairs with product <= 0 are masked out
            debug: If True, print detailed debugging information
        Returns:
            loss: scalar contrastive loss
        """
        batch_size = embeddings.size(0)
        if batch_size < 2:
            if debug:
                print(f"[ContrastiveLoss DEBUG] Batch size {batch_size} < 2, returning 0 loss")
            return torch.tensor(0.0, device=embeddings.device)

        if confidences is not None:
            if not torch.is_tensor(confidences):
                confidences = torch.as_tensor(confidences, device=embeddings.device)
            else:
                confidences = confidences.to(embeddings.device)
            confidences = confidences.view(-1)
            if confidences.numel() != batch_size:
                raise ValueError(
                    f"Confidences size {confidences.numel()} does not match batch size {batch_size}"
                )

        # Compute pairwise distances (all pairs)
        if self.distance_metric == 'euclidean':
            distances = torch.cdist(embeddings, embeddings, p=2)  # (B, B)
        elif self.distance_metric == 'cosine':
            emb_norm = F.normalize(embeddings, p=2, dim=-1)
            distances = 1.0 - torch.matmul(emb_norm, emb_norm.T)  # (B, B)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

        # Compute pairwise similarity labels
        # y_ij = 0 if same class (both agonist or both antagonist)
        # y_ij = 1 if different class
        labels = labels.view(-1)
        labels_expanded = labels.unsqueeze(1)  # (B, 1)
        label_product = labels_expanded * labels_expanded.T  # (B, B)
        # label_product > 0 means same class (both +1 or both -1)
        # label_product < 0 means different class
        dissimilar_mask = (label_product < 0)  # (B, B) bool

        # Exclude diagonal
        eye_mask = torch.eye(batch_size, device=embeddings.device, dtype=torch.bool)
        pos_mask = (~dissimilar_mask) & ~eye_mask
        neg_mask = dissimilar_mask & ~eye_mask

        # Apply confidence mask: remove pairs with confidence product <= 0
        conf_mask = None
        if confidences is not None:
            conf_product = confidences.unsqueeze(0) * confidences.unsqueeze(1)
            conf_mask = conf_product > 0
            pos_mask = pos_mask & conf_mask
            neg_mask = neg_mask & conf_mask

        # Adaptive margin: set margin based on actual dissimilar distances
        if self.adaptive_margin and neg_mask.any():
            # Get all dissimilar distances
            dissimilar_distances = distances[neg_mask]
            # Set margin to 150% of mean dissimilar distance
            # This ensures there's always room for optimization
            adaptive_margin = 1.5 * dissimilar_distances.mean().item()
            # Use max of base_margin and adaptive_margin
            margin = max(self.base_margin, adaptive_margin)
        else:
            margin = self.base_margin

        pos_count = pos_mask.sum()
        neg_count = neg_mask.sum()
        total_pairs = pos_count + neg_count
        if total_pairs.item() == 0:
            if debug:
                print("[ContrastiveLoss DEBUG] No valid pairs after filtering, returning 0 loss")
            return torch.tensor(0.0, device=embeddings.device)

        # Contrastive loss
        # For similar pairs: minimize squared distance
        # For dissimilar pairs: squared hinge loss with margin
        pos_loss = distances[pos_mask].pow(2).sum() / (pos_count + 1e-8)
        neg_loss = torch.clamp(margin - distances[neg_mask], min=0.0).pow(2).sum() / (neg_count + 1e-8)
        loss = pos_loss + neg_loss

        if debug:
            print(f"\n[ContrastiveLoss DEBUG]")
            print(f"  Batch size: {batch_size}")
            print(f"  Labels: {labels.cpu().tolist()}")
            print(f"  Unique labels: {torch.unique(labels).cpu().tolist()}")
            print(f"  Embedding shape: {embeddings.shape}")
            print(f"  Embedding norm (mean): {embeddings.norm(dim=-1).mean().item():.4f}")
            print(f"  Embedding norm (std): {embeddings.norm(dim=-1).std().item():.4f}")
            valid_mask = pos_mask | neg_mask
            if valid_mask.any():
                valid_dists = distances[valid_mask]
                print(f"  Distance stats (valid pairs): mean={valid_dists.mean().item():.4f} "
                      f"min={valid_dists.min().item():.4f} max={valid_dists.max().item():.4f}")
            if self.adaptive_margin and neg_mask.any():
                print(f"  Margin: {margin:.4f} (adaptive, base={self.base_margin})")
            else:
                print(f"  Margin: {margin:.4f} (fixed)")
            print(f"  Num similar pairs: {pos_count.item():.0f}")
            print(f"  Num dissimilar pairs: {neg_count.item():.0f}")
            if conf_mask is not None:
                print(f"  Confidence-passing pairs: {conf_mask.sum().item():.0f}")
            print(f"  Similar loss (mean): {pos_loss.item():.4f}")
            print(f"  Dissimilar loss (mean): {neg_loss.item():.4f}")
            print(f"  Total loss: {loss.item():.4f}")

            # Show which dissimilar pairs have margin violations
            margin_violations = (distances < margin) & neg_mask
            if margin_violations.sum() > 0:
                print(f"  Margin violations: {margin_violations.sum().item():.0f} dissimilar pairs have distance < margin")
            else:
                print(f"  Margin violations: 0 (all dissimilar pairs are already separated)")

        return loss


class InfoNCELoss(nn.Module):
    """
    Alternative: InfoNCE contrastive loss (used in SimCLR, CLIP).
    Treats agonists as positive class, antagonists as negative class.

    For each agonist, pull it close to other agonists and push away from antagonists.
    For each antagonist, pull it close to other antagonists and push away from agonists.
    """

    def __init__(self, temperature: float = 0.1):
        """
        Args:
            temperature: Temperature parameter for softmax
        """
        super().__init__()
        self.temperature = temperature

    def forward(
        self,
        embeddings: torch.Tensor,
        labels: torch.Tensor,
        confidences: Optional[torch.Tensor] = None,
        debug: bool = False
    ) -> torch.Tensor:
        """
        Compute InfoNCE loss.

        Args:
            embeddings: (batch_size, embedding_dim)
            labels: (batch_size,) in {-1, +1}
            confidences: (batch_size,) oracle confidence scores; pairs with product <= 0 are masked out
            debug: Unused (kept for API compatibility)
        Returns:
            loss: scalar
        """
        batch_size = embeddings.size(0)
        if confidences is not None:
            if not torch.is_tensor(confidences):
                confidences = torch.as_tensor(confidences, device=embeddings.device)
            else:
                confidences = confidences.to(embeddings.device)
            confidences = confidences.view(-1)
            if confidences.numel() != batch_size:
                raise ValueError(
                    f"Confidences size {confidences.numel()} does not match batch size {batch_size}"
                )
        if batch_size < 2:
            return torch.tensor(0.0, device=embeddings.device)

        # Normalize embeddings
        embeddings = F.normalize(embeddings, p=2, dim=-1)  # (B, D)

        # Compute similarity matrix
        similarity = torch.matmul(embeddings, embeddings.T) / self.temperature  # (B, B)

        # Create positive/negative masks
        labels_expanded = labels.unsqueeze(1)  # (B, 1)
        label_product = labels_expanded * labels_expanded.T  # (B, B)
        positive_mask = (label_product > 0)  # Same class
        negative_mask = (label_product < 0)  # Different class

        # Remove self-similarity
        positive_mask.fill_diagonal_(0)

        if confidences is not None:
            conf_product = confidences.unsqueeze(0) * confidences.unsqueeze(1)
            conf_mask = conf_product > 0
            positive_mask = positive_mask & conf_mask
            negative_mask = negative_mask & conf_mask

        # For each sample, compute InfoNCE loss
        # log( exp(sim_pos) / (exp(sim_pos) + sum(exp(sim_neg))) )
        losses = []
        for i in range(batch_size):
            # Positive samples
            pos_sims = similarity[i][positive_mask[i]]  # (num_pos,)
            # Negative samples
            neg_sims = similarity[i][negative_mask[i]]  # (num_neg,)

            # Check if there are positive samples
            if pos_sims.numel() == 0:
                continue

            # LogSumExp for numerical stability
            pos_exp = torch.exp(pos_sims)  # (num_pos,)
            neg_exp = torch.exp(neg_sims)  # (num_neg,)

            if neg_exp.numel() == 0:
                continue

            # Average over positive samples
            denominator = pos_exp.sum() + neg_exp.sum()
            loss_i = -torch.log(pos_exp.sum() / (denominator + 1e-8))
            losses.append(loss_i)

        if len(losses) == 0:
            return torch.tensor(0.0, device=embeddings.device)

        return torch.stack(losses).mean()


class TD3BTotalLoss:
    """
    Combined TD3B loss: L_total = L_WDCE + λ * L_ctr + β * L_KL

    Components:
        - L_WDCE: Weighted Denoising Cross-Entropy (from TR2-D2)
        - L_ctr: Contrastive loss for agonist/antagonist separation
        - L_KL: KL divergence regularization between policy and reference model
    """

    def __init__(
        self,
        contrastive_weight: float = 0.1,
        contrastive_margin: float = 1.0,
        contrastive_type: str = 'margin',  # 'margin' or 'infonce'
        kl_beta: float = 0.1,  # β coefficient for KL divergence
        reference_model: Optional[nn.Module] = None,
        adaptive_margin: bool = True  # Enable adaptive margin by default
    ):
        """
        Args:
            contrastive_weight: λ coefficient for contrastive loss
            contrastive_margin: Margin for margin-based contrastive loss (base margin if adaptive)
            contrastive_type: Type of contrastive loss ('margin' or 'infonce')
            kl_beta: β coefficient for KL divergence regularization
            reference_model: Frozen reference model for KL divergence (deepcopy of pretrained)
            adaptive_margin: If True, automatically adjust margin based on dissimilar distances
        """
        self.contrastive_weight = contrastive_weight
        self.kl_beta = kl_beta
        self.reference_model = reference_model

        # Freeze reference model if provided
        if self.reference_model is not None:
            self.reference_model.eval()
            for param in self.reference_model.parameters():
                param.requires_grad = False

            # Verify all parameters are frozen
            assert all(not p.requires_grad for p in self.reference_model.parameters()), \
                "ERROR: Reference model has parameters with requires_grad=True!"

        if contrastive_type == 'margin':
            self.contrastive_loss = ContrastiveLoss(
                margin=contrastive_margin,
                distance_metric='euclidean',
                adaptive_margin=adaptive_margin
            )
        elif contrastive_type == 'infonce':
            self.contrastive_loss = InfoNCELoss(temperature=0.1)
        else:
            raise ValueError(f"Unknown contrastive type: {contrastive_type}")

    def compute_kl_categorical(
        self,
        log_p: torch.Tensor,
        log_ref_p: torch.Tensor
    ) -> torch.Tensor:
        """
        Compute KL divergence between categorical distributions.

        KL(P || Q) = Σ P(x) * log(P(x) / Q(x))
                   = Σ P(x) * (log P(x) - log Q(x))

        Args:
            log_p: (B, L, Vocab) log-probabilities from policy model
            log_ref_p: (B, L, Vocab) log-probabilities from reference model
        Returns:
            kl: (B, L) KL divergence per position
        """
        # Convert log-probs to probabilities
        p = torch.exp(log_p)  # (B, L, Vocab)

        # KL divergence element-wise
        kl_elementwise = p * (log_p - log_ref_p)  # (B, L, Vocab)

        # Handle numerical issues: 0 * log(0) should be 0
        # Replace NaNs or Infs that occur at -inf locations with 0
        kl_elementwise = torch.where(
            torch.isfinite(kl_elementwise),
            kl_elementwise,
            torch.zeros_like(kl_elementwise)
        )

        # Sum over vocabulary dimension
        kl = kl_elementwise.sum(dim=-1)  # (B, L)

        return kl

    def compute_kl_loss(
        self,
        policy_model: nn.Module,
        sequences: torch.Tensor,
        attn_mask: torch.Tensor,
        sigma: torch.Tensor
    ) -> torch.Tensor:
        """
        Compute KL divergence loss between policy model and reference model.

        Args:
            policy_model: Current policy model (being trained)
            sequences: (B, L) input sequences
            attn_mask: (B, L) attention mask
            sigma: (B,) noise schedule
        Returns:
            kl_loss: Scalar KL divergence loss
        """
        if self.reference_model is None:
            return torch.tensor(0.0, device=sequences.device)

        # Ensure reference model is in eval mode
        assert not self.reference_model.training, \
            "ERROR: Reference model is in training mode! It should always be in eval mode."

        # Forward through policy model (already computed in WDCE, but need logits)
        policy_logits = policy_model(sequences, attn_mask=attn_mask, sigma=sigma)  # (B, L, Vocab)

        # Forward through reference model (frozen, no gradients)
        with torch.no_grad():
            ref_logits = self.reference_model(sequences, attn_mask=attn_mask, sigma=sigma)  # (B, L, Vocab)

        # Convert to log-probabilities
        log_p = F.log_softmax(policy_logits, dim=-1)  # (B, L, Vocab)
        log_ref_p = F.log_softmax(ref_logits, dim=-1)  # (B, L, Vocab)

        # Compute KL divergence
        kl_per_position = self.compute_kl_categorical(log_p, log_ref_p)  # (B, L)

        # Mask out padding positions
        kl_masked = kl_per_position * attn_mask.float()  # (B, L)

        # Average over all non-padding positions
        num_valid = attn_mask.float().sum()
        kl_loss = kl_masked.sum() / (num_valid + 1e-8)

        return kl_loss

    def compute_loss(
        self,
        wdce_loss: torch.Tensor,
        embeddings: torch.Tensor,
        directional_labels: torch.Tensor,
        confidences: Optional[torch.Tensor] = None,
        kl_loss: Optional[torch.Tensor] = None,
        debug: bool = False
    ) -> Tuple[torch.Tensor, dict]:
        """
        Compute total TD3B loss.

        Args:
            wdce_loss: Precomputed WDCE loss (scalar)
            embeddings: (batch_size, embedding_dim) sequence embeddings from MDLM
            directional_labels: (batch_size,) labels in {-1, +1}
            confidences: (batch_size,) oracle confidence scores; pairs with product <= 0 are masked out
            kl_loss: Precomputed KL divergence loss (optional)
            debug: If True, enable debugging output in contrastive loss
        Returns:
            total_loss: Combined loss
            loss_dict: Dictionary with individual loss components
        """
        # Contrastive loss (pass debug flag)
        contrastive_loss = self.contrastive_loss(
            embeddings,
            directional_labels,
            confidences=confidences,
            debug=debug
        )

        # KL divergence loss
        if kl_loss is None:
            kl_loss = torch.tensor(0.0, device=embeddings.device)

        # Total loss: L_total = L_WDCE + λ * L_ctr + β * L_KL
        total_loss = wdce_loss + self.contrastive_weight * contrastive_loss + self.kl_beta * kl_loss

        loss_dict = {
            'total_loss': total_loss.item(),
            'wdce_loss': wdce_loss.item(),
            'contrastive_loss': contrastive_loss.item(),
            'kl_loss': kl_loss.item() if isinstance(kl_loss, torch.Tensor) else kl_loss
        }

        return total_loss, loss_dict


def extract_embeddings_from_mdlm(
    model,
    sequences: torch.Tensor,
    pool_method: str = 'mean'
) -> torch.Tensor:
    """
    Extract sequence-level embeddings from MDLM backbone.

    Args:
        model: MDLM model with backbone (Roformer)
        sequences: (batch_size, seq_len) token sequences
        pool_method: 'mean', 'max', or 'cls'
    Returns:
        embeddings: (batch_size, hidden_dim)
    """
    # Create attention mask (1 for real tokens, 0 for padding)
    attn_mask = (sequences != 0).long()  # (B, L)

    # Forward through Roformer backbone to get hidden states
    # IMPORTANT: DO NOT use torch.no_grad() here - we need gradients for backprop!
    # Access the underlying RoFormerForMaskedLM model and request hidden states
    outputs = model.backbone.model(
        input_ids=sequences,
        attention_mask=attn_mask,
        output_hidden_states=True,
        return_dict=True
    )

    # Extract last hidden state from outputs
    # outputs.hidden_states is a tuple of (embedding_output, layer1, layer2, ..., layerN)
    # We want the last layer's hidden states
    hidden_states = outputs.hidden_states[-1]  # (B, L, D)

    # Pool to get sequence-level embeddings
    if pool_method == 'mean':
        # Mean pooling (ignore padding)
        mask = attn_mask.float().unsqueeze(-1)  # (B, L, 1)
        pooled = (hidden_states * mask).sum(dim=1) / (mask.sum(dim=1) + 1e-8)  # (B, D)
    elif pool_method == 'max':
        # Max pooling
        pooled = hidden_states.max(dim=1)[0]  # (B, D)
    elif pool_method == 'cls':
        # Use first token (CLS-style)
        pooled = hidden_states[:, 0, :]  # (B, D)
    else:
        raise ValueError(f"Unknown pool method: {pool_method}")

    return pooled