ALJIACHI
/

Mizan-Rerank-V2

@@ -205,15 +205,17 @@ class RotaryEmbedding(torch.nn.Module):
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-        return (
-            self.cos_cached[:seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:seq_len, ...].to(dtype=x.dtype),
-        )
 class NTKScalingRotaryEmbedding(RotaryEmbedding):
@@ -250,6 +252,21 @@ class NTKScalingRotaryEmbedding(RotaryEmbedding):
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 class RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):

         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def _compute_cos_sin(self, seq_len, device, dtype):
+        """Compute cos/sin from scratch — avoids persistent buffer corruption on Python 3.13."""
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
+        t = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.cos().to(dtype=dtype), emb.sin().to(dtype=dtype)
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
+        return self._compute_cos_sin(seq_len, x.device, x.dtype)
 class NTKScalingRotaryEmbedding(RotaryEmbedding):
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def _compute_cos_sin(self, seq_len, device, dtype):
+        """Compute NTK-scaled cos/sin from scratch — avoids persistent buffer corruption."""
+        base = self.base * (self.scaling_factor if self.mixed_b is None else 1)
+        inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
+        if self.mixed_b is None:
+            inv_freq = inv_freq / self.scaling_factor ** (2 / self.dim)
+        else:
+            a = torch.tensor(self.scaling_factor, device=device).log() / (self.dim / 2) ** self.mixed_b
+            lambda_1_m = (a * torch.arange(1, self.dim // 2 + 1, device=device, dtype=torch.float32) ** self.mixed_b).exp()
+            inv_freq = inv_freq / lambda_1_m
+        t = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.cos().to(dtype=dtype), emb.sin().to(dtype=dtype)
 class RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):