tritesh
/

dflash-mlx-universal

@@ -1,10 +1,13 @@
 """
 MLX implementation of the DFlash block diffusion draft model.
-This implements the core architecture from the DFlash paper (arXiv:2602.06036):
 - Block-level diffusion for parallel token drafting
 - KV injection of target model hidden features
 - Causal attention within blocks with cross-block masking
 """
 import math
@@ -28,14 +31,26 @@ class RMSNorm(nn.Module):
 def apply_rotary_emb(x, cos, sin):
-    """Apply rotary positional embeddings."""
     x1, x2 = x[..., ::2], x[..., 1::2]
     rotated = mx.stack([-x2, x1], axis=-1).reshape(x.shape)
     return x * cos + rotated * sin
 def build_rope_cache(seq_len: int, head_dim: int, base: float = 10000.0):
-    """Build rotary positional embedding cache."""
     theta = 1.0 / (base ** (mx.arange(0, head_dim, 2) / head_dim))
     positions = mx.arange(seq_len)
     angles = mx.outer(positions, theta)
@@ -47,12 +62,25 @@ def build_rope_cache(seq_len: int, head_dim: int, base: float = 10000.0):
     return cos, sin
 class DFlashAttention(nn.Module):
     """Multi-head attention with KV injection from target model features.
     This is the core of DFlash: the draft model's attention keys and values
     are augmented with projected target model hidden states, providing rich
     conditioning that enables high acceptance rates.
     """
     def __init__(
@@ -78,7 +106,7 @@ class DFlashAttention(nn.Module):
         self.v_proj = nn.Linear(hidden_size, num_kv_heads * head_dim, bias=False)
         self.o_proj = nn.Linear(num_heads * head_dim, hidden_size, bias=False)
-        # Layer norms
         self.q_norm = RMSNorm(head_dim, eps=1e-6)
         self.k_norm = RMSNorm(head_dim, eps=1e-6)
@@ -90,6 +118,18 @@ class DFlashAttention(nn.Module):
         position_embeddings: Optional[Tuple[mx.array, mx.array]] = None,
         past_key_values: Optional[Tuple[mx.array, mx.array]] = None,
     ) -> mx.array:
         bsz, q_len = hidden_states.shape[:2]
         ctx_len = target_hidden.shape[1]
@@ -209,6 +249,8 @@ class DFlashDraftModel(nn.Module):
     - Target context feature projection (fuses cross-layer hidden states)
     - Rotary position embeddings
     - Block-wise parallel diffusion
     """
     def __init__(
@@ -281,7 +323,7 @@ class DFlashDraftModel(nn.Module):
         """Select target model layer indices for feature extraction.
         Uniformly samples from shallow to deep layers for cross-layer
-        feature fusion.
         """
         if num_draft_layers == 1:
             return [num_target_layers // 2]
@@ -308,13 +350,25 @@ class DFlashDraftModel(nn.Module):
         """Extract and fuse target model hidden features.
         Args:
-            hidden_states: List of hidden states from target model layers
         Returns:
             Fused target context feature [bsz, seq_len, hidden_size]
         """
-        offset = 1  # Skip embedding layer
-        selected = [hidden_states[layer_id + offset] for layer_id in self.target_layer_ids]
         target_hidden = mx.concatenate(selected, axis=-1)
         return self.hidden_norm(self.fc(target_hidden))
@@ -366,6 +420,10 @@ class DFlashDenoiser:
     Implements the iterative denoising process where masked tokens
     are progressively revealed in parallel within each block.
     """
     def __init__(self, model: DFlashDraftModel, num_steps: int = 12):
@@ -382,6 +440,8 @@ class DFlashDenoiser:
     ) -> mx.array:
         """Denoise a block of masked tokens in parallel.
         Args:
             draft_tokens: Token IDs with mask tokens [bsz, block_size]
             target_hidden: Target context features
@@ -394,11 +454,16 @@ class DFlashDenoiser:
         # Embed tokens
         embeddings = self.model.embed_tokens(draft_tokens)
         # Run draft model
         hidden_states = self.model(
             noise_embedding=embeddings,
             target_hidden=target_hidden,
             position_ids=position_ids,
         )
         # Get logits and sample

 """
 MLX implementation of the DFlash block diffusion draft model.
+Implements the core architecture from the DFlash paper (arXiv:2602.06036):
 - Block-level diffusion for parallel token drafting
 - KV injection of target model hidden features
 - Causal attention within blocks with cross-block masking
+- Position-dependent loss decay
+Architecture-agnostic: works with any target model family via adapters.
 """
 import math
 def apply_rotary_emb(x, cos, sin):
+    """Apply rotary positional embeddings to x.
+    Args:
+        x: [..., seq_len, head_dim]
+        cos, sin: [seq_len, head_dim]
+    Returns:
+        Rotated tensor same shape as x
+    """
     x1, x2 = x[..., ::2], x[..., 1::2]
     rotated = mx.stack([-x2, x1], axis=-1).reshape(x.shape)
     return x * cos + rotated * sin
 def build_rope_cache(seq_len: int, head_dim: int, base: float = 10000.0):
+    """Build rotary positional embedding cache.
+    Returns:
+        cos, sin: [seq_len, head_dim] each interleaved for all dims
+    """
     theta = 1.0 / (base ** (mx.arange(0, head_dim, 2) / head_dim))
     positions = mx.arange(seq_len)
     angles = mx.outer(positions, theta)
     return cos, sin
+def create_causal_mask(seq_len: int, dtype=mx.float32) -> mx.array:
+    """Create a causal attention mask for self-attention.
+    Returns [1, 1, seq_len, seq_len] mask with -inf in upper triangle.
+    """
+    mask = mx.triu(mx.ones((seq_len, seq_len), dtype=dtype), k=1)
+    mask = mx.where(mask == 1, -1e9, 0.0)
+    return mask[None, None, :, :]  # [1, 1, seq_len, seq_len]
 class DFlashAttention(nn.Module):
     """Multi-head attention with KV injection from target model features.
     This is the core of DFlash: the draft model's attention keys and values
     are augmented with projected target model hidden states, providing rich
     conditioning that enables high acceptance rates.
+    Supports both standard attention and KV-injected cross-attention within
+    the same layer.
     """
     def __init__(
         self.v_proj = nn.Linear(hidden_size, num_kv_heads * head_dim, bias=False)
         self.o_proj = nn.Linear(num_heads * head_dim, hidden_size, bias=False)
+        # Layer norms for Q, K (Qwen3.5-style pre-norm in attention)
         self.q_norm = RMSNorm(head_dim, eps=1e-6)
         self.k_norm = RMSNorm(head_dim, eps=1e-6)
         position_embeddings: Optional[Tuple[mx.array, mx.array]] = None,
         past_key_values: Optional[Tuple[mx.array, mx.array]] = None,
     ) -> mx.array:
+        """Forward pass with KV injection.
+        Args:
+            hidden_states: Draft token embeddings [bsz, q_len, hidden_size]
+            target_hidden: Target context features [bsz, ctx_len, hidden_size]
+            attention_mask: Optional mask [1, 1, q_len, kv_len]
+            position_embeddings: Optional (cos, sin) for RoPE
+            past_key_values: Not used in DFlash (diffusion is non-autoregressive)
+        Returns:
+            Attention output [bsz, q_len, hidden_size]
+        """
         bsz, q_len = hidden_states.shape[:2]
         ctx_len = target_hidden.shape[1]
     - Target context feature projection (fuses cross-layer hidden states)
     - Rotary position embeddings
     - Block-wise parallel diffusion
+    Universal: config auto-detected from target model or specified explicitly.
     """
     def __init__(
         """Select target model layer indices for feature extraction.
         Uniformly samples from shallow to deep layers for cross-layer
+        feature fusion, as described in the DFlash paper.
         """
         if num_draft_layers == 1:
             return [num_target_layers // 2]
         """Extract and fuse target model hidden features.
         Args:
+            hidden_states: List of hidden states from target model layers.
+                           hidden_states[0] is typically embedding layer output.
         Returns:
             Fused target context feature [bsz, seq_len, hidden_size]
         """
+        offset = 1  # Skip embedding layer (usually index 0)
+        selected = []
+        for layer_id in self.target_layer_ids:
+            idx = layer_id + offset
+            if idx < len(hidden_states):
+                selected.append(hidden_states[idx])
+            else:
+                # Fallback: use last available hidden state
+                selected.append(hidden_states[-1])
+        if not selected:
+            raise RuntimeError("[DFlashDraftModel] No hidden states available for extraction")
         target_hidden = mx.concatenate(selected, axis=-1)
         return self.hidden_norm(self.fc(target_hidden))
     Implements the iterative denoising process where masked tokens
     are progressively revealed in parallel within each block.
+    For simplicity, this uses a single-step denoising (the draft model
+    predicts all masked positions at once). The full DFlash paper
+    uses multiple denoising steps with noise scheduling.
     """
     def __init__(self, model: DFlashDraftModel, num_steps: int = 12):
     ) -> mx.array:
         """Denoise a block of masked tokens in parallel.
+        Single-step: embed tokens, run draft model, sample predictions.
         Args:
             draft_tokens: Token IDs with mask tokens [bsz, block_size]
             target_hidden: Target context features
         # Embed tokens
         embeddings = self.model.embed_tokens(draft_tokens)
+        # Build causal mask for the block (tokens attend to context + earlier positions)
+        seq_len = draft_tokens.shape[1]
+        mask = create_causal_mask(seq_len)
         # Run draft model
         hidden_states = self.model(
             noise_embedding=embeddings,
             target_hidden=target_hidden,
             position_ids=position_ids,
+            attention_mask=mask,
         )
         # Get logits and sample