tritesh
/

dflash-mlx-universal

@@ -1,4 +1,8 @@
-"""Tests for DFlash MLX model architecture."""
 import unittest
 import mlx.core as mx
@@ -8,61 +12,238 @@ from dflash_mlx.model import (
     DFlashMLP,
     DFlashDecoderLayer,
     DFlashDraftModel,
 )
 class TestRMSNorm(unittest.TestCase):
     def test_shape_preservation(self):
         norm = RMSNorm(dims=128)
-        x = mx.random.normal(shape=(2, 10, 128))
         out = norm(x)
         self.assertEqual(out.shape, x.shape)
 class TestDFlashAttention(unittest.TestCase):
-    def test_forward(self):
         attn = DFlashAttention(
             hidden_size=256,
             num_heads=4,
             num_kv_heads=2,
             head_dim=64,
-            layer_idx=0,
         )
-        hidden = mx.random.normal(shape=(1, 10, 256))
-        target_hidden = mx.random.normal(shape=(1, 5, 256))
-        out = attn(hidden, target_hidden)
-        self.assertEqual(out.shape, (1, 10, 256))
 class TestDFlashDraftModel(unittest.TestCase):
-    def test_forward(self):
         model = DFlashDraftModel(
             vocab_size=1000,
             hidden_size=256,
-            num_layers=2,
             num_heads=4,
             num_kv_heads=2,
-            intermediate_size=512,
-            max_seq_len=128,
-            block_size=16,
         )
-        noise = mx.random.normal(shape=(1, 16, 256))
-        target = mx.random.normal(shape=(1, 5, 256))
-        out = model(noise, target)
-        self.assertEqual(out.shape, (1, 16, 256))
-    def test_logits(self):
         model = DFlashDraftModel(
             vocab_size=1000,
-            hidden_size=256,
             num_layers=2,
             num_heads=4,
             num_kv_heads=2,
-            intermediate_size=512,
         )
-        hidden = mx.random.normal(shape=(1, 8, 256))
         logits = model.get_logits(hidden)
-        self.assertEqual(logits.shape, (1, 8, 1000))
 if __name__ == "__main__":

+"""
+Tests for DFlash draft model and denoiser.
+Validates model creation, forward pass, and KV injection mechanism.
+"""
 import unittest
 import mlx.core as mx
     DFlashMLP,
     DFlashDecoderLayer,
     DFlashDraftModel,
+    DFlashDenoiser,
+    build_rope_cache,
+    apply_rotary_emb,
+    create_causal_mask,
 )
 class TestRMSNorm(unittest.TestCase):
+    """Test RMSNorm implementation."""
     def test_shape_preservation(self):
+        """RMSNorm preserves tensor shape."""
         norm = RMSNorm(dims=128)
+        x = mx.random.normal((2, 10, 128))
+        out = norm(x)
+        self.assertEqual(out.shape, x.shape)
+    def test_unit_variance(self):
+        """RMSNorm roughly normalizes to unit variance."""
+        norm = RMSNorm(dims=64, eps=1e-6)
+        x = mx.random.normal((1, 5, 64)) * 5.0
         out = norm(x)
+        # Check that values are reasonably scaled
+        self.assertTrue(mx.all(mx.abs(out) < 100).item())
+class TestRoPE(unittest.TestCase):
+    """Test rotary positional embeddings."""
+    def test_cache_shape(self):
+        """RoPE cache has correct shape."""
+        cos, sin = build_rope_cache(seq_len=100, head_dim=64)
+        self.assertEqual(cos.shape, (100, 64))
+        self.assertEqual(sin.shape, (100, 64))
+    def test_application_shape(self):
+        """RoPE application preserves shape."""
+        cos, sin = build_rope_cache(seq_len=10, head_dim=64)
+        x = mx.random.normal((1, 2, 8, 64))  # [bsz, seq, heads, dim]
+        out = apply_rotary_emb(x, cos[:10], sin[:10])
         self.assertEqual(out.shape, x.shape)
+class TestCausalMask(unittest.TestCase):
+    """Test causal mask creation."""
+    def test_mask_shape(self):
+        """Mask has correct shape."""
+        mask = create_causal_mask(seq_len=16)
+        self.assertEqual(mask.shape, (1, 1, 16, 16))
+    def test_mask_values(self):
+        """Upper triangle has large negative values."""
+        mask = create_causal_mask(seq_len=4)
+        # Lower triangle should be 0 (or close)
+        self.assertTrue(mask[0, 0, 3, 0] == 0.0)
+        # Upper triangle should be very negative
+        self.assertTrue(mask[0, 0, 0, 3] < -1e8)
 class TestDFlashAttention(unittest.TestCase):
+    """Test DFlash attention with KV injection."""
+    def test_forward_shape(self):
+        """Attention output preserves shape."""
         attn = DFlashAttention(
             hidden_size=256,
             num_heads=4,
             num_kv_heads=2,
             head_dim=64,
         )
+        hidden = mx.random.normal((1, 8, 256))    # draft tokens
+        target_h = mx.random.normal((1, 16, 256))  # target context
+        out = attn(hidden, target_h)
+        self.assertEqual(out.shape, (1, 8, 256))
+    def test_kv_injection(self):
+        """Different target hidden states produce different outputs."""
+        attn = DFlashAttention(
+            hidden_size=64,
+            num_heads=2,
+            num_kv_heads=1,
+            head_dim=32,
+        )
+        hidden = mx.random.normal((1, 4, 64))
+        target_h1 = mx.random.normal((1, 8, 64))
+        target_h2 = mx.random.normal((1, 8, 64)) * 2.0
+        out1 = attn(hidden, target_h1)
+        out2 = attn(hidden, target_h2)
+        # Should be different
+        self.assertFalse(mx.allclose(out1, out2))
 class TestDFlashDraftModel(unittest.TestCase):
+    """Test complete draft model."""
+    def test_model_creation(self):
+        """Model can be instantiated."""
         model = DFlashDraftModel(
             vocab_size=1000,
             hidden_size=256,
+            num_layers=3,
             num_heads=4,
             num_kv_heads=2,
+            intermediate_size=704,
+            block_size=8,
         )
+        self.assertEqual(model.num_layers, 3)
+        self.assertEqual(model.hidden_size, 256)
+    def test_forward_pass(self):
+        """Model forward pass works."""
         model = DFlashDraftModel(
             vocab_size=1000,
+            hidden_size=128,
             num_layers=2,
             num_heads=4,
             num_kv_heads=2,
+            intermediate_size=352,
+            block_size=8,
+        )
+        # Create noise embedding (simulating embedded tokens)
+        noise_embed = mx.random.normal((1, 8, 128))
+        target_hidden = mx.random.normal((1, 16, 128))
+        out = model(noise_embed, target_hidden)
+        self.assertEqual(out.shape, (1, 8, 128))
+    def test_target_layer_ids(self):
+        """Target layer IDs are computed correctly."""
+        model = DFlashDraftModel(
+            vocab_size=1000,
+            hidden_size=128,
+            num_layers=5,
+            num_target_layers=32,
+        )
+        ids = model.target_layer_ids
+        self.assertEqual(len(ids), 5)
+        # Should span from early to late layers
+        self.assertGreater(ids[-1], ids[0])
+    def test_extract_features(self):
+        """Feature extraction works with multi-layer hidden states."""
+        model = DFlashDraftModel(
+            vocab_size=1000,
+            hidden_size=128,
+            num_layers=3,
+            num_target_layers=10,
         )
+        # Create fake hidden states from 10 layers
+        hidden_states = [mx.random.normal((1, 5, 128)) for _ in range(10)]
+        features = model.extract_context_features(hidden_states)
+        # Features should have same batch and seq dims, hidden_size dim
+        self.assertEqual(features.shape[0], 1)
+        self.assertEqual(features.shape[1], 5)
+        # Hidden size is projected back to hidden_size
+        self.assertEqual(features.shape[2], 128)
+    def test_get_logits(self):
+        """Logits have correct vocab size."""
+        vocab_size = 500
+        model = DFlashDraftModel(
+            vocab_size=vocab_size,
+            hidden_size=128,
+            num_layers=2,
+        )
+        hidden = mx.random.normal((1, 8, 128))
         logits = model.get_logits(hidden)
+        self.assertEqual(logits.shape, (1, 8, vocab_size))
+class TestDFlashDenoiser(unittest.TestCase):
+    """Test denoising block generation."""
+    def test_denoise_shape(self):
+        """Denoised block has correct shape."""
+        model = DFlashDraftModel(
+            vocab_size=1000,
+            hidden_size=128,
+            num_layers=2,
+            block_size=8,
+        )
+        denoiser = DFlashDenoiser(model)
+        draft_tokens = mx.array([[100, 200, 300, 400, 500, 600, 700, 800]])
+        target_hidden = mx.random.normal((1, 16, 128))
+        position_ids = mx.arange(8)
+        out = denoiser.denoise_block(draft_tokens, target_hidden, position_ids)
+        self.assertEqual(out.shape, (1, 8))
+    def test_denoise_greedy(self):
+        """Greedy denoising is deterministic."""
+        model = DFlashDraftModel(
+            vocab_size=100,
+            hidden_size=64,
+            num_layers=1,
+            block_size=4,
+        )
+        denoiser = DFlashDenoiser(model)
+        draft_tokens = mx.array([[0, 1, 2, 3]])
+        target_hidden = mx.random.normal((1, 8, 64))
+        position_ids = mx.arange(4)
+        out1 = denoiser.denoise_block(draft_tokens, target_hidden, position_ids, temperature=0.0)
+        out2 = denoiser.denoise_block(draft_tokens, target_hidden, position_ids, temperature=0.0)
+        mx.eval(out1, out2)
+        self.assertTrue(mx.array_equal(out1, out2))
+class TestModelSerialization(unittest.TestCase):
+    """Test saving/loading model parameters."""
+    def test_parameter_dict(self):
+        """Model can export parameters."""
+        model = DFlashDraftModel(
+            vocab_size=100,
+            hidden_size=64,
+            num_layers=2,
+        )
+        params = dict(model.parameters())
+        self.assertTrue(len(params) > 0)
+        self.assertIn("lm_head.weight", params)
 if __name__ == "__main__":