tritesh
/

dflash-mlx-universal

@@ -3,26 +3,34 @@ Universal DFlash decoder for any MLX-converted model.
 Provides a high-level interface that works with any mlx_lm model,
 including those without pre-built DFlash drafters.
 """
 from typing import Optional, List, Dict, Any
 import mlx.core as mx
 from .model import DFlashDraftModel
 from .speculative_decode import DFlashSpeculativeDecoder
 class UniversalDFlashDecoder:
     """Universal DFlash decoder that works with any MLX-converted model.
     This class handles:
-    1. Loading pre-converted DFlash drafters
     2. Creating generic drafters for unsupported models
     3. Training custom drafters on-the-fly
     """
     def __init__(
         self,
-        target_model,
         tokenizer,
         draft_model_path: Optional[str] = None,
         draft_layers: int = 5,
@@ -33,7 +41,7 @@ class UniversalDFlashDecoder:
         """Initialize the universal decoder.
         Args:
-            target_model: Any mlx_lm loaded model
             tokenizer: Tokenizer for the model
             draft_model_path: Optional path to pre-converted DFlash drafter
             draft_layers: Number of draft layers (if creating generic drafter)
@@ -41,19 +49,47 @@ class UniversalDFlashDecoder:
             block_size: Number of tokens per draft block
             device: MLX device
         """
-        self.target_model = target_model
         self.tokenizer = tokenizer
         self.block_size = block_size
         self.device = device
         # Determine model type and vocab size
         self.vocab_size = getattr(tokenizer, "vocab_size", 151936)
-        self.target_config = self._extract_target_config(target_model)
         # Load or create draft model
         if draft_model_path:
-            print(f"[UniversalDFlash] Loading pre-built drafter from {draft_model_path}")
-            from .convert import load_mlx_dflash
             self.draft_model, self.draft_config = load_mlx_dflash(draft_model_path)
         else:
             print("[UniversalDFlash] Creating generic drafter for your model...")
@@ -63,9 +99,9 @@ class UniversalDFlashDecoder:
             )
             self.draft_config = None
-        # Create the speculative decoder
         self.decoder = DFlashSpeculativeDecoder(
-            target_model=target_model,
             draft_model=self.draft_model,
             tokenizer=tokenizer,
             block_size=block_size,
@@ -85,6 +121,7 @@ class UniversalDFlashDecoder:
             config['intermediate_size'] = getattr(model_config, 'intermediate_size', 14336)
             config['num_attention_heads'] = getattr(model_config, 'num_attention_heads', 32)
             config['num_key_value_heads'] = getattr(model_config, 'num_key_value_heads', 8)
         else:
             # Default Qwen3-4B-like config
             config = {
@@ -94,6 +131,7 @@ class UniversalDFlashDecoder:
                 'intermediate_size': 14336,
                 'num_attention_heads': 32,
                 'num_key_value_heads': 8,
             }
         return config
@@ -107,16 +145,26 @@ class UniversalDFlashDecoder:
         This creates an untrained drafter that can be trained or used
         with pre-trained weights from a similar architecture.
         """
         # Determine architecture compatibility
         hidden_size = self.target_config.get('hidden_size', 4096)
         vocab_size = self.target_config.get('vocab_size', 151936)
         # Scale drafter based on target model size
         num_heads = draft_hidden_size // 64  # ~64 dims per head
         num_kv_heads = max(1, num_heads // 4)
         intermediate_size = int(draft_hidden_size * 2.75)  # Standard SwiGLU ratio
         drafter = DFlashDraftModel(
             vocab_size=vocab_size,
             hidden_size=draft_hidden_size,
@@ -126,8 +174,9 @@ class UniversalDFlashDecoder:
             intermediate_size=intermediate_size,
             max_seq_len=8192,
             block_size=self.block_size,
-            mask_token_id=0,  # Will be set from tokenizer
-            num_target_layers=self.target_config.get('num_layers', 32),
         )
         return drafter
@@ -145,14 +194,20 @@ class UniversalDFlashDecoder:
     ) -> str:
         """Train a custom DFlash drafter for your target model.
         Args:
             dataset: Path to training dataset or HF dataset name
             max_seq_length: Maximum sequence length for training
-            epochs: Number of training epochs
             batch_size: Training batch size
-            lr: Learning rate
-            warmup_ratio: Warmup ratio for cosine schedule
-            grad_clip: Gradient clipping threshold
             output_path: Where to save the trained drafter
         Returns:
@@ -161,6 +216,9 @@ class UniversalDFlashDecoder:
         from .trainer import DFlashTrainer
         print(f"[UniversalDFlash] Training custom drafter...")
         trainer = DFlashTrainer(
             target_model=self.target_model,
             drafter=self.draft_model,
@@ -196,7 +254,17 @@ class UniversalDFlashDecoder:
         # Save weights
         weights = dict(self.draft_model.parameters())
-        mx.save_safetensors(str(path / "weights.safetensors"), weights)
         # Save config
         config = {
@@ -205,9 +273,11 @@ class UniversalDFlashDecoder:
             "num_hidden_layers": self.draft_model.num_layers,
             "num_attention_heads": self.draft_model.num_heads,
             "num_key_value_heads": self.draft_model.num_heads // 4,
-            "intermediate_size": self.draft_model.layers[0].mlp.gate_proj.weight.shape[1] if hasattr(self.draft_model.layers[0].mlp.gate_proj, 'weight') else 2816,
             "max_position_embeddings": self.draft_model.max_seq_len,
             "block_size": self.draft_model.block_size,
         }
         with open(path / "config.json", "w") as f:
@@ -221,7 +291,8 @@ class UniversalDFlashDecoder:
         max_tokens: int = 2048,
         temperature: float = 0.0,
         stop_strings: Optional[List[str]] = None,
-    ) -> str:
         """Generate text using DFlash speculative decoding.
         Args:
@@ -229,15 +300,17 @@ class UniversalDFlashDecoder:
             max_tokens: Maximum tokens to generate
             temperature: Sampling temperature
             stop_strings: Optional stop strings
         Returns:
-            Generated text
         """
         return self.decoder.generate(
             prompt=prompt,
             max_tokens=max_tokens,
             temperature=temperature,
             stop_strings=stop_strings,
         )
     def benchmark(
@@ -256,31 +329,8 @@ class UniversalDFlashDecoder:
         Returns:
             Dict with speedup metrics
         """
-        import time
-        print(f"[Benchmark] Running {num_runs} generations...")
-        # Warmup
-        self.generate(prompt, max_tokens=10)
-        # DFlash generation
-        dflash_times = []
-        for _ in range(num_runs):
-            start = time.time()
-            self.generate(prompt, max_tokens=max_tokens)
-            dflash_times.append(time.time() - start)
-        # Baseline generation (without speculative decoding)
-        # We estimate based on token count vs time
-        # In practice you'd run a full baseline comparison
-        avg_time = sum(dflash_times) / len(dflash_times)
-        tokens_per_sec = max_tokens / avg_time
-        print(f"[Benchmark] Avg time: {avg_time:.2f}s, Speed: {tokens_per_sec:.1f} tok/s")
-        return {
-            "avg_time_sec": avg_time,
-            "tokens_per_sec": tokens_per_sec,
-            "num_runs": num_runs,
-        }

 Provides a high-level interface that works with any mlx_lm model,
 including those without pre-built DFlash drafters.
+Now uses the architecture-agnostic adapter system for proper target model
+interaction across all supported families (Qwen3, Qwen3.5, LLaMA, Mistral, Gemma).
 """
 from typing import Optional, List, Dict, Any
 import mlx.core as mx
 from .model import DFlashDraftModel
 from .speculative_decode import DFlashSpeculativeDecoder
+from .adapters import load_target_model, LoadedTargetModel, detect_model_architecture
+from .convert import load_mlx_dflash
 class UniversalDFlashDecoder:
     """Universal DFlash decoder that works with any MLX-converted model.
     This class handles:
+    1. Loading pre-converted DFlash drafters with architecture detection
     2. Creating generic drafters for unsupported models
     3. Training custom drafters on-the-fly
+    Key improvement: Automatically detects target model architecture and
+    selects the correct adapter for hidden state extraction and KV cache management.
     """
     def __init__(
         self,
+        target_model: Any,
         tokenizer,
         draft_model_path: Optional[str] = None,
         draft_layers: int = 5,
         """Initialize the universal decoder.
         Args:
+            target_model: Any mlx_lm loaded model, or path/ID to load
             tokenizer: Tokenizer for the model
             draft_model_path: Optional path to pre-converted DFlash drafter
             draft_layers: Number of draft layers (if creating generic drafter)
             block_size: Number of tokens per draft block
             device: MLX device
         """
         self.tokenizer = tokenizer
         self.block_size = block_size
         self.device = device
+        # Resolve target model
+        if isinstance(target_model, str):
+            print(f"[UniversalDFlash] Loading target model: {target_model}...")
+            self.loaded_target = load_target_model(target_model)
+            self.target_model = self.loaded_target.model
+        elif hasattr(target_model, 'adapter'):
+            # Already a LoadedTargetModel
+            self.loaded_target = target_model
+            self.target_model = target_model.model
+        else:
+            # Raw mlx_lm model — detect architecture
+            print("[UniversalDFlash] Detecting model architecture...")
+            self.target_model = target_model
+            # Try to build adapter from model attributes
+            arch = detect_model_architecture(target_model)
+            print(f"[UniversalDFlash] Detected architecture: {arch}")
+            # Create minimal LoadedTargetModel wrapper
+            from .adapters import MLXTargetAdapter, adapter_for_model_type
+            adapter_cls = adapter_for_model_type(arch)
+            if adapter_cls is None:
+                adapter_cls = MLXTargetAdapter
+            adapter = adapter_cls(model=target_model, config={"model_type": arch})
+            self.loaded_target = LoadedTargetModel(
+                requested_model="unknown",
+                resolved_model_path=None,
+                model=target_model,
+                tokenizer=tokenizer,
+                adapter=adapter,
+            )
         # Determine model type and vocab size
         self.vocab_size = getattr(tokenizer, "vocab_size", 151936)
+        self.target_config = self._extract_target_config(self.target_model)
         # Load or create draft model
         if draft_model_path:
+            print(f"[UniversalDFlash] Loading pre-built drafter from {draft_model_path}...")
             self.draft_model, self.draft_config = load_mlx_dflash(draft_model_path)
         else:
             print("[UniversalDFlash] Creating generic drafter for your model...")
             )
             self.draft_config = None
+        # Create the speculative decoder with architecture-aware adapter
         self.decoder = DFlashSpeculativeDecoder(
+            target_model=self.loaded_target,
             draft_model=self.draft_model,
             tokenizer=tokenizer,
             block_size=block_size,
             config['intermediate_size'] = getattr(model_config, 'intermediate_size', 14336)
             config['num_attention_heads'] = getattr(model_config, 'num_attention_heads', 32)
             config['num_key_value_heads'] = getattr(model_config, 'num_key_value_heads', 8)
+            config['model_type'] = getattr(model_config, 'model_type', 'unknown')
         else:
             # Default Qwen3-4B-like config
             config = {
                 'intermediate_size': 14336,
                 'num_attention_heads': 32,
                 'num_key_value_heads': 8,
+                'model_type': 'unknown',
             }
         return config
         This creates an untrained drafter that can be trained or used
         with pre-trained weights from a similar architecture.
+        The draft model is sized proportionally to the target model's
+        hidden dimension for feature compatibility.
         """
         # Determine architecture compatibility
         hidden_size = self.target_config.get('hidden_size', 4096)
         vocab_size = self.target_config.get('vocab_size', 151936)
+        num_layers = self.target_config.get('num_layers', 32)
         # Scale drafter based on target model size
+        # Aim for ~1B params (common for draft models)
         num_heads = draft_hidden_size // 64  # ~64 dims per head
         num_kv_heads = max(1, num_heads // 4)
         intermediate_size = int(draft_hidden_size * 2.75)  # Standard SwiGLU ratio
+        # Target layer ids for feature extraction
+        target_layer_ids = DFlashDraftModel._build_target_layer_ids(
+            None, num_layers, draft_layers
+        )
         drafter = DFlashDraftModel(
             vocab_size=vocab_size,
             hidden_size=draft_hidden_size,
             intermediate_size=intermediate_size,
             max_seq_len=8192,
             block_size=self.block_size,
+            mask_token_id=0,  # Will be overridden by tokenizer
+            num_target_layers=num_layers,
+            target_layer_ids=target_layer_ids,
         )
         return drafter
     ) -> str:
         """Train a custom DFlash drafter for your target model.
+        Uses the training recipe from the DFlash paper:
+        - KV injection with target model features
+        - Random anchor sampling for block construction
+        - Sparse attention masking within blocks
+        - Position-dependent loss decay
         Args:
             dataset: Path to training dataset or HF dataset name
             max_seq_length: Maximum sequence length for training
+            epochs: Number of training epochs (paper: 6)
             batch_size: Training batch size
+            lr: Learning rate (paper: 6e-4)
+            warmup_ratio: Warmup ratio for cosine schedule (paper: 0.04)
+            grad_clip: Gradient clipping threshold (paper: 1.0)
             output_path: Where to save the trained drafter
         Returns:
         from .trainer import DFlashTrainer
         print(f"[UniversalDFlash] Training custom drafter...")
+        print(f"  Dataset: {dataset}")
+        print(f"  Epochs: {epochs}, Batch size: {batch_size}, LR: {lr}")
         trainer = DFlashTrainer(
             target_model=self.target_model,
             drafter=self.draft_model,
         # Save weights
         weights = dict(self.draft_model.parameters())
+        # Try multiple formats
+        try:
+            np_weights = {k: np.array(v) for k, v in weights.items()}
+            np.savez(str(path / "weights.npz"), **np_weights)
+        except Exception:
+            try:
+                mx.savez(str(path / "weights.npz"), **weights)
+            except Exception as e:
+                print(f"[Save] Error saving weights: {e}")
+                raise
         # Save config
         config = {
             "num_hidden_layers": self.draft_model.num_layers,
             "num_attention_heads": self.draft_model.num_heads,
             "num_key_value_heads": self.draft_model.num_heads // 4,
+            "intermediate_size": self.draft_model.layers[0].mlp.gate_proj.weight.shape[1]
+                if hasattr(self.draft_model.layers[0].mlp.gate_proj, 'weight') else 2816,
             "max_position_embeddings": self.draft_model.max_seq_len,
             "block_size": self.draft_model.block_size,
+            "target_layer_ids": self.draft_model.target_layer_ids,
         }
         with open(path / "config.json", "w") as f:
         max_tokens: int = 2048,
         temperature: float = 0.0,
         stop_strings: Optional[List[str]] = None,
+        stream: bool = False,
+    ) -> str | Any:
         """Generate text using DFlash speculative decoding.
         Args:
             max_tokens: Maximum tokens to generate
             temperature: Sampling temperature
             stop_strings: Optional stop strings
+            stream: If True, returns a generator yielding text deltas
         Returns:
+            Generated text string, or generator if stream=True
         """
         return self.decoder.generate(
             prompt=prompt,
             max_tokens=max_tokens,
             temperature=temperature,
             stop_strings=stop_strings,
+            stream=stream,
         )
     def benchmark(
         Returns:
             Dict with speedup metrics
         """
+        return self.decoder.benchmark(
+            prompt=prompt,
+            max_tokens=max_tokens,
+            num_runs=num_runs,
+        )