tritesh
/

dflash-mlx-universal

@@ -3,19 +3,24 @@ Convert PyTorch DFlash drafter models to MLX format.
 Handles weight conversion from PyTorch safetensors to MLX arrays,
 compatible with any z-lab DFlash drafter.
 """
 import json
 import os
 from pathlib import Path
-from typing import Optional, Dict
 import mlx.core as mx
-from transformers import AutoConfig, AutoModel
 from huggingface_hub import hf_hub_download, snapshot_download
 def _convert_key(key: str) -> str:
-    """Convert PyTorch parameter names to MLX format."""
     # Replace PyTorch-specific prefixes
     key = key.replace("model.", "")
     # Standardize naming
@@ -45,8 +50,10 @@ def _convert_key(key: str) -> str:
 def _transpose_if_needed(key: str, tensor) -> mx.array:
-    """Transpose linear layer weights from PyTorch to MLX format."""
-    # Linear layers in PyTorch are [out, in], MLX expects [in, out]
     if "proj" in key or "fc" in key or "lm_head" in key or "embed" in key:
         if len(tensor.shape) == 2:
             return mx.array(tensor.T)
@@ -79,12 +86,12 @@ def convert_dflash_to_mlx(
     repo_path = snapshot_download(
         repo_id=pytorch_model_id,
         token=token,
-        ignore_patterns=["*.md", "*.png", "*.jpg"],
     )
     repo_path = Path(repo_path)
-    # Load PyTorch model to extract config
-    print("[Convert] Loading PyTorch model for config extraction...")
     config = AutoConfig.from_pretrained(
         repo_path,
         trust_remote_code=trust_remote_code,
@@ -103,26 +110,43 @@ def convert_dflash_to_mlx(
         "block_size": getattr(config, "block_size", 16),
         "rope_base": getattr(config, "rope_theta", 10000.0),
     }
     # Load weights from safetensors
     print("[Convert] Loading weights from safetensors...")
     try:
         from safetensors.torch import load_file
-        weights_file = repo_path / "model.safetensors"
-        if weights_file.exists():
-            pt_weights = load_file(str(weights_file))
         else:
-            # Try to find any .safetensors file
-            safetensors_files = list(repo_path.glob("*.safetensors"))
-            if safetensors_files:
-                pt_weights = load_file(str(safetensors_files[0]))
             else:
-                raise FileNotFoundError("No safetensors file found")
     except ImportError:
         # Fallback to torch load
         import torch
         weights_file = repo_path / "pytorch_model.bin"
-        pt_weights = torch.load(str(weights_file), map_location="cpu")
     # Convert weights
     print(f"[Convert] Converting {len(pt_weights)} parameters...")
@@ -131,72 +155,114 @@ def convert_dflash_to_mlx(
         mlx_key = _convert_key(key)
         mlx_weights[mlx_key] = _transpose_if_needed(key, tensor)
-    # Save MLX weights
-    weights_path = output_path / "weights.safetensors"
-    print(f"[Convert] Saving to {weights_path}...")
-    # Save using MLX
-    mx.save_safetensors(str(weights_path), mlx_weights)
     # Save config
     config_path = output_path / "config.json"
     with open(config_path, "w") as f:
         json.dump(dflash_config, f, indent=2)
-    # Save target model info
     target_info = {
         "source_model": pytorch_model_id,
-        "target_model": _infer_target_model(pytorch_model_id),
     }
     info_path = output_path / "model_info.json"
     with open(info_path, "w") as f:
         json.dump(target_info, f, indent=2)
     print(f"[Convert] Done! Model saved to {output_path}")
     return str(output_path)
-def _infer_target_model(dflash_model_id: str) -> str:
-    """Infer the target model from DFlash drafter ID."""
     # Map drafter IDs to target models
     mapping = {
         "Qwen3-4B-DFlash": "Qwen/Qwen3-4B",
         "Qwen3-8B-DFlash": "Qwen/Qwen3-8B",
         "Qwen3.5-9B-DFlash": "Qwen/Qwen3.5-9B",
         "Qwen3.5-27B-DFlash": "Qwen/Qwen3.5-27B",
         "Qwen3.6-27B-DFlash": "Qwen/Qwen3.6-27B",
         "Qwen3.6-35B-A3B-DFlash": "Qwen/Qwen3.6-35B-A3B",
         "Qwen3-Coder-30B-A3B-DFlash": "Qwen/Qwen3-Coder-30B-A3B",
-        "Qwen3.5-122B-A10B-DFlash": "Qwen/Qwen3.5-122B-A10B",
         "LLaMA3.1-8B-Instruct-DFlash": "meta-llama/Llama-3.1-8B-Instruct",
         "gemma-4-31B-it-DFlash": "google/gemma-4-31b-it",
         "gpt-oss-20b-DFlash": "openai/gpt-oss-20b",
         "Kimi-K2.5-DFlash": "moonshotai/Kimi-K2.5",
         "MiniMax-M2.5-DFlash": "MiniMax/MiniMax-M2.5",
     }
     for key, target in mapping.items():
         if key in dflash_model_id:
             return target
-    # Generic inference
     if "Qwen3.6" in dflash_model_id:
         return "Qwen/Qwen3.6-27B"
     elif "Qwen3.5" in dflash_model_id:
         return "Qwen/Qwen3.5-9B"
     elif "Qwen3" in dflash_model_id:
         return "Qwen/Qwen3-4B"
-    elif "LLaMA" in dflash_model_id or "Llama" in dflash_model_id:
         return "meta-llama/Llama-3.1-8B-Instruct"
-    elif "gemma" in dflash_model_id:
         return "google/gemma-4-31b-it"
     return "unknown"
 def load_mlx_dflash(
     model_path: str,
-) -> tuple:
     """Load a converted MLX DFlash model.
     Args:
@@ -214,8 +280,20 @@ def load_mlx_dflash(
         config = json.load(f)
     # Load weights
-    weights = mx.load(str(model_path / "weights.safetensors"))
     # Build model
     model = DFlashDraftModel(
         vocab_size=config["vocab_size"],
@@ -227,9 +305,32 @@ def load_mlx_dflash(
         max_seq_len=config["max_position_embeddings"],
         block_size=config.get("block_size", 16),
         rope_base=config.get("rope_base", 10000.0),
     )
     # Load weights into model
     model.update(weights)
     return model, config

 Handles weight conversion from PyTorch safetensors to MLX arrays,
 compatible with any z-lab DFlash drafter.
+Updated to work with the universal adapter system for any target model family.
 """
 import json
 import os
 from pathlib import Path
+from typing import Optional, Dict, Tuple
 import mlx.core as mx
+from transformers import AutoConfig
 from huggingface_hub import hf_hub_download, snapshot_download
 def _convert_key(key: str) -> str:
+    """Convert PyTorch parameter names to MLX format.
+    Handles various naming conventions across model families.
+    """
     # Replace PyTorch-specific prefixes
     key = key.replace("model.", "")
     # Standardize naming
 def _transpose_if_needed(key: str, tensor) -> mx.array:
+    """Transpose linear layer weights from PyTorch to MLX format.
+    Linear layers in PyTorch are [out, in], MLX expects [in, out].
+    """
     if "proj" in key or "fc" in key or "lm_head" in key or "embed" in key:
         if len(tensor.shape) == 2:
             return mx.array(tensor.T)
     repo_path = snapshot_download(
         repo_id=pytorch_model_id,
         token=token,
+        ignore_patterns=["*.md", "*.png", "*.jpg", "*.gif", "*.jpeg"],
     )
     repo_path = Path(repo_path)
+    # Load PyTorch model config
+    print("[Convert] Loading PyTorch config...")
     config = AutoConfig.from_pretrained(
         repo_path,
         trust_remote_code=trust_remote_code,
         "block_size": getattr(config, "block_size", 16),
         "rope_base": getattr(config, "rope_theta", 10000.0),
     }
+    # Extract target layer IDs if present in config
+    if hasattr(config, "target_layer_ids"):
+        dflash_config["target_layer_ids"] = config.target_layer_ids
+    elif hasattr(config, "dflash_config") and hasattr(config.dflash_config, "target_layer_ids"):
+        dflash_config["target_layer_ids"] = config.dflash_config.target_layer_ids
     # Load weights from safetensors
     print("[Convert] Loading weights from safetensors...")
     try:
         from safetensors.torch import load_file
+        # Find all safetensors files
+        safetensors_files = sorted(repo_path.glob("*.safetensors"))
+        if safetensors_files:
+            pt_weights = {}
+            for st_file in safetensors_files:
+                print(f"  Loading {st_file.name}...")
+                partial = load_file(str(st_file))
+                pt_weights.update(partial)
         else:
+            # Try pytorch_model.bin
+            bin_file = repo_path / "pytorch_model.bin"
+            if bin_file.exists():
+                import torch
+                pt_weights = torch.load(str(bin_file), map_location="cpu")
             else:
+                raise FileNotFoundError("No safetensors or pytorch_model.bin found")
     except ImportError:
         # Fallback to torch load
         import torch
         weights_file = repo_path / "pytorch_model.bin"
+        if weights_file.exists():
+            pt_weights = torch.load(str(weights_file), map_location="cpu")
+        else:
+            raise FileNotFoundError("No weight files found and safetensors not installed")
     # Convert weights
     print(f"[Convert] Converting {len(pt_weights)} parameters...")
         mlx_key = _convert_key(key)
         mlx_weights[mlx_key] = _transpose_if_needed(key, tensor)
+    # Save MLX weights (try safetensors, fallback to npz)
+    weights_path = output_path / "weights.npz"
+    try:
+        # Use numpy format if safetensors save is problematic
+        import numpy as np
+        np_weights = {k: np.array(v) for k, v in mlx_weights.items()}
+        np.savez(str(weights_path), **np_weights)
+        print(f"[Convert] Saved weights to {weights_path}")
+    except Exception as e:
+        print(f"[Convert] Warning: Could not save weights: {e}")
+        # Try direct mlx save
+        try:
+            mx.savez(str(weights_path), **mlx_weights)
+        except Exception as e2:
+            print(f"[Convert] Error saving weights: {e2}")
+            raise
     # Save config
     config_path = output_path / "config.json"
     with open(config_path, "w") as f:
         json.dump(dflash_config, f, indent=2)
+    # Save target model mapping
     target_info = {
         "source_model": pytorch_model_id,
+        "target_model": infer_target_model(pytorch_model_id),
+        "conversion_date": str(Path(__file__).stat().st_mtime),
     }
     info_path = output_path / "model_info.json"
     with open(info_path, "w") as f:
         json.dump(target_info, f, indent=2)
     print(f"[Convert] Done! Model saved to {output_path}")
+    print(f"  Config: {dflash_config}")
+    print(f"  Target: {target_info['target_model']}")
     return str(output_path)
+def infer_target_model(dflash_model_id: str) -> str:
+    """Infer the target model from DFlash drafter ID.
+    Maps known drafter checkpoints to their corresponding target models.
+    Supports all official z-lab DFlash models plus community variants.
+    """
     # Map drafter IDs to target models
     mapping = {
+        # Qwen3 series
         "Qwen3-4B-DFlash": "Qwen/Qwen3-4B",
         "Qwen3-8B-DFlash": "Qwen/Qwen3-8B",
+        "Qwen3-32B-DFlash": "Qwen/Qwen3-32B",
+        # Qwen3.5 series
+        "Qwen3.5-4B-DFlash": "Qwen/Qwen3.5-4B",
         "Qwen3.5-9B-DFlash": "Qwen/Qwen3.5-9B",
         "Qwen3.5-27B-DFlash": "Qwen/Qwen3.5-27B",
+        "Qwen3.5-35B-A3B-DFlash": "Qwen/Qwen3.5-35B-A3B",
+        "Qwen3.5-122B-A10B-DFlash": "Qwen/Qwen3.5-122B-A10B",
+        # Qwen3.6 series
         "Qwen3.6-27B-DFlash": "Qwen/Qwen3.6-27B",
         "Qwen3.6-35B-A3B-DFlash": "Qwen/Qwen3.6-35B-A3B",
+        # Qwen Coder
+        "Qwen3-Coder-Next-DFlash": "Qwen/Qwen3-Coder-Next",
         "Qwen3-Coder-30B-A3B-DFlash": "Qwen/Qwen3-Coder-30B-A3B",
+        # LLaMA
         "LLaMA3.1-8B-Instruct-DFlash": "meta-llama/Llama-3.1-8B-Instruct",
+        "LLaMA3.1-70B-Instruct-DFlash": "meta-llama/Llama-3.1-70B-Instruct",
+        # Gemma
         "gemma-4-31B-it-DFlash": "google/gemma-4-31b-it",
+        "gemma-4-26B-A4B-it-DFlash": "google/gemma-4-26b-a4b-it",
+        # GPT-OSS
         "gpt-oss-20b-DFlash": "openai/gpt-oss-20b",
+        "gpt-oss-120b-DFlash": "openai/gpt-oss-120b",
+        # Kimi
         "Kimi-K2.5-DFlash": "moonshotai/Kimi-K2.5",
+        # MiniMax
         "MiniMax-M2.5-DFlash": "MiniMax/MiniMax-M2.5",
     }
+    # Direct mapping lookup
     for key, target in mapping.items():
         if key in dflash_model_id:
             return target
+    # Generic inference by model family
     if "Qwen3.6" in dflash_model_id:
         return "Qwen/Qwen3.6-27B"
     elif "Qwen3.5" in dflash_model_id:
         return "Qwen/Qwen3.5-9B"
+    elif "Qwen3-Coder" in dflash_model_id:
+        return "Qwen/Qwen3-Coder-Next"
     elif "Qwen3" in dflash_model_id:
         return "Qwen/Qwen3-4B"
+    elif "LLaMA" in dflash_model_id or "Llama" in dflash_model_id or "llama" in dflash_model_id:
         return "meta-llama/Llama-3.1-8B-Instruct"
+    elif "gemma" in dflash_model_id.lower():
         return "google/gemma-4-31b-it"
+    elif "gpt-oss" in dflash_model_id.lower():
+        return "openai/gpt-oss-20b"
+    elif "Kimi" in dflash_model_id:
+        return "moonshotai/Kimi-K2.5"
+    elif "MiniMax" in dflash_model_id:
+        return "MiniMax/MiniMax-M2.5"
     return "unknown"
 def load_mlx_dflash(
     model_path: str,
+) -> Tuple:
     """Load a converted MLX DFlash model.
     Args:
         config = json.load(f)
     # Load weights
+    weights_path = model_path / "weights.npz"
+    if not weights_path.exists():
+        # Try alternative extensions
+        for ext in [".safetensors", ".mlx", ".npz"]:
+            alt = model_path / f"weights{ext}"
+            if alt.exists():
+                weights_path = alt
+                break
+    if not weights_path.exists():
+        raise FileNotFoundError(f"No weights found in {model_path}")
+    weights = mx.load(str(weights_path))
     # Build model
     model = DFlashDraftModel(
         vocab_size=config["vocab_size"],
         max_seq_len=config["max_position_embeddings"],
         block_size=config.get("block_size", 16),
         rope_base=config.get("rope_base", 10000.0),
+        target_layer_ids=config.get("target_layer_ids", None),
     )
     # Load weights into model
     model.update(weights)
     return model, config
+def main():
+    """CLI entry point for conversion."""
+    import argparse
+    parser = argparse.ArgumentParser(description="Convert PyTorch DFlash drafter to MLX")
+    parser.add_argument("--model", required=True, help="HF model ID of PyTorch drafter")
+    parser.add_argument("--output", required=True, help="Output directory")
+    parser.add_argument("--trust-remote-code", action="store_true", default=True)
+    parser.add_argument("--token", default=None, help="HF token for gated models")
+    args = parser.parse_args()
+    convert_dflash_to_mlx(
+        pytorch_model_id=args.model,
+        output_path=args.output,
+        trust_remote_code=args.trust_remote_code,
+        token=args.token,
+    )
+if __name__ == "__main__":
+    main()