Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +69 -0
config.json +68 -0
config.py +95 -0
model.safetensors +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+library_name: speculators
+base_model:
+- poolside/Laguna-XS.2
+license: apache-2.0
+tags:
+- speculative-decoding
+- dflash
+- speculators
+---
+# RedHatAI/Laguna-XS.2-speculator.dflash
+This is a DFlash speculator model for [poolside/Laguna-XS.2](https://huggingface.co/poolside/Laguna-XS.2).
+## Training Details
+This model was trained using the [Speculators](https://github.com/vllm-project/speculators) library on a combination of [Magpie-Align/Magpie-Llama-3.1-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.1-Pro-300K-Filtered) and the `train_sft` split of [HuggingFaceH4/ultrachat_200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k). Responses were regenerated by Laguna-XS.2 (with reasoning).
+## Model Specifications
+| | |
+|---|---|
+| **Base Model** | poolside/Laguna-XS.2 |
+| **Chat Template** | poolside/Laguna-XS.2 (use `/chat/completions` endpoint) |
+| **Format** | Safetensors |
+| **License** | Apache 2.0 |
+| **Validation Hardware** | Nvidia A100 |
+## Deployment
+```bash
+# Install vLLM from the required PR
+pip install git+https://github.com/vllm-project/vllm.git@refs/pull/41880/head
+# Deploy with speculative decoding
+VLLM_USE_DEEP_GEMM=0 vllm serve poolside/Laguna-XS.2 \
+    --tensor-parallel-size 1 \
+    --max-model-len 16384 \
+    --tool-call-parser poolside_v1 \
+    --reasoning-parser poolside_v1 \
+    --enable-auto-tool-choice \
+    --default-chat-template-kwargs '{"enable_thinking": true}' \
+    --speculative-config '{
+        "model": "poolside/Laguna-XS.2-speculator.dflash",
+        "num_speculative_tokens": 7,
+        "method": "dflash"
+    }'
+```
+## Preliminary Evaluations
+Per-position token acceptance rates across datasets:
+(with reasoning enabled)
+| Dataset | Pos 1 | Pos 2 | Pos 3 | Pos 4 | Pos 5 | Pos 6 | Pos 7 | Avg Length |
+|---------|-------|-------|-------|-------|-------|-------|-------|------------|
+| HumanEval | 74.0% | 48.6% | 29.9% | 17.7% | 9.9% | 5.1% | 2.4% | 2.876 |
+| math_reasoning | 76.9% | 53.2% | 34.6% | 21.2% | 12.1% | 6.0% | 2.6% | 3.066 |
+| qa | 68.5% | 41.8% | 24.8% | 14.7% | 8.4% | 4.6% | 2.2% | 2.650 |
+| question | 70.6% | 44.1% | 26.2% | 15.0% | 8.4% | 4.5% | 2.3% | 2.711 |
+| rag | 71.7% | 45.7% | 27.6% | 16.0% | 8.9% | 4.8% | 2.3% | 2.770 |
+| summarization | 68.8% | 40.8% | 22.7% | 12.3% | 6.5% | 3.3% | 1.5% | 2.559 |
+| translation | 70.8% | 44.3% | 25.0% | 13.0% | 6.5% | 3.1% | 1.2% | 2.639 |
+| writing | 70.9% | 44.6% | 26.8% | 15.8% | 9.4% | 5.4% | 2.3% | 2.752 |
+## References
+**Paper**: [DFlash: Block Diffusion for Flash Speculative Decoding](https://arxiv.org/abs/2602.06036)

config.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "architectures": [
+    "DFlashDraftModel"
+  ],
+  "auto_map": {
+    "": "config.DFlashSpeculatorConfig"
+  },
+  "aux_hidden_state_layer_ids": [
+    1,
+    9,
+    17,
+    36,
+    39
+  ],
+  "block_size": 8,
+  "draft_vocab_size": 32000,
+  "dtype": "bfloat16",
+  "mask_token_id": 12,
+  "max_anchors": 3072,
+  "speculators_config": {
+    "algorithm": "dflash",
+    "default_proposal_method": "greedy",
+    "proposal_methods": [
+      {
+        "accept_tolerance": 0.0,
+        "proposal_type": "greedy",
+        "speculative_tokens": 7,
+        "verifier_accept_k": 1
+      }
+    ],
+    "verifier": {
+      "architectures": [],
+      "name_or_path": "poolside/Laguna-XS.2"
+    }
+  },
+  "speculators_model_type": "dflash",
+  "speculators_version": "0.5.0.dev97",
+  "target_hidden_size": null,
+  "tie_word_embeddings": false,
+  "transformer_layer_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 8192,
+    "max_position_embeddings": 131072,
+    "mlp_bias": false,
+    "model_type": "llama",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 5,
+    "num_key_value_heads": 8,
+    "pad_token_id": null,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "rope_theta": 10000.0,
+      "rope_type": "default"
+    },
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 100352
+  },
+  "transformers_version": "5.6.2"
+}

config.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from typing import Any, Literal
+from pydantic import Field, field_serializer, field_validator
+from transformers import AutoConfig, PretrainedConfig
+from transformers.models.qwen3.modeling_qwen3 import (
+    Qwen3Config,
+)
+from speculators import SpeculatorModelConfig
+__all__ = [
+    "DFlashSpeculatorConfig",
+]
+@SpeculatorModelConfig.register("dflash")
+class DFlashSpeculatorConfig(SpeculatorModelConfig):
+    """
+    Configuration for DFlash speculator with vocabulary mapping.
+    DFlash features vocabulary mapping between draft (64K) and target (128K)
+    vocabularies, enabling cross-tokenizer speculation.
+    :param transformer_layer_config: Configuration for the transformer decoder layer
+    :param draft_vocab_size: Size of draft model vocabulary for speculation
+    """
+    speculators_model_type: Literal["dflash"] = "dflash"
+    architectures: list[str] = Field(
+        default_factory=lambda: ["DFlashSpeculator"],
+        description="Model architectures that can load these weights",
+    )
+    transformer_layer_config: PretrainedConfig = Field(
+        default_factory=Qwen3Config,
+        description="Configuration for the transformer decoder layer",
+    )
+    draft_vocab_size: int = Field(
+        default=32000,
+        description="Size of draft model vocabulary for speculation",
+    )
+    block_size: int = Field(
+        default=8,
+        description=(
+            "Default size of the draft block predicted with a forward pass of the model"
+        ),
+    )
+    max_anchors: int = Field(
+        default=256,
+        description=(
+            "Maximum number of anchor positions to sample during training "
+            "(controls memory usage and training efficiency)"
+        ),
+    )
+    target_hidden_size: int | None = Field(
+        default=None,
+        description="Hidden size of the target model (if different from draft model)",
+    )
+    aux_hidden_state_layer_ids: list[int] | None = Field(
+        default=None,
+        description="Layer IDs of the DFlash auxiliary hidden state layers",
+    )
+    mask_token_id: int | None = Field(
+        default=None,
+        description="Token ID used for masking",
+    )
+    @field_serializer("transformer_layer_config")
+    def serialize_transformer_config(self, value: PretrainedConfig) -> dict:
+        """Serialize transformer config to dict."""
+        return value.to_diff_dict()
+    @field_validator("transformer_layer_config", mode="before")
+    @classmethod
+    def validate_transformer_config(cls, value: Any) -> PretrainedConfig:
+        """Validate and convert transformer config."""
+        if isinstance(value, dict):
+            config_class: type[PretrainedConfig] = Qwen3Config
+            if "model_type" in value:
+                config_class = AutoConfig.for_model(
+                    model_type=value["model_type"]
+                ).__class__
+            return config_class(**value)
+        return value
+    @property
+    def target_vocab_size(self) -> int:
+        """Get target vocabulary size from transformer config."""
+        return self.transformer_layer_config.vocab_size

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e08bf72a99f27d92af25595422baf5421a58d8d5634ddfec7a60f7cb21d964a7
+size 1213617016