# ==================== configuration_neollm.py ====================
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging

logger = logging.get_logger(__name__)


class NeoLLMConfig(PretrainedConfig):
    r"""
    Configuration class for the NeoLLM model architecture.

    Instantiates a NeoLLM model according to the specified arguments, defining the
    full architecture including attention mechanisms, normalization, periodicity
    modeling, an optional Leviathan continuous token embedding generator, an
    optional Leviathan-JTok-M token-indexed modulation module, and optional
    Spelling Bee character-level embedding augmentation.

    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and
    can be used to control the model outputs. Read the documentation from
    :class:`~transformers.PretrainedConfig` for more information.

    Args:
        vocab_size (:obj:`int`, *optional*, defaults to 200005):
            Vocabulary size of the NeoLLM model. Defines the number of different
            tokens that can be represented by the ``input_ids``.
        hidden_size (:obj:`int`, *optional*, defaults to 512):
            Dimensionality of the hidden representations.
        intermediate_size (:obj:`int`, *optional*, defaults to 1536):
            Dimensionality of the MLP feed-forward intermediate representations.
        num_hidden_layers (:obj:`int`, *optional*, defaults to 12):
            Number of decoder Transformer layers.
        num_attention_heads (:obj:`int`, *optional*, defaults to 8):
            Number of query attention heads per layer.
        num_key_value_heads (:obj:`int`, *optional*, defaults to 2):
            Number of key/value attention heads (GQA). Must divide
            ``num_attention_heads`` evenly.
        hidden_act (:obj:`str`, *optional*, defaults to ``"xielu"``):
            Non-linear activation function used in the MLP layers.
        max_position_embeddings (:obj:`int`, *optional*, defaults to 32768):
            Maximum sequence length supported by the positional encoding.
        initializer_range (:obj:`float`, *optional*, defaults to 0.02):
            Standard deviation of the truncated-normal weight initializer.
        rms_norm_eps (:obj:`float`, *optional*, defaults to 1e-6):
            Epsilon for RMS normalization and SeeDNorm layers.
        tie_word_embeddings (:obj:`bool`, *optional*, defaults to ``False``):
            Whether to share weights between the input embedding matrix and the
            output language-model head. Automatically forced to ``False`` when
            ``use_token_generator=True``, because the generator produces input
            representations via a learned smooth surface and the output head must
            remain an independent dense projection.
        rope_theta (:obj:`float`, *optional*, defaults to 10000.0):
            Base period for Rotary Position Embeddings (RoPE).
        rope_scaling (:obj:`dict`, *optional*):
            Dictionary containing the RoPE scaling configuration. Must contain at
            least the key ``"rope_type"`` (or ``"type"``). Validated by
            :func:`~transformers.modeling_rope_utils.rope_config_validation`.
        partial_rotary_factor (:obj:`float`, *optional*, defaults to 0.25):
            Fraction of each attention head's dimension to rotate with RoPE.
        attention_bias (:obj:`bool`, *optional*, defaults to ``False``):
            Whether to add a bias term to Q, K, V, and output projections.
        attention_dropout (:obj:`float`, *optional*, defaults to 0.1):
            Dropout probability applied to attention weights during training.
        head_dim (:obj:`int`, *optional*, defaults to 64):
            Dimensionality of each attention head.
        use_momentum_attention (:obj:`bool`, *optional*, defaults to ``True``):
            Enable post-RoPE Momentum Attention: applies a causal first-difference
            shear to Q and K before the dot-product score computation.
        momentum_gamma (:obj:`float`, *optional*, defaults to 0.10):
            Mixing coefficient for the Momentum Attention shear. Ignored when
            ``use_momentum_attention=False``.
        use_mea_attention (:obj:`bool`, *optional*, defaults to ``True``):
            Enable Multi-head Explicit Attention (MEA), which applies a learned
            head-level linear composition over K and V initialized as identity,
            allowing inter-head interaction to emerge freely from step 0.
        mea_component_key_value_heads (:obj:`int`, *optional*):
            Number of component K/V heads used by MEA. Defaults to
            ``num_key_value_heads`` when ``None``.
        mea_groupnorm_eps (:obj:`float`, *optional*, defaults to 1e-6):
            Epsilon for the GQA-grouped SeeDNorm applied to the MEA attention
            output.
        use_lucid_attention (:obj:`bool`, *optional*, defaults to ``False``):
            Enable LUCID attention: applies a lower-triangular solve to
            precondition the value states using the causal key-key similarity
            matrix in RKHS, decorrelating keys to reduce attentional noise in
            long-context settings (Duvvuri et al., 2026).
        lucid_attention_eps (:obj:`float`, *optional*, defaults to 1e-6):
            Epsilon for RMS key normalization inside the LUCID preconditioner.
        use_affine_scaled_attention (:obj:`bool`, *optional*, defaults to ``False``):
            Enable Affine-Scaled Attention (Bae et al., 2026). Applies an
            input-dependent per-head scaling factor α and a moving-average bias β
            directly to the softmax-normalized attention weights before the
            weighted sum with V:

                [α(X) · softmax(QKᵀ/√dk) + β(X)] V

            This relaxes the unit-sum constraint of softmax, reduces first-token
            bias, increases attention entropy, and promotes head diversity.
            Orthogonal to Gated Attention: the gate modulates the post-SDPA
            output, while Affine-Scaled modulates the softmax weights directly.
            Only active in eager attention mode (flash kernels do not expose
            intermediate softmax weights).
        affine_momentum (:obj:`float`, *optional*, defaults to 0.9):
            EMA momentum coefficient ρ for the running average α_ma used to
            compute the bias term β(X) = (α_ma − α(X)) / N. Controls the
            trade-off between the running estimate and the current batch
            statistic. Ignored when ``use_affine_scaled_attention=False``.
        use_xsa (:obj:`bool`, *optional*, defaults to ``False``):
            Enable Exclusive Self Attention (Zhai, 2026). After the SDPA output
            is computed, removes from each head's output the component that falls
            along the direction of the token's own value vector, forcing the
            attention layer to carry only contextual information orthogonal to
            self-position.

            Two paths depending on active components:

            - **MEA active or LUCID active**: ``v_ref`` is the value vector
              after MEA mixing and after LUCID preconditioning — the vector that
              actually participated in the SDPA aggregation.
            - **MEA and LUCID inactive**: ``v_ref`` is the raw value projection
              — standard XSA as described in the paper.

            Applied after MEAHeadSeeDNorm and before the Gated Attention gate.
            Gains increase with sequence length.
        xsa_eps (:obj:`float`, *optional*, defaults to 1e-6):
            Epsilon for the denominator of the XSA projection to prevent
            division by zero: ``‖v_ref‖² + xsa_eps``. Ignored when
            ``use_xsa=False``.
        fan_ratio (:obj:`float`, *optional*, defaults to 0.125):
            Ratio controlling the periodic-dimension size in FANformer attention.
            The transformed representation has dimension
            ``hidden_size * (1 + fan_ratio)``.
        fan_ratio_ffn (:obj:`float`, *optional*, defaults to 0.0625):
            Ratio controlling the periodic-dimension size in FANformer MLP
            layers. Set to half of ``fan_ratio`` to model complementary
            periodicities in the feature space.
        dropout_rate (:obj:`float`, *optional*, defaults to 0.1):
            General dropout probability for attention outputs and MLP states.
        use_token_generator (:obj:`bool`, *optional*, defaults to ``True``):
            Replace the discrete vocabulary embedding lookup table with a
            **Leviathan** continuous token generator (Batley & Saha, 2026).

            When enabled:

            - ``tie_word_embeddings`` is forced to ``False``.
            - ``model.embed_tokens`` is replaced by ``model.token_generator``
              (:class:`LeviathanGenerator`).
            - The input-embedding parameter budget scales as
              ``O(k · ⌈V^{1/k}⌉ · d_seed)`` instead of ``O(V · D)``.
            - When ``use_jtokm=True``, the generator additionally returns
              ``z_tilde`` and ``B_vals`` for reuse by every decoder layer,
              avoiding redundant B-spline evaluation.

            See :class:`LeviathanGenerator` in ``modeling_neollm.py``.
        generator_d_seed (:obj:`int`, *optional*, defaults to 128):
            Dimensionality of the latent seed space ``z̃ ∈ [0,1]^{d_seed}``.
            Also used as the per-dimension input to each generator head's
            B-spline expansion and as the residual dimension for JTok-M
            surfaces when ``use_jtokm=True``.
        generator_num_modes (:obj:`int`, *optional*, defaults to 8):
            Number of independent per-head generator modes. Each mode has its
            own preprocessing (Dense + LayerNorm + sigmoid(x/2)), its own
            learnable per-dimension scale, its own spline weights, and its own
            output projection. Head outputs are summed to form the embedding.
        generator_num_knots (:obj:`int`, *optional*, defaults to 32):
            Number of B-spline knot points on ``[0, 1]``. Shared between the
            input generator heads and all JTok-M surfaces.
        generator_spline_degree (:obj:`int`, *optional*, defaults to 2):
            Polynomial degree of the B-spline basis. Kept for documentation;
            the closed-form KHRONOS quadratic kernel is used in practice.
        generator_k (:obj:`int`, *optional*, defaults to 3):
            Number of coordinate dimensions for latent compositional indexing.
        generator_krank (:obj:`int`, *optional*, defaults to 64):
            Output rank of each per-head KHRONOS tensor-product kernel inside
            the Leviathan generator. Each of the ``generator_num_modes`` heads
            produces a vector of this dimensionality via the tensor product
            aggregation, which is then projected independently to
            ``hidden_size`` via ``head_out[i]``. Matches the ``krank=64``
            default in the original ``ckhronos.py`` implementation.
            Ignored when ``use_token_generator=False``.
        use_jtokm (:obj:`bool`, *optional*, defaults to ``False``):
            Enable the **Leviathan-JTok-M** token-indexed modulation module
            (Yang et al., 2026; fused with Leviathan geometry).

            Unlike the original JTok paper which maintains discrete embedding
            tables of size ``V × d`` per layer — reintroducing the vocabulary
            tax in every decoder layer — this implementation operates over the
            Leviathan latent coordinate ``z̃_x``. Parameter cost scales with
            ``n_e × M_mod × d_seed × n_knots`` per layer rather than ``V × d``,
            breaking the linear dependency on vocabulary size. Additionally,
            tokens with nearby latent coordinates receive structurally related
            modulations, introducing continuity that the discrete formulation
            cannot express.

            Architecture per decoder layer when active:

            1. **Surface pool**: ``n_e`` independent CP-separable surfaces, each
               sharing the same ``z̃_x`` and ``B(z̃_x)`` produced by the
               generator. Surface ``i`` computes:

               .. math::
                   m^{\\ell}_{x,i} = W^{\\ell,i}_{\\text{out}}
                   [M^{\\ell,i}_1, \\ldots, M^{\\ell,i}_{M_{\\text{mod}}}]^\\top
                   + W^{\\ell,i}_{\\text{res}}\\, \\tilde{z}_x

            2. **Context router**: a linear projection of
               ``RMSNorm(h̃^ℓ_x)`` — the hidden state *after* attention —
               produces ``n_e`` routing logits. TopK selects K surfaces;
               Sigmoid-normalized weights (not Softmax) avoid inter-surface
               competition:

               .. math::
                   w^{\\ell}_i = \\frac{\\sigma(g^{\\ell}_i)}{\\sum_{j \\in
                   \\mathcal{G}^{\\ell}_x} \\sigma(g^{\\ell}_j)}

            3. **Additive injection** with LNS-coordinated scaling:

               .. math::
                   \\Delta r^{\\ell}_x = \\frac{1}{\\sqrt{2\\ell}} \\cdot
                   s^{\\ell} \\odot \\text{Norm}_{\\varepsilon}(e^{\\ell}_x)

               .. math::
                   h^{\\ell+1}_x = \\tilde{h}^{\\ell}_x + \\Delta m^{\\ell}_x
                   + \\Delta r^{\\ell}_x

               The ``1/√(2ℓ)`` factor — where ``ℓ`` is the 1-indexed layer
               index — is coordinated with the existing LNS factor ``1/√ℓ``
               to maintain a **constant JTok-M / backbone contribution ratio**
               of ``1/√2 ≈ 0.707`` at every depth.

            4. **Load-balancing loss** (averaged over all layers):

               .. math::
                   \\mathcal{L}_{\\text{aux}} = \\lambda \\cdot n_e
                   \\sum_{i=1}^{n_e} p_i f_i

            Requires ``use_token_generator=True``.

        jtokm_num_experts (:obj:`int`, *optional*, defaults to 5):
            Number of independent CP-separable modulation surfaces ``n_e`` per
            decoder layer.
        jtokm_top_k (:obj:`int`, *optional*, defaults to 2):
            Number of surfaces selected by the router per token per layer (K).
            Must satisfy ``1 ≤ jtokm_top_k < jtokm_num_experts``.
        jtokm_num_modes (:obj:`int`, *optional*, defaults to 4):
            Number of rank-1 separable modes ``M_mod`` per JTok-M surface.
        jtokm_aux_loss_weight (:obj:`float`, *optional*, defaults to 1e-4):
            Coefficient ``λ`` for the load-balancing auxiliary loss.
        jtokm_norm_eps (:obj:`float`, *optional*, defaults to 1e-6):
            Epsilon for L2 normalisation of modulation vectors.
        use_spelling_bee_embeddings (:obj:`bool`, *optional*, defaults to ``False``):
            Augment token embeddings with character-level byte information
            (Rabe, Clymo & Dong, 2026).

            Each token's UTF-8 encoding (up to 16 bytes) is embedded through a
            shared ``nn.Embedding(256, d)`` table. Byte embeddings are
            position-encoded with RoPE using intra-token byte positions (not
            sequence positions), summed and normalised by ``√byte_len``, then
            averaged with the standard token embedding:

            .. math::
                e_{\\text{bee}}(t) = \\frac{1}{2}\\left(e_{\\text{tok}}(t) +
                \\frac{1}{\\sqrt{|t|}}\\sum_{i=1}^{16}
                \\text{RoPE}(e_{\\text{byte}}[b_i], i)\\right)

            Adds ``256 × hidden_size`` parameters (≈0.13M for d=512).
            Zero inference overhead when ``bake_inference_table()`` is called
            after training.

            Compatible with all four combinations of ``use_token_generator``
            and ``use_spelling_bee_embeddings``.

            **Setup required**: call
            ``model.model.spelling_bee.set_byte_table(tokenizer)`` once after
            model instantiation (handled automatically by ``setup_model`` in
            ``train.py``).

            Reference: Rabe, Clymo & Dong (2026). *Spelling Bee Embeddings for
            Language Modeling.* arXiv:2601.18030.

        use_hadamard_o_proj (:obj:`bool`, *optional*, defaults to ``False``):
            Replace the dense ``W_O ∈ R^{d×d}`` output projection in every
            multi-head attention block with a fixed Walsh–Hadamard Transform
            followed by a learnable per-channel affine rescaling
            ``α ⊙ FWHT(x)/√d + β``.

            The WHT is a parameter-free orthogonal matrix whose singular values
            are all identically 1, so the effective condition number is
            ``κ = 1`` by construction and cannot grow during training. This
            directly addresses the high-κ pathology (κ up to 10^5) observed in
            the dense ``o_proj`` matrices, which causes FP8 per-tensor
            quantisation to lose low-magnitude directions entirely.

            Parameter reduction: replaces ``d²`` weights with ``2d``
            (``α`` and ``β``), saving ≈25% of attention parameters per block.
            Requires ``hidden_size`` to be a power of 2 (512 ✓, 1024 ✓,
            768 ✗).

            Reference: Aggarwal & Kumar (2026). *Rethinking Attention Output
            Projection: Structured Hadamard Transforms for Efficient
            Transformers.* arXiv:2603.08343.

        use_repo (:obj:`bool`, *optional*, defaults to ``False``):
            Enable Context Re-Positioning (REPO) in attention layers at or
            above ``repo_start_layer`` (Li et al., 2026).

            REPO replaces the fixed linear position indices ``0…L-1`` fed to
            RoPE with continuous, data-dependent positions ``z_i = f_ϕ(h_i)``
            learned end-to-end. The attention score between tokens ``i`` and
            ``j`` becomes:

            .. math::
                A^{\\text{REPO}}_{i,j} = q_i^\\top\\, g_\\theta(z_j - z_i)\\, k_j

            where ``g_θ`` is the standard RoPE rotation and ``z_i`` is
            predicted from the hidden state ``h_i`` by a lightweight SwiGLU
            sub-layer ``f_ϕ``:

            .. math::
                r_i = \\text{Swish}(h_i W_g) \\odot (h_i W_c), \\quad
                z_i^{(h)} = r_i w_z^{(h)}

            ``W_g, W_c \\in \\mathbb{R}^{d \\times d_p}`` are shared across
            all query heads within a layer; ``w_z^{(h)} \\in \\mathbb{R}^{d_p}``
            is learned independently per head. The assigned positions are
            real-valued and unconstrained — the model may learn constant
            (NoPE-like), monotonic (RoPE-like), or hybrid patterns as needed.

            Lower layers (``layer_idx < repo_start_layer``) retain the
            standard integer RoPE positions because they primarily capture
            surface-level, locally-dependent features that benefit less from
            re-positioning (Li et al., 2026, §3).

            Overhead: +0.9% parameters; inference latency negligible.

        repo_start_layer (:obj:`int`, *optional*, defaults to
            ``num_hidden_layers // 3``):
            Index of the first decoder layer to which REPO is applied.
            Layers ``[0, repo_start_layer)`` continue to use standard
            integer RoPE positions. Must satisfy
            ``0 <= repo_start_layer < num_hidden_layers``.
            Ignored when ``use_repo=False``.

        repo_d_p (:obj:`int`, *optional*, defaults to
            ``hidden_size // 8``):
            Dimensionality of the intermediate position representation
            ``r_i \\in \\mathbb{R}^{d_p}`` inside ``f_ϕ``. The paper sets
            ``d_p = d/8`` on the assumption that positional information
            is less rich than the full hidden representation. Ignored
            when ``use_repo=False``.

        use_laurel (:obj:`bool`, *optional*, defaults to ``False``):
            Enable the Learned Augmented Residual Layer (LAUREL) framework
            (Menghani, Kumar & Kumar, ICML 2025). LAUREL generalises the
            canonical residual connection:

            .. math::
                x_{i+1} = \\alpha \\cdot f(x_i) + g(x_i)

            where :math:`g` is a learned linear function operating on the
            residual stream.  Applied independently to both the attention
            and MLP sublayers of every decoder layer.

            At least one of ``use_laurel_rw`` or ``use_laurel_lr`` must be
            ``True`` when this flag is active; both may be active
            simultaneously, producing the combined **LAUREL-RW+LR** variant
            (paper eq. 5).

            Incompatible with ``use_attn_res=True`` — both methods modify
            the residual stream and their interaction is undefined.

            Reference: Menghani, G., Kumar, R. & Kumar, S. (2025).
            *LAUREL: Learned Augmented Residual Layer.* ICML 2025.

        use_laurel_rw (:obj:`bool`, *optional*, defaults to ``True``):
            Enable the **LAUREL-RW** (Residual Weights) variant.  Assigns
            independent learned scalars :math:`\\alpha, \\beta` to the
            sublayer output and residual respectively:

            .. math::
                x_{i+1} = \\alpha_s \\cdot f(x_i) + \\beta_s \\cdot x_i

            :math:`\\alpha_s, \\beta_s = \\text{softmax}([\\tilde{\\alpha},
            \\tilde{\\beta}])` so that they are non-negative and sum to 1,
            preventing unbounded growth (paper §2.1).  Adds **2 parameters
            per sublayer** (4 per decoder layer).

            When combined with ``use_laurel_lr=True`` (LAUREL-RW+LR,
            paper eq. 5):

            .. math::
                x_{i+1} = \\alpha_s \\cdot f(x_i)
                         + \\beta_s \\cdot (B A x_i + x_i)

            Ignored when ``use_laurel=False``.

        use_laurel_lr (:obj:`bool`, *optional*, defaults to ``False``):
            Enable the **LAUREL-LR** (Low-Rank) variant.  Augments the
            residual with a rank-``laurel_lr_rank`` correction:

            .. math::
                x_{i+1} = f(x_i) + B A x_i + x_i

            where :math:`A \\in \\mathbb{R}^{D \\times r}` and
            :math:`B \\in \\mathbb{R}^{r \\times D}` are learnable matrices
            (paper eq. 3).  :math:`A` is initialised with column-orthogonal
            values :math:`A_{i,j} = 1/\\sqrt{rD}` if :math:`i \\bmod r = j`
            else 0; :math:`B` is initialised to zero — matching the LoRA
            convention and ensuring the residual starts as identity
            (paper §3.3).  Adds **2·r·D parameters per sublayer**
            (4·r·D per decoder layer).

            Ignored when ``use_laurel=False``.

        laurel_lr_rank (:obj:`int`, *optional*, defaults to ``32``):
            Rank ``r`` of the low-rank matrices in LAUREL-LR.  The paper
            recommends :math:`r \\in \\{32, 48, 64\\}` for LLMs
            (paper §3.3).  Ignored when ``use_laurel=False`` or
            ``use_laurel_lr=False``.

    Constraints:
        - ``use_jtokm=True`` requires ``use_token_generator=True``.
        - ``1 ≤ jtokm_top_k < jtokm_num_experts`` when ``use_jtokm=True``.
        - ``use_spelling_bee_embeddings=True`` requires calling
          ``model.model.spelling_bee.set_byte_table(tokenizer)`` before
          training (handled automatically by ``setup_model``).
        - ``repo_start_layer`` must satisfy
          ``0 <= repo_start_layer < num_hidden_layers`` when
          ``use_repo=True``.
        - ``use_laurel=True`` is incompatible with ``use_attn_res=True``.
        - When ``use_laurel=True``, at least one of ``use_laurel_rw`` or
          ``use_laurel_lr`` must be ``True``.

    Examples::

        >>> from configuration_neollm import NeoLLMConfig
        >>> from modeling_neollm import NeoLLMForCausalLM

        >>> # Standard dense-embedding model
        >>> config = NeoLLMConfig(use_token_generator=False,
        ...                       tie_word_embeddings=True)
        >>> model  = NeoLLMForCausalLM(config)

        >>> # Full attention stack
        >>> config_full = NeoLLMConfig(
        ...     use_affine_scaled_attention=True,
        ...     use_xsa=True,
        ...     use_lucid_attention=True,
        ... )
        >>> model_full = NeoLLMForCausalLM(config_full)

        >>> # Leviathan generator + JTok-M
        >>> config_jtokm = NeoLLMConfig(
        ...     use_token_generator=True,
        ...     use_jtokm=True,
        ... )

        >>> # REPO: context re-positioning from layer 4 onward (default for 12 layers)
        >>> config_repo = NeoLLMConfig(
        ...     use_repo=True,
        ...     # repo_start_layer defaults to num_hidden_layers // 3 = 4
        ...     # repo_d_p          defaults to hidden_size // 8       = 64
        ... )

    References:
        Bae, J. et al. (2026). *Affine-Scaled Attention: Towards Flexible and
        Stable Transformer Attention.* arXiv:2602.23057.

        Duvvuri, S. et al. (2026). *LUCID: Attention with Preconditioned
        Representations.* arXiv:2602.10410.

        Zhai, S. (2026). *Exclusive Self Attention.* arXiv:2603.09078.

        Batley, R. T. & Saha, S. (2026). *A Separable Architecture for Continuous
        Token Representation in Language Models.* arXiv:2601.22040.

        Yang, Y. et al. (2026). *JTok: On Token Embedding as Another Axis of
        Scaling Law via Joint Token Self-Modulation.* arXiv:2602.00800.

        Robinson, M. et al. (2025). *Token Embeddings Violate the Manifold
        Hypothesis.* arXiv:2504.01002.

        Rabe, M. N., Clymo, J. & Dong, Z. (2026). *Spelling Bee Embeddings for
        Language Modeling.* arXiv:2601.18030.

        Li, H., Zhao, T., Cai, D. & Sproat, R. (2026). *REPO: Language Models
        with Context Re-Positioning.* arXiv:2512.14391.

        Menghani, G., Kumar, R. & Kumar, S. (2025). *LAUREL: Learned Augmented
        Residual Layer.* ICML 2025. arXiv:2411.07501.
    """

    model_type = "neollm"
    keys_to_ignore_at_inference = []

    def __init__(
        self,
        vocab_size=64402,
        hidden_size=512,
        intermediate_size=1536,
        num_hidden_layers=12,
        num_attention_heads=8,
        num_key_value_heads=4,
        hidden_act="xielu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        partial_rotary_factor=0.25,
        attention_bias=False,
        attention_dropout=0.1,
        head_dim=64,
        use_momentum_attention=True,
        momentum_gamma=0.10,
        use_mea_attention=False,
        mea_component_key_value_heads=None,
        mea_groupnorm_eps=1e-6,
        use_lucid_attention=False,
        lucid_attention_eps=1e-6,
        use_affine_scaled_attention=True,
        affine_momentum=0.9,
        use_xsa=True,
        xsa_eps=1e-6,
        # ── Directional Routing (Taylor, 2026) ────────────────────────────
        use_directional_routing=False,
        directional_routing_k=4,
        directional_routing_temp=3.0,
        # ── Attention Residuals (Kimi Team, 2026) ─────────────────────────
        use_attn_res=False,
        attn_res_num_blocks=4,
        # ── ResFormer cross-layer FAN residual (He et al., 2023) ─────────
        use_fan_residual=False,
        fan_ratio=0.125,
        fan_ratio_ffn=0.0625,
        dropout_rate=0.1,
        # ── Leviathan continuous token generator ──────────────────────────
        use_token_generator=False,
        generator_d_seed=128,
        generator_num_modes=8,
        generator_num_knots=32,
        generator_spline_degree=2,
        generator_k=3,
        generator_krank=32,
        # ── Leviathan-JTok-M token-indexed modulation ─────────────────────
        use_jtokm=False,
        jtokm_num_experts=4,
        jtokm_top_k=2,
        jtokm_num_modes=4,
        jtokm_aux_loss_weight=1e-4,
        jtokm_norm_eps=1e-6,
        # ── Hadamard output projection (Aggarwal & Kumar, 2026) ───────────
        use_hadamard_o_proj=True,
        # ── PolyNorm exclusivity ──────────────────────────────────────────
        polynorm_exclusive=False,
        # ── Spelling Bee Embeddings (Rabe et al., 2026) ───────────────────
        use_spelling_bee_embeddings=True,
        # ── Context Re-Positioning (Li et al., 2026) ──────────────────────
        use_repo=True,
        repo_start_layer=None,
        repo_d_p=None,
        # ── VersatileFFN (Nie et al., 2026) ───────────────────────────────
        use_versatile_ffn=False,
        versatile_total_experts=4,
        versatile_active_experts=2,
        versatile_max_depth=2,
        versatile_gumbel_temp_start=5.0,
        versatile_gumbel_temp_end=0.1,
        versatile_gumbel_temp_decay=0.99984,
        versatile_aux_loss_weight=1e-5,
        # ── LAuReL: Learned Augmented Residual Layer (Menghani et al., 2025) ─
        use_laurel=False,
        use_laurel_rw=False,
        use_laurel_lr=False,
        laurel_lr_rank=32,
        # ── Interleaved Head Attention (Duvvuri et al., 2026) ─────────────
        use_iha=False,
        iha_num_pseudo_heads=2,          # P=2 → 2×2=4 patrones por head
        iha_local_global_pattern="LLLLG", # 4 locales + 1 global (paper §5.1)
        iha_sliding_window=None,          # auto = N // (2*P^2) usando la longitud real del batch
        **kwargs,
    ):
        # ── Generator / tying consistency ─────────────────────────────────
        if use_token_generator and tie_word_embeddings:
            logger.warning(
                "`use_token_generator=True` is incompatible with "
                "`tie_word_embeddings=True`. "
                "Automatically setting `tie_word_embeddings=False`. "
                "The continuous generator replaces the discrete lookup table "
                "with a learned smooth surface, so input and output parameters "
                "are always structurally decoupled."
            )
            tie_word_embeddings = False

        # ── JTok-M / generator dependency ─────────────────────────────────
        if use_jtokm and not use_token_generator:
            raise ValueError(
                "`use_jtokm=True` requires `use_token_generator=True`. "
                "The JTok-M surfaces are defined over the Leviathan latent "
                "coordinate z̃_x, which is only produced when the generator "
                "is active. Set `use_token_generator=True` or disable JTok-M."
            )

        # ── JTok-M top-k sanity ────────────────────────────────────────────
        if use_jtokm and not (1 <= jtokm_top_k < jtokm_num_experts):
            raise ValueError(
                f"`jtokm_top_k` must satisfy 1 <= jtokm_top_k < jtokm_num_experts, "
                f"got jtokm_top_k={jtokm_top_k}, jtokm_num_experts={jtokm_num_experts}."
            )

        # ── REPO: resolve defaults and validate ───────────────────────────
        # repo_start_layer defaults to num_hidden_layers // 3, matching the
        # paper's 1/3-of-depth heuristic (Li et al., 2026, §3).
        # repo_d_p defaults to hidden_size // 8, matching the paper's
        # assumption that positional information is less rich than the full
        # hidden representation (Li et al., 2026, §3.2).
        if repo_start_layer is None:
            repo_start_layer = num_hidden_layers // 3
        if repo_d_p is None:
            repo_d_p = hidden_size // 8
        if use_repo and not (0 <= repo_start_layer < num_hidden_layers):
            raise ValueError(
                f"`repo_start_layer` must satisfy "
                f"0 <= repo_start_layer < num_hidden_layers, "
                f"got repo_start_layer={repo_start_layer}, "
                f"num_hidden_layers={num_hidden_layers}."
            )

        # ── VersatileFFN: validate expert configuration ────────────────────
        if use_versatile_ffn:
            if not (1 <= versatile_active_experts < versatile_total_experts):
                raise ValueError(
                    f"`versatile_active_experts` must satisfy "
                    f"1 <= versatile_active_experts < versatile_total_experts, "
                    f"got {versatile_active_experts} vs {versatile_total_experts}."
                )
            if intermediate_size % versatile_total_experts != 0:
                raise ValueError(
                    f"`intermediate_size` ({intermediate_size}) must be divisible by "
                    f"`versatile_total_experts` ({versatile_total_experts})."
                )

        # ── IHA / MEA compatibility ───────────────────────────────────────
        # The implementation keeps both modules in-place:
        #   IHA acts first on Q/K/V component heads.
        #   MEA then applies its [H_comp, H_kv] mixing independently inside
        #   each IHA pseudo-slot on K/V.
        # This preserves IHA's pseudo-head structure and the GQA ratio
        # (H_q*P) / (H_kv*P) = H_q / H_kv without moving other attention ops.
        if use_iha and iha_num_pseudo_heads < 1:
            raise ValueError(
                f"`iha_num_pseudo_heads` must be >= 1, got {iha_num_pseudo_heads}."
            )

        # ── LAuReL: mutual exclusion and sub-flag consistency ─────────────
        # use_laurel and use_attn_res both modify the residual stream and are
        # structurally incompatible: AttnRes replaces the accumulation entirely
        # with learned depth-wise attention, while LAuReL scales/augments the
        # additive residual in-place.
        if use_laurel and use_attn_res:
            raise ValueError(
                "`use_laurel=True` is incompatible with `use_attn_res=True`. "
                "Both methods modify the residual stream: AttnRes replaces it "
                "with depth-wise softmax attention, while LAuReL applies learned "
                "scalar/low-rank augmentation in-place. Enable at most one."
            )
        if use_laurel and not use_laurel_rw and not use_laurel_lr:
            raise ValueError(
                "`use_laurel=True` requires at least one sub-variant to be active. "
                "Set `use_laurel_rw=True` and/or `use_laurel_lr=True`."
            )

        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

        # ── Core Transformer ──────────────────────────────────────────────
        self.vocab_size              = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size             = hidden_size
        self.intermediate_size       = intermediate_size
        self.num_hidden_layers       = num_hidden_layers
        self.num_attention_heads     = num_attention_heads
        self.num_key_value_heads     = num_key_value_heads
        self.hidden_act              = hidden_act
        self.initializer_range       = initializer_range
        self.rms_norm_eps            = rms_norm_eps

        # ── Positional encoding ───────────────────────────────────────────
        self.rope_theta              = rope_theta
        self.rope_scaling            = rope_scaling
        self.partial_rotary_factor   = partial_rotary_factor

        # ── Attention ─────────────────────────────────────────────────────
        self.attention_bias                = attention_bias
        self.attention_dropout             = attention_dropout
        self.head_dim                      = head_dim
        self.use_momentum_attention        = use_momentum_attention
        self.momentum_gamma                = momentum_gamma
        self.use_mea_attention             = use_mea_attention
        self.mea_component_key_value_heads = (
            num_key_value_heads
            if mea_component_key_value_heads is None
            else int(mea_component_key_value_heads)
        )
        self.mea_groupnorm_eps             = mea_groupnorm_eps
        self.use_lucid_attention           = use_lucid_attention
        self.lucid_attention_eps           = lucid_attention_eps
        self.use_affine_scaled_attention   = use_affine_scaled_attention
        self.affine_momentum               = affine_momentum
        self.use_xsa                       = use_xsa
        self.xsa_eps                       = xsa_eps
# ── Directional Routing ───────────────────────────────────────────
        self.use_directional_routing       = use_directional_routing
        self.directional_routing_k         = directional_routing_k
        self.directional_routing_temp      = directional_routing_temp

        # ── Attention Residuals ───────────────────────────────────────────
        # use_attn_res=True:  replace fixed residual accumulation with learned
        #   depth-wise softmax attention over preceding layer outputs.
        # attn_res_num_blocks=0: Full AttnRes — all previous layer outputs
        #   are kept as sources (N grows to num_hidden_layers+1).
        # attn_res_num_blocks=4: Block AttnRes — 4 block summaries maximum,
        #   block_size = num_hidden_layers // 4 = 3 layers per block.
        #   Memory cost: O(num_blocks × batch × seq × hidden) instead of
        #   O(num_layers × batch × seq × hidden).
        self.use_attn_res                  = use_attn_res
        self.attn_res_num_blocks           = attn_res_num_blocks
        rope_config_validation(self)

        # ── FANformer periodicity ─────────────────────────────────────────
        self.use_fan_residual              = use_fan_residual
        self.fan_ratio                     = fan_ratio
        self.fan_ratio_ffn                 = fan_ratio_ffn

        # ── Regularization ────────────────────────────────────────────────
        self.dropout_rate                  = dropout_rate

        # ── Leviathan generator ───────────────────────────────────────────
        self.use_token_generator           = use_token_generator
        self.generator_d_seed              = generator_d_seed
        self.generator_num_modes           = generator_num_modes
        self.generator_num_knots           = generator_num_knots
        self.generator_spline_degree       = generator_spline_degree
        self.generator_k                   = generator_k
        self.generator_krank               = generator_krank

        # ── Leviathan-JTok-M ─────────────────────────────────────────────
        self.use_jtokm                     = use_jtokm
        self.jtokm_num_experts             = jtokm_num_experts
        self.jtokm_top_k                   = jtokm_top_k
        self.jtokm_num_modes               = jtokm_num_modes
        self.jtokm_aux_loss_weight         = jtokm_aux_loss_weight
        self.jtokm_norm_eps                = jtokm_norm_eps

        # ── Hadamard output projection (Aggarwal & Kumar, 2026) ───────────
        self.use_hadamard_o_proj           = use_hadamard_o_proj

        # ── PolyNorm exclusivity ──────────────────────────────────────────
        self.polynorm_exclusive            = polynorm_exclusive

        # ── Spelling Bee Embeddings (Rabe et al., 2026) ───────────────────
        self.use_spelling_bee_embeddings   = use_spelling_bee_embeddings

        # ── Context Re-Positioning (Li et al., 2026) ──────────────────────
        self.use_repo                      = use_repo
        self.repo_start_layer              = repo_start_layer
        self.repo_d_p                      = repo_d_p

        # ── VersatileFFN (Nie et al., 2026) ───────────────────────────────
        self.use_versatile_ffn             = use_versatile_ffn
        self.versatile_total_experts       = versatile_total_experts
        self.versatile_active_experts      = versatile_active_experts
        self.versatile_max_depth           = versatile_max_depth
        self.versatile_gumbel_temp_start   = versatile_gumbel_temp_start
        self.versatile_gumbel_temp_end     = versatile_gumbel_temp_end
        self.versatile_gumbel_temp_decay   = versatile_gumbel_temp_decay
        self.versatile_aux_loss_weight     = versatile_aux_loss_weight

        # ── LAuReL: Learned Augmented Residual Layer (Menghani et al., 2025) ─
        self.use_laurel                    = use_laurel
        self.use_laurel_rw                 = use_laurel_rw
        self.use_laurel_lr                 = use_laurel_lr
        self.laurel_lr_rank                = laurel_lr_rank

        # ── Interleaved Head Attention (Duvvuri et al., 2026) ─────────────
        # use_iha=True: enables learned cross-head mixing of Q, K, V.
        # iha_num_pseudo_heads (P): number of pseudo-heads per original head.
        #   P=1: lightweight cross-head linear mixing, fully shape-preserving,
        #        compatible with all other attention flags.
        #   P>1: full IHA with pseudo-head expansion and collapse.
        #        If MEA is active, MEA composes K/V independently inside each
        #        pseudo-slot after IHA, so both remain compatible.
        # iha_local_global_pattern: paper Sec. 5.1 hybrid schedule.
        #   "LLLLG" → 4 sliding-window local layers + 1 global layer per cycle.
        #   Applied only when P>1 (P=1 never needs FLOP compensation).
        # iha_sliding_window: window size W for local-IHA layers.
        #   None → auto = N/(2P²) with N = actual sequence length at forward time
        #           (paper Sec. 5.1 / Appendix C exact recipe).
        #   int  → use the provided explicit window size as-is.
        # Init: identity (IHA ≡ MHA at step 0, Theorem 2 inclusion proof).
        self.use_iha                       = use_iha
        self.iha_num_pseudo_heads          = iha_num_pseudo_heads
        self.iha_local_global_pattern      = iha_local_global_pattern
        self.iha_sliding_window            = iha_sliding_window

        self.auto_map = {
            "AutoConfig":           "configuration_neollm.NeoLLMConfig",
            "AutoModel":            "modeling_neollm.NeoLLMModel",
            "AutoModelForCausalLM": "modeling_neollm.NeoLLMForCausalLM",
        }


__all__ = ["NeoLLMConfig"]