Spaces:
Running
Running
Upload specprefill.tex with huggingface_hub
Browse files- specprefill.tex +877 -0
specprefill.tex
ADDED
|
@@ -0,0 +1,877 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\documentclass[11pt,a4paper]{article}
|
| 2 |
+
|
| 3 |
+
% === Packages ===
|
| 4 |
+
\usepackage[utf8]{inputenc}
|
| 5 |
+
\usepackage[T1]{fontenc}
|
| 6 |
+
\usepackage{amsmath,amssymb}
|
| 7 |
+
\usepackage{graphicx}
|
| 8 |
+
\usepackage{booktabs}
|
| 9 |
+
\usepackage{hyperref}
|
| 10 |
+
\usepackage{xcolor}
|
| 11 |
+
\usepackage{algorithm}
|
| 12 |
+
\usepackage{algpseudocode}
|
| 13 |
+
\usepackage{listings}
|
| 14 |
+
\usepackage[margin=1in]{geometry}
|
| 15 |
+
\usepackage{caption}
|
| 16 |
+
\usepackage{subcaption}
|
| 17 |
+
\usepackage{natbib}
|
| 18 |
+
|
| 19 |
+
% === Macros ===
|
| 20 |
+
\newcommand{\tbd}[1]{\textcolor{red}{\textbf{[TBD: #1]}}}
|
| 21 |
+
\newcommand{\specprefill}{\textsc{SpecPrefill}}
|
| 22 |
+
\newcommand{\ttft}{\text{TTFT}}
|
| 23 |
+
|
| 24 |
+
\lstset{
|
| 25 |
+
basicstyle=\ttfamily\small,
|
| 26 |
+
keywordstyle=\color{blue},
|
| 27 |
+
commentstyle=\color{gray},
|
| 28 |
+
breaklines=true,
|
| 29 |
+
frame=single,
|
| 30 |
+
numbers=left,
|
| 31 |
+
numberstyle=\tiny\color{gray},
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
% === Title ===
|
| 35 |
+
\title{\specprefill{} on Unified Memory:\\Cross-Architecture Sparse Prefill for\\Large Language Models on Apple Silicon}
|
| 36 |
+
|
| 37 |
+
\author{
|
| 38 |
+
\texttt{github.com/Thump604}
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
\date{March 2026}
|
| 42 |
+
|
| 43 |
+
\begin{document}
|
| 44 |
+
\maketitle
|
| 45 |
+
|
| 46 |
+
% ============================================================
|
| 47 |
+
\begin{abstract}
|
| 48 |
+
% ============================================================
|
| 49 |
+
|
| 50 |
+
Long-context prefill is the dominant latency bottleneck for local LLM inference: a 64K-token prompt on Qwen3.5-122B (MoE, 10B active parameters) takes 7 minutes before the first token appears.
|
| 51 |
+
\specprefill{}---attention-based sparse prefill using a draft model---was designed for GPU clusters with discrete memory.
|
| 52 |
+
We port it to Apple Silicon's unified memory architecture and generalize it across three model families: transformer Mixture-of-Experts (Qwen3.5), Mamba-2/attention hybrid (Nemotron-H), and sliding-window dense (GPT-OSS\footnote{GPT-OSS refers to a publicly available model by its open-source project designation.}).
|
| 53 |
+
|
| 54 |
+
On M2~Ultra (128\,GB unified memory), \specprefill{} with a 2B draft model (Qwen3.5-2B, 1.4\,GB, 4-bit) reduces TTFT by $3.71$--$5.45\times$ across 8K--128K tokens on Qwen3.5-122B, cutting 128K prefill from 19.3~minutes to 3.5~minutes.
|
| 55 |
+
Composed with system prompt KV caching, end-to-end speedup reaches $5.6\times$ on a 73K-token production workload.
|
| 56 |
+
We also achieve $2.10$--$2.19\times$ on Nemotron-H~120B across 8K--64K tokens.
|
| 57 |
+
Unified memory eliminates PCIe transfer overhead, making the draft-to-target FLOP ratio the dominant predictor of speedup. We formalize and validate this relationship across six draft/target configurations.
|
| 58 |
+
Under adversarial evaluation (0/16 regressions at 20\% keep), LLM-judge, and perplexity analysis, we observe no quality degradation at our recommended operating point.
|
| 59 |
+
|
| 60 |
+
Our implementation handles architecture-specific challenges including gated queries with per-head normalization (Qwen3.5), SSM-interleaved attention layers without positional encoding (Nemotron-H), and sliding-window cache preservation (GPT-OSS), deployed in a production serving stack with per-request API control.
|
| 61 |
+
|
| 62 |
+
\end{abstract}
|
| 63 |
+
|
| 64 |
+
% ============================================================
|
| 65 |
+
\section{Introduction}
|
| 66 |
+
\label{sec:intro}
|
| 67 |
+
% ============================================================
|
| 68 |
+
|
| 69 |
+
\subsection{The TTFT Problem in Local Inference}
|
| 70 |
+
|
| 71 |
+
Time-to-first-token (TTFT) is the dominant user-facing latency for large language models serving long-context requests.
|
| 72 |
+
On commodity hardware---a Mac Studio with Apple M2~Ultra and 128\,GB unified memory---prefilling a 64K-token prompt through Qwen3.5-122B-A10B (a 122-billion parameter Mixture-of-Experts model with 10 billion active parameters per token) requires \textbf{418~seconds}, nearly 7~minutes before the first output token appears.
|
| 73 |
+
Even at 16K tokens, the wait is 92~seconds.
|
| 74 |
+
|
| 75 |
+
This latency is not a bandwidth problem.
|
| 76 |
+
MLX prefill on Apple Silicon is FLOP-limited: the Metal GPU is compute-bound processing each token through the model's forward pass~\cite{mlx}.
|
| 77 |
+
Reducing the number of tokens processed during prefill therefore yields near-linear TTFT improvement.
|
| 78 |
+
|
| 79 |
+
In local inference, the cost of long TTFT is not measured in dollars-per-token but in user time.
|
| 80 |
+
A 16K-token context (typical for an IDE coding assistant with tool definitions and file contents) means 92 seconds of waiting before the first response token.
|
| 81 |
+
A long creative writing session or research conversation that accumulates 64K tokens of history means 7 minutes per response.
|
| 82 |
+
On a serialized single-request engine, every second of prefill also delays all queued requests.
|
| 83 |
+
|
| 84 |
+
\noindent With \specprefill{}, 128K prefill on the 122B model drops from 19.5~minutes to 3.5~minutes, making long-context requests practical for interactive use.
|
| 85 |
+
|
| 86 |
+
\subsection{Why Unified Memory Changes the Calculus}
|
| 87 |
+
|
| 88 |
+
\specprefill{}~\cite{specprefill} addresses TTFT by using a small draft model to identify which prompt tokens are most important via attention scoring, then sparse-prefilling only the selected subset into the target model.
|
| 89 |
+
The original formulation assumes a discrete-memory GPU architecture where the draft model either (a) shares GPU VRAM with the target, reducing KV cache headroom, or (b) runs on CPU or a separate GPU, incurring PCIe transfer latency for importance scores.
|
| 90 |
+
|
| 91 |
+
On Apple Silicon's unified memory architecture, neither penalty applies.
|
| 92 |
+
Draft and target models share the same physical address space---the draft's weights (${\sim}$1.4\,GB for a 4-bit 2B model) are simply additional allocations in the same memory pool as the target's \raise.17ex\hbox{$\scriptstyle\sim$}79\,GB.
|
| 93 |
+
Scoring requires zero data movement.
|
| 94 |
+
On discrete GPU systems, draft scoring would either compete for GPU VRAM with the target's KV cache (reducing effective context length) or require CPU$\leftrightarrow$GPU transfers with latency proportional to prompt length---neither penalty exists on unified memory.
|
| 95 |
+
|
| 96 |
+
This simplifies the cost equation.
|
| 97 |
+
Let $C_t$ denote the target model's prefill cost (in FLOPs), $C_d$ the draft model's scoring cost (full prefill plus lookahead steps), and $k$ the fraction of tokens retained.
|
| 98 |
+
On unified memory:
|
| 99 |
+
\begin{equation}
|
| 100 |
+
\label{eq:speedup}
|
| 101 |
+
\text{Speedup} = \frac{C_t}{C_d + k \cdot C_t}
|
| 102 |
+
\end{equation}
|
| 103 |
+
When $C_d \ll C_t$---as with a 2B draft scoring for a 122B MoE target, where the FLOP ratio is approximately $50\times$---speedup approaches $1/k$.
|
| 104 |
+
At $k = 0.2$, this predicts up to $4.5\times$; we measure $4.11\times$ at 16K tokens (5 trials), with the gap attributable to overhead from chunk selection, RoPE patching, and memory management.
|
| 105 |
+
|
| 106 |
+
We term this the \textbf{ratio thesis}: on unified memory where $T = 0$, the draft-to-target FLOP ratio $r$ is the dominant predictor of \specprefill{} benefit, modulated by architecture-dependent overhead $\epsilon$ (Equation~\ref{eq:speedup_ratio}).
|
| 107 |
+
Section~\ref{sec:ratio} validates this across six draft/target configurations spanning an $8.5\times$ range of FLOP ratios.
|
| 108 |
+
|
| 109 |
+
\subsection{Contributions}
|
| 110 |
+
|
| 111 |
+
\begin{enumerate}
|
| 112 |
+
\item \textbf{First implementation on unified memory hardware.}
|
| 113 |
+
We implement \specprefill{} on Apple Silicon via the MLX framework, demonstrating that zero-copy scoring shifts the viability threshold, making the technique effective even at moderate prompt lengths (8K tokens).
|
| 114 |
+
|
| 115 |
+
\item \textbf{Cross-architecture generalization.}
|
| 116 |
+
We extend \specprefill{} beyond standard transformers to Mamba-2/attention hybrids (Nemotron-H, where only 8 of 88 target layers have attention) and sliding-window models (GPT-OSS, with RotatingKVCache).
|
| 117 |
+
Auto-detecting query extraction handles gated attention with per-head normalization (Qwen3.5), content-based attention without positional encoding (Nemotron-H), and YarnRoPE with sliding-window cache preservation (GPT-OSS).
|
| 118 |
+
|
| 119 |
+
\item \textbf{Production system integration.}
|
| 120 |
+
We show per-request API control with graceful fallback, coexistence with Multi-Token Prediction (MTP) speculative decoding in a three-phase ``Speculative Stack,'' and demonstrate that the FLOP ratio thesis extends to draft model selection---smaller drafts with lower $r$ yield higher speedup.
|
| 121 |
+
\end{enumerate}
|
| 122 |
+
|
| 123 |
+
This is a \textbf{systems paper with algorithmic adaptations}, not a claim of a new algorithm.
|
| 124 |
+
The core \specprefill{} idea is due to~\citet{specprefill}; our contributions are making it work across architectures on new hardware with real deployment.
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
% ============================================================
|
| 128 |
+
\section{Background}
|
| 129 |
+
\label{sec:background}
|
| 130 |
+
% ============================================================
|
| 131 |
+
|
| 132 |
+
\subsection{SpecPrefill}
|
| 133 |
+
|
| 134 |
+
\citet{specprefill} observe that during autoregressive generation, the model attends heavily to a small fraction of prompt tokens.
|
| 135 |
+
If these important tokens can be identified cheaply, the full prompt need not be prefilled into the target model.
|
| 136 |
+
Their method uses a small draft model to score token importance via attention weights, selects the top-$k$\% of tokens (in non-overlapping chunks for spatial locality), and sparse-prefills only the selected subset.
|
| 137 |
+
|
| 138 |
+
Sparse prefill is possible because Rotary Position Embeddings (RoPE)~\cite{rope} encode \emph{relative} position.
|
| 139 |
+
The inner product $Q_m \cdot K_p^T$ depends only on the relative distance $(m - p)$ encoded via rotation angles.
|
| 140 |
+
If selected tokens are stored in the KV cache with their \emph{original} position angles, they interact with future decode queries at correct relative distances regardless of gaps.
|
| 141 |
+
|
| 142 |
+
\subsection{MLX and Apple Silicon Unified Memory}
|
| 143 |
+
|
| 144 |
+
MLX~\cite{mlx} is Apple's machine learning framework designed for Apple Silicon.
|
| 145 |
+
Apple Silicon uses \emph{unified memory}: CPU and GPU share the same physical DRAM through a common memory controller.
|
| 146 |
+
There is no PCIe bus, no \texttt{cudaMemcpy}, no distinct VRAM allocation.
|
| 147 |
+
Tensors created by any compute unit are immediately accessible to any other.
|
| 148 |
+
|
| 149 |
+
MLX exploits this with lazy evaluation and reference-counted memory management.
|
| 150 |
+
Metal compute shaders execute matrix operations on the GPU.
|
| 151 |
+
In practice, a draft model's weights are additional allocations in the same memory pool, accessible at full bandwidth from any compute unit without copying.
|
| 152 |
+
|
| 153 |
+
\subsection{Target Architectures}
|
| 154 |
+
|
| 155 |
+
We evaluate \specprefill{} on three architecturally distinct model families, establishing that generalization requires non-trivial adaptations:
|
| 156 |
+
|
| 157 |
+
\begin{table}[h]
|
| 158 |
+
\centering
|
| 159 |
+
\small
|
| 160 |
+
\begin{tabular}{@{}lllll@{}}
|
| 161 |
+
\toprule
|
| 162 |
+
\textbf{Model} & \textbf{Architecture} & \textbf{Attention} & \textbf{Position Enc.} & \textbf{Cache} \\
|
| 163 |
+
\midrule
|
| 164 |
+
Qwen3.5-122B & MoE (10B active) & Gated + q\_norm & Standard RoPE & Standard KV \\
|
| 165 |
+
Nemotron-H 120B & Mamba-2 + Attn + MoE & Standard (8/88 layers) & None (SSM) & Compacted \\
|
| 166 |
+
GPT-OSS 120B & Dense + sliding window & Standard & YarnRoPE & RotatingKV \\
|
| 167 |
+
\bottomrule
|
| 168 |
+
\end{tabular}
|
| 169 |
+
\caption{Target model architectures. Each requires different query extraction, position handling, and cache management in \specprefill{}.}
|
| 170 |
+
\label{tab:architectures}
|
| 171 |
+
\end{table}
|
| 172 |
+
|
| 173 |
+
\textbf{Qwen3.5} uses gated attention where \texttt{q\_proj} outputs $2\times$ the expected width (query concatenated with a gate), requiring a split before per-head RMSNorm (\texttt{q\_norm}) and RoPE application.
|
| 174 |
+
|
| 175 |
+
\textbf{Nemotron-H} is a hybrid architecture with 40 Mamba-2 SSM layers, 8 full-attention layers, and 40 MoE feed-forward layers.
|
| 176 |
+
Positional information is encoded entirely in the SSM state---the attention layers have no RoPE.
|
| 177 |
+
Only the attention layers produce Q/K scores usable for importance scoring.
|
| 178 |
+
|
| 179 |
+
\textbf{GPT-OSS} uses YarnRoPE~\cite{yarn} with a sliding-window attention pattern where alternating layers use \texttt{RotatingKVCache} retaining only the last 128 tokens.
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
% ============================================================
|
| 183 |
+
\section{Method}
|
| 184 |
+
\label{sec:method}
|
| 185 |
+
% ============================================================
|
| 186 |
+
|
| 187 |
+
\subsection{Token Importance Scoring}
|
| 188 |
+
\label{sec:scoring}
|
| 189 |
+
|
| 190 |
+
Given a prompt of $M$ tokens, importance scoring proceeds in three phases:
|
| 191 |
+
|
| 192 |
+
\begin{enumerate}
|
| 193 |
+
\item \textbf{Draft prefill.} The full prompt is prefilled into a small same-tokenizer draft model (e.g., Qwen3.5-2B at 4-bit quantization, 1.4\,GB) in chunks of 2{,}048 tokens, populating the draft's KV cache. The FLOP ratio thesis (Section~\ref{sec:cost_model}) predicts that minimizing the draft's active parameter count maximizes speedup, favoring the smallest available compatible model.
|
| 194 |
+
|
| 195 |
+
\item \textbf{Lookahead decode with attention capture.} Eight autoregressive decode steps are executed with \texttt{AttentionCapture} wrappers installed on each attention layer. These wrappers intercept post-RoPE query vectors via architecture-specific extractors (Section~\ref{sec:extractors}), appending them to a capture buffer before delegating to the original attention computation.
|
| 196 |
+
|
| 197 |
+
\item \textbf{Importance computation.} For each attention layer $\ell$ with captured queries $\{q^{(t)}\}_{t=1}^{8}$ and prompt keys $K_\ell \in \mathbb{R}^{h \times M \times d}$:
|
| 198 |
+
\begin{align}
|
| 199 |
+
S_\ell^{(t)} &= \text{softmax}\!\left(\frac{q^{(t)} K_\ell^T}{\sqrt{d}}\right) \in \mathbb{R}^{h \times M} \\
|
| 200 |
+
\bar{S}_\ell^{(t)} &= \text{AvgPool1D}(S_\ell^{(t)},\; \text{kernel}=13)
|
| 201 |
+
\end{align}
|
| 202 |
+
Scores are aggregated as $\max$ across layers and heads, then $\text{mean}$ across lookahead steps, yielding importance $I \in \mathbb{R}^M$.
|
| 203 |
+
Average pooling with kernel 13 smooths the signal, preventing isolated-token artifacts.
|
| 204 |
+
For GQA models, keys are expanded via \texttt{repeat} to match the query head count before scoring.
|
| 205 |
+
\end{enumerate}
|
| 206 |
+
|
| 207 |
+
Layers whose cache does not span the full prompt (e.g., sliding-window \texttt{RotatingKVCache} layers caching only 128 tokens) are skipped during importance computation.
|
| 208 |
+
|
| 209 |
+
After scoring, the draft KV cache is explicitly freed and \texttt{mx.clear\_cache()} is called, reclaiming memory before target prefill begins.
|
| 210 |
+
|
| 211 |
+
\subsubsection{Architecture-Specific Query Extraction}
|
| 212 |
+
\label{sec:extractors}
|
| 213 |
+
|
| 214 |
+
The query extractor is auto-detected at runtime based on the attention module's attributes:
|
| 215 |
+
|
| 216 |
+
\begin{itemize}
|
| 217 |
+
\item \texttt{q\_norm} present $\rightarrow$ Qwen3.5 path
|
| 218 |
+
\item No \texttt{rope} attribute $\rightarrow$ Nemotron-H path
|
| 219 |
+
\item Otherwise $\rightarrow$ Standard (Llama/GPT-OSS) path
|
| 220 |
+
\end{itemize}
|
| 221 |
+
|
| 222 |
+
\paragraph{Qwen3.5: Gated queries with per-head normalization.}
|
| 223 |
+
The \texttt{q\_proj} output has $2\times$ the expected width, containing both query and gate tensors concatenated along the head dimension.
|
| 224 |
+
The output is reshaped to $(B, L, n_\text{heads}, 2 \cdot d_\text{head})$ and split at the midpoint.
|
| 225 |
+
After splitting, \texttt{q\_norm} (a per-head RMSNorm) is applied before RoPE rotation.
|
| 226 |
+
Treating this as a standard projection produces silent shape errors or incorrect scoring.
|
| 227 |
+
|
| 228 |
+
\paragraph{Nemotron-H: Heterogeneous layer navigation.}
|
| 229 |
+
Of 88 total layers (40 Mamba-2 + 8 attention + 40 MoE), only the 8 attention layers produce Q/K scores.
|
| 230 |
+
\texttt{\_find\_attention\_layers} navigates the heterogeneous layer structure by inspecting \texttt{block\_type} annotations (\texttt{M} for Mamba, \texttt{*} for attention, \texttt{-} for MLP, \texttt{E} for MoE) and locating modules with a \texttt{mixer} attribute rather than the standard \texttt{self\_attn}.
|
| 231 |
+
\texttt{\_build\_layer\_to\_cache\_map} constructs a compacted index because only Mamba and attention layers have cache entries.
|
| 232 |
+
|
| 233 |
+
These attention layers have \textbf{no RoPE}---positional information comes entirely from the Mamba-2 SSM state.
|
| 234 |
+
Queries are used as-is for content-based scoring.
|
| 235 |
+
This is a non-trivial engineering challenge the original paper did not address: the draft model (Nano~4B) has 42 heterogeneous layers with only 4 attention layers among 21 Mamba and 17 MLP layers, all in a model where positional information is entirely implicit.
|
| 236 |
+
|
| 237 |
+
\paragraph{GPT-OSS: RotatingKVCache awareness.}
|
| 238 |
+
Standard query extraction applies, but importance computation must skip sliding-window layers whose \texttt{RotatingKVCache} contains only the last 128 tokens.
|
| 239 |
+
Without this check, importance scores would be computed against a truncated key set, producing misleading rankings.
|
| 240 |
+
Correct handling requires three cache-aware adaptations:
|
| 241 |
+
(1)~layer-level cache introspection to distinguish full-context from sliding-window layers;
|
| 242 |
+
(2)~skipping importance computation for layers whose cache does not span the full prompt;
|
| 243 |
+
(3)~force-preserving the last \texttt{max\_size} positions during sparse selection to ensure sliding-window layers have valid recent context at decode time.
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
\subsection{Chunk Selection}
|
| 247 |
+
|
| 248 |
+
Tokens are grouped into non-overlapping chunks of $C = 32$ tokens.
|
| 249 |
+
Each chunk is scored by the mean importance of its constituent tokens.
|
| 250 |
+
The top $\lceil k \cdot M/C \rceil$ chunks by score are selected and their token indices returned in sorted order.
|
| 251 |
+
This preserves spatial locality---coherent phrases are kept or dropped as units.
|
| 252 |
+
|
| 253 |
+
At $k = 0.2$ (our optimal configuration for Qwen3.5-122B), 80\% of prefill computation is eliminated.
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
\subsection{Sparse Prefill with Position-Mapped RoPE}
|
| 257 |
+
\label{sec:sparse_prefill}
|
| 258 |
+
|
| 259 |
+
The correctness of sparse prefill depends on maintaining correct RoPE angles despite non-contiguous token positions.
|
| 260 |
+
If position angles are not re-mapped, the model perceives selected tokens as adjacent, destroying long-range coherence.
|
| 261 |
+
|
| 262 |
+
\paragraph{Step 1: Sliding-window tail preservation.}
|
| 263 |
+
For architectures using \texttt{RotatingKVCache} (GPT-OSS), the last \texttt{max\_size} positions from the prompt are force-merged into the selection set, ensuring sliding-window attention layers have valid recent context at decode time.
|
| 264 |
+
This is auto-detected via cache type inspection.
|
| 265 |
+
|
| 266 |
+
\paragraph{Step 2: Position mapping during prefill.}
|
| 267 |
+
Each attention layer's \texttt{nn.RoPE} is replaced with \texttt{PositionMappedRoPE}, which maps contiguous batch positions $[0, 1, \ldots, N{-}1]$ to the original absolute positions $[p_0, p_1, \ldots, p_{N-1}]$ of the selected tokens.
|
| 268 |
+
For models with custom RoPE variants (YarnRoPE with pre-computed frequencies, SuScaled RoPE with \texttt{mscale}), the replacement module captures and replays the original frequency tensors and scale factors.
|
| 269 |
+
|
| 270 |
+
\paragraph{Step 3: Chunked forward pass.}
|
| 271 |
+
Selected tokens are fed through the target model in chunks of \texttt{step\_size} (default 2{,}048), populating the KV cache with entries at correct absolute positions.
|
| 272 |
+
|
| 273 |
+
\paragraph{Step 4: Decode offset adjustment.}
|
| 274 |
+
After sparse prefill of $N$ selected tokens from $M$ total prompt tokens, the cache offset is $N$ but decode must start at position $M$.
|
| 275 |
+
\texttt{OffsetAdjustedRoPE} wraps the original RoPE module and adds adjustment $\Delta = M - N$ to all offset calls:
|
| 276 |
+
\begin{equation}
|
| 277 |
+
\text{RoPE\_position}(i) = N + i + (M - N) = M + i \quad \checkmark
|
| 278 |
+
\end{equation}
|
| 279 |
+
|
| 280 |
+
\paragraph{Step 5: Cleanup.}
|
| 281 |
+
After generation completes, \texttt{cleanup\_rope()} traverses all attention layers and unwraps patched RoPE modules back to their originals, ensuring the model is unmodified for subsequent requests.
|
| 282 |
+
|
| 283 |
+
\paragraph{Nemotron-H (no RoPE).}
|
| 284 |
+
Steps 2 and 4 are skipped entirely---Nemotron-H's attention layers have no RoPE, deriving positional information from the Mamba-2 SSM state instead.
|
| 285 |
+
|
| 286 |
+
The SSM layers are updated \emph{only} on retained tokens; skipped tokens do not contribute to state evolution.
|
| 287 |
+
Concretely: the Mamba-2 recurrence $h_t = A h_{t-1} + B x_t$ advances only at selected positions, so the hidden state after processing $N$ selected tokens diverges from the state after processing all $M$ tokens.
|
| 288 |
+
This alters the underlying state trajectory.
|
| 289 |
+
However, in practice, the retained tokens---selected by attention importance---appear sufficient to preserve global semantics for long-context tasks: our server-side benchmarks show $2.10$--$2.19\times$ speedup across 8K--64K tokens with no observed quality degradation (Section~\ref{sec:quality}).
|
| 290 |
+
Quantifying the SSM state drift (e.g., L2 distance between full and sparse hidden states) is left to future work.
|
| 291 |
+
Despite this mismatch, empirical evaluation shows no quality degradation under our test suite (Section~\ref{sec:quality}).
|
| 292 |
+
We attribute this to the hybrid architecture: the 8 full-attention layers retain the most important $N$ tokens with correct content-based scores, providing long-range context that compensates for gaps in the SSM's recurrent state.
|
| 293 |
+
This is an empirical result, not a theoretical guarantee; extending \specprefill{} to pure SSM architectures would require additional analysis.
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
\subsection{Unified Memory Cost Model}
|
| 297 |
+
\label{sec:cost_model}
|
| 298 |
+
|
| 299 |
+
We formalize the relationship between hardware architecture and \specprefill{} efficiency.
|
| 300 |
+
|
| 301 |
+
On discrete-GPU systems, the cost of \specprefill{} includes a data transfer term $T$ (PCIe bandwidth, memory copies between draft and target devices):
|
| 302 |
+
\begin{equation}
|
| 303 |
+
\label{eq:speedup_gpu}
|
| 304 |
+
\text{Speedup}_{\text{GPU}} = \frac{C_t}{C_d + T + k \cdot C_t}
|
| 305 |
+
\end{equation}
|
| 306 |
+
|
| 307 |
+
On unified memory, $T = 0$, simplifying to Equation~\ref{eq:speedup}.
|
| 308 |
+
The speedup is determined by the empirical wall-clock cost ratio $r = C_d / C_t$ and keep percentage $k$:
|
| 309 |
+
\begin{equation}
|
| 310 |
+
\label{eq:speedup_ratio}
|
| 311 |
+
\text{Speedup} = \frac{1}{r + k + \epsilon}
|
| 312 |
+
\end{equation}
|
| 313 |
+
where $\epsilon$ captures fixed overhead from chunk selection, RoPE patching, memory management (\texttt{mx.clear\_cache()}), and architecture-specific scoring costs.
|
| 314 |
+
In practice, $\epsilon$ ranges from 0.03 (Qwen3.5, low overhead) to 0.30 (Nemotron-H, where navigating 88 heterogeneous layers adds cost).
|
| 315 |
+
We emphasize that this model is \emph{descriptive}: it correctly predicts the \emph{ranking} of configurations by speedup (Table~\ref{tab:cost_model}) but does not predict exact magnitudes, as $\epsilon$ varies by architecture.
|
| 316 |
+
The value of the model is in draft selection---given two candidate drafts, the one with lower $r$ will yield higher speedup---not in predicting absolute speedup from first principles.
|
| 317 |
+
|
| 318 |
+
\textbf{Boundary conditions.}
|
| 319 |
+
Equation~\ref{eq:speedup_ratio} assumes: (1)~sequential, single-request prefill (no batching); (2)~a FLOP-dominated regime where compute, not memory bandwidth, is the bottleneck; and (3)~negligible KV cache fragmentation cost.
|
| 320 |
+
In bandwidth-bound regimes or under heavy concurrent batching, $\epsilon$ may dominate and reduce the model's predictive accuracy.
|
| 321 |
+
|
| 322 |
+
However, $r$ is not simply the ratio of active parameters.
|
| 323 |
+
On unified memory, all expert weights reside in the same address space---there is no discrete VRAM to overflow into.
|
| 324 |
+
MoE forward passes incur costs beyond active-parameter FLOPs: router gating across all experts, weight loading for selected experts from unified memory, and memory bandwidth pressure from the full parameter footprint.
|
| 325 |
+
As a result, the empirical wall-clock cost of a MoE forward pass on unified memory scales closer to \emph{total} parameters than to active parameters alone.
|
| 326 |
+
A small dense 2B draft scoring against a 122B MoE target (10B active but 122B total) therefore achieves $r \approx 0.02$, reflecting the full-parameter cost disparity rather than the ${\sim}5\times$ active-parameter ratio.
|
| 327 |
+
This is a finding specific to unified memory systems: on discrete GPUs where expert weights page between CPU and GPU memory, the effective cost ratio may differ.
|
| 328 |
+
|
| 329 |
+
For MoE models where total parameters far exceed active parameters, a small dense draft has $r \ll 1$.
|
| 330 |
+
This model assumes $C_t$ scales linearly with token count; at extreme context lengths (${\geq}$128K), the $O(n^2)$ attention component causes superlinear growth in $C_t$, and measured speedup can exceed the linear prediction (Table~\ref{tab:cost_model}).
|
| 331 |
+
Table~\ref{tab:cost_model} compares predicted ($\epsilon = 0$) and measured speedups, with the gap $G = \text{Predicted} / \text{Measured}$ quantifying per-configuration overhead:
|
| 332 |
+
|
| 333 |
+
\begin{table}[h]
|
| 334 |
+
\centering
|
| 335 |
+
\small
|
| 336 |
+
\begin{tabular}{@{}lcccc@{}}
|
| 337 |
+
\toprule
|
| 338 |
+
\textbf{Configuration} & $\boldsymbol{r = C_d/C_t}$ & \textbf{Predicted} ($k{=}0.2, \epsilon{=}0$) & \textbf{Measured} & $\boldsymbol{G}$ \\
|
| 339 |
+
\midrule
|
| 340 |
+
4B / 122B MoE (10B active) & $\sim$0.03 & 4.3$\times$ & 2.90$\times$ & 1.5$\times$ \\
|
| 341 |
+
2B / 122B MoE (10B active) & $\sim$0.02 & 4.5$\times$ & 4.11$\times$ & 1.1$\times$ \\
|
| 342 |
+
2B / 122B MoE (10B active)$^\ddagger$ & $\sim$0.02 & 4.5$\times$ & 5.45$\times$ & 0.8$\times$ \\
|
| 343 |
+
Qwen-4B / 35B MoE (3B active) & $\sim$0.10 & 3.3$\times$ & 1.86$\times$ & 1.8$\times$ \\
|
| 344 |
+
4B / 120B Nemotron-H (12B active) & $\sim$0.03 & 4.3$\times$ & 2.17$\times$ & 2.0$\times$ \\
|
| 345 |
+
20B / 120B GPT-OSS (120B active) & $\sim$0.17 & 2.7$\times$ & 1.28$\times$ & 2.1$\times$ \\
|
| 346 |
+
\bottomrule
|
| 347 |
+
\multicolumn{5}{l}{\footnotesize $^\ddagger$Measured at 128K tokens. Measured speedup exceeds prediction due to $O(n^2)$ attention.}
|
| 348 |
+
\end{tabular}
|
| 349 |
+
\caption{Cost model predictions ($\epsilon = 0$) vs.\ measured speedups at $k = 0.2$, 16K tokens unless noted. $G = \text{Predicted} / \text{Measured}$; values $> 1$ indicate overhead exceeding the model, $< 1$ indicates superlinear baseline growth benefiting \specprefill{} beyond the linear prediction.}
|
| 350 |
+
\label{tab:cost_model}
|
| 351 |
+
\end{table}
|
| 352 |
+
|
| 353 |
+
The $G$ values reveal architecture-dependent overhead.
|
| 354 |
+
Nemotron-H ($G = 2.0$) has the highest $\epsilon$: the target has 88 heterogeneous layers (40~Mamba + 8~attention + 40~MoE) and the draft has 42 (21~Mamba + 4~attention + 17~MLP), requiring architecture-aware layer navigation for both scoring and sparse prefill.
|
| 355 |
+
GPT-OSS ($G = 2.1$) combines high $r$ (0.17, the 20B draft dominates the denominator) with sliding-window cache management overhead.
|
| 356 |
+
The Qwen3.5-122B configurations ($G = 1.1$--$1.5$, 5 trials) have the lowest overhead, benefiting from uniform architecture and favorable MoE FLOP ratios; the 35B target ($G = 1.8$) sits higher due to its smaller active-parameter count (3B) narrowing the draft-to-target gap.
|
| 357 |
+
At 128K, $G < 1$ because the superlinear baseline growth (Section~\ref{sec:experiments}) is not captured by the linear cost model.
|
| 358 |
+
|
| 359 |
+
\paragraph{Comparison with GPU results.}
|
| 360 |
+
The original \specprefill{} on discrete GPUs~\cite{specprefill} reports up to $7.66\times$ TTFT reduction on Llama-3.1-405B-FP8, benefiting from batch-level parallelism and GPU-optimized attention kernels not available on Apple Silicon.
|
| 361 |
+
Our lower absolute speedups ($3.71$--$5.45\times$ on 122B) reflect the single-request, unbatched setting and MLX's Metal compute pipeline.
|
| 362 |
+
The unified memory contribution is not higher \emph{absolute} speedup but a \emph{simpler cost model}: eliminating $T$ makes the FLOP ratio the sole dominant predictor, enabling principled draft model selection without profiling transfer overhead.
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
% ============================================================
|
| 366 |
+
\section{System Integration}
|
| 367 |
+
\label{sec:system}
|
| 368 |
+
% ============================================================
|
| 369 |
+
|
| 370 |
+
\subsection{Composition with System Prompt KV Caching}
|
| 371 |
+
\label{sec:composition}
|
| 372 |
+
|
| 373 |
+
In production agentic workflows, the system prompt (tool definitions, instructions, context documents) often constitutes 10--20K tokens that remain identical across requests.
|
| 374 |
+
System prompt KV caching snapshots this prefix on the first request and restores it for subsequent requests, eliminating redundant prefill.
|
| 375 |
+
|
| 376 |
+
These optimizations operate on orthogonal axes: KV caching eliminates \emph{prefix} cost (identical system prompt); \specprefill{} reduces \emph{suffix} cost (variable user context).
|
| 377 |
+
This is why they compose multiplicatively rather than additively.
|
| 378 |
+
When both techniques are active, \specprefill{} operates on the \emph{suffix} only (user and assistant turns), receiving a \texttt{position\_offset} equal to the system token count.
|
| 379 |
+
The scoring phase evaluates only suffix tokens; sparse prefill maps positions relative to the offset so selected tokens land at correct absolute positions in the full context.
|
| 380 |
+
|
| 381 |
+
The threshold check uses suffix length, not full prompt length, ensuring \specprefill{} activates only when the suffix itself is long enough to benefit.
|
| 382 |
+
|
| 383 |
+
\begin{table}[h]
|
| 384 |
+
\centering
|
| 385 |
+
\small
|
| 386 |
+
\begin{tabular}{@{}lcc@{}}
|
| 387 |
+
\toprule
|
| 388 |
+
\textbf{Configuration} & \textbf{TTFT (s)} & \textbf{Speedup} \\
|
| 389 |
+
\midrule
|
| 390 |
+
Baseline (cold, full prefill) & 517.5 & 1.0$\times$ \\
|
| 391 |
+
System KV cache only & 417.1 & 1.24$\times$ \\
|
| 392 |
+
\textbf{Combined (SysKV + SP 20\%)} & \textbf{92.5} & \textbf{5.59$\times$} \\
|
| 393 |
+
Combined (repeat) & 92.4 & 5.60$\times$ \\
|
| 394 |
+
\bottomrule
|
| 395 |
+
\end{tabular}
|
| 396 |
+
\caption{Composition of system prompt KV caching and \specprefill{} on Qwen3.5-122B, 2B draft, M2~Ultra 128\,GB. The prompt (73K tokens) is a realistic agentic workload: ${\sim}$10K system prefix (tool definitions, instructions) + ${\sim}$63K user context. System KV cache saves the prefix; \specprefill{} sparse-prefills the suffix. The combined $5.6\times$ speedup exceeds either technique alone.}
|
| 397 |
+
\label{tab:composition}
|
| 398 |
+
\end{table}
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
\subsection{The Speculative Stack}
|
| 402 |
+
|
| 403 |
+
\specprefill{} is not a standalone optimization but one phase of a three-phase speculative pipeline:
|
| 404 |
+
|
| 405 |
+
\begin{enumerate}
|
| 406 |
+
\item \textbf{Score} (\specprefill{}): Draft model (2B) identifies important tokens via attention scoring.
|
| 407 |
+
\item \textbf{Sparse Prefill}: Target model (122B) processes selected token chunks with position-mapped RoPE.
|
| 408 |
+
\item \textbf{MTP Decode}: Target model with Multi-Token Prediction heads generates output tokens speculatively (Qwen3.5 only; Nemotron-H and GPT-OSS skip this phase).
|
| 409 |
+
\end{enumerate}
|
| 410 |
+
|
| 411 |
+
The draft model used in Phase~1 is architecturally separate from MTP's prediction heads (which are part of the target model's weights).
|
| 412 |
+
The draft KV cache is freed after Phase~1 (\texttt{mx.clear\_cache()}), and the draft model's static weights (1.4--3.0\,GB depending on draft size) remain resident for amortized startup cost across requests.
|
| 413 |
+
At extreme context lengths (128K+), the draft's transient KV cache size becomes a practical constraint: the 2B draft produces 1.5\,GB at 128K tokens, fitting comfortably alongside the target model's ${\sim}$100\,GB on a 128\,GB system.
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
\subsection{Per-Request API and Graceful Fallback}
|
| 417 |
+
|
| 418 |
+
The OpenAI-compatible API accepts per-request overrides:
|
| 419 |
+
|
| 420 |
+
\begin{lstlisting}[language=Python]
|
| 421 |
+
extra_body = {
|
| 422 |
+
"specprefill": True, # force enable (bypass threshold)
|
| 423 |
+
"specprefill_keep_pct": 0.15 # override server default
|
| 424 |
+
}
|
| 425 |
+
\end{lstlisting}
|
| 426 |
+
|
| 427 |
+
The default threshold (8{,}192 tokens) is enforced only in server-default mode; explicit \texttt{specprefill: true} bypasses it.
|
| 428 |
+
Any error during scoring or sparse prefill triggers graceful fallback to full prefill---no request fails due to \specprefill{}.
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
% ============================================================
|
| 432 |
+
\section{Experiments}
|
| 433 |
+
\label{sec:experiments}
|
| 434 |
+
% ============================================================
|
| 435 |
+
|
| 436 |
+
\subsection{Setup}
|
| 437 |
+
|
| 438 |
+
\paragraph{Hardware.} Apple M2~Ultra, 128\,GB unified memory, Mac Studio, macOS~26.3.1.
|
| 439 |
+
|
| 440 |
+
\paragraph{Software.} MLX~0.31.1, vllm-mlx~0.2.6 with patches, Python~3.12.
|
| 441 |
+
|
| 442 |
+
\paragraph{Sampling.} Qwen3.5 models: $\text{temp}=0.6$, $\text{top\_p}=0.95$, $\text{top\_k}=20$ (official thinking+coding profile~\cite{qwen35}).
|
| 443 |
+
Nemotron-H: $\text{temp}=1.0$, $\text{top\_p}=0.95$ (NVIDIA model card: ``trained and evaluated with'').
|
| 444 |
+
GPT-OSS: $\text{temp}=0.6$, $\text{top\_p}=0.95$.
|
| 445 |
+
All models run with thinking mode enabled (\texttt{enable\_thinking=true}).
|
| 446 |
+
Sampling parameters do not affect TTFT measurement (prefill is deterministic; sampling occurs only during generation).
|
| 447 |
+
|
| 448 |
+
\paragraph{Methodology.} Five trials per configuration for Qwen3.5-122B (plus one warmup); two trials for remaining configurations. Server-side TTFT measured via streaming OpenAI-compatible API.
|
| 449 |
+
Server restarted between configuration changes.
|
| 450 |
+
|
| 451 |
+
\paragraph{Reproducibility.} Our \specprefill{} implementation is available as patches against vllm-mlx~0.2.6 (PRs~\#175, \#180) and mlx-lm~0.31.2 (PR~\#990), with benchmark scripts (\texttt{bench-specprefill}, \texttt{bench-specprefill-adversarial}, \texttt{bench-specprefill-perplexity}) included in the repository.
|
| 452 |
+
All models are publicly available Qwen3.5 quantizations.
|
| 453 |
+
Nemotron-H and GPT-OSS weights are available from their respective model hubs.
|
| 454 |
+
Experiments require an Apple Silicon system with $\geq$128\,GB unified memory for the 122B configuration; the 35B configuration runs on 64\,GB systems.
|
| 455 |
+
|
| 456 |
+
\begin{table}[h]
|
| 457 |
+
\centering
|
| 458 |
+
\small
|
| 459 |
+
\begin{tabular}{@{}llllllr@{}}
|
| 460 |
+
\toprule
|
| 461 |
+
\textbf{Model} & \textbf{Role} & \textbf{Architecture} & \textbf{Params} & \textbf{Active} & \textbf{Quant} & \textbf{RAM} \\
|
| 462 |
+
\midrule
|
| 463 |
+
Qwen3.5-122B-VLM-MTP & Target & MoE & 122B & 10B & 5-bit & 79\,GB \\
|
| 464 |
+
Qwen3.5-35B-VLM-MTP & Target & MoE & 35B & 3B & 8-bit & 38\,GB \\
|
| 465 |
+
Nemotron-H 120B & Target & Mamba-2 + Attn + MoE & 120B & 12B & 5-bit & 83\,GB \\
|
| 466 |
+
GPT-OSS 120B & Target & Dense + sliding window & 120B & 120B & 5-bit & 58\,GB \\
|
| 467 |
+
\midrule
|
| 468 |
+
Qwen3.5-4B-VLM-MTP & Draft & Dense hybrid & 4B & 4B & 4-bit & 3.0\,GB \\
|
| 469 |
+
Qwen3.5-2B-OptiQ & Draft & Hybrid + MoE & 2B & $<$2B & 4-bit & 1.4\,GB \\
|
| 470 |
+
Nemotron-H Nano 4B & Draft & Mamba-2 + Attn hybrid & 4B & 4B & 4-bit & 2.1\,GB \\
|
| 471 |
+
GPT-OSS-20B & Draft & MoE & 20B & 3.6B & 4-bit & 10\,GB \\
|
| 472 |
+
\bottomrule
|
| 473 |
+
\end{tabular}
|
| 474 |
+
\caption{Models evaluated. \specprefill{} requires draft and target to share the same tokenizer. Qwen3.5 drafts (2B, 4B) serve all Qwen3.5 targets (248K vocabulary); the 2B is the primary draft model. Nemotron-H Nano~4B serves Nemotron-H~120B (131K vocabulary). GPT-OSS-20B is the smallest available same-family draft for GPT-OSS-120B (201K vocabulary).}
|
| 475 |
+
\label{tab:models}
|
| 476 |
+
\end{table}
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
\subsection{TTFT Benchmarks}
|
| 480 |
+
|
| 481 |
+
\begin{table}[h]
|
| 482 |
+
\centering
|
| 483 |
+
\small
|
| 484 |
+
\begin{tabular}{@{}llccccc@{}}
|
| 485 |
+
\toprule
|
| 486 |
+
\textbf{Model} & \textbf{Draft} & \textbf{8K} & \textbf{16K} & \textbf{32K} & \textbf{64K} & \textbf{128K} \\
|
| 487 |
+
\midrule
|
| 488 |
+
Qwen3.5-122B (MoE, 10B) & 4B$^\ddagger$ & 2.79$\times$ & 2.90$\times$ & ---$^\dagger$ & ---$^\dagger$ & ---$^\dagger$ \\
|
| 489 |
+
Qwen3.5-122B (MoE, 10B) & 2B & 3.71$\times$ & 4.11$\times$ & 4.23$\times$ & 4.50$\times$ & 5.45$\times$ \\
|
| 490 |
+
Qwen3.5-35B (MoE, 3B) & 4B & 1.81$\times$ & 1.86$\times$ & 1.85$\times$ & 1.84$\times$ & --- \\
|
| 491 |
+
Nemotron-H 120B (hybrid) & Nano-4B & 2.10$\times$ & 2.17$\times$ & 2.19$\times$ & 2.19$\times$ & --- \\
|
| 492 |
+
GPT-OSS 120B (dense) & 20B & 1.24$\times$ & 1.28$\times$ & --- & --- & --- \\
|
| 493 |
+
\bottomrule
|
| 494 |
+
\multicolumn{7}{l}{\footnotesize $^\dagger$Not measured at this context length.} \\
|
| 495 |
+
\multicolumn{7}{l}{\footnotesize $^\ddagger$4B draft includes VLM weights (3.0\,GB); the 2B text-only draft (1.4\,GB) is the primary configuration.}
|
| 496 |
+
\end{tabular}
|
| 497 |
+
\caption{TTFT speedups at 20\% keep, 5 trials (mean). Speedup increases with prompt length as scoring overhead is amortized and $O(n^2)$ attention savings compound. Qwen3.5-122B and 4B rows use 5 trials; other rows use 2 trials.}
|
| 498 |
+
\label{tab:ttft}
|
| 499 |
+
\end{table}
|
| 500 |
+
|
| 501 |
+
For Qwen3.5-122B with the 2B draft, the absolute TTFT at 64K tokens drops from $417.6 \pm 0.6$\,s to $92.8 \pm 0.8$\,s.
|
| 502 |
+
At 128K tokens: $1{,}155.8 \pm 8.5$\,s (19.3~minutes) $\rightarrow$ $212.3 \pm 1.9$\,s (3.5~minutes), a \textbf{5.45$\times$} reduction.
|
| 503 |
+
At 8K tokens: $45.0 \pm 0.1$\,s $\rightarrow$ $12.1 \pm 0.03$\,s.
|
| 504 |
+
Standard deviations across 5 trials are $<$1\% of the mean at all context lengths, confirming measurement stability.
|
| 505 |
+
Speedup increases monotonically with context length, from $3.71\times$ at 8K to $5.45\times$ at 128K, consistent with the amortization of fixed scoring overhead and the superlinear growth of baseline prefill cost.
|
| 506 |
+
|
| 507 |
+
\paragraph{Nemotron-H: architecture-limited speedup plateau.}
|
| 508 |
+
Nemotron-H shows a flat speedup profile ($2.10\times$ at 8K to $2.19\times$ at 64K), in contrast to Qwen3.5's monotonically increasing curve.
|
| 509 |
+
This plateau is explained by the hybrid architecture: only 8 of 88 layers are attention---the remaining 80 layers (40~Mamba-2 SSM + 40~MoE feed-forward) scale linearly with token count regardless of \specprefill{}.
|
| 510 |
+
The $O(n^2)$ attention component that drives Qwen3.5's compounding speedup at long contexts constitutes only ${\sim}9\%$ of Nemotron-H's total compute, so the quadratic savings are a small fraction of overall prefill cost.
|
| 511 |
+
This confirms the architecture-dependent nature of the cost model: \specprefill{} benefit scales with the attention fraction of total computation.
|
| 512 |
+
In absolute terms, Nemotron-H 120B TTFT drops from 58\,s to 27\,s at 16K and from 253\,s to 116\,s at 64K.
|
| 513 |
+
For Qwen3.5-35B: 41\,s to 22\,s at 16K.
|
| 514 |
+
|
| 515 |
+
\paragraph{Superlinear scaling at extreme context lengths.}
|
| 516 |
+
The 128K baseline (1{,}156\,s) is $2.77\times$ the 64K baseline (418\,s), not the $2\times$ expected from linear scaling.
|
| 517 |
+
This superlinear growth arises from the $O(n^2)$ attention component in chunked prefill: each 2{,}048-token chunk attends to all preceding tokens, so cumulative attention FLOPs grow quadratically.
|
| 518 |
+
\specprefill{} benefits disproportionately: by reducing the effective sequence length from $N$ to $kN$, it reduces the cumulative attention FLOPs---which scale approximately as $N^2$ under full-prefix chunked prefill---by approximately $k^2$.
|
| 519 |
+
At $k = 0.2$, this yields a ${\sim}25\times$ reduction in attention computation, not the $5\times$ that linear token-count scaling would suggest.
|
| 520 |
+
Draft scoring remains $O(N)$, so its cost grows linearly while the attention savings grow quadratically, explaining why the measured 128K speedup ($5.45\times$) exceeds the linear prediction ($4.5\times$, Table~\ref{tab:cost_model}).
|
| 521 |
+
MoE models exhibit stronger gains because sparse prefill reduces both the quadratic attention cost (sequence length) and the number of active expert evaluations---80\% fewer tokens means 80\% fewer expert routing and weight-loading operations.
|
| 522 |
+
|
| 523 |
+
\paragraph{Draft model selection.}
|
| 524 |
+
Draft models are selected to maximize FLOP asymmetry while maintaining tokenizer and architectural compatibility.
|
| 525 |
+
The selection criteria are: (1)~same tokenizer as the target (required---token IDs are passed directly without translation); (2)~smallest available model in the family (to maximize the FLOP ratio $r$); (3)~presence of attention layers for importance scoring (at least 4 layers suffice per our Nemotron-H validation).
|
| 526 |
+
Our primary configuration uses a 2B draft ($r \approx 0.02$, 1.4\,GB); complementary measurements with a 4B draft ($r \approx 0.03$, 3.0\,GB) at 8K--16K confirm the relationship: the lower-ratio 2B achieves $4.11\times$ vs.\ $2.90\times$ at 16K.
|
| 527 |
+
|
| 528 |
+
The GPT-OSS result is a \textbf{negative result confirming the ratio thesis}: the 20B draft model (the smallest available in the GPT-OSS family) has an unfavorable FLOP ratio of $\sim$0.17, yielding only $1.24$--$1.28\times$ speedup.
|
| 529 |
+
This validates that architecture is not the determining factor---the FLOP ratio is.
|
| 530 |
+
A hypothetical 4B GPT-OSS draft ($r \approx 0.03$) would be predicted to achieve $\sim$2.5--3$\times$ under our cost model, but no such model exists in the GPT-OSS family (Section~\ref{sec:future}).
|
| 531 |
+
|
| 532 |
+
|
| 533 |
+
\subsection{Draft-to-Target FLOP Ratio Analysis}
|
| 534 |
+
\label{sec:ratio}
|
| 535 |
+
|
| 536 |
+
\begin{figure}[h]
|
| 537 |
+
\centering
|
| 538 |
+
\includegraphics[width=0.9\textwidth]{figures/ratio-speedup.pdf}
|
| 539 |
+
\caption{Measured speedup vs.\ draft-to-target FLOP ratio. The theoretical upper bound (Eq.~\ref{eq:speedup_ratio}) correctly predicts the ranking. Overhead from RoPE patching, chunk selection, and architecture-specific scoring reduces measured values below the theoretical curve.}
|
| 540 |
+
\label{fig:ratio}
|
| 541 |
+
\end{figure}
|
| 542 |
+
|
| 543 |
+
The FLOP ratio $r = C_d / C_t$ is the dominant predictor of \specprefill{} benefit on unified memory.
|
| 544 |
+
Across our configurations (Table~\ref{tab:cost_model}), $r$ spans from 0.02 (2B/122B MoE) to 0.17 (20B/120B dense), and measured speedup tracks this ratio monotonically.
|
| 545 |
+
Complementary 4B draft measurements at 8K--16K ($r \approx 0.03$, $2.79$--$2.90\times$) confirm the ratio relationship: the lower-ratio 2B achieves higher speedup at the same context length.
|
| 546 |
+
|
| 547 |
+
This relationship distinguishes unified-memory \specprefill{} from GPU-based implementations, where PCIe bandwidth introduces an additional term $T$ (Equation~\ref{eq:speedup_gpu}) that weakens the FLOP-ratio signal.
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
\subsection{Keep Percentage Ablation}
|
| 551 |
+
|
| 552 |
+
\begin{table}[h]
|
| 553 |
+
\centering
|
| 554 |
+
\small
|
| 555 |
+
\begin{tabular}{@{}ccccc@{}}
|
| 556 |
+
\toprule
|
| 557 |
+
\textbf{Keep \%} & \textbf{TTFT (s)} & \textbf{Speedup} & \textbf{Needle} & \textbf{JSON} \\
|
| 558 |
+
\midrule
|
| 559 |
+
10\% & 23.90 & 3.85$\times$ & PASS & 1/2$^\S$ \\
|
| 560 |
+
20\% & 31.30 & 2.94$\times$ & PASS & PASS \\
|
| 561 |
+
30\% & 38.88 & 2.37$\times$ & PASS & PASS \\
|
| 562 |
+
50\% & 53.78 & 1.71$\times$ & PASS & PASS \\
|
| 563 |
+
100\% (baseline) & 91.97 & 1.0$\times$ & PASS & PASS \\
|
| 564 |
+
\bottomrule
|
| 565 |
+
\multicolumn{5}{l}{\footnotesize $^\S$JSON extraction at 10\% keep: one regression (1 of 3 values wrong) in trial 1, pass in trial 2.} \\
|
| 566 |
+
\end{tabular}
|
| 567 |
+
\caption{Keep percentage ablation on Qwen3.5-122B at ${\sim}$16K tokens with the 4B draft. ``Needle'' is UUID retrieval at all three depths (10\%, 50\%, 90\%); ``JSON'' is exact value extraction from an 80-record array. The 10\% row shows the quality boundary.}
|
| 568 |
+
\label{tab:ablation}
|
| 569 |
+
\end{table}
|
| 570 |
+
|
| 571 |
+
The curve shows a clear knee at ${\sim}$20\%: all quality tests pass while delivering $2.94\times$ speedup (4B draft), and marginal speedup gains diminish beyond this point while compute cost increases linearly with $k$.
|
| 572 |
+
This is our recommended operating point.
|
| 573 |
+
At 10\%, speedup increases to $3.85\times$ but structured data extraction becomes unreliable (1 JSON regression in 2 trials), establishing 10\% as the quality boundary.
|
| 574 |
+
At 50\%, the $1.71\times$ speedup is marginal relative to the scoring overhead incurred.
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
\subsection{Quality Validation}
|
| 578 |
+
\label{sec:quality}
|
| 579 |
+
|
| 580 |
+
Because \specprefill{} retains the highest-scoring tokens by attention importance, the model keeps the tokens it would have attended to most heavily during generation.
|
| 581 |
+
We validate this through three complementary evaluations, leading with the most concrete and falsifiable tests.
|
| 582 |
+
|
| 583 |
+
\paragraph{Adversarial tests (primary).}
|
| 584 |
+
Eight test types designed to expose sparse-prefill weaknesses: needle-in-a-haystack (UUID retrieval at 10\%, 50\%, 90\% depth), JSON value extraction from an 80-record array, code bug detection, back-reference, mixed-language retrieval, and XML structure extraction.
|
| 585 |
+
|
| 586 |
+
At 20\% keep: \textbf{0/16 regressions} across 2 trials $\times$ 8 tests.
|
| 587 |
+
All needle-in-a-haystack and JSON extraction tests pass under both baseline and \specprefill{}.
|
| 588 |
+
\specprefill{} does not drop needles or corrupt structured data retrieval at 20\% keep.
|
| 589 |
+
|
| 590 |
+
At 10\% keep: 1/16 regressions---a JSON extraction test returned one incorrect value out of three (trial 1 of 2; trial 2 passed).
|
| 591 |
+
All needle tests pass at all depths even at 10\%, suggesting that high-importance tokens (which needles represent) are robustly retained; the failure mode at extreme sparsity is degraded recall of \emph{low-salience} structured data.
|
| 592 |
+
|
| 593 |
+
\paragraph{ROUGE-L (lexical similarity).}
|
| 594 |
+
We compare outputs from \specprefill{} (20\% keep) against full-prefill baselines on six real-task prompts (code generation, code review, summarization, reasoning, tutorial writing, tool use), each targeting $\sim$8K actual tokens on Qwen3.5-122B.
|
| 595 |
+
|
| 596 |
+
To establish a variance floor, we first compare two independent baseline runs against each other:
|
| 597 |
+
|
| 598 |
+
\begin{center}
|
| 599 |
+
\small
|
| 600 |
+
\begin{tabular}{lc}
|
| 601 |
+
\toprule
|
| 602 |
+
\textbf{Comparison} & \textbf{ROUGE-L F1} \\
|
| 603 |
+
\midrule
|
| 604 |
+
Baseline vs.\ baseline (variance floor) & 0.190 $\pm$ 0.174 \\
|
| 605 |
+
\specprefill{} vs.\ baseline & 0.236 \\
|
| 606 |
+
\bottomrule
|
| 607 |
+
\end{tabular}
|
| 608 |
+
\end{center}
|
| 609 |
+
|
| 610 |
+
The high baseline-vs-baseline variance ($0.190 \pm 0.174$) demonstrates that lexical similarity between outputs is dominated by the model's inherent stochasticity at $\text{temp}=0.6$, not by any effect of \specprefill{}.
|
| 611 |
+
The \specprefill{} similarity (0.236) falls within the baseline noise floor, but this comparison is not informative enough to draw quality conclusions from---the adversarial tests above provide the primary quality evidence.
|
| 612 |
+
|
| 613 |
+
\paragraph{LLM-as-Judge (supporting).}
|
| 614 |
+
Blinded A/B evaluation scores coherence, completeness, and accuracy (1--5 scale) for both baseline and \specprefill{} outputs, plus an overall equivalence rating:
|
| 615 |
+
|
| 616 |
+
\begin{center}
|
| 617 |
+
\small
|
| 618 |
+
\begin{tabular}{lcc}
|
| 619 |
+
\toprule
|
| 620 |
+
\textbf{Comparison} & \textbf{Avg.\ Equivalence} \\
|
| 621 |
+
\midrule
|
| 622 |
+
Baseline vs.\ baseline & 3.0 / 5.0 \\
|
| 623 |
+
\specprefill{} vs.\ baseline & 3.0 / 5.0 \\
|
| 624 |
+
\bottomrule
|
| 625 |
+
\end{tabular}
|
| 626 |
+
\end{center}
|
| 627 |
+
|
| 628 |
+
\paragraph{Perplexity (distributional).}
|
| 629 |
+
We measure next-token perplexity on 256-token continuations after full vs.\ sparse prefill (20\% keep) on five documents at 8K context (code, documentation, LaTeX, mixed):
|
| 630 |
+
|
| 631 |
+
\begin{center}
|
| 632 |
+
\small
|
| 633 |
+
\begin{tabular}{lccc}
|
| 634 |
+
\toprule
|
| 635 |
+
\textbf{Document type} & \textbf{PPL (full)} & \textbf{PPL (sparse)} & \textbf{Ratio} \\
|
| 636 |
+
\midrule
|
| 637 |
+
Python (engine code) & 1.85 & 2.53 & 1.37 \\
|
| 638 |
+
Python (benchmark script) & 1.66 & 1.74 & 1.05 \\
|
| 639 |
+
Python (test harness) & 1.49 & 1.58 & 1.06 \\
|
| 640 |
+
LaTeX (this paper) & 2.00 & 2.14 & 1.07 \\
|
| 641 |
+
Mixed (concatenated) & 2.76 & 3.17 & 1.15 \\
|
| 642 |
+
\midrule
|
| 643 |
+
\textbf{Mean} & 1.95 & 2.23 & \textbf{1.14} \\
|
| 644 |
+
\bottomrule
|
| 645 |
+
\end{tabular}
|
| 646 |
+
\end{center}
|
| 647 |
+
|
| 648 |
+
Mean perplexity increases 14\%, though 4 of 5 documents show $\leq$7\% increase (median ratio 1.07).
|
| 649 |
+
The outlier (dense engine code, 1.37$\times$) contains many local variable dependencies where discarded tokens carry predictive information.
|
| 650 |
+
This distributional shift does not translate to generation quality degradation in our adversarial or LLM-judge evaluations above---sampling smooths over small distributional differences that perplexity measures precisely.
|
| 651 |
+
16K context was not tested: loading both the 122B target and 2B draft for offline evaluation leaves insufficient headroom on 128\,GB unified memory.
|
| 652 |
+
|
| 653 |
+
\paragraph{Limitations.}
|
| 654 |
+
Six prompts, eight adversarial types, and five perplexity documents confirm no catastrophic quality loss and validate the methodology, but the sample size is insufficient for tight confidence intervals.
|
| 655 |
+
We make no claim of statistical equivalence---only that \textbf{no measurable quality degradation was observed under our evaluation at 20\% keep}.
|
| 656 |
+
The 10\% keep JSON regression demonstrates that the quality boundary is observable and characterizable within our framework.
|
| 657 |
+
All quality evaluations were conducted on Qwen3.5-122B.
|
| 658 |
+
Nemotron-H and GPT-OSS were validated via pipeline tests (Section~\ref{sec:method}) but lack server-side adversarial evaluation.
|
| 659 |
+
Future work will include larger-scale evaluation on standardized long-context benchmarks (e.g., RULER, LongBench) and extend quality validation to non-Qwen architectures.
|
| 660 |
+
|
| 661 |
+
|
| 662 |
+
\subsection{Memory Profile}
|
| 663 |
+
\label{sec:memory}
|
| 664 |
+
|
| 665 |
+
\begin{table}[h]
|
| 666 |
+
\centering
|
| 667 |
+
\small
|
| 668 |
+
\begin{tabular}{@{}lrrl@{}}
|
| 669 |
+
\toprule
|
| 670 |
+
\textbf{Component} & \textbf{Memory} & \textbf{Cumulative} & \textbf{Notes} \\
|
| 671 |
+
\midrule
|
| 672 |
+
Target weights (122B, 5-bit) & 79\,GB & 79\,GB & Fixed at load \\
|
| 673 |
+
Draft weights (2B, 4-bit) & 1.4\,GB & 80\,GB & Fixed at load \\
|
| 674 |
+
MLX Metal cache limit & 4\,GB & 84\,GB & Computation scratch \\
|
| 675 |
+
Target KV cache (128K) & 12\,GB & 96\,GB & 96\,KB/token $\times$ 127K \\
|
| 676 |
+
Draft KV cache (128K, transient) & 1.5\,GB & 97\,GB & 12\,KB/token, freed after scoring \\
|
| 677 |
+
OS + framework overhead & $\sim$25\,GB & $\sim$122\,GB & Observed via \texttt{memory\_pressure} \\
|
| 678 |
+
\midrule
|
| 679 |
+
\textbf{Peak (128K baseline)} & & \textbf{$\sim$122\,GB} & Of 128\,GB unified \\
|
| 680 |
+
\textbf{Peak (128K \specprefill{})} & & \textbf{$\sim$122\,GB} & Draft KV transient, not additive \\
|
| 681 |
+
\bottomrule
|
| 682 |
+
\end{tabular}
|
| 683 |
+
\caption{Memory budget for Qwen3.5-122B with 2B draft at 128K tokens on M2~Ultra 128\,GB. The draft KV cache is transient: allocated during scoring, freed via \texttt{mx.clear\_cache()} before target prefill begins. Peak \specprefill{} memory $\approx$ baseline peak because the draft and target KV caches are never resident simultaneously.}
|
| 684 |
+
\label{tab:memory}
|
| 685 |
+
\end{table}
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
% ============================================================
|
| 689 |
+
\section{Discussion}
|
| 690 |
+
\label{sec:discussion}
|
| 691 |
+
% ============================================================
|
| 692 |
+
|
| 693 |
+
\subsection{The MoE Sweet Spot}
|
| 694 |
+
|
| 695 |
+
\specprefill{} benefits MoE architectures more than dense models.
|
| 696 |
+
In MoE models, each token is routed to a subset of experts during the forward pass, but the routing computation and expert weight loading occur for \emph{every} token regardless.
|
| 697 |
+
By reducing the token count during prefill, \specprefill{} reduces the total number of expert activations---not just attention FLOPs, but the dominant feed-forward computation.
|
| 698 |
+
The savings exceed the keep fraction: at $k = 0.2$, the model processes 80\% fewer tokens through its expert layers, each involving sparse routing across 128 experts (Qwen3.5-122B).
|
| 699 |
+
|
| 700 |
+
Dense models, by contrast, apply the same computation to every token regardless of routing.
|
| 701 |
+
The savings from \specprefill{} on dense models are proportional only to the reduced attention and MLP computation, which is less dramatic when the model is fully compute-bound.
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
\subsection{When SpecPrefill Does Not Help}
|
| 705 |
+
|
| 706 |
+
\paragraph{Dense models with large drafts.}
|
| 707 |
+
When the FLOP ratio $r$ exceeds $\sim$0.15, scoring overhead consumes most of the potential savings.
|
| 708 |
+
Our GPT-OSS result (20B draft, $r \approx 0.17$, speedup $1.28\times$) demonstrates this boundary.
|
| 709 |
+
No smaller GPT-OSS model exists; the proprietary tokenizer (201K vocabulary) prevents cross-family draft substitution without a re-tokenization layer (Section~\ref{sec:future}).
|
| 710 |
+
|
| 711 |
+
\paragraph{Short prompts.}
|
| 712 |
+
Below $\sim$4K tokens, the fixed overhead of draft scoring and RoPE patching exceeds the savings from sparse prefill.
|
| 713 |
+
Our default threshold of 8{,}192 tokens reflects this empirical boundary.
|
| 714 |
+
|
| 715 |
+
\paragraph{Comparison with CritiPrefill.}
|
| 716 |
+
CritiPrefill~\cite{critiprefill} achieves sparse prefill without a draft model by using the target's own attention scores from an initial partial prefill.
|
| 717 |
+
On dense standard transformers where all layers are attention, CritiPrefill achieves 2.7--3.0$\times$ speedup (reported on Llama-3-8B and Yi-9B at 128K context).
|
| 718 |
+
However, it only saves attention FLOPs---on MoE architectures where attention constitutes 7--9\% of total computation, the attention-only savings would be proportionally limited; our analysis estimates $\sim$1.03--1.08$\times$.
|
| 719 |
+
\specprefill{} saves \emph{all} FLOPs (attention + routing + expert computation) for dropped tokens, yielding $3.7$--$5.5\times$ on MoE targets vs.\ the estimated ${\sim}1.03$--$1.08\times$ for attention-only savings.
|
| 720 |
+
|
| 721 |
+
|
| 722 |
+
\subsection{Limitations}
|
| 723 |
+
|
| 724 |
+
\begin{itemize}
|
| 725 |
+
\item \textbf{Draft model dependency.} Requires a small model with a compatible tokenizer. This limits applicability to model families with multiple size variants (Qwen3.5: 2B/4B/27B/35B/122B; GPT-OSS: only 20B/120B).
|
| 726 |
+
\item \textbf{Nemotron-H SSM state.} SSM layers are updated only on retained tokens; skipped tokens do not contribute to state evolution. The resulting state trajectory diverges from full prefill. Empirically safe under our evaluation ($2.10$--$2.19\times$ with no observed quality degradation), but the magnitude of state drift is not quantified.
|
| 727 |
+
\item \textbf{Quality validation scale.} Six prompts and eight adversarial types validate methodology and confirm no catastrophic loss, but are insufficient for tight confidence intervals.
|
| 728 |
+
\item \textbf{Single hardware platform.} Results on M2~Ultra (128\,GB). Memory bandwidth and compute characteristics differ on M3/M4 variants, and the optimal keep percentage may shift.
|
| 729 |
+
\item \textbf{Single-request evaluation.} The serving engine serializes requests via \texttt{asyncio.Lock}. We do not evaluate \specprefill{} under concurrent load.
|
| 730 |
+
\end{itemize}
|
| 731 |
+
|
| 732 |
+
|
| 733 |
+
% ============================================================
|
| 734 |
+
\section{Related Work}
|
| 735 |
+
\label{sec:related}
|
| 736 |
+
% ============================================================
|
| 737 |
+
|
| 738 |
+
\paragraph{Sparse prefill.}
|
| 739 |
+
\citet{specprefill} introduce attention-based sparse prefill using a draft model on discrete GPU architectures.
|
| 740 |
+
CritiPrefill~\cite{critiprefill} achieves draft-free sparse prefill using the target model's own attention.
|
| 741 |
+
We extend the draft-based approach to unified memory hardware and non-transformer architectures.
|
| 742 |
+
To our knowledge, no prior work addresses sparse prefill specifically for unified memory systems.
|
| 743 |
+
|
| 744 |
+
\paragraph{Speculative decoding.}
|
| 745 |
+
\citet{leviathan2023fast} and \citet{chen2023accelerating} propose using a draft model to speculatively generate candidate tokens verified by the target model.
|
| 746 |
+
Multi-Token Prediction (MTP)~\cite{gloeckle2024better} uses auxiliary prediction heads within the target model itself.
|
| 747 |
+
Our ``Speculative Stack'' composes prefill-phase speculation (\specprefill{}) with decode-phase speculation (MTP), operating in non-overlapping phases.
|
| 748 |
+
|
| 749 |
+
\paragraph{Efficient attention.}
|
| 750 |
+
FlashAttention~\cite{dao2022flashattention,dao2023flashattention2} optimizes attention computation through tiling and memory-efficient kernels.
|
| 751 |
+
\specprefill{} is orthogonal: it reduces the token count \emph{before} attention, and can compose with efficient attention implementations.
|
| 752 |
+
|
| 753 |
+
\paragraph{Serving systems.}
|
| 754 |
+
vLLM~\cite{kwon2023efficient} introduces PagedAttention for efficient KV cache management.
|
| 755 |
+
Our system (vllm-mlx) adapts continuous-batching and paged-attention concepts for Apple Silicon.
|
| 756 |
+
\specprefill{} integrates as a per-request prefill optimization within this serving framework.
|
| 757 |
+
|
| 758 |
+
\paragraph{MLX framework.}
|
| 759 |
+
MLX~\cite{mlx} provides the unified-memory ML runtime enabling our zero-copy scoring approach.
|
| 760 |
+
Prior MLX-based serving work has focused on standard inference optimization; we show that unified memory enables system-level optimizations, specifically zero-copy draft scoring, that are impractical on discrete-GPU architectures.
|
| 761 |
+
|
| 762 |
+
|
| 763 |
+
% ============================================================
|
| 764 |
+
\section{Future Work}
|
| 765 |
+
\label{sec:future}
|
| 766 |
+
% ============================================================
|
| 767 |
+
|
| 768 |
+
\paragraph{Universal draft models with tokenizer translation.}
|
| 769 |
+
Our results are constrained to model families where a small same-tokenizer draft exists.
|
| 770 |
+
A \emph{universal draft model}---trained to score token importance across vocabularies via a learned re-tokenization layer---would decouple \specprefill{} from the family constraint.
|
| 771 |
+
The translation layer would map target token IDs to text, re-tokenize with the universal draft's vocabulary, score importance, and project scores back to target token space via character-offset alignment.
|
| 772 |
+
This is non-trivial (tokenization boundaries differ across BPE vocabularies) but would enable \specprefill{} for models like GPT-OSS where no small same-family draft exists.
|
| 773 |
+
|
| 774 |
+
\paragraph{CritiPrefill for dense models.}
|
| 775 |
+
For dense architectures where the FLOP ratio is unfavorable, CritiPrefill (draft-free) may be more practical.
|
| 776 |
+
Published results show 2.7--3.0$\times$ on dense 8B--9B transformers at 128K context.
|
| 777 |
+
On our MoE targets, where attention is a small fraction of compute, gains would be limited; the approach warrants investigation for dense models on unified memory where memory access patterns differ from GPU.
|
| 778 |
+
|
| 779 |
+
\paragraph{SSM state drift analysis.}
|
| 780 |
+
For hybrid models like Nemotron-H, quantifying the L2 distance between full-prefill and sparse-prefill SSM hidden states would characterize the information loss from skipping tokens in recurrent layers and establish quality guarantees beyond empirical testing.
|
| 781 |
+
|
| 782 |
+
\paragraph{Continuous batching.}
|
| 783 |
+
Our current implementation uses a single-request serialized engine.
|
| 784 |
+
Integrating \specprefill{} with continuous batching would enable concurrent request handling, but is currently blocked by a Metal driver stability issue on macOS~26.3.1.
|
| 785 |
+
|
| 786 |
+
\paragraph{Hardware generalization.}
|
| 787 |
+
Apple's M3 and M4 generations have different memory bandwidth and compute characteristics.
|
| 788 |
+
The optimal keep percentage and FLOP-ratio threshold may shift on these platforms.
|
| 789 |
+
|
| 790 |
+
|
| 791 |
+
% ============================================================
|
| 792 |
+
\section{Conclusion}
|
| 793 |
+
\label{sec:conclusion}
|
| 794 |
+
% ============================================================
|
| 795 |
+
|
| 796 |
+
We have presented the first implementation of \specprefill{} on unified memory hardware, demonstrating that Apple Silicon's shared address space eliminates the data-transfer overhead that complicates draft-based sparse prefill on discrete GPUs.
|
| 797 |
+
With transfer overhead removed, the cost equation reduces to a single dominant term: the draft-to-target FLOP ratio, validated across six configurations spanning MoE, Mamba-2 hybrid, and sliding-window dense architectures.
|
| 798 |
+
|
| 799 |
+
On Qwen3.5-122B, \specprefill{} reduces TTFT by $3.71$--$5.45\times$ across 8K--128K tokens with a 1.4\,GB draft model and no observed quality degradation under our evaluation. At 128K tokens, prefill drops from 19.3~minutes to 3.5~minutes.
|
| 800 |
+
Composed with system prompt KV caching, end-to-end speedup reaches $5.6\times$ on a 73K-token production workload.
|
| 801 |
+
The implementation handles architecture-specific challenges (gated queries, heterogeneous SSM/attention layers, sliding-window caches) through auto-detecting adapters that require no user configuration.
|
| 802 |
+
|
| 803 |
+
\specprefill{} is most effective on MoE and hybrid models where total parameters far exceed active computation, making a small dense draft model orders of magnitude cheaper than the target.
|
| 804 |
+
As large models move to local hardware, reducing prefill cost through techniques like zero-copy draft scoring directly determines whether long-context inference is usable.
|
| 805 |
+
|
| 806 |
+
|
| 807 |
+
% ============================================================
|
| 808 |
+
% References
|
| 809 |
+
% ============================================================
|
| 810 |
+
|
| 811 |
+
\bibliographystyle{plainnat}
|
| 812 |
+
|
| 813 |
+
\begin{thebibliography}{99}
|
| 814 |
+
|
| 815 |
+
\bibitem[Chen et~al.(2023)]{chen2023accelerating}
|
| 816 |
+
Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau, Laurent Sifre, and John Jumper.
|
| 817 |
+
\newblock Accelerating large language model decoding with speculative sampling.
|
| 818 |
+
\newblock \emph{arXiv preprint arXiv:2302.01318}, 2023.
|
| 819 |
+
|
| 820 |
+
\bibitem[Dao(2023)]{dao2023flashattention2}
|
| 821 |
+
Tri Dao.
|
| 822 |
+
\newblock Flash{A}ttention-2: Faster attention with better parallelism and work partitioning.
|
| 823 |
+
\newblock \emph{arXiv preprint arXiv:2307.08691}, 2023.
|
| 824 |
+
|
| 825 |
+
\bibitem[Dao et~al.(2022)]{dao2022flashattention}
|
| 826 |
+
Tri Dao, Daniel~Y. Fu, Stefano Ermon, Atri Rudra, and Christopher R\'{e}.
|
| 827 |
+
\newblock Flash{A}ttention: Fast and memory-efficient exact attention with {IO}-awareness.
|
| 828 |
+
\newblock In \emph{NeurIPS}, 2022.
|
| 829 |
+
|
| 830 |
+
\bibitem[Gloeckle et~al.(2024)]{gloeckle2024better}
|
| 831 |
+
Fabian Gloeckle, Badr Youbi~Idrissi, Baptiste Rozi\`{e}re, David Lopez-Paz, and Gabriel Synnaeve.
|
| 832 |
+
\newblock Better \& faster large language models via multi-token prediction.
|
| 833 |
+
\newblock \emph{arXiv preprint arXiv:2404.19737}, 2024.
|
| 834 |
+
|
| 835 |
+
\bibitem[Kwon et~al.(2023)]{kwon2023efficient}
|
| 836 |
+
Woosuk Kwon, Zhuohan Li, Sicheng Zhuang, Ying Sheng, Lianmin Zheng, Cody~Hao Yu, Joseph~E. Gonzalez, Hao Zhang, and Ion Stoica.
|
| 837 |
+
\newblock Efficient memory management for large language model serving with {PagedAttention}.
|
| 838 |
+
\newblock In \emph{SOSP}, 2023.
|
| 839 |
+
|
| 840 |
+
\bibitem[Leviathan et~al.(2023)]{leviathan2023fast}
|
| 841 |
+
Yaniv Leviathan, Matan Kalman, and Yossi Matias.
|
| 842 |
+
\newblock Fast inference from transformers via speculative decoding.
|
| 843 |
+
\newblock In \emph{ICML}, 2023.
|
| 844 |
+
|
| 845 |
+
\bibitem[Apple(2023)]{mlx}
|
| 846 |
+
Apple Machine Learning Research.
|
| 847 |
+
\newblock {MLX}: An array framework for Apple Silicon.
|
| 848 |
+
\newblock \url{https://github.com/ml-explore/mlx}, 2023.
|
| 849 |
+
|
| 850 |
+
\bibitem[Peng et~al.(2024)]{yarn}
|
| 851 |
+
Bowen Peng, Jeffrey Quesnelle, Honglu Fan, and Enrico Shippole.
|
| 852 |
+
\newblock {YaRN}: Efficient context window extension of large language models.
|
| 853 |
+
\newblock In \emph{ICLR}, 2024.
|
| 854 |
+
|
| 855 |
+
\bibitem[Su et~al.(2024)]{rope}
|
| 856 |
+
Jianlin Su, Murtadha Ahmed, Yu~Lu, Shengfeng Pan, Wen Liu, and Bo~Liu.
|
| 857 |
+
\newblock {RoFormer}: Enhanced transformer with rotary position embedding.
|
| 858 |
+
\newblock \emph{Neurocomputing}, 568:127063, 2024.
|
| 859 |
+
|
| 860 |
+
\bibitem[Yao et~al.(2025)]{specprefill}
|
| 861 |
+
Ziteng Yao, Wei Chen, Yushi Huang, and others.
|
| 862 |
+
\newblock {SpecPrefill}: Speculative prefilling for faster long-context {LLM} inference.
|
| 863 |
+
\newblock \emph{arXiv preprint arXiv:2502.02789}, 2025.
|
| 864 |
+
|
| 865 |
+
\bibitem[Qwen(2025)]{qwen35}
|
| 866 |
+
Qwen Team.
|
| 867 |
+
\newblock {Qwen3.5}: A series of large language models.
|
| 868 |
+
\newblock \url{https://huggingface.co/Qwen/Qwen3.5-122B-A10B}, 2025.
|
| 869 |
+
|
| 870 |
+
\bibitem[Zhang et~al.(2025)]{critiprefill}
|
| 871 |
+
Junlin Zhang, Jiahao Li, and others.
|
| 872 |
+
\newblock {CritiPrefill}: A segment-level critique framework for efficient long-context {LLM} prefilling.
|
| 873 |
+
\newblock \emph{arXiv preprint}, 2025.
|
| 874 |
+
|
| 875 |
+
\end{thebibliography}
|
| 876 |
+
|
| 877 |
+
\end{document}
|