florianleibert commited on 20 days ago

Commit

37c082a

verified ·

1 Parent(s): 816fd85

Upload folder using huggingface_hub

Browse files

Files changed (29) hide show

Dockerfile.kimi26-dflash +29 -0
README.md +314 -0
assets/acceptance-comparison.png +0 -0
assets/generate_charts.py +257 -0
assets/hardware-stack.png +0 -0
assets/latency-flat.png +0 -0
assets/optimization-journey.png +0 -0
assets/throughput-scaling.png +0 -0
benchmarks/CLEAN-dflash-st2-c12.json +262 -0
benchmarks/CLEAN-dflash-st2-c8.json +182 -0
benchmarks/CLEAN-dflash-st2-s16-c12.json +262 -0
benchmarks/CLEAN-dflash-st2-s16-c16.json +342 -0
benchmarks/CLEAN-dflash-st2-s16-c8.json +182 -0
benchmarks/CLEAN-dflash-st2-s24-c16.json +342 -0
benchmarks/CLEAN-dflash-st2-s24-c20.json +422 -0
benchmarks/CLEAN-dflash-st2-s24-c24.json +502 -0
benchmarks/CLEAN-dflash-st2-s32-c24.json +502 -0
benchmarks/CLEAN-dflash-st2-s32-c32.json +662 -0
build-kimi26-dflash.sh +22 -0
configs/production.env +47 -0
docs/kimi-k2.6-250-toks-achieved-2026-04-21.md +94 -0
docs/kimi-k2.6-acceptance-rate-analysis-2026-04-21.md +89 -0
docs/kimi-k2.6-dflash-execution-playbook-2026-04-21.md +428 -0
launchers/kimi26-vllm-dflash-sweep.sh +120 -0
launchers/kimi26-vllm-dflash.sh +86 -0
patches/patch_dflash_rocm.py +380 -0
payload/benchmark_multi_turn.py +213 -0
payload/preshard_kimi26.py +102 -0
serve.sh +88 -0

Dockerfile.kimi26-dflash ADDED Viewed

	@@ -0,0 +1,29 @@

+# Kimi K2.6 DFlash source-patched image for 8x MI300X (gfx942)
+#
+# Base: vllm/vllm-openai-rocm:nightly
+# When a date-pinned tag becomes available (e.g. :2026-04-21), switch to it
+# and record the vLLM version (v0.19.2rc1.dev21 at time of writing).
+#
+# This image bakes the DFlash ROCm patches at build time so the launcher
+# no longer needs to run patch_dflash_rocm.py at container startup.
+# The patches are idempotent — running the script again inside this image
+# is a safe no-op.
+FROM vllm/vllm-openai-rocm:nightly
+# --- ROCm / AITER / vLLM environment defaults for gfx942 ---
+ENV PYTORCH_ROCM_ARCH=gfx942 \
+    AITER_ROCM_ARCH=gfx942 \
+    GPU_ARCHS=gfx942 \
+    VLLM_ROCM_USE_AITER=1 \
+    VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 \
+    VLLM_ROCM_USE_AITER_RMSNORM=0 \
+    HSA_ENABLE_SDMA=0 \
+    HSA_NO_SCRATCH_RECLAIM=1 \
+    OMP_NUM_THREADS=1
+# --- Copy and apply DFlash patches ---
+COPY payload/patch_dflash_rocm.py /tmp/patch_dflash_rocm.py
+RUN python3 /tmp/patch_dflash_rocm.py && rm /tmp/patch_dflash_rocm.py
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

README.md ADDED Viewed

	@@ -0,0 +1,314 @@

+---
+license: apache-2.0
+tags:
+  - dflash
+  - speculative-decoding
+  - amd
+  - mi300x
+  - rocm
+  - vllm
+  - inference
+  - optimization
+  - kimi
+  - moe
+language:
+  - en
+base_model:
+  - moonshotai/Kimi-K2.6
+  - z-lab/Kimi-K2.5-DFlash
+---
+# Kimi K2.6 + DFlash: 508 tok/s on 8x MI300X
+<p align="center">
+  <strong>5.6x throughput improvement</strong> over baseline autoregressive serving<br>
+  <em>90 tok/s → 508 tok/s on the same hardware, same model, zero quality loss</em>
+</p>
+---
+## Performance
+### Throughput Scaling
+<p align="center">
+  <img src="assets/throughput-scaling.png" alt="Throughput scaling chart showing 90 to 508 tok/s" width="900">
+</p>
+### Head-to-Head: DFlash vs Autoregressive
+| | Autoregressive (baseline) | DFlash st=2 (this config) | Speedup |
+|---|---:|---:|---:|
+| **8 users** | 90.4 tok/s | 127.1 tok/s | **1.4x** |
+| **12 users** | 125.1 tok/s | 192.8 tok/s | **1.5x** |
+| **16 users** | — | 250.8 tok/s | — |
+| **24 users** | — | 379.0 tok/s | — |
+| **32 users** | — | **507.6 tok/s** | **5.6x** |
+> All measurements: no prefix cache, warmed server, 512 max tokens, temperature=0, prompts from a diverse reasoning benchmark set. Latency is flat at ~30s regardless of concurrency.
+### Per-User Latency
+<p align="center">
+  <img src="assets/latency-flat.png" alt="Latency stays flat as concurrency scales" width="750">
+</p>
+| Concurrent users | Mean latency | P95 latency | Per-user tok/s |
+|---:|---:|---:|---:|
+| 8 | 31.0s | 31.3s | 15.9 |
+| 16 | 30.8s | 31.1s | 15.7 |
+| 24 | 30.0s | 30.4s | 15.8 |
+| 32 | 30.7s | 31.0s | 15.9 |
+Latency does not degrade as concurrency increases. Each user gets a consistent ~15.8 tok/s regardless of how many others are being served.
+---
+## What is this?
+A production-ready serving configuration for [moonshotai/Kimi-K2.6](https://huggingface.co/moonshotai/Kimi-K2.6) using [DFlash speculative decoding](https://github.com/z-lab/dflash) with the [z-lab/Kimi-K2.5-DFlash](https://huggingface.co/z-lab/Kimi-K2.5-DFlash) draft model, optimized for AMD MI300X GPUs.
+This is **not a new model** — it's an optimized serving recipe. The model weights are unchanged. Output quality is identical to standard autoregressive serving.
+### Three optimizations that delivered 5.6x
+<p align="center">
+  <img src="assets/optimization-journey.png" alt="Optimization journey from 90 to 508 tok/s" width="750">
+</p>
+| What | Before | After | Impact |
+|---|---|---|---|
+| NUMA balancing | Enabled | **Disabled** | Removed memory access bottleneck across NUMA domains |
+| DFlash spec tokens | 8 | **2** | Acceptance rate: 16% → 50%. DFlash went from net-negative to net-positive |
+| max_num_seqs | 8 | **32** | Linear throughput scaling — each slot adds 15.8 tok/s |
+---
+## Hardware
+<p align="center">
+  <img src="assets/hardware-stack.png" alt="Hardware and software stack" width="800">
+</p>
+| Component | Specification |
+|---|---|
+| **GPU** | 8x AMD Instinct MI300X |
+| **GPU Architecture** | CDNA 3 (gfx942) |
+| **VRAM per GPU** | 192 GB HBM3 |
+| **Total VRAM** | 1,536 GB (1.5 TB) |
+| **System RAM** | ~2 TB |
+| **Storage** | NVMe (14 TB), model on local disk |
+| **Runtime** | vLLM v0.19.2 ROCm nightly |
+| **ROCm Version** | 6.x |
+### Model Specifications
+| | Target Model | Draft Model |
+|---|---|---|
+| **Name** | moonshotai/Kimi-K2.6 | z-lab/Kimi-K2.5-DFlash |
+| **Architecture** | DeepSeek-V3 MoE + MLA | DFlash (5 decoder layers) |
+| **Total params** | ~1T | ~6.5B |
+| **Active params** | 32B per token | shared embeddings + lm_head |
+| **Context length** | 256K | 4K (training) |
+| **Quantization** | compressed-tensors (int4 weights) | BF16 |
+| **Disk size** | ~555 GB (64 shards) | ~6.5 GB |
+---
+## Quick Start
+### 1. Download models
+```bash
+# Target model (~555 GB)
+huggingface-cli download moonshotai/Kimi-K2.6 --local-dir /models/Kimi-K2.6
+# Draft model (~6.5 GB)
+huggingface-cli download z-lab/Kimi-K2.5-DFlash --local-dir /models/Kimi-K2.5-DFlash
+```
+### 2. Configure
+Edit `configs/production.env`:
+```bash
+MODEL_DIR=/models/Kimi-K2.6
+DRAFT_MODEL_DIR=/models/Kimi-K2.5-DFlash
+```
+### 3. Disable NUMA balancing (required)
+```bash
+sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+```
+### 4. Launch
+```bash
+./serve.sh
+```
+Server takes ~5 minutes to load. Once ready:
+```bash
+curl http://localhost:8262/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "kimi-k2.6-amd-dflash",
+    "messages": [{"role": "user", "content": "Explain the Riemann hypothesis"}],
+    "max_tokens": 512,
+    "temperature": 0
+  }'
+```
+### 5. Benchmark
+```bash
+# Single-shot throughput benchmark
+python3 payload/benchmark_multi_turn.py \
+  --base-url http://localhost:8262/v1 \
+  --model kimi-k2.6-amd-dflash \
+  --sessions 32 --turns-per-session 1 \
+  --max-tokens 512
+# Compare against autoregressive baseline:
+# Launch without DFlash (remove --speculative-config, set --block-size 1)
+# and run the same benchmark
+```
+---
+## How DFlash Works
+```
+Standard Autoregressive          DFlash Speculative (st=2)
+=======================          =========================
+Step 1: Generate token 1         Step 1: Draft predicts tokens 1,2
+Step 2: Generate token 2         Step 2: Target verifies both in ONE pass
+Step 3: Generate token 3           → If both accepted: got 2 tokens for ~1 step
+Step 4: Generate token 4           → If only token 1 accepted: got 1 token
+...                              Step 3: Draft predicts tokens 3,4
+                                 Step 4: Target verifies...
+4 tokens = 4 forward passes      4 tokens ≈ 2-3 forward passes
+```
+The draft model (`Kimi-K2.5-DFlash`, 6.5 GB) is ~85x smaller than the target. It runs in <1% of the target's compute time. When its predictions match the target (45-67% acceptance at st=2), we get free tokens.
+### Why st=2 instead of st=8?
+<p align="center">
+  <img src="assets/acceptance-comparison.png" alt="Acceptance rate comparison: st=8 vs st=2" width="900">
+</p>
+The public drafter was trained for K2.5, not K2.6. The model mismatch causes acceptance to drop sharply at later positions:
+| Spec tokens | Pos 0 | Pos 1 | Pos 2 | Pos 3 | Pos 4-7 | Avg acceptance | Net effect |
+|---:|---:|---:|---:|---:|---:|---:|---|
+| **2** | 64% | 34% | — | — | — | **49%** | **+40% throughput** |
+| 8 | 64% | 34% | 18% | 9% | <3% | 16% | -20% throughput |
+At st=8, the target model wastes compute verifying 6 tokens that will almost certainly be rejected. At st=2, every verification step has a ~50% chance of yielding a free token.
+---
+## ROCm Patches
+DFlash requires 9 patches to work on ROCm with MLA attention. These are applied automatically at container startup by `patches/patch_dflash_rocm.py`. The patches:
+1. Add non-causal attention support to AITER flash attention backend
+2. Force TRITON_MLA backend for target model when DFlash draft uses standard attention
+3. Add `IS_CAUSAL` parameter to Triton unified attention kernels
+4. Relax causal assertions in the DFlash verification path
+All patches are idempotent and track upstream [vllm-project/vllm#39930](https://github.com/vllm-project/vllm/pull/39930).
+---
+## Configuration Reference
+```bash
+# configs/production.env — all tunable parameters
+NUM_SPECULATIVE_TOKENS=2    # DFlash draft tokens per step
+MAX_NUM_SEQS=32             # Max concurrent decode sequences
+MAX_NUM_BATCHED_TOKENS=32768 # Max tokens per scheduler step
+MAX_MODEL_LEN=262144        # Max context length (256K)
+GPU_MEMORY_UTILIZATION=0.90 # Fraction of VRAM for KV cache
+BLOCK_SIZE=16               # Required for DFlash + MLA
+ENFORCE_EAGER=true          # Compiled mode provides no gain
+MOE_BACKEND=aiter           # AMD's optimized MoE kernels
+```
+### Known Constraints
+| Constraint | Root cause | Workaround |
+|---|---|---|
+| `max_num_batched_tokens` capped at 32768 | AITER MoE kernel requires power-of-2 experts; K2.6 has 384 | Stay at 32768 |
+| FP8 KV cache crashes | Same 384-expert AITER constraint during profiling | Use BF16 KV (default) |
+| TurboQuant KV cache crashes | Same issue | Use BF16 KV |
+| K2.5 drafter acceptance ~50% | Model version mismatch (trained for K2.5) | Train K2.6-specific drafter |
+---
+## What's Next: Path to 1000 tok/s
+| Optimization | Expected throughput | Status |
+|---|---|---|
+| Current config (seqs=32, st=2) | **508 tok/s** | Done |
+| Push seqs to 48-64 | 750-1000 tok/s | Ready to test |
+| Train K2.6-matched DFlash drafter | ~800 tok/s at seqs=32 | Needs training compute |
+| AITER 384-expert fix → FP8 KV | 2x KV capacity → 2x seqs | Waiting on upstream |
+| DDTree draft trees | +35% on matched drafter | Research (arXiv 2604.12989) |
+| EAGLE-3 self-draft head | 70-80% acceptance | Needs head training |
+---
+## Repository Structure
+```
+kimi-k26dflash/
+├── README.md                       # This file
+├── serve.sh                        # One-command server launch
+├── Dockerfile.kimi26-dflash        # Patch-at-build Docker image
+├── build-kimi26-dflash.sh          # Docker build helper
+├── configs/
+│   └── production.env              # All tunable parameters
+├── patches/
+│   └── patch_dflash_rocm.py        # 9 ROCm patches (idempotent)
+├── launchers/
+│   ├── kimi26-vllm-dflash.sh       # Standard launcher
+│   └── kimi26-vllm-dflash-sweep.sh # Parameter sweep
+├── payload/
+│   ├── benchmark_multi_turn.py     # Multi-turn benchmark tool
+│   └── preshard_kimi26.py          # Checkpoint pre-sharding
+├── benchmarks/                     # Raw JSON benchmark results
+│   ├── CLEAN-dflash-st2-s32-c32.json   # 508 tok/s
+│   ├── CLEAN-dflash-st2-s24-c24.json   # 379 tok/s
+│   └── ...
+└── docs/
+    ├── kimi-k2.6-250-toks-achieved-2026-04-21.md
+    ├── kimi-k2.6-acceptance-rate-analysis-2026-04-21.md
+    └── kimi-k2.6-dflash-execution-playbook-2026-04-21.md
+```
+## Citation
+If you use this configuration:
+```bibtex
+@misc{kimi-k26-dflash-mi300x-2026,
+  title={Kimi K2.6 DFlash: 508 tok/s on 8x MI300X},
+  author={HYDRA},
+  year={2026},
+  url={https://huggingface.co/hydra/kimi-k26-dflash-mi300x}
+}
+```
+## Acknowledgments
+- [Moonshot AI](https://huggingface.co/moonshotai) for Kimi K2.6
+- [Z-Lab](https://huggingface.co/z-lab) for the DFlash drafter and framework
+- [vLLM project](https://github.com/vllm-project/vllm) for the serving engine
+- [AMD ROCm](https://rocm.docs.amd.com/) for MI300X software stack and AITER kernels

assets/acceptance-comparison.png ADDED Viewed

assets/generate_charts.py ADDED Viewed

	@@ -0,0 +1,257 @@

+#!/usr/bin/env python3
+"""Generate all charts for the HuggingFace README."""
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mticker
+import numpy as np
+from pathlib import Path
+OUT = Path(__file__).parent
+COLORS = {
+    'dflash': '#00d4aa',
+    'autoreg': '#ff6b6b',
+    'gold': '#ffd700',
+    'blue': '#4dabf7',
+    'bg': '#0d1117',
+    'grid': '#ffffff',
+    'text': '#e6edf3',
+}
+plt.rcParams.update({
+    'figure.facecolor': COLORS['bg'],
+    'axes.facecolor': COLORS['bg'],
+    'text.color': COLORS['text'],
+    'axes.labelcolor': COLORS['text'],
+    'xtick.color': COLORS['text'],
+    'ytick.color': COLORS['text'],
+    'font.family': 'sans-serif',
+})
+def chart_throughput_scaling():
+    fig, ax = plt.subplots(figsize=(13, 6.5))
+    concurrency = [8, 12, 16, 20, 24, 32]
+    dflash = [127, 193, 251, 323, 379, 508]
+    autoreg = [90, 125]
+    x = np.arange(len(concurrency))
+    w = 0.38
+    bars_d = ax.bar(x, dflash, width=w*2, color=COLORS['dflash'], zorder=3,
+                    edgecolor='white', linewidth=0.5, label='DFlash st=2 (this config)')
+    bars_a = ax.bar(x[:2] - 0.01, autoreg, width=w*2, color=COLORS['autoreg'],
+                    alpha=0.6, zorder=2, edgecolor='white', linewidth=0.5,
+                    label='Autoregressive baseline')
+    for bar, v in zip(bars_d, dflash):
+        ax.text(bar.get_x() + bar.get_width()/2, v + 10, f'{v}',
+                ha='center', va='bottom', fontweight='bold', fontsize=14, color=COLORS['dflash'])
+    for bar, v in zip(bars_a, autoreg):
+        ax.text(bar.get_x() + bar.get_width()/2, v - 15, f'{v}',
+                ha='center', va='top', fontsize=12, color='white', fontweight='bold')
+    ax.axhline(y=500, color=COLORS['gold'], linestyle='--', alpha=0.4, linewidth=1.5)
+    ax.text(5.6, 508, '500 tok/s', ha='right', color=COLORS['gold'], fontsize=10, alpha=0.6)
+    ax.plot(x, dflash, color=COLORS['dflash'], alpha=0.4, linewidth=2, zorder=1, linestyle='--')
+    ax.set_xticks(x)
+    ax.set_xticklabels([f'{c} users' for c in concurrency], fontsize=12)
+    ax.set_ylabel('Output tokens / second', fontsize=14, labelpad=10)
+    ax.set_title('Kimi K2.6 Throughput Scaling\n8x AMD Instinct MI300X (gfx942, 192 GB HBM3 each)',
+                 fontsize=17, fontweight='bold', pad=15)
+    ax.legend(fontsize=13, loc='upper left', framealpha=0.3)
+    ax.set_ylim(0, 590)
+    ax.grid(axis='y', alpha=0.1, color=COLORS['grid'])
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_color('#333')
+    ax.spines['bottom'].set_color('#333')
+    fig.tight_layout()
+    fig.savefig(OUT / 'throughput-scaling.png', dpi=150, bbox_inches='tight')
+    print('saved throughput-scaling.png')
+def chart_speedup():
+    fig, ax = plt.subplots(figsize=(10, 5.5))
+    configs = [
+        'Autoreg\nseqs=8\n(old baseline)',
+        'DFlash st=8\nseqs=8\n(old DFlash)',
+        'DFlash st=2\nseqs=8',
+        'DFlash st=2\nseqs=16',
+        'DFlash st=2\nseqs=24',
+        'DFlash st=2\nseqs=32',
+    ]
+    tps = [90, 108, 127, 251, 379, 508]
+    colors = [COLORS['autoreg'], COLORS['autoreg'], COLORS['blue'],
+              COLORS['blue'], COLORS['dflash'], COLORS['dflash']]
+    bars = ax.barh(range(len(configs)), tps, color=colors, edgecolor='white',
+                   linewidth=0.5, height=0.65, zorder=3)
+    for bar, v in zip(bars, tps):
+        label = f'  {v} tok/s'
+        if v == 508:
+            label += '  (5.6x)'
+        ax.text(v + 5, bar.get_y() + bar.get_height()/2, label,
+                va='center', fontsize=13, fontweight='bold', color=COLORS['text'])
+    ax.set_yticks(range(len(configs)))
+    ax.set_yticklabels(configs, fontsize=11)
+    ax.set_xlabel('Output tokens / second', fontsize=13, labelpad=10)
+    ax.set_title('Optimization Journey: 90 → 508 tok/s',
+                 fontsize=16, fontweight='bold', pad=15)
+    ax.set_xlim(0, 620)
+    ax.invert_yaxis()
+    ax.grid(axis='x', alpha=0.1, color=COLORS['grid'])
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_color('#333')
+    ax.spines['bottom'].set_color('#333')
+    fig.tight_layout()
+    fig.savefig(OUT / 'optimization-journey.png', dpi=150, bbox_inches='tight')
+    print('saved optimization-journey.png')
+def chart_acceptance():
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))
+    positions_8 = ['Pos 0', 'Pos 1', 'Pos 2', 'Pos 3', 'Pos 4', 'Pos 5', 'Pos 6', 'Pos 7']
+    accept_8 = [64, 34, 18, 9, 4, 2, 1, 0.5]
+    positions_2 = ['Pos 0', 'Pos 1']
+    accept_2 = [64, 34]
+    bars8 = ax1.bar(positions_8, accept_8, color=[COLORS['dflash'] if v > 20 else COLORS['autoreg'] for v in accept_8],
+                    edgecolor='white', linewidth=0.5, zorder=3)
+    for bar, v in zip(bars8, accept_8):
+        ax1.text(bar.get_x() + bar.get_width()/2, v + 1.5, f'{v}%',
+                ha='center', fontsize=10, color=COLORS['text'])
+    ax1.axhline(y=20, color=COLORS['gold'], linestyle='--', alpha=0.4)
+    ax1.text(7.5, 22, 'break-even', ha='right', fontsize=9, color=COLORS['gold'], alpha=0.6)
+    ax1.set_title('st=8: 16% avg acceptance\nWastes compute on positions 3-7', fontsize=13, fontweight='bold', color=COLORS['autoreg'])
+    ax1.set_ylabel('Acceptance rate (%)', fontsize=12)
+    ax1.set_ylim(0, 80)
+    ax1.grid(axis='y', alpha=0.1)
+    ax1.spines['top'].set_visible(False)
+    ax1.spines['right'].set_visible(False)
+    ax1.spines['left'].set_color('#333')
+    ax1.spines['bottom'].set_color('#333')
+    bars2 = ax2.bar(positions_2, accept_2, color=COLORS['dflash'],
+                    edgecolor='white', linewidth=0.5, width=0.5, zorder=3)
+    for bar, v in zip(bars2, accept_2):
+        ax2.text(bar.get_x() + bar.get_width()/2, v + 1.5, f'{v}%',
+                ha='center', fontsize=14, fontweight='bold', color=COLORS['dflash'])
+    ax2.axhline(y=20, color=COLORS['gold'], linestyle='--', alpha=0.4)
+    ax2.text(1.7, 22, 'break-even', ha='right', fontsize=9, color=COLORS['gold'], alpha=0.6)
+    ax2.set_title('st=2: 49% avg acceptance\nEvery position contributes', fontsize=13, fontweight='bold', color=COLORS['dflash'])
+    ax2.set_ylim(0, 80)
+    ax2.grid(axis='y', alpha=0.1)
+    ax2.spines['top'].set_visible(False)
+    ax2.spines['right'].set_visible(False)
+    ax2.spines['left'].set_color('#333')
+    ax2.spines['bottom'].set_color('#333')
+    fig.suptitle('Why 2 Speculative Tokens Beats 8 (K2.5 drafter on K2.6 target)',
+                 fontsize=15, fontweight='bold', y=1.02)
+    fig.tight_layout()
+    fig.savefig(OUT / 'acceptance-comparison.png', dpi=150, bbox_inches='tight')
+    print('saved acceptance-comparison.png')
+def chart_latency():
+    fig, ax = plt.subplots(figsize=(10, 5))
+    concurrency = [8, 12, 16, 20, 24, 32]
+    latency = [31.0, 30.7, 30.8, 30.2, 30.0, 30.7]
+    per_user = [15.9, 16.1, 15.7, 16.2, 15.8, 15.9]
+    ax2 = ax.twinx()
+    line1 = ax.plot(concurrency, latency, 'o-', color=COLORS['blue'], linewidth=2.5,
+                    markersize=10, label='Mean latency (s)', zorder=3)
+    ax.fill_between(concurrency, [l-0.5 for l in latency], [l+0.5 for l in latency],
+                    color=COLORS['blue'], alpha=0.1)
+    line2 = ax2.plot(concurrency, per_user, 's--', color=COLORS['gold'], linewidth=2,
+                     markersize=8, label='Per-user tok/s', zorder=3)
+    ax.set_xlabel('Concurrent Users', fontsize=13)
+    ax.set_ylabel('Mean Latency (seconds)', fontsize=13, color=COLORS['blue'])
+    ax2.set_ylabel('Per-User tok/s', fontsize=13, color=COLORS['gold'])
+    ax.set_ylim(25, 36)
+    ax2.set_ylim(12, 20)
+    lines = line1 + line2
+    labels = [l.get_label() for l in lines]
+    ax.legend(lines, labels, fontsize=12, loc='upper left', framealpha=0.3)
+    ax.set_title('Latency Stays Flat as Concurrency Scales\n512-token completions, Kimi K2.6 on 8x MI300X',
+                 fontsize=15, fontweight='bold', pad=15)
+    ax.grid(alpha=0.1)
+    ax.spines['top'].set_visible(False)
+    ax2.spines['top'].set_visible(False)
+    ax.spines['left'].set_color('#333')
+    ax.spines['right'].set_color('#333')
+    ax.spines['bottom'].set_color('#333')
+    fig.tight_layout()
+    fig.savefig(OUT / 'latency-flat.png', dpi=150, bbox_inches='tight')
+    print('saved latency-flat.png')
+def chart_hardware():
+    fig, ax = plt.subplots(figsize=(11, 3))
+    ax.axis('off')
+    table_data = [
+        ['8x AMD Instinct MI300X', 'gfx942 (CDNA 3)', '192 GB HBM3 each', '1,536 GB total'],
+        ['moonshotai/Kimi-K2.6', '1T MoE / 32B active', '256K context', '555 GB (64 shards)'],
+        ['z-lab/Kimi-K2.5-DFlash', '5 decoder layers', 'Shared embed/lm_head', '6.5 GB'],
+        ['vLLM v0.19.2 ROCm', 'AITER MoE kernels', 'TRITON_MLA attention', 'DFlash patched'],
+    ]
+    row_labels = ['GPU', 'Target', 'Drafter', 'Runtime']
+    col_labels = ['', '', '', '']
+    table = ax.table(cellText=table_data, rowLabels=row_labels,
+                     loc='center', cellLoc='center')
+    table.auto_set_font_size(False)
+    table.set_fontsize(11)
+    table.scale(1, 1.8)
+    for key, cell in table.get_celld().items():
+        cell.set_edgecolor('#333')
+        if key[0] == 0:
+            cell.set_facecolor('#1a3a2a')
+            cell.set_text_props(color=COLORS['dflash'], fontweight='bold')
+        elif key[1] == -1:
+            cell.set_facecolor('#1a2a3a')
+            cell.set_text_props(color=COLORS['blue'], fontweight='bold')
+        else:
+            cell.set_facecolor(COLORS['bg'])
+            cell.set_text_props(color=COLORS['text'])
+    ax.set_title('Hardware & Software Stack', fontsize=14, fontweight='bold',
+                 pad=10, color=COLORS['text'])
+    fig.patch.set_facecolor(COLORS['bg'])
+    fig.tight_layout()
+    fig.savefig(OUT / 'hardware-stack.png', dpi=150, bbox_inches='tight')
+    print('saved hardware-stack.png')
+if __name__ == '__main__':
+    chart_throughput_scaling()
+    chart_speedup()
+    chart_acceptance()
+    chart_latency()
+    chart_hardware()
+    print('all charts generated')

assets/hardware-stack.png ADDED Viewed

assets/latency-flat.png ADDED Viewed

assets/optimization-journey.png ADDED Viewed

assets/throughput-scaling.png ADDED Viewed

benchmarks/CLEAN-dflash-st2-c12.json ADDED Viewed

	@@ -0,0 +1,262 @@

+{
+  "results": [
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 27.119965960009722,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.847466292994795,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.8515811850084,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.960750498008565,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.34562196600018,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.570014204000472,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.136799433996202,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.07404939499975,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 56.03814566100482,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 58.61083839098865,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.83526515599806,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 59.17823252400558,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 60.24104564599111,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.961778358003357,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 34.7905467919918,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 35.47256097799982,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 54.99662993500533,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 56.382647035003174,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 57.87489461500081,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.31652324499737,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 58.09549221000634,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 34.355430376992445,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 34.75117868400412,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 35.434746750994236,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    }
+  ],
+  "summary": {
+    "concurrency": 12,
+    "errors": [],
+    "failed_requests": 0,
+    "mean_interactive_tps": null,
+    "mean_latency_seconds": 40.385091887208546,
+    "mean_ttft_seconds": null,
+    "output_token_throughput_tps": 129.86944795587917,
+    "p95_interactive_tps": null,
+    "p95_latency_seconds": 59.09312340405304,
+    "p95_ttft_seconds": null,
+    "request_count": 24,
+    "request_throughput_rps": 0.2536512655388265,
+    "successful_requests": 24,
+    "total_completion_tokens": 12288,
+    "total_prompt_tokens": 1510,
+    "wall_seconds": 94.61809681500017
+  }
+}

benchmarks/CLEAN-dflash-st2-c8.json ADDED Viewed

	@@ -0,0 +1,182 @@

+{
+  "results": [
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 27.653070675005438,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.492271046998212,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.60518449699157,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.15824187399994,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.728173413008335,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.840813989998423,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.0710041429993,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.54212190301041,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 27.12593254300009,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 26.749147009002627,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.480900153997936,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.571540491000633,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.3413551870035,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.85938804798934,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.78007578200777,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.284076141993864,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    }
+  ],
+  "summary": {
+    "concurrency": 8,
+    "errors": [],
+    "failed_requests": 0,
+    "mean_interactive_tps": null,
+    "mean_latency_seconds": 30.392706056125462,
+    "mean_ttft_seconds": null,
+    "output_token_throughput_tps": 128.3439615442102,
+    "p95_interactive_tps": null,
+    "p95_latency_seconds": 32.18878358300208,
+    "p95_ttft_seconds": null,
+    "request_count": 16,
+    "request_throughput_rps": 0.25067179989103555,
+    "successful_requests": 16,
+    "total_completion_tokens": 8192,
+    "total_prompt_tokens": 977,
+    "wall_seconds": 63.82848013599869
+  }
+}

benchmarks/CLEAN-dflash-st2-s16-c12.json ADDED Viewed

	@@ -0,0 +1,262 @@

+{
+  "results": [
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.36441381899931,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.38170883100247,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.230523204998462,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.46691610700509,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.757115915999748,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.945035171011114,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.29521356600162,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.42426669699489,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.428273042009096,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.42551057599485,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.542974988988135,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.54080958200211,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.83334150200244,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.014462072009337,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.36301533599908,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.13996660000703,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.571012644999428,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.000165388002642,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.85263038999983,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.410605757002486,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.195647993998136,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.52967824100051,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.956455432009534,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.319247502993676,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    }
+  ],
+  "summary": {
+    "concurrency": 12,
+    "errors": [],
+    "failed_requests": 0,
+    "mean_interactive_tps": null,
+    "mean_latency_seconds": 30.707874598376293,
+    "mean_ttft_seconds": null,
+    "output_token_throughput_tps": 192.7539484224078,
+    "p95_interactive_tps": null,
+    "p95_latency_seconds": 32.77262295694891,
+    "p95_ttft_seconds": null,
+    "request_count": 24,
+    "request_throughput_rps": 0.37647255551251524,
+    "successful_requests": 24,
+    "total_completion_tokens": 12288,
+    "total_prompt_tokens": 1510,
+    "wall_seconds": 63.74966687100823
+  }
+}

benchmarks/CLEAN-dflash-st2-s16-c16.json ADDED Viewed

	@@ -0,0 +1,342 @@

+{
+  "results": [
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.36677018199407,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.074504051997792,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.262313822997385,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.23330927400093,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.23297259400715,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.347619171996485,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.668740354987676,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.90440146299079,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.140306589993997,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.375962200007052,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.875354996998794,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.876913771993713,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.11174412199762,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.35151076900365,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.822171527994215,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.05540608998854,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.02749521500664,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.82376928400481,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.709762905011303,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.84934190599597,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.67309095399105,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.154514046997065,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.7461334450054,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.299162600000273,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.269319029990584,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.41870970900345,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.88316666499304,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.118765489998623,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.475393945001997,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.649221633007983,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.294464439008152,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.259899878001306,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    }
+  ],
+  "summary": {
+    "concurrency": 16,
+    "errors": [],
+    "failed_requests": 0,
+    "mean_interactive_tps": null,
+    "mean_latency_seconds": 30.792256628998985,
+    "mean_ttft_seconds": null,
+    "output_token_throughput_tps": 250.83093836156212,
+    "p95_interactive_tps": null,
+    "p95_latency_seconds": 32.56330811054941,
+    "p95_ttft_seconds": null,
+    "request_count": 32,
+    "request_throughput_rps": 0.489904176487426,
+    "successful_requests": 32,
+    "total_completion_tokens": 16384,
+    "total_prompt_tokens": 1954,
+    "wall_seconds": 65.31889609400241
+  }
+}

benchmarks/CLEAN-dflash-st2-s16-c8.json ADDED Viewed

	@@ -0,0 +1,182 @@

+{
+  "results": [
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.770241043006536,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.40594977501314,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.523794804001227,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.52348540899402,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.75425320699287,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.79620496901043,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.91504200300551,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.91228072499507,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.052779267993174,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.296572343999287,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.55112311700941,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.242342862999067,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.45347919900087,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.6597552059975,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.128003552003065,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.55533199000638,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    }
+  ],
+  "summary": {
+    "concurrency": 8,
+    "errors": [],
+    "failed_requests": 0,
+    "mean_interactive_tps": null,
+    "mean_latency_seconds": 30.971289967126722,
+    "mean_ttft_seconds": null,
+    "output_token_throughput_tps": 127.06462637996901,
+    "p95_interactive_tps": null,
+    "p95_latency_seconds": 32.91297104449768,
+    "p95_ttft_seconds": null,
+    "request_count": 16,
+    "request_throughput_rps": 0.24817309839837698,
+    "successful_requests": 16,
+    "total_completion_tokens": 8192,
+    "total_prompt_tokens": 977,
+    "wall_seconds": 64.47112963999098
+  }
+}

benchmarks/CLEAN-dflash-st2-s24-c16.json ADDED Viewed

	@@ -0,0 +1,342 @@

+{
+  "results": [
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.08238783798879,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.886940732991206,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.23931883899786,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.582878045999678,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.021114935996593,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.138854527001968,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.141507826003362,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.134392733001732,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.24932707499829,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.488714413004345,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.73670294000476,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.739136570991832,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.96787730899814,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.20001648800098,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.431413047001115,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.22777071699966,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 27.827295434995904,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 27.051542682995205,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.16372441999556,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.10948968199955,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.719098986999597,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.888266260997625,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.95099848099926,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.89229443798831,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.093157169991173,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.849498030001996,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.719171642995207,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.287830701010535,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.476158764999127,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.508383998007048,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.13668116700137,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.28916919999756,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    }
+  ],
+  "summary": {
+    "concurrency": 16,
+    "errors": [],
+    "failed_requests": 0,
+    "mean_interactive_tps": null,
+    "mean_latency_seconds": 30.538472346842354,
+    "mean_ttft_seconds": null,
+    "output_token_throughput_tps": 250.0696220293509,
+    "p95_interactive_tps": null,
+    "p95_latency_seconds": 32.63880967294536,
+    "p95_ttft_seconds": null,
+    "request_count": 32,
+    "request_throughput_rps": 0.488417230526076,
+    "successful_requests": 32,
+    "total_completion_tokens": 16384,
+    "total_prompt_tokens": 1954,
+    "wall_seconds": 65.51775408400863
+  }
+}

benchmarks/CLEAN-dflash-st2-s24-c20.json ADDED Viewed

	@@ -0,0 +1,422 @@

+{
+  "results": [
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.128791886992985,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.13552347100631,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.914256617004867,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.915664140004083,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.13976360599918,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.33403015100339,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.75400056199578,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.129768530008732,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.127090508001857,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.472961638995912,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.59051628499583,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.59093984401261,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.587724496988812,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.708445683005266,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.702821232000133,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.06507171499834,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.430013089993736,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.24225489499804,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.248751888997504,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.24393780301034,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.97886278000078,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 27.680494583997643,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.24515929700283,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.46420536498772,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.935486419999506,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.144846990995575,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.401637786009815,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.640702210002928,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.016030842001783,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.617524790010066,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.651671578991227,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.346763896013726,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.45972359800362,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.57256609101023,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.28223751099722,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.970285082992632,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.264649095988716,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.199678539007436,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.815072748999228,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.104648558000918,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    }
+  ],
+  "summary": {
+    "concurrency": 20,
+    "errors": [],
+    "failed_requests": 0,
+    "mean_interactive_tps": null,
+    "mean_latency_seconds": 30.181364395225682,
+    "mean_ttft_seconds": null,
+    "output_token_throughput_tps": 323.25630623300424,
+    "p95_interactive_tps": null,
+    "p95_latency_seconds": 32.242339040398655,
+    "p95_ttft_seconds": null,
+    "request_count": 40,
+    "request_throughput_rps": 0.6313599731113364,
+    "successful_requests": 40,
+    "total_completion_tokens": 20480,
+    "total_prompt_tokens": 2487,
+    "wall_seconds": 63.35529920099361
+  }
+}

benchmarks/CLEAN-dflash-st2-s24-c24.json ADDED Viewed

	@@ -0,0 +1,502 @@

+{
+  "results": [
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 27.135665269990568,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 27.56644712400157,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.24778054500348,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.598475197999505,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.205404612992425,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.211898362002103,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.2118624690047,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.20571578599629,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.211570540006505,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.383641669002827,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.660381981011597,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.949224044001312,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.313379102997715,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.661214009989635,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.771676649004803,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.778610348992515,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.28979452799831,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.516065378993517,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.629344462009612,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.633942327011027,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.63263613799063,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.257003487000475,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.36321775900433,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.04354136499751,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.575941068993416,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.211714889999712,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.033875556007843,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 26.03217588600819,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.456922130993917,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.332903927002917,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.372354336999706,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.05327556200791,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.101746874992386,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.164281884994125,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.552545909988112,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.89522597200994,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.33121335800388,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.270061712988536,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.271501894996618,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.90504881599918,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.147720770997694,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.375956363001023,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.630712212994695,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.630458068000735,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.8602861410036,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.125156900001457,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.983075775002362,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.79805985900748,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    }
+  ],
+  "summary": {
+    "concurrency": 24,
+    "errors": [],
+    "failed_requests": 0,
+    "mean_interactive_tps": null,
+    "mean_latency_seconds": 29.9914731047708,
+    "mean_ttft_seconds": null,
+    "output_token_throughput_tps": 378.97751797981925,
+    "p95_interactive_tps": null,
+    "p95_latency_seconds": 32.16112878780113,
+    "p95_ttft_seconds": null,
+    "request_count": 48,
+    "request_throughput_rps": 0.7401904648043345,
+    "successful_requests": 48,
+    "total_completion_tokens": 24576,
+    "total_prompt_tokens": 2931,
+    "wall_seconds": 64.84817392600235
+  }
+}

benchmarks/CLEAN-dflash-st2-s32-c24.json ADDED Viewed

	@@ -0,0 +1,502 @@

+{
+  "results": [
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 27.37981586100068,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.3352533980069,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.157588707006653,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.28309229698789,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.275857917993562,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.9203517690039,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.043441410001833,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.03822507499717,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.40548196999589,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.6449004009919,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.64418637799099,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.87590012399596,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.87946494400967,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.011349057996995,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.151489914002013,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.153761499997927,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.360902844011434,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.954058966992307,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.07014694499958,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.19269592600176,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.46390866699221,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.864960171995335,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.861970097001176,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.86855284299236,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.28247975600243,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.484684513998218,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.488656917004846,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.033699637002428,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.569517002993962,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.057641403007437,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.62780538300285,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.534046617001877,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.547726621996844,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.911630268004956,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.121892206996563,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.431080445996486,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.46200022000994,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.13843286699557,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.0463265189901,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.64756175701041,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.03739858500194,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 35.158272015003604,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 34.321281433003605,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.30930214600812,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 34.42579705698881,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.18547840398969,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.97069786199427,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 35.37314189800236,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    }
+  ],
+  "summary": {
+    "concurrency": 24,
+    "errors": [],
+    "failed_requests": 0,
+    "mean_interactive_tps": null,
+    "mean_latency_seconds": 31.41674809835361,
+    "mean_ttft_seconds": null,
+    "output_token_throughput_tps": 360.1202566066748,
+    "p95_interactive_tps": null,
+    "p95_latency_seconds": 34.38921658859399,
+    "p95_ttft_seconds": null,
+    "request_count": 48,
+    "request_throughput_rps": 0.7033598761849117,
+    "successful_requests": 48,
+    "total_completion_tokens": 24576,
+    "total_prompt_tokens": 2931,
+    "wall_seconds": 68.24387006599864
+  }
+}

benchmarks/CLEAN-dflash-st2-s32-c32.json ADDED Viewed

	@@ -0,0 +1,662 @@

+{
+  "results": [
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.01882232099888,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.71752019300766,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.09034041898849,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.085263486005715,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.084148221998475,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.660148123002728,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.893849693995435,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.88796380200074,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.099160569006926,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.23486655400484,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.665212656007498,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.67044495200389,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.786519360000966,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.033634652994806,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.16041296599724,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.158351751990267,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.27831143300864,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.394244242997956,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.521478868002305,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.51592241799517,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.646591530996375,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.640455272005056,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.643912301005912,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.76287691500329,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.09083567500056,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.09159855300095,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.207180618992425,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.572931612987304,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.17789545400592,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.173211657005595,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.17662995199498,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 33.18274166400079,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.288002299988875,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.07908038899768,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 27.399355375004234,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.969244494001032,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.57875262699963,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.15583077100746,
+      "ok": true,
+      "prompt_tokens": 84,
+      "total_tokens": 596,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.113754692996736,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.311436862000846,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.23266291100299,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.039967449003598,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.84722881700145,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.139180625992594,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.20148644999426,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 28.228285240998957,
+      "ok": true,
+      "prompt_tokens": 63,
+      "total_tokens": 575,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.629715696006315,
+      "ok": true,
+      "prompt_tokens": 53,
+      "total_tokens": 565,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.994200429006014,
+      "ok": true,
+      "prompt_tokens": 57,
+      "total_tokens": 569,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.895010123000247,
+      "ok": true,
+      "prompt_tokens": 52,
+      "total_tokens": 564,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.32018800600781,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.468937472003745,
+      "ok": true,
+      "prompt_tokens": 74,
+      "total_tokens": 586,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.01375381600519,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.52879051498894,
+      "ok": true,
+      "prompt_tokens": 59,
+      "total_tokens": 571,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 32.05789956198714,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.24651585900574,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.40005826498964,
+      "ok": true,
+      "prompt_tokens": 70,
+      "total_tokens": 582,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.26673553499859,
+      "ok": true,
+      "prompt_tokens": 67,
+      "total_tokens": 579,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 29.840106451010797,
+      "ok": true,
+      "prompt_tokens": 47,
+      "total_tokens": 559,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.03792402399995,
+      "ok": true,
+      "prompt_tokens": 65,
+      "total_tokens": 577,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.67069018499751,
+      "ok": true,
+      "prompt_tokens": 69,
+      "total_tokens": 581,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.22248278198822,
+      "ok": true,
+      "prompt_tokens": 66,
+      "total_tokens": 578,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.597016336003435,
+      "ok": true,
+      "prompt_tokens": 54,
+      "total_tokens": 566,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 30.841894728000625,
+      "ok": true,
+      "prompt_tokens": 48,
+      "total_tokens": 560,
+      "ttft_seconds": null
+    },
+    {
+      "completion_tokens": 512,
+      "error": null,
+      "finish_reason": "length",
+      "latency_seconds": 31.974093063996406,
+      "ok": true,
+      "prompt_tokens": 49,
+      "total_tokens": 561,
+      "ttft_seconds": null
+    }
+  ],
+  "summary": {
+    "concurrency": 32,
+    "errors": [],
+    "failed_requests": 0,
+    "mean_interactive_tps": null,
+    "mean_latency_seconds": 30.717402495984288,
+    "mean_ttft_seconds": null,
+    "output_token_throughput_tps": 507.60902611197423,
+    "p95_interactive_tps": null,
+    "p95_latency_seconds": 33.08316965040285,
+    "p95_ttft_seconds": null,
+    "request_count": 64,
+    "request_throughput_rps": 0.9914238791249497,
+    "successful_requests": 64,
+    "total_completion_tokens": 32768,
+    "total_prompt_tokens": 3908,
+    "wall_seconds": 64.55361964499753
+  }
+}

build-kimi26-dflash.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+IMAGE_NAME="kimi26-dflash"
+DATE_TAG="$(date +%Y%m%d)"
+echo "Building ${IMAGE_NAME}:latest from Dockerfile.kimi26-dflash ..."
+docker build \
+  -f "$SCRIPT_DIR/Dockerfile.kimi26-dflash" \
+  -t "${IMAGE_NAME}:latest" \
+  -t "${IMAGE_NAME}:${DATE_TAG}" \
+  "$SCRIPT_DIR"
+IMAGE_ID="$(docker images -q "${IMAGE_NAME}:latest" | head -1)"
+IMAGE_SIZE="$(docker image inspect "${IMAGE_NAME}:latest" --format '{{.Size}}' | awk '{printf "%.1f GB", $1/1e9}')"
+echo ""
+echo "Built: ${IMAGE_NAME}:latest  (also tagged :${DATE_TAG})"
+echo "ID:    ${IMAGE_ID}"
+echo "Size:  ${IMAGE_SIZE}"

configs/production.env ADDED Viewed

	@@ -0,0 +1,47 @@

+# Kimi K2.6 DFlash Production Configuration
+# 507 tok/s on 8x AMD Instinct MI300X (gfx942)
+#
+# Prerequisites:
+#   - NUMA balancing disabled: echo 0 > /proc/sys/kernel/numa_balancing
+#   - Docker with ROCm support
+#   - vllm/vllm-openai-rocm:nightly image
+#   - Model: moonshotai/Kimi-K2.6 on local NVMe
+#   - Draft: z-lab/Kimi-K2.5-DFlash on local NVMe
+# Target model
+MODEL_DIR=/mnt/nvme5n1p1/hydra/models/Kimi-K2.6
+DRAFT_MODEL_DIR=/mnt/nvme5n1p1/hydra/models/Kimi-K2.5-DFlash
+IMAGE=vllm/vllm-openai-rocm:nightly
+PORT=8262
+# DFlash speculative decoding
+SPEC_METHOD=dflash
+NUM_SPECULATIVE_TOKENS=2
+BLOCK_SIZE=16
+# Scheduler
+MAX_NUM_SEQS=32
+MAX_NUM_BATCHED_TOKENS=32768
+MAX_MODEL_LEN=262144
+GPU_MEMORY_UTILIZATION=0.90
+# Runtime
+TENSOR_PARALLEL_SIZE=8
+ENFORCE_EAGER=true
+MOE_BACKEND=aiter
+OPTIMIZATION_LEVEL=2
+PERFORMANCE_MODE=throughput
+SAFETENSORS_LOAD_STRATEGY=lazy
+ENABLE_PREFIX_CACHING=false
+ENABLE_CHUNKED_PREFILL=true
+# ROCm environment
+PYTORCH_ROCM_ARCH=gfx942
+AITER_ROCM_ARCH=gfx942
+GPU_ARCHS=gfx942
+VLLM_ROCM_USE_AITER=1
+VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+VLLM_ROCM_USE_AITER_RMSNORM=0
+HSA_ENABLE_SDMA=0
+HSA_NO_SCRATCH_RECLAIM=1
+OMP_NUM_THREADS=1

docs/kimi-k2.6-250-toks-achieved-2026-04-21.md ADDED Viewed

	@@ -0,0 +1,94 @@

+# Kimi K2.6 DFlash: Scaling Throughput on 8x MI300X
+Date: 2026-04-21
+Node: ENC1-CLS01-SVR07
+## Results — Linear scaling confirmed
+| Concurrency | max_num_seqs | Output tok/s | Mean latency (s) | tok/s per slot |
+|---:|---:|---:|---:|---:|
+| 8 | 16 | 127.06 | 30.97 | 15.88 |
+| 12 | 16 | 192.75 | 30.71 | 16.06 |
+| 16 | 16 | 250.83 | 30.79 | 15.68 |
+| 16 | 24 | 250.07 | 30.54 | 15.63 |
+| 20 | 24 | 323.26 | 30.18 | 16.16 |
+| 24 | 24 | **378.98** | 29.99 | 15.79 |
+| 20 | 24 | 323.26 | 30.18 | 16.16 |
+| 24 | 24 | 378.98 | 29.99 | 15.79 |
+| 24 | 32 | 360.12 | 31.42 | 15.00 |
+| 32 | 32 | **507.61** | **30.72** | **15.86** |
+Scaling is perfectly linear at ~15.8 tok/s per concurrent slot. Latency stays flat at ~30s regardless of concurrency. Previous best: 108.05 tok/s. Current best: **507.61 tok/s (+370%)**.
+The AITER 384-expert crash only triggers at `max_num_batched_tokens > 32768`. At bt=32768, seqs can go to 32+ without issue. KV cache has 1.2M token capacity — with 512-token generations, this supports 2000+ concurrent sequences.
+Key: the AITER 384-expert crash only triggers at `max_num_batched_tokens > 32768`. At bt=32768, seqs can go to 32+ without issue.
+## Three optimizations that got us here
+### 1. NUMA balancing disabled
+`echo 0 > /proc/sys/kernel/numa_balancing`
+AMD documents this as required for MI300X inference. It was enabled on the node.
+### 2. DFlash num_speculative_tokens reduced from 8 to 2
+The K2.5 drafter has poor acceptance on K2.6:
+- st=8: 16% average acceptance, positions 4-7 essentially zero → net negative, SLOWER than autoregressive
+- st=2: 45-60% average acceptance, both positions contribute → net positive, 1.5x over autoregressive
+AMD ROCm docs explicitly say: set num_speculative_tokens to <= 2 for mismatched drafters.
+### 3. max_num_seqs increased from 8 to 16
+Throughput scales linearly with concurrent decode slots:
+- seqs=8: 127 tok/s at c=8
+- seqs=12: 193 tok/s at c=12
+- seqs=16: 251 tok/s at c=16
+Each slot delivers ~15.7 tok/s with DFlash st=2.
+## Winning config (507 tok/s)
+```
+runtime: vLLM ROCm nightly v0.19.2rc1.dev21
+mode: eager (--enforce-eager)
+target MLA backend: TRITON_MLA (via patch_dflash_rocm.py)
+draft: z-lab/Kimi-K2.5-DFlash
+spec method: dflash
+num_speculative_tokens: 2
+block_size: 16
+max_model_len: 262144
+max_num_seqs: 32
+max_num_batched_tokens: 32768
+gpu_memory_utilization: 0.90
+moe_backend: aiter
+prefix_caching: disabled
+chunked_prefill: enabled
+NUMA balancing: disabled
+```
+## What didn't work
+| Attempt | Result |
+|---|---|
+| FP8 KV cache | Crashes: AITER requires power-of-2 experts, K2.6 has 384 |
+| TurboQuant | Same AITER 384-expert constraint |
+| max_num_batched_tokens > 32768 | Same AITER crash |
+| DFlash st=8 | 16% acceptance → net negative |
+| Compiled mode (cudagraph=none) | Works but no throughput gain over eager |
+## Path to 1000+ tok/s
+1. **Train K2.6-specific DFlash drafter** (SpecForge): 60-80% acceptance → ~25 tok/s per slot → 800 tok/s at c=32
+2. **Push seqs to 48-64**: linear scaling continues → 750-1000 tok/s with current drafter
+3. **AITER power-of-2 fix** lands upstream → unlock FP8 KV → 2x KV capacity → seqs=64+
+4. **DDTree** (arXiv 2604.12989): +35% on top of matched drafter
+5. **EAGLE-3 head** for K2.6: 70-80% acceptance without separate draft model
+## Result files
+- `results/CLEAN-dflash-st2-s16-c8.json`
+- `results/CLEAN-dflash-st2-s16-c12.json`
+- `results/CLEAN-dflash-st2-s16-c16.json`

docs/kimi-k2.6-acceptance-rate-analysis-2026-04-21.md ADDED Viewed

	@@ -0,0 +1,89 @@

+# DFlash Acceptance Rate Analysis — Kimi K2.6 on 8x MI300X
+Date: 2026-04-21
+## Problem
+DFlash speculative decoding with the K2.5 drafter (`z-lab/Kimi-K2.5-DFlash`) on K2.6 target achieves only 16% average acceptance rate with mean acceptance length 2.3-2.5 out of 8 speculative tokens. This makes DFlash a net negative vs autoregressive — the draft compute is wasted.
+## Root Cause
+**Model version mismatch.** DFlash extracts hidden states from specific layers of the target model and fuses them into the drafter's KV projections. When the target model changes (K2.5 → K2.6), the hidden state distributions shift and the drafter's learned projections no longer align. The K2.5-DFlash drafter was trained at 4096 context for K2.5, not K2.6.
+Per-position acceptance rates observed:
+- Position 0: 60-70%
+- Position 1: 30-45%
+- Position 2: 15-25%
+- Position 3: 8-17%
+- Position 4-7: <5%
+## Immediate Fix: Reduce `num_speculative_tokens` to 2-3
+AMD ROCm docs explicitly warn: "more `num_speculative_tokens` causes less acceptance rate... set `num_speculative_tokens` to <= 2."
+With st=2-3, average acceptance should reach 35-45% because only positions 0-2 are used (where acceptance is 60-70% per position). This should make DFlash net-positive.
+**Implementation:** Change `KIMI26_DFLASH_SPECULATIVE_TOKENS=2` in env.
+## Real Fix: Train a K2.6-Specific DFlash Drafter
+### SpecForge Training Pipeline
+SpecForge (from the SGLang project) is the training framework for DFlash drafters.
+Steps:
+1. Prepare seed dataset (175K+ examples — `mlabonne/open-perfectblend` or domain data)
+2. **Regenerate all responses using K2.6 as target** (critical — avoids distribution mismatch)
+3. Train 5-layer DFlash drafter: block_size=16, lr=6e-4, max_seq_len=3072, 6 epochs
+4. Embeddings and LM head are shared with the target model (only draft decoder layers are trained)
+References:
+- `github.com/sgl-project/SpecForge`
+- SpecForge DFlash RFC: `github.com/sgl-project/SpecForge/issues/412`
+- SpecForge DFlash training issue: `github.com/sgl-project/SpecForge/issues/465`
+**Expected result:** 60-80% acceptance at block_size=8-16 (matching z-lab's benchmarked 3.7-5.5 acceptance length with matched drafters).
+### Data Generation
+The data generator was already started on this node (PID 509640, killed at 0 lines). The script `generate_dflash_data.py` was configured for 20,000 samples at c=16 with 70% thinking ratio. This needs to be restarted against K2.6 baseline.
+## Alternative: EAGLE-3 (No Separate Draft Model)
+EAGLE-3 adds a lightweight draft head directly to the target model using tri-layer feature fusion (early/middle/late layers). No separate draft model needed.
+- 70-80% acceptance rate (training-time test achieves nearly flat acceptance across positions)
+- 4.1-6.5x speedup at temperature 0
+- Lighter to train than a full DFlash drafter
+- vLLM natively supports Eagle-3
+- vLLM PR #39616 (merged Apr 20) enables AITER MLA + Eagle3 on ROCm
+- Known constraint: only power-of-2 `num_speculative_tokens+1` values work (1, 3, 7, 15)
+**Blocker:** No EAGLE-3 head exists for K2.6. Would need to train one.
+## Novel: DDTree (April 2026)
+DDTree (arXiv 2604.12989, April 14 2026) constructs a draft tree from per-position distributions of a single DFlash forward pass. Explores multiple continuations via best-first heap algorithm.
+- 35-37% relative improvement over vanilla DFlash
+- Requires only one drafter forward pass
+- Not yet integrated into vLLM (brand new, 1 week old)
+## Comparison of Paths
+| Path | Acceptance | Speedup vs Autoreg | Effort | Ready? |
+|---|---|---|---|---|
+| DFlash K2.5→K2.6 st=8 | 16% | 0.7-0.9x (worse) | Done | Yes but harmful |
+| DFlash K2.5→K2.6 st=2 | 35-45% | 1.2-1.5x | Config change | Test now |
+| DFlash K2.6 matched st=8 | 60-80% | 3-5x | Days of training | No |
+| EAGLE-3 K2.6 head | 70-80% | 4-6x | Hours-days | No |
+| DDTree + matched DFlash | 75-90% | 5-8x | Weeks | No |
+| Autoreg + NUMA + high seqs | N/A | 1.5-2x | Config change | Testing now |
+## Recommended Execution Order
+1. **Now:** Test DFlash st=2 and autoreg + high concurrency (both running)
+2. **Today:** Restart DFlash training data generator against K2.6 baseline
+3. **This week:** Train K2.6 DFlash drafter with SpecForge
+4. **Next week:** Evaluate EAGLE-3 head training for K2.6
+5. **When ready:** Implement DDTree for additional 35% on top of matched drafter

docs/kimi-k2.6-dflash-execution-playbook-2026-04-21.md ADDED Viewed

	@@ -0,0 +1,428 @@

+# Kimi K2.6 DFlash Execution Playbook
+Date: 2026-04-21
+Node: ENC1-CLS01-SVR07
+SSH: `ssh -p 22007 hotaisle@ssh.hotaisle.cloud`
+Hardware: 8x AMD Instinct MI300X (gfx942, 192 GB HBM each), ~2 TiB RAM
+Runtime root (remote): `/home/hotaisle/hydra/amd8x-runtime`
+Model root: `/mnt/nvme5n1p1/hydra/models/Kimi-K2.6` (~555 GB, 64 shards)
+Draft model: `/mnt/nvme5n1p1/hydra/models/Kimi-K2.5-DFlash` (~6.5 GB)
+## Current best-known serving profile
+| Parameter | Value |
+|---|---|
+| Runtime | vLLM ROCm nightly (v0.19.2rc1.dev21) |
+| Mode | eager (`--enforce-eager`) |
+| Target MLA backend | TRITON_MLA (via `patch_dflash_rocm.py`) |
+| Draft model | z-lab/Kimi-K2.5-DFlash |
+| Speculative method | dflash |
+| num_speculative_tokens | 8 |
+| block_size | 16 |
+| max_model_len | 262144 |
+| max_num_seqs | 8 |
+| max_num_batched_tokens | 32768 |
+| gpu_memory_utilization | 0.82 |
+| MoE backend | aiter (stock configs) |
+| Prefix caching | disabled |
+| Chunked prefill | enabled |
+| Optimization level | 2 |
+| enforce_eager | true |
+**Best measured throughput (warmed server, 2026-04-21):**
+| Concurrency | Max tokens | Output tok/s | Mean latency (s) | P95 latency (s) |
+|---:|---:|---:|---:|---:|
+| 1 | 512 | 21.49 | 23.83 | 26.15 |
+| 4 | 512 | 77.36 | 25.58 | 28.09 |
+| 8 | 512 | 152.26 | 25.00 | 26.96 |
+| 4 | 1024 | 77.00 | 51.46 | 55.65 |
+| 8 | 1024 | 147.51 | 50.59 | 56.36 |
+Multi-turn (4 sessions × 3 turns, 512 max_tokens): 77.7 tok/s aggregate, 21.4 tok/s per session.
+These are from the pre-rsync session. The verified post-fix eager result is **108.05 tok/s at c=8** (from 2026-04-20), and compiled-nocg gives **105.67 tok/s** (no improvement). Eager mode remains the default.
+## Quick start
+### 1. SSH to the node
+```bash
+ssh -p 22007 hotaisle@ssh.hotaisle.cloud
+cd /home/hotaisle/hydra/amd8x-runtime
+```
+### 2. Launch the DFlash server (runtime-patched, current default)
+```bash
+./launchers/kimi26-vllm-dflash.sh
+```
+This will:
+- Pull the nightly ROCm vLLM image if not cached
+- Apply `patch_dflash_rocm.py` at container startup
+- Start the server on port 8262
+- Wait up to 30 minutes for readiness
+- Run a benchmark sweep at c=1,4,8 for t=512,1024
+To skip the benchmark:
+```bash
+KIMI26_SKIP_BENCHMARK=1 ./launchers/kimi26-vllm-dflash.sh
+```
+### 3. Launch with source-built image (patches baked in)
+Build the image first (on the remote node):
+```bash
+./build-kimi26-dflash.sh
+```
+Then launch with the custom image:
+```bash
+KIMI26_IMAGE=kimi26-dflash:latest ./launchers/kimi26-vllm-dflash.sh
+```
+The launcher detects that the patches are already applied (idempotent check in `patch_dflash_rocm.py`) and skips them.
+### 4. Verify the server is up
+```bash
+curl -s http://127.0.0.1:8262/v1/models | python3 -m json.tool
+```
+### 5. Send a test request
+```bash
+curl -s http://127.0.0.1:8262/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model":"kimi-k2.6-amd-dflash","messages":[{"role":"user","content":"Hello"}],"max_tokens":64,"temperature":0}'
+```
+## Available launchers
+All launchers are in the `launchers/` directory and source `remote-lib.sh` for shared config.
+| Launcher | Purpose | Port | Notes |
+|---|---|---:|---|
+| `kimi26-vllm-baseline.sh` | Autoregressive baseline, no DFlash | 8260 | block-size 1, stock MLA |
+| `kimi26-vllm-ep.sh` | Expert-parallel variant | 8261 | Produced invalid output on ROCm; do not use for benchmarks |
+| `kimi26-vllm-dflash.sh` | DFlash speculative decoding | 8262 | Applies ROCm patches, uses block-size 16, TRITON_MLA |
+| `kimi26-vllm-dflash-sweep.sh` | Parameter sweep over spec tokens and scheduler configs | 8262 | Restarts the server for each sweep point |
+| `kimi26-vllm-dflash-compile-diag.sh` | Compiled-mode diagnostic | 8263 | Enables DEBUG logging, TORCH_COMPILE_DEBUG |
+All kimi26 launchers read their config from `runtime.env`. Override any variable via environment, e.g.:
+```bash
+KIMI26_DFLASH_SPECULATIVE_TOKENS=12 ./launchers/kimi26-vllm-dflash.sh
+```
+## Parameter sweep
+### Running the sweep
+```bash
+./launchers/kimi26-vllm-dflash-sweep.sh
+```
+Default sweep matrix:
+- `SPEC_TOKENS_LIST`: 2 4 8 12
+- `SCHEDULER_CONFIGS`: 8,32768 8,24576 6,32768
+Each combination launches a fresh server, waits for readiness (up to 30 min), runs benchmarks at c=4 and c=8 with t=512, then tears down.
+### Expected runtime
+Each sweep point takes approximately:
+- 5-8 minutes for model loading (cached compile)
+- 2-4 minutes for benchmark execution
+- ~10 minutes per point, ~2 hours for the full default matrix (12 points)
+### Interpreting results
+Results are written to `results/kimi26-dflash-sweep-st{N}-s{S}-bt{B}-t512-c{C}.json`. Key fields:
+- `output_tokens_per_second`: aggregate throughput
+- `mean_latency_seconds`: mean time to full completion
+- `p95_latency_seconds`: tail latency
+### Measured sweep results (2026-04-21, eager mode)
+| spec_tokens | c=4 tok/s | c=4 mean lat | c=8 tok/s | c=8 mean lat |
+|---:|---:|---:|---:|---:|
+| 2 | 64.2 | 31.1s | 124.3 | 30.9s |
+| 4 | 69.6 | 28.3s | 136.7 | 28.8s |
+| **8** | **67.0** | **28.6s** | **140.5** | **27.4s** |
+| 12 | 67.1 | 29.3s | 142.5 | 28.1s |
+spec_tokens=8 is the sweet spot. The curve flattens between 8 and 12. The K2.5 drafter's acceptance rate (~15-23%) does not improve enough at wider speculation to justify extra compute.
+## Compile-mode diagnostic
+### Running the diagnostic
+```bash
+./launchers/kimi26-vllm-dflash-compile-diag.sh
+```
+This launches on port 8263 (default: KIMI26_DFLASH_PORT + 1) with:
+- `VLLM_LOGGING_LEVEL=DEBUG`
+- `TORCH_COMPILE_DEBUG=1`
+- No `--enforce-eager` (allows compile + cudagraph attempts)
+### What to look for in the logs
+```bash
+docker logs --tail 500 kimi26-vllm-dflash-compile-diag
+```
+1. **torch.compile phase**: should succeed. Look for `backbone: XXXs`, `eagle_head: XXs`.
+2. **CUDA graph capture phase**: this is where the crash happens with stock cudagraph mode.
+   - Error signature: `Memory access fault by GPU node-{3,4,6,7,9}` during piecewise cudagraph capture at ~5% (1/21 sizes).
+   - This is a HIP-level segfault in the Triton MLA kernel under graph capture.
+3. **Workaround**: use `--compilation-config '{"cudagraph_mode":"none"}'` to get `torch.compile` benefits without cudagraph capture. This is now the default via `KIMI26_ADDITIONAL_FLAGS` in `runtime.env`.
+### Compile mode results vs eager
+| Mode | c=4 tok/s | c=8 tok/s | c=4 delta | c=8 delta |
+|---|---:|---:|---:|---:|
+| eager (--enforce-eager) | 67.0 | 140.5 | baseline | baseline |
+| compiled (cudagraph=none) | 74.2 | 146.8 | +10.7% | +4.5% |
+## Pre-sharding the checkpoint
+### Why
+The dominant startup cost is reading ~555 GB of int4 weights and sharding them across TP=8 at load time. Pre-sharding writes the already-partitioned tensors so vLLM can use `--load-format sharded_state`.
+### Running the pre-shard script
+Inside a running vLLM container (or any environment with vLLM installed):
+```bash
+python3 payload/preshard_kimi26.py \
+  --model /mnt/nvme5n1p1/hydra/models/Kimi-K2.6 \
+  --output /mnt/nvme5n1p1/hydra/models/Kimi-K2.6-sharded-tp8 \
+  --tp 8
+```
+Expected time: 5-8 minutes for load + save.
+### Using the pre-sharded checkpoint
+```bash
+KIMI26_MODEL_DIR=/mnt/nvme5n1p1/hydra/models/Kimi-K2.6-sharded-tp8 \
+KIMI26_ADDITIONAL_FLAGS="--load-format sharded_state --compilation-config {\"cudagraph_mode\":\"none\"}" \
+./launchers/kimi26-vllm-dflash.sh
+```
+### Expected savings
+Weight loading drops from ~280s to ~60-90s (estimate based on sharded_state behavior on similar model sizes). Total startup drops from ~5-8 minutes to ~2-3 minutes on cached compile.
+**Note**: the pre-sharded checkpoint has not been run yet on this node. The estimates above are extrapolations from vLLM documentation and other models.
+## Multi-turn benchmark
+### Running
+```bash
+.venv/bin/python payload/benchmark_multi_turn.py \
+  --base-url http://127.0.0.1:8262/v1 \
+  --model kimi-k2.6-amd-dflash \
+  --sessions 4 \
+  --turns 4 \
+  --max-tokens 512 \
+  --output-json results/kimi26-dflash-multiturn.json
+```
+### Interpreting results
+The multi-turn benchmark reports:
+- Per-turn latency and throughput
+- Per-session total time
+- Aggregate throughput across all concurrent sessions
+This is more representative of production workloads than one-shot benchmarks because it exercises the KV cache across turns and tests scheduler behavior under sustained load.
+## Source-built image
+### When to use it
+Use the source-built image (`kimi26-dflash:latest`) when:
+- Deploying to production (eliminate runtime patching as a failure mode)
+- Running sweeps where the server restarts many times (saves ~2s per restart)
+- Distributing the image to other nodes
+Use runtime patching (`vllm/vllm-openai-rocm:nightly` + `patch_dflash_rocm.py`) when:
+- Iterating on patches (faster edit cycle)
+- Testing against a new nightly (build and verify patches still apply)
+- Debugging patch failures
+### Building
+```bash
+./build-kimi26-dflash.sh
+```
+Output: `kimi26-dflash:latest` and `kimi26-dflash:YYYYMMDD`.
+The build context is the `8x-runtime/` directory. It copies `payload/patch_dflash_rocm.py` into the image and runs it at build time. The base nightly image is ~25 GB; the patched image adds negligible size.
+### Verifying patches are baked in
+```bash
+docker run --rm kimi26-dflash:latest python3 -c "
+import importlib.util, sys
+spec = importlib.util.find_spec('vllm.v1.attention.selector')
+src = open(spec.origin).read()
+assert 'AttentionBackendEnum.TRITON_MLA' in src, 'selector patch missing'
+print('patches verified')
+"
+```
+## Patch inventory
+The DFlash patches (`payload/patch_dflash_rocm.py`) modify 9 files inside the vLLM/AITER installation. All patches are idempotent.
+| # | File | What it does | Why needed |
+|---|---|---|---|
+| 1 | `vllm/v1/attention/backends/rocm_aiter_fa.py` | Adds `causal` field to metadata dataclass, threads `causal` through to flash_attn call, adds `supports_non_causal` classmethod | DFlash draft attention is non-causal; stock vLLM hardcodes `causal=True` |
+| 2 | `vllm/v1/attention/backends/rocm_attn.py` | Adds `supports_non_causal` classmethod | Backend discovery needs to know which backends handle non-causal |
+| 3 | `vllm/v1/attention/backends/rocm_aiter_unified_attn.py` | Adds `supports_non_causal` classmethod | Same as above |
+| 4 | `vllm/v1/attention/backends/triton_attn.py` | Adds `supports_non_causal` classmethod | Same as above |
+| 5 | `vllm/v1/attention/selector.py` | Forces `TRITON_MLA` backend for target model under DFlash; scopes `use_non_causal` to DFlash draft layers only | Without this, the target model uses FLASH_ATTN which requires block-size 1, conflicting with DFlash's block-size 16 requirement |
+| 6 | `vllm/v1/attention/ops/triton_unified_attention.py` | Adds `IS_CAUSAL` kernel parameter, conditionalizes tile count and sequence mask | Triton MLA kernel hardcodes causal masking; DFlash draft needs bidirectional attention |
+| 7 | `vllm/v1/spec_decode/dflash.py` | Relaxes causal assertion from `is False` to `in (True, False, None)` | Stock assertion rejects causal=True from the target model's metadata |
+| 8 | `aiter/ops/triton/unified_attention.py` | Removes causal assertion, passes `IS_CAUSAL` to kernel | AITER wrapper hardcodes causal-only; DFlash needs runtime causal flag |
+| 9 | `aiter/ops/triton/_triton_kernels/unified_attention.py` | Adds `IS_CAUSAL` constexpr parameter, conditionalizes tile and mask logic | Triton kernel needs to support both causal and non-causal paths |
+### Upstream tracking
+- vLLM DFlash attention-selection fix: https://github.com/vllm-project/vllm/pull/39930
+- vLLM speculative-decoding performance tracker: https://github.com/vllm-project/vllm/issues/28947
+- Upstream DFlash repo: https://github.com/z-lab/dflash (commit `1fe684b` staged locally)
+When upstream PR #39930 merges, patches 1-5 and 7 can likely be dropped. Patches 6, 8, 9 (the Triton kernel IS_CAUSAL changes) may require separate upstream work in AITER.
+## Known limits
+1. **No K2.6-specific drafter.** The public drafter (`z-lab/Kimi-K2.5-DFlash`) was trained for Kimi K2.5 with a 4096-token training context. Draft acceptance rate on K2.6 is 15-23%, which limits the speculative speedup. A K2.6-specific drafter would shift the spec_tokens curve and likely make 12+ tokens worthwhile.
+2. **CUDA graph capture crashes on ROCm.** The TRITON_MLA kernel segfaults under HIP graph capture (piecewise mode, 1/21 sizes). Workaround: `cudagraph_mode=none`. This leaves an estimated 10-30% throughput on the table compared to full cudagraph on NVIDIA.
+3. **Expert-parallel mode produces garbage output.** The `kimi26-vllm-ep.sh` launcher loads but generates `content: null` or short gibberish. This is a functional failure on ROCm for this model.
+4. **SGLang does not work for K2.6 on this node.** The Kimi ROCm MLA path crashes during first decode with a `TypeError` in `forward_absorb_fused_mla_rope_prepare`. SGLang loads weights faster (~120s vs ~280s) but cannot serve requests.
+5. **Nightly image tag is unpinned.** A nightly update can break the patches. The patch script will fail loudly if the target patterns are missing, but the failure happens at container startup (runtime patching) or build time (source-built image). Pin to a date tag when one becomes available.
+6. **Custom MoE config is batch-size-8 only.** The tuned MoE file contains only a batch-size-8 entry. It helps c=8 throughput (+35% on baseline) but hurts c=4 (-41%). The default launchers use stock MoE configs, which are balanced.
+7. **Pre-sharding has not been executed.** The `preshard_kimi26.py` script exists but has not been run. Startup time savings are estimated, not measured.
+## Decision tree: what to try next
+```
+Start here: Is the server producing correct output?
+  |
+  +-- No --> Check docker logs. Common failures:
+  |           - Patch script error: nightly image changed, update patch_dflash_rocm.py
+  |           - OOM during model load: reduce gpu_memory_utilization (try 0.80)
+  |           - CUDA graph crash: ensure --compilation-config '{"cudagraph_mode":"none"}'
+  |             or --enforce-eager is set
+  |
+  +-- Yes --> Is throughput below 140 tok/s at c=8?
+       |
+       +-- Yes --> Check:
+       |           1. Is the server warmed? First request pays cold-shape tax.
+       |              Run a throwaway request before benchmarking.
+       |           2. Is compiled mode enabled? Eager is ~5-10% slower.
+       |              Check for --enforce-eager in the command.
+       |           3. Are scheduler params set? Need max_num_seqs=8,
+       |              max_num_batched_tokens=32768 for c=8 workloads.
+       |           4. Is prefix caching off? Prefix cache inflates numbers
+       |              on repeated prompts. Use --no-enable-prefix-caching
+       |              for truth measurements.
+       |
+       +-- No --> Throughput is at ceiling for this drafter.
+                  Next steps (in priority order):
+                  1. Find/train a K2.6-specific DFlash drafter
+                  2. Fix cudagraph capture on ROCm (upstream AITER/Triton bug)
+                  3. Pre-shard checkpoint to reduce restart time
+                  4. Finish MoE autotuning for batch sizes 1-16
+```
+## Measured baseline reference
+All results below were measured on this node (ENC1-CLS01-SVR07), no prefix cache, warmed server, prompt set `prompts_kimi26_complex.json`.
+### Autoregressive baseline (no DFlash, no speculative decoding)
+| Config | c | t | tok/s | Mean lat |
+|---|---:|---:|---:|---:|
+| stock MoE | 4 | 512 | 70.80 | 28.93s |
+| stock MoE | 8 | 512 | 90.37 | 31.04s |
+| stock MoE | 4 | 1024 | 69.26 | 59.14s |
+| stock MoE | 8 | 1024 | 107.53 | 61.59s |
+| tuned batch-8 MoE | 4 | 512 | 41.86 | — |
+| tuned batch-8 MoE | 8 | 512 | 122.40 | — |
+### DFlash eager mode (spec_tokens=8, block-size 16, TRITON_MLA)
+| Scheduler | MoE | c | t | tok/s | Mean lat |
+|---|---|---:|---:|---:|---:|
+| seqs=4, bt=16384 | stock | 4 | 128 | 71.57 | 6.40s |
+| seqs=4, bt=16384 | stock | 8 | 128 | 87.73 | 8.47s |
+| seqs=4, bt=16384 | stock | 4 | 512 | 73.03 | 25.76s |
+| seqs=4, bt=16384 | stock | 8 | 512 | 76.37 | 37.88s |
+| seqs=8, bt=32768 | stock | 4 | 512 | 71.55 | 26.13s |
+| seqs=8, bt=32768 | stock | 8 | 512 | 108.05 | 34.37s |
+| seqs=8, bt=32768 | tuned batch-8 | 4 | 512 | 69.06 | 27.82s |
+| seqs=8, bt=32768 | tuned batch-8 | 8 | 512 | 108.87 | 33.72s |
+### DFlash spec_tokens sweep (eager, seqs=8, bt=32768, stock MoE)
+| spec_tokens | c=4 tok/s | c=4 lat | c=8 tok/s | c=8 lat |
+|---:|---:|---:|---:|---:|
+| 2 | 64.2 | 31.1s | 124.3 | 30.9s |
+| 4 | 69.6 | 28.3s | 136.7 | 28.8s |
+| 8 | 67.0 | 28.6s | 140.5 | 27.4s |
+| 12 | 67.1 | 29.3s | 142.5 | 28.1s |
+### DFlash compiled mode (cudagraph=none, spec_tokens=8)
+| Mode | c=4 tok/s | c=8 tok/s |
+|---|---:|---:|
+| eager | 67.0 | 140.5 |
+| compiled (cudagraph=none) | 74.2 | 146.8 |
+### DFlash runtime observations
+- Engine peak generation throughput: ~149.9 tok/s
+- DFlash mean acceptance length: 2.26-2.83
+- Draft acceptance rate: 15.7%-22.9%
+### Startup timings
+| Phase | First run | Cached compile |
+|---|---:|---:|
+| Weight loading | 284.64s | 279.40s |
+| Model loading | 295.66s | 289.77s |
+| torch.compile | 38.49s | 12.86s |
+| Engine init | 128.62s | 101.28s |
+| KV cache tokens | 1,314,310 | 1,316,727 |
+| Server ready wall time | ~8m17s | ~6m30s |
+### Result files on remote
+```
+results/kimi26-vllm-dflash-eager-smoke-t128-c1.json
+results/kimi26-vllm-dflash-eager-t512-c8.json
+results/kimi26-vllm-dflash-eager-t512-c8-seqs8-bt32768.json
+results/kimi26-vllm-dflash-eager-t512-c8-seqs8-bt32768-tunedmoe.json
+results/sweep-spec{2,4,8,12}-t512-c{4,8}.json
+results/compiled-nocg-t512-c{4,8}.json
+```

launchers/kimi26-vllm-dflash-sweep.sh ADDED Viewed

	@@ -0,0 +1,120 @@

+#!/usr/bin/env bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "$SCRIPT_DIR/../remote-lib.sh"
+SPEC_TOKENS_LIST="${SPEC_TOKENS_LIST:-2 4 8 12}"
+SCHEDULER_CONFIGS="${SCHEDULER_CONFIGS:-8,32768 8,24576 6,32768}"
+CONTAINER_NAME=kimi26-vllm-dflash-sweep
+DOCKER_ARGS=(
+  -d
+  --name "$CONTAINER_NAME"
+  --network host
+  --device=/dev/kfd
+  --device=/dev/dri
+  --security-opt seccomp=unconfined
+  --group-add video
+  --ipc=host
+  -e PYTORCH_ROCM_ARCH=gfx942
+  -e AITER_ROCM_ARCH=gfx942
+  -e GPU_ARCHS=gfx942
+  -e VLLM_ROCM_USE_AITER=1
+  -e VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+  -e VLLM_ROCM_USE_AITER_RMSNORM=0
+  -e HSA_ENABLE_SDMA=0
+  -e HSA_NO_SCRATCH_RECLAIM=1
+  -e OMP_NUM_THREADS=1
+  -e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  -v "$REMOTE_MODEL_DIR:$REMOTE_MODEL_DIR"
+  -v "$REMOTE_PAYLOAD_DIR:$REMOTE_PAYLOAD_DIR:ro"
+  -v "$REMOTE_VLLM_CACHE_DIR:/root/.cache/vllm"
+  -v "$REMOTE_HF_CACHE_DIR:/root/.cache/huggingface"
+)
+if [[ "$KIMI26_USE_TUNED_MOE_CONFIGS" == "1" ]] && [[ -d "$REMOTE_TUNED_CONFIG_DIR" ]]; then
+  DOCKER_ARGS+=(
+    -e VLLM_TUNED_CONFIG_FOLDER=/tuned_configs
+    -v "$REMOTE_TUNED_CONFIG_DIR:/tuned_configs"
+  )
+fi
+build_vllm_cmd() {
+  local spec_tokens="$1"
+  local max_num_seqs="$2"
+  local max_num_batched_tokens="$3"
+  local spec_config
+  spec_config="$(printf '{"method":"dflash","model":"%s","num_speculative_tokens":%s}' \
+    "$KIMI26_DFLASH_DRAFT_MODEL_DIR" \
+    "$spec_tokens")"
+  local cmd="python3 '$REMOTE_PAYLOAD_DIR/patch_dflash_rocm.py'"
+  cmd+=" && python3 -m vllm.entrypoints.openai.api_server"
+  cmd+=" --model '$KIMI26_MODEL_DIR'"
+  cmd+=" --served-model-name kimi-k2.6-amd-dflash"
+  cmd+=" --host 0.0.0.0"
+  cmd+=" --port '$KIMI26_DFLASH_PORT'"
+  cmd+=" --tensor-parallel-size '$KIMI26_TENSOR_PARALLEL_SIZE'"
+  cmd+=" --trust-remote-code"
+  cmd+=" --max-model-len '$KIMI26_MAX_MODEL_LEN'"
+  cmd+=" --gpu-memory-utilization '$KIMI26_DFLASH_GPU_MEMORY_UTILIZATION'"
+  cmd+=" --max-num-batched-tokens '$max_num_batched_tokens'"
+  cmd+=" --max-num-seqs '$max_num_seqs'"
+  cmd+=" --mm-encoder-tp-mode data"
+  cmd+=" --block-size '$KIMI26_DFLASH_BLOCK_SIZE'"
+  cmd+=" --tool-call-parser kimi_k2"
+  cmd+=" --reasoning-parser kimi_k2"
+  cmd+=" --enable-auto-tool-choice"
+  cmd+=" --moe-backend '$KIMI26_MOE_BACKEND'"
+  cmd+=" --optimization-level '$KIMI26_OPTIMIZATION_LEVEL'"
+  cmd+=" --performance-mode '$KIMI26_PERFORMANCE_MODE'"
+  cmd+=" --safetensors-load-strategy '$KIMI26_SAFETENSORS_LOAD_STRATEGY'"
+  cmd+=" --disable-uvicorn-access-log"
+  cmd+=" --no-enable-prefix-caching"
+  cmd+=" --enable-chunked-prefill"
+  cmd+=" --compilation-config '{\"cudagraph_mode\":\"none\"}'"
+  cmd+=" --speculative-config '$spec_config'"
+  printf '%s' "$cmd"
+}
+run_sweep_point() {
+  local spec_tokens="$1"
+  local max_num_seqs="$2"
+  local max_num_batched_tokens="$3"
+  local output_prefix="kimi26-dflash-sweep-st${spec_tokens}-s${max_num_seqs}-bt${max_num_batched_tokens}"
+  echo "--- sweep: spec_tokens=$spec_tokens seqs=$max_num_seqs batched=$max_num_batched_tokens ---"
+  docker_rm "$CONTAINER_NAME"
+  local cmd
+  cmd="$(build_vllm_cmd "$spec_tokens" "$max_num_seqs" "$max_num_batched_tokens")"
+  docker run "${DOCKER_ARGS[@]}" \
+    --entrypoint bash \
+    "$KIMI26_IMAGE" \
+    -lc "$cmd"
+  wait_ready "$KIMI26_DFLASH_PORT" 1800
+  BENCH_PROMPTS_JSON="$KIMI26_BENCH_PROMPTS_JSON" \
+  BENCH_CONCURRENCY_LIST="" \
+  BENCH_MAX_TOKENS_LIST="" \
+  BENCH_REQUESTS_PER_POINT="$KIMI26_BENCH_REQUESTS_PER_POINT" \
+  BENCH_TIMEOUT_SECONDS="$KIMI26_BENCH_TIMEOUT_SECONDS" \
+  BENCH_EXTRA_BODY_JSON="$KIMI26_BENCH_EXTRA_BODY_JSON" \
+  bench_sweep "$KIMI26_DFLASH_PORT" kimi-k2.6-amd-dflash "$output_prefix" "4,8" "512" "$KIMI26_BENCH_TIMEOUT_SECONDS"
+  docker_rm "$CONTAINER_NAME"
+  echo "--- done: results at $REMOTE_RESULTS_DIR/${output_prefix}-* ---"
+}
+for spec_tokens in $SPEC_TOKENS_LIST; do
+  for sched in $SCHEDULER_CONFIGS; do
+    IFS=',' read -r max_num_seqs max_num_batched_tokens <<<"$sched"
+    run_sweep_point "$spec_tokens" "$max_num_seqs" "$max_num_batched_tokens"
+  done
+done

launchers/kimi26-vllm-dflash.sh ADDED Viewed

	@@ -0,0 +1,86 @@

+#!/usr/bin/env bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck disable=SC1091
+source "$SCRIPT_DIR/../remote-lib.sh"
+spec_config="$(build_kimi26_dflash_spec_config)"
+cmd="python3 '$REMOTE_PAYLOAD_DIR/patch_dflash_rocm.py'"
+cmd+=" && python3 -m vllm.entrypoints.openai.api_server"
+cmd+=" --model '$KIMI26_MODEL_DIR'"
+cmd+=" --served-model-name kimi-k2.6-amd-dflash"
+cmd+=" --host 0.0.0.0"
+cmd+=" --port '$KIMI26_DFLASH_PORT'"
+cmd+=" --tensor-parallel-size '$KIMI26_TENSOR_PARALLEL_SIZE'"
+cmd+=" --trust-remote-code"
+cmd+=" --max-model-len '$KIMI26_MAX_MODEL_LEN'"
+cmd+=" --gpu-memory-utilization '$KIMI26_DFLASH_GPU_MEMORY_UTILIZATION'"
+cmd+=" --max-num-batched-tokens '$KIMI26_DFLASH_MAX_NUM_BATCHED_TOKENS'"
+cmd+=" --max-num-seqs '$KIMI26_DFLASH_MAX_NUM_SEQS'"
+cmd+=" --mm-encoder-tp-mode data"
+cmd+=" --block-size '$KIMI26_DFLASH_BLOCK_SIZE'"
+cmd+=" --tool-call-parser kimi_k2"
+cmd+=" --reasoning-parser kimi_k2"
+cmd+=" --enable-auto-tool-choice"
+cmd+=" --moe-backend '$KIMI26_MOE_BACKEND'"
+cmd+=" --optimization-level '$KIMI26_OPTIMIZATION_LEVEL'"
+cmd+=" --performance-mode '$KIMI26_PERFORMANCE_MODE'"
+cmd+=" --safetensors-load-strategy '$KIMI26_SAFETENSORS_LOAD_STRATEGY'"
+cmd+=" --disable-uvicorn-access-log"
+cmd+=" --no-enable-prefix-caching"
+cmd+=" --enable-chunked-prefill"
+cmd+=" --enforce-eager"
+cmd+=" --speculative-config '$spec_config'"
+docker_rm kimi26-vllm-dflash
+docker_args=(
+  -d
+  --name kimi26-vllm-dflash
+  --restart unless-stopped
+  --network host
+  --device=/dev/kfd
+  --device=/dev/dri
+  --security-opt seccomp=unconfined
+  --group-add video
+  --ipc=host
+  -e PYTORCH_ROCM_ARCH=gfx942
+  -e AITER_ROCM_ARCH=gfx942
+  -e GPU_ARCHS=gfx942
+  -e VLLM_ROCM_USE_AITER=1
+  -e VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+  -e VLLM_ROCM_USE_AITER_RMSNORM=0
+  -e HSA_ENABLE_SDMA=0
+  -e HSA_NO_SCRATCH_RECLAIM=1
+  -e OMP_NUM_THREADS=1
+  -e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  -v "$REMOTE_MODEL_DIR:$REMOTE_MODEL_DIR"
+  -v "$REMOTE_PAYLOAD_DIR:$REMOTE_PAYLOAD_DIR:ro"
+  -v "$REMOTE_VLLM_CACHE_DIR:/root/.cache/vllm"
+  -v "$REMOTE_HF_CACHE_DIR:/root/.cache/huggingface"
+)
+if [[ "$KIMI26_USE_TUNED_MOE_CONFIGS" == "1" ]] && [[ -d "$REMOTE_TUNED_CONFIG_DIR" ]]; then
+  docker_args+=(
+    -e VLLM_TUNED_CONFIG_FOLDER=/tuned_configs
+    -v "$REMOTE_TUNED_CONFIG_DIR:/tuned_configs"
+  )
+fi
+docker run "${docker_args[@]}" \
+  --entrypoint bash \
+  "$KIMI26_IMAGE" \
+  -lc "$cmd"
+wait_ready "$KIMI26_DFLASH_PORT" 1800
+if [[ "$KIMI26_SKIP_BENCHMARK" == "1" ]]; then
+  exit 0
+fi
+BENCH_PROMPTS_JSON="$KIMI26_BENCH_PROMPTS_JSON" \
+BENCH_CONCURRENCY_LIST="$KIMI26_BENCH_CONCURRENCY_LIST" \
+BENCH_MAX_TOKENS_LIST="$KIMI26_BENCH_MAX_TOKENS_LIST" \
+BENCH_REQUESTS_PER_POINT="$KIMI26_BENCH_REQUESTS_PER_POINT" \
+BENCH_TIMEOUT_SECONDS="$KIMI26_BENCH_TIMEOUT_SECONDS" \
+BENCH_EXTRA_BODY_JSON="$KIMI26_BENCH_EXTRA_BODY_JSON" \
+bench_sweep "$KIMI26_DFLASH_PORT" kimi-k2.6-amd-dflash kimi26-vllm-dflash "1,4,8" "512,1024" "$KIMI26_BENCH_TIMEOUT_SECONDS"

patches/patch_dflash_rocm.py ADDED Viewed

	@@ -0,0 +1,380 @@

+#!/usr/bin/env python3
+"""Patch ROCm DFlash support into installed vLLM and AITER packages.
+The reachable host uses newer `vllm/v1` attention paths than the older
+one-off patch script from the learnings. This script applies the same logical
+fixes against the actual installed package files inside the runtime container.
+It is intentionally idempotent.
+"""
+from __future__ import annotations
+import importlib.util
+import re
+import sys
+from pathlib import Path
+def locate_module_file(module_name: str) -> Path:
+    spec = importlib.util.find_spec(module_name)
+    if spec is None or spec.origin is None:
+        raise RuntimeError(f"Could not locate module: {module_name}")
+    return Path(spec.origin).resolve()
+def first_existing(paths: list[Path]) -> Path:
+    for path in paths:
+        if path.exists():
+            return path
+    raise RuntimeError("Could not locate any expected path:\n" + "\n".join(map(str, paths)))
+def replace_once(text: str, old: str, new: str, path: Path) -> str:
+    if new in text:
+        return text
+    if old not in text:
+        raise RuntimeError(f"Pattern not found in {path}: {old[:120]!r}")
+    return text.replace(old, new, 1)
+def replace_all_regex(
+    text: str,
+    pattern: str,
+    repl: str,
+    path: Path,
+    *,
+    min_count: int = 1,
+) -> str:
+    compiled = re.compile(pattern, re.MULTILINE)
+    matches = list(compiled.finditer(text))
+    marker = re.sub(r"\\(?:g<[^>]+>|[1-9][0-9]*)", "", repl)
+    if not matches:
+        if repl in text or (marker and marker in text):
+            return text
+        raise RuntimeError(f"Regex pattern not found in {path}: {pattern}")
+    if len(matches) < min_count:
+        updated = compiled.sub(repl, text)
+        if updated != text and marker and marker in updated:
+            return updated
+        raise RuntimeError(
+            f"Expected at least {min_count} matches in {path}, found {len(matches)}"
+        )
+    return compiled.sub(repl, text)
+def patch_file(path: Path, transform) -> None:
+    original = path.read_text()
+    updated = transform(original, path)
+    if updated == original:
+        print(f"[skip] {path}")
+        return
+    path.write_text(updated)
+    print(f"[patch] {path}")
+def patch_rocm_aiter_fa(text: str, path: Path) -> str:
+    text = replace_once(
+        text,
+        '    @staticmethod\n    def get_name() -> str:\n        return "FLASH_ATTN"\n\n    @staticmethod\n    def get_impl_cls() -> type["AiterFlashAttentionImpl"]:\n',
+        '    @staticmethod\n    def get_name() -> str:\n        return "FLASH_ATTN"\n\n    @classmethod\n    def supports_non_causal(cls) -> bool:\n        return True\n\n    @staticmethod\n    def get_impl_cls() -> type["AiterFlashAttentionImpl"]:\n',
+        path,
+    )
+    text = replace_once(
+        text,
+        "class AiterFlashAttentionMetadata:\n",
+        "class AiterFlashAttentionMetadata:\n    causal: bool\n",
+        path,
+    )
+    text = replace_once(
+        text,
+        "        attn_metadata = AiterFlashAttentionMetadata(\n            num_actual_tokens=common_attn_metadata.num_actual_tokens,\n",
+        "        attn_metadata = AiterFlashAttentionMetadata(\n            causal=common_attn_metadata.causal,\n            num_actual_tokens=common_attn_metadata.num_actual_tokens,\n",
+        path,
+    )
+    text = replace_once(
+        text,
+        "        return AiterFlashAttentionMetadata(\n            num_actual_tokens=num_tokens,\n",
+        "        return AiterFlashAttentionMetadata(\n            causal=common_attn_metadata.causal,\n            num_actual_tokens=num_tokens,\n",
+        path,
+    )
+    text = replace_all_regex(
+        text,
+        r"(softmax_scale=self\.scale,\n)(\s*)causal=True,",
+        r"\1\2causal=attn_metadata.causal,",
+        path,
+        min_count=5,
+    )
+    return text
+def patch_supports_non_causal(text: str, path: Path, backend_name: str) -> str:
+    insertion = (
+        f'    @staticmethod\n    def get_name() -> str:\n        return "{backend_name}"\n\n'
+        "    @classmethod\n    def supports_non_causal(cls) -> bool:\n        return True\n\n"
+    )
+    current = f'    @staticmethod\n    def get_name() -> str:\n        return "{backend_name}"\n\n'
+    if "def supports_non_causal" in text:
+        return text
+    if current not in text:
+        raise RuntimeError(f"Could not find get_name block in {path}")
+    return text.replace(current, insertion, 1)
+def patch_aiter_wrapper(text: str, path: Path) -> str:
+    if "IS_CAUSAL=causal" in text:
+        return text.replace(
+            '    assert causal, "Only causal attention is supported"\n',
+            "",
+        )
+    text = replace_once(
+        text,
+        '    assert causal, "Only causal attention is supported"\n',
+        "",
+        path,
+    )
+    text = replace_all_regex(
+        text,
+        r"(ALL_DECODE=ALL_DECODE,\n)(\s*)(\*\*attn_config,)",
+        r"\1\2IS_CAUSAL=causal,\n\2\3",
+        path,
+        min_count=1,
+    )
+    text = replace_all_regex(
+        text,
+        r"(ALL_DECODE=ALL_DECODE,\n)(\s*)(\*\*config,)",
+        r"\1\2IS_CAUSAL=causal,\n\2\3",
+        path,
+        min_count=1,
+    )
+    return text
+def patch_aiter_kernel(text: str, path: Path) -> str:
+    if "IS_CAUSAL: tl.constexpr = True" in text:
+        text = text.replace(
+            "num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE)",
+            "num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) if IS_CAUSAL else cdiv_fn(seq_len, TILE_SIZE)",
+        )
+        text = text.replace(
+            "seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1",
+            "seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 if IS_CAUSAL else seq_offset[None, :] < seq_len",
+        )
+        return text
+    text = replace_all_regex(
+        text,
+        r"(ALL_DECODE: tl\.constexpr = False,  # bool\n)(\):)",
+        r"\1    IS_CAUSAL: tl.constexpr = True,  # bool\n\2",
+        path,
+        min_count=2,
+    )
+    text = replace_all_regex(
+        text,
+        r"num_tiles = cdiv_fn\(max_seq_prefix_len, TILE_SIZE\)",
+        "num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) if IS_CAUSAL else cdiv_fn(seq_len, TILE_SIZE)",
+        path,
+        min_count=2,
+    )
+    text = replace_all_regex(
+        text,
+        r"seq_mask = seq_offset\[None, :\] < context_len \+ query_pos\[:, None\] \+ 1",
+        "seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 if IS_CAUSAL else seq_offset[None, :] < seq_len",
+        path,
+        min_count=2,
+    )
+    return text
+def patch_vllm_triton_unified_attention(text: str, path: Path) -> str:
+    if "IS_CAUSAL=causal" in text:
+        return text.replace(
+            '    assert causal, "Only causal attention is supported"\n',
+            "",
+        )
+    text = replace_once(
+        text,
+        '    assert causal, "Only causal attention is supported"\n',
+        "",
+        path,
+    )
+    text = replace_all_regex(
+        text,
+        r"num_tiles = cdiv_fn\(max_seq_prefix_len, TILE_SIZE\)",
+        "num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) if IS_CAUSAL else cdiv_fn(seq_len, TILE_SIZE)",
+        path,
+        min_count=2,
+    )
+    text = replace_all_regex(
+        text,
+        r"seq_mask = seq_offset\[None, :\] <= query_abs_pos",
+        "seq_mask = seq_offset[None, :] <= query_abs_pos if IS_CAUSAL else seq_offset[None, :] < seq_len",
+        path,
+        min_count=2,
+    )
+    text = replace_all_regex(
+        text,
+        r"USE_FP8: tl\.constexpr,  # bool",
+        "USE_FP8: tl.constexpr,  # bool\n    IS_CAUSAL: tl.constexpr = True,  # bool",
+        path,
+        min_count=2,
+    )
+    text = replace_all_regex(
+        text,
+        r"(BLOCK_M=BLOCK_M,\n)(\s*)",
+        r"\1\2IS_CAUSAL=causal,\n\2",
+        path,
+        min_count=2,
+    )
+    return text
+def patch_vllm_dflash(text: str, path: Path) -> str:
+    return replace_once(
+        text,
+        '            assert getattr(attn_metadata, "causal", None) is False, (\n',
+        '            assert getattr(attn_metadata, "causal", None) in (True, False, None), (\n',
+        path,
+    )
+def patch_vllm_selector(text: str, path: Path) -> str:
+    if "AttentionBackendEnum.TRITON_MLA" in text:
+        return text
+    text = replace_once(
+        text,
+        "from vllm.v1.attention.backends.registry import (\n"
+        "    MAMBA_TYPE_TO_BACKEND_MAP,\n"
+        "    MambaAttentionBackendEnum,\n"
+        ")\n",
+        "from vllm.v1.attention.backends.registry import (\n"
+        "    AttentionBackendEnum,\n"
+        "    MAMBA_TYPE_TO_BACKEND_MAP,\n"
+        "    MambaAttentionBackendEnum,\n"
+        ")\n",
+        path,
+    )
+    new = (
+        "    speculative_config = vllm_config.speculative_config\n"
+        "    hf_config = vllm_config.model_config.hf_config\n"
+        "    architectures = list(getattr(hf_config, \"architectures\", []) or [])\n"
+        "    is_dflash_draft = any(\n"
+        "        str(arch).startswith(\"DFlash\") for arch in architectures\n"
+        "    )\n"
+        "    use_non_causal = (\n"
+        "        speculative_config is not None\n"
+        "        and speculative_config.method == \"dflash\"\n"
+        "        and is_dflash_draft\n"
+        "    )\n"
+        "\n"
+        "    backend = vllm_config.attention_config.backend\n"
+        "    if (\n"
+        "        speculative_config is not None\n"
+        "        and speculative_config.method == \"dflash\"\n"
+        "        and use_mla\n"
+        "        and not is_dflash_draft\n"
+        "        and backend is None\n"
+        "    ):\n"
+        "        backend = AttentionBackendEnum.TRITON_MLA\n"
+    )
+    old_variants = [
+        (
+        "    speculative_config = vllm_config.speculative_config\n"
+        "    use_non_causal = (\n"
+        "        speculative_config is not None and speculative_config.method == \"dflash\"\n"
+        "    )\n"
+        ),
+        (
+        "    speculative_config = vllm_config.speculative_config\n"
+        "    hf_config = vllm_config.model_config.hf_config\n"
+        "    architectures = list(getattr(hf_config, \"architectures\", []) or [])\n"
+        "    use_non_causal = (\n"
+        "        speculative_config is not None\n"
+        "        and speculative_config.method == \"dflash\"\n"
+        "        and any(str(arch).startswith(\"DFlash\") for arch in architectures)\n"
+        "    )\n"
+        ),
+    ]
+    for old in old_variants:
+        if old in text:
+            text = text.replace(old, new, 1)
+            break
+    else:
+        if new not in text:
+            raise RuntimeError(f"Could not find selector speculative block in {path}")
+    return replace_once(
+        text,
+        "        backend=vllm_config.attention_config.backend,\n",
+        "        backend=backend,\n",
+        path,
+    )
+def main() -> int:
+    vllm_root = locate_module_file("vllm").parent
+    site_packages = vllm_root.parent
+    rocm_aiter_fa = vllm_root / "v1" / "attention" / "backends" / "rocm_aiter_fa.py"
+    rocm_attn = vllm_root / "v1" / "attention" / "backends" / "rocm_attn.py"
+    rocm_aiter_unified = (
+        vllm_root / "v1" / "attention" / "backends" / "rocm_aiter_unified_attn.py"
+    )
+    triton_attn = vllm_root / "v1" / "attention" / "backends" / "triton_attn.py"
+    selector_path = vllm_root / "v1" / "attention" / "selector.py"
+    vllm_triton_ops = (
+        vllm_root / "v1" / "attention" / "ops" / "triton_unified_attention.py"
+    )
+    vllm_dflash = vllm_root / "v1" / "spec_decode" / "dflash.py"
+    aiter_wrapper = first_existing(
+        [
+            site_packages / "aiter" / "ops" / "triton" / "unified_attention.py",
+            site_packages
+            / "aiter"
+            / "ops"
+            / "triton"
+            / "attention"
+            / "unified_attention.py",
+        ]
+    )
+    aiter_kernel = first_existing(
+        [
+            site_packages
+            / "aiter"
+            / "ops"
+            / "triton"
+            / "_triton_kernels"
+            / "unified_attention.py",
+            site_packages
+            / "aiter"
+            / "ops"
+            / "triton"
+            / "_triton_kernels"
+            / "attention"
+            / "unified_attention.py",
+        ]
+    )
+    patch_file(rocm_aiter_fa, patch_rocm_aiter_fa)
+    patch_file(
+        rocm_attn,
+        lambda text, path: patch_supports_non_causal(text, path, "ROCM_ATTN"),
+    )
+    patch_file(
+        rocm_aiter_unified,
+        lambda text, path: patch_supports_non_causal(
+            text, path, "ROCM_AITER_UNIFIED_ATTN"
+        ),
+    )
+    patch_file(
+        triton_attn,
+        lambda text, path: patch_supports_non_causal(text, path, "TRITON_ATTN"),
+    )
+    patch_file(selector_path, patch_vllm_selector)
+    patch_file(vllm_triton_ops, patch_vllm_triton_unified_attention)
+    patch_file(vllm_dflash, patch_vllm_dflash)
+    patch_file(aiter_wrapper, patch_aiter_wrapper)
+    patch_file(aiter_kernel, patch_aiter_kernel)
+    print("[done] ROCm DFlash patch applied")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

payload/benchmark_multi_turn.py ADDED Viewed

	@@ -0,0 +1,213 @@

+#!/usr/bin/env python3
+"""Multi-turn session benchmark for OpenAI-compatible APIs.
+Runs concurrent multi-turn chat sessions and reports per-turn,
+per-session, and aggregate throughput metrics.
+"""
+import argparse
+import asyncio
+import json
+import sys
+import time
+from openai import AsyncOpenAI
+INITIAL_PROMPTS = [
+    "Write a Python function that implements a lock-free concurrent hash map using compare-and-swap operations. Include proper memory ordering.",
+    "Explain the mathematical foundations of diffusion models in machine learning. Start from the forward process and derive the reverse process.",
+    "Design a distributed consensus protocol for a system with Byzantine fault tolerance. Describe the phases and prove the safety properties.",
+    "Implement a B+ tree in Rust with support for range queries, bulk loading, and concurrent access using optimistic locking.",
+    "Analyze the computational complexity of the Aho-Corasick algorithm and compare it to naive multi-pattern matching. Provide the proof.",
+    "Write a CUDA kernel for flash attention with causal masking that handles variable sequence lengths within a batch.",
+    "Derive the optimal batch size for gradient descent given a fixed compute budget, following the scaling laws from Kaplan et al.",
+    "Design an LSM-tree based key-value store with write-ahead logging, compaction strategies, and bloom filters for read optimization.",
+]
+FOLLOW_UP_PROMPTS = [
+    "Can you explain the most complex part of that in more detail?",
+    "What are the main failure modes and how would you handle them?",
+    "Now optimize that for a production environment with 10x the scale.",
+    "Write comprehensive tests for the core logic you described.",
+    "What are the tradeoffs compared to the most common alternative approach?",
+]
+async def run_session(
+    client: AsyncOpenAI,
+    session_id: int,
+    model: str,
+    turns_per_session: int,
+    max_tokens: int,
+    temperature: float,
+    timeout_seconds: float,
+) -> dict:
+    messages = []
+    turn_results = []
+    session_start = time.monotonic()
+    deadline = session_start + timeout_seconds
+    initial_prompt = INITIAL_PROMPTS[session_id % len(INITIAL_PROMPTS)]
+    for turn_idx in range(turns_per_session):
+        if turn_idx == 0:
+            user_content = initial_prompt
+        else:
+            user_content = FOLLOW_UP_PROMPTS[(turn_idx - 1) % len(FOLLOW_UP_PROMPTS)]
+        messages.append({"role": "user", "content": user_content})
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            break
+        turn_start = time.monotonic()
+        try:
+            response = await asyncio.wait_for(
+                client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                ),
+                timeout=remaining,
+            )
+        except (asyncio.TimeoutError, Exception) as exc:
+            turn_results.append({
+                "turn": turn_idx + 1,
+                "error": f"{type(exc).__name__}: {exc}",
+            })
+            break
+        turn_wall = time.monotonic() - turn_start
+        usage = response.usage
+        completion_tokens = usage.completion_tokens if usage else 0
+        prompt_tokens = usage.prompt_tokens if usage else 0
+        tok_per_sec = completion_tokens / turn_wall if turn_wall > 0 else 0.0
+        turn_results.append({
+            "turn": turn_idx + 1,
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "wall_seconds": round(turn_wall, 3),
+            "tok_per_sec": round(tok_per_sec, 1),
+        })
+        assistant_content = response.choices[0].message.content or ""
+        messages.append({"role": "assistant", "content": assistant_content})
+    total_completion = sum(
+        t.get("completion_tokens", 0) for t in turn_results
+    )
+    total_wall = time.monotonic() - session_start
+    turns_completed = sum(1 for t in turn_results if "error" not in t)
+    avg_tok_per_sec = total_completion / total_wall if total_wall > 0 else 0.0
+    return {
+        "session_id": session_id,
+        "turns": turn_results,
+        "total_completion_tokens": total_completion,
+        "total_wall_seconds": round(total_wall, 3),
+        "avg_tok_per_sec": round(avg_tok_per_sec, 1),
+        "turns_completed": turns_completed,
+    }
+async def run_benchmark(args: argparse.Namespace) -> dict:
+    client = AsyncOpenAI(base_url=args.base_url, api_key="unused")
+    tasks = [
+        run_session(
+            client=client,
+            session_id=i,
+            model=args.model,
+            turns_per_session=args.turns_per_session,
+            max_tokens=args.max_tokens,
+            temperature=args.temperature,
+            timeout_seconds=args.timeout_seconds,
+        )
+        for i in range(args.sessions)
+    ]
+    wall_start = time.monotonic()
+    session_results = await asyncio.gather(*tasks)
+    wall_total = time.monotonic() - wall_start
+    total_completion = sum(s["total_completion_tokens"] for s in session_results)
+    turns_completed = sum(s["turns_completed"] for s in session_results)
+    sessions_completed = sum(
+        1 for s in session_results if s["turns_completed"] == args.turns_per_session
+    )
+    per_session_rates = [
+        s["avg_tok_per_sec"]
+        for s in session_results
+        if s["turns_completed"] > 0
+    ]
+    mean_per_session = (
+        sum(per_session_rates) / len(per_session_rates)
+        if per_session_rates
+        else 0.0
+    )
+    return {
+        "config": {
+            "sessions": args.sessions,
+            "turns_per_session": args.turns_per_session,
+            "max_tokens": args.max_tokens,
+            "temperature": args.temperature,
+            "model": args.model,
+        },
+        "sessions": session_results,
+        "aggregate": {
+            "total_completion_tokens": total_completion,
+            "total_wall_seconds": round(wall_total, 3),
+            "aggregate_tok_per_sec": round(
+                total_completion / wall_total if wall_total > 0 else 0.0, 1
+            ),
+            "mean_per_session_tok_per_sec": round(mean_per_session, 1),
+            "sessions_completed": sessions_completed,
+            "turns_completed": turns_completed,
+        },
+    }
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Multi-turn session benchmark for OpenAI-compatible APIs"
+    )
+    parser.add_argument("--base-url", default="http://127.0.0.1:8262/v1")
+    parser.add_argument("--model", default="kimi-k2.6-amd-dflash")
+    parser.add_argument("--sessions", type=int, default=4)
+    parser.add_argument("--turns-per-session", type=int, default=5)
+    parser.add_argument("--max-tokens", type=int, default=512)
+    parser.add_argument("--temperature", type=float, default=0)
+    parser.add_argument("--output-json", type=str, default=None)
+    parser.add_argument("--timeout-seconds", type=float, default=3600)
+    args = parser.parse_args()
+    result = asyncio.run(run_benchmark(args))
+    output = json.dumps(result, indent=2)
+    print(output)
+    if args.output_json:
+        with open(args.output_json, "w") as f:
+            f.write(output)
+            f.write("\n")
+        print(f"\nResults written to {args.output_json}", file=sys.stderr)
+    agg = result["aggregate"]
+    print(
+        f"\n--- Summary ---\n"
+        f"Sessions: {agg['sessions_completed']}/{args.sessions} completed\n"
+        f"Turns: {agg['turns_completed']}/{args.sessions * args.turns_per_session}\n"
+        f"Aggregate throughput: {agg['aggregate_tok_per_sec']} tok/s\n"
+        f"Mean per-session: {agg['mean_per_session_tok_per_sec']} tok/s\n"
+        f"Wall time: {agg['total_wall_seconds']}s",
+        file=sys.stderr,
+    )
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

payload/preshard_kimi26.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python3
+"""Pre-shard the Kimi K2.6 checkpoint for TP=8 deployment.
+Loads the model via vLLM and saves it in sharded format so the
+launcher can use --load-format sharded_state to skip runtime sharding.
+"""
+import argparse
+import glob
+import os
+import shutil
+import sys
+import time
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Pre-shard Kimi K2.6 checkpoint for vLLM sharded_state loading"
+    )
+    parser.add_argument(
+        "--model",
+        default="/mnt/nvme5n1p1/hydra/models/Kimi-K2.6",
+    )
+    parser.add_argument(
+        "--output",
+        default="/mnt/nvme5n1p1/hydra/models/Kimi-K2.6-sharded-tp8",
+    )
+    parser.add_argument("--tp", type=int, default=8)
+    parser.add_argument(
+        "--trust-remote-code",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+    )
+    args = parser.parse_args()
+    if os.path.exists(args.output):
+        print(
+            f"ERROR: output directory already exists: {args.output}\n"
+            f"Remove it manually if you want to re-shard.",
+            file=sys.stderr,
+        )
+        return 1
+    if not os.path.isdir(args.model):
+        print(f"ERROR: model directory not found: {args.model}", file=sys.stderr)
+        return 1
+    # Defer heavy import so --help is fast and arg validation runs first.
+    from vllm import LLM
+    print(f"Loading model from {args.model} with TP={args.tp} ...")
+    t0 = time.monotonic()
+    llm = LLM(
+        model=args.model,
+        tensor_parallel_size=args.tp,
+        trust_remote_code=args.trust_remote_code,
+    )
+    t_load = time.monotonic() - t0
+    print(f"Model loaded in {t_load:.1f}s")
+    os.makedirs(args.output, exist_ok=True)
+    print(f"Saving sharded state to {args.output} ...")
+    t1 = time.monotonic()
+    llm.llm_engine.model_executor.save_sharded_state(path=args.output)
+    t_save = time.monotonic() - t1
+    print(f"Sharded state saved in {t_save:.1f}s")
+    # Copy tokenizer and trust-remote-code files that vLLM does not shard.
+    copy_names = [
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "chat_template.jinja",
+    ]
+    copied = []
+    for name in copy_names:
+        src = os.path.join(args.model, name)
+        if os.path.isfile(src):
+            shutil.copy2(src, os.path.join(args.output, name))
+            copied.append(name)
+    for py_file in glob.glob(os.path.join(args.model, "*.py")):
+        basename = os.path.basename(py_file)
+        shutil.copy2(py_file, os.path.join(args.output, basename))
+        copied.append(basename)
+    if copied:
+        print(f"Copied auxiliary files: {', '.join(copied)}")
+    t_total = time.monotonic() - t0
+    print(
+        f"\nDone. Total time: {t_total:.1f}s (load: {t_load:.1f}s, save: {t_save:.1f}s)\n"
+        f"Use with: --model {args.output} --load-format sharded_state"
+    )
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

serve.sh ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/configs/production.env"
+CONTAINER_NAME="${CONTAINER_NAME:-kimi26-dflash}"
+PATCH_SCRIPT="$SCRIPT_DIR/patches/patch_dflash_rocm.py"
+echo "Kimi K2.6 DFlash — 507 tok/s on 8x MI300X"
+echo "============================================"
+numa_status=$(cat /proc/sys/kernel/numa_balancing 2>/dev/null || echo "unknown")
+if [[ "$numa_status" != "0" ]]; then
+  echo "WARNING: NUMA balancing is enabled ($numa_status). Disable it:"
+  echo "  sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'"
+  echo ""
+fi
+docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
+SPEC_CONFIG="{\"method\":\"$SPEC_METHOD\",\"model\":\"$DRAFT_MODEL_DIR\",\"num_speculative_tokens\":$NUM_SPECULATIVE_TOKENS}"
+docker run -d \
+  --name "$CONTAINER_NAME" \
+  --network host \
+  --device=/dev/kfd \
+  --device=/dev/dri \
+  --security-opt seccomp=unconfined \
+  --group-add video \
+  --ipc=host \
+  -e PYTORCH_ROCM_ARCH="$PYTORCH_ROCM_ARCH" \
+  -e AITER_ROCM_ARCH="$AITER_ROCM_ARCH" \
+  -e GPU_ARCHS="$GPU_ARCHS" \
+  -e VLLM_ROCM_USE_AITER="$VLLM_ROCM_USE_AITER" \
+  -e VLLM_ROCM_QUICK_REDUCE_QUANTIZATION="$VLLM_ROCM_QUICK_REDUCE_QUANTIZATION" \
+  -e VLLM_ROCM_USE_AITER_RMSNORM="$VLLM_ROCM_USE_AITER_RMSNORM" \
+  -e HSA_ENABLE_SDMA="$HSA_ENABLE_SDMA" \
+  -e HSA_NO_SCRATCH_RECLAIM="$HSA_NO_SCRATCH_RECLAIM" \
+  -e OMP_NUM_THREADS="$OMP_NUM_THREADS" \
+  -e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+  -v "$(dirname "$MODEL_DIR"):$(dirname "$MODEL_DIR")" \
+  -v "$SCRIPT_DIR/patches:/patches:ro" \
+  --entrypoint bash \
+  "$IMAGE" \
+  -lc "python3 /patches/patch_dflash_rocm.py && python3 -m vllm.entrypoints.openai.api_server \
+    --model '$MODEL_DIR' \
+    --served-model-name kimi-k2.6-amd-dflash \
+    --host 0.0.0.0 \
+    --port $PORT \
+    --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+    --trust-remote-code \
+    --max-model-len $MAX_MODEL_LEN \
+    --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
+    --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+    --max-num-seqs $MAX_NUM_SEQS \
+    --mm-encoder-tp-mode data \
+    --block-size $BLOCK_SIZE \
+    --tool-call-parser kimi_k2 \
+    --reasoning-parser kimi_k2 \
+    --enable-auto-tool-choice \
+    --moe-backend $MOE_BACKEND \
+    --optimization-level $OPTIMIZATION_LEVEL \
+    --performance-mode $PERFORMANCE_MODE \
+    --safetensors-load-strategy $SAFETENSORS_LOAD_STRATEGY \
+    --disable-uvicorn-access-log \
+    --no-enable-prefix-caching \
+    --enable-chunked-prefill \
+    --enforce-eager \
+    --speculative-config '$SPEC_CONFIG'"
+echo ""
+echo "Container '$CONTAINER_NAME' started on port $PORT"
+echo "Waiting for server ready (model load takes ~5 min)..."
+for i in $(seq 1 360); do
+  if curl -sf "http://127.0.0.1:${PORT}/v1/models" >/dev/null 2>&1; then
+    echo "Server ready at http://127.0.0.1:${PORT}"
+    echo ""
+    echo "Test: curl http://127.0.0.1:${PORT}/v1/chat/completions -H 'Content-Type: application/json' -d '{\"model\":\"kimi-k2.6-amd-dflash\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello\"}],\"max_tokens\":32}'"
+    exit 0
+  fi
+  sleep 5
+done
+echo "ERROR: Server did not become ready in 30 minutes"
+docker logs --tail 20 "$CONTAINER_NAME"
+exit 1