Add benchmark chart
Browse files- .gitignore +1 -0
- README.md +9 -10
- bench/chart/chart.py +125 -0
- bench/results/cyankiwi--gemma-4-31B-it-AWQ-4bit.csv +31 -0
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
bench/chart/benchmark.png
|
README.md
CHANGED
|
@@ -55,14 +55,13 @@ This variant is **text-only**, video/audio weights and encoders have been stripp
|
|
| 55 |
|
| 56 |
## Benchmark
|
| 57 |
|
|
|
|
|
|
|
| 58 |
> [!NOTE]
|
| 59 |
> RTX PRO 6000, `vllm bench` @ 1K input / 200 output tokens. See [bench.sh](/bench/bench.sh).
|
| 60 |
>
|
| 61 |
> Note: We also ran ***⚡Turbo*** benchmark on RTX 5090, and it performed exactly the same because at 16k context, the performance is not limited by the GPU memory.
|
| 62 |
|
| 63 |
-
[CHART HERE]
|
| 64 |
-
|
| 65 |
-
|
| 66 |
| | [Base model](https://huggingface.co/google/gemma-4-31B-it) | [NVIDIA quant](https://huggingface.co/nvidia/Gemma-4-31B-IT-NVFP4) | ***⚡ Turbo*** (this model) |
|
| 67 |
|------------------|------------------------------------------------------------|--------------------------------------------------------------------|---------------------------------------------|
|
| 68 |
| GPU memory | 58.9 GiB | 31 GiB | **18.5 GiB** *(-68% base, -40% nvidia)* |
|
|
@@ -77,13 +76,13 @@ This variant is **text-only**, video/audio weights and encoders have been stripp
|
|
| 77 |
Other quants of similar size use kernel paths (compressed-tensors, Marlin) that don't leverage Blackwell's FP4 tensor cores, resulting in significantly lower prefill and concurrent throughput:
|
| 78 |
|
| 79 |
|
| 80 |
-
| | [prithivMLmods NVFP4
|
| 81 |
-
|------------------|----------------------------------------------------------------------------------
|
| 82 |
-
| GPU memory | 19.6 GiB
|
| 83 |
-
| Prefill | 6647 tok/s
|
| 84 |
-
| Decode (single) | 64.3 tok/s
|
| 85 |
-
| Decode (batched) | 757 tok/s
|
| 86 |
-
| Concurrency | 3.79 req/s
|
| 87 |
|
| 88 |
|
| 89 |
## Usage
|
|
|
|
| 55 |
|
| 56 |
## Benchmark
|
| 57 |
|
| 58 |
+

|
| 59 |
+
|
| 60 |
> [!NOTE]
|
| 61 |
> RTX PRO 6000, `vllm bench` @ 1K input / 200 output tokens. See [bench.sh](/bench/bench.sh).
|
| 62 |
>
|
| 63 |
> Note: We also ran ***⚡Turbo*** benchmark on RTX 5090, and it performed exactly the same because at 16k context, the performance is not limited by the GPU memory.
|
| 64 |
|
|
|
|
|
|
|
|
|
|
| 65 |
| | [Base model](https://huggingface.co/google/gemma-4-31B-it) | [NVIDIA quant](https://huggingface.co/nvidia/Gemma-4-31B-IT-NVFP4) | ***⚡ Turbo*** (this model) |
|
| 66 |
|------------------|------------------------------------------------------------|--------------------------------------------------------------------|---------------------------------------------|
|
| 67 |
| GPU memory | 58.9 GiB | 31 GiB | **18.5 GiB** *(-68% base, -40% nvidia)* |
|
|
|
|
| 76 |
Other quants of similar size use kernel paths (compressed-tensors, Marlin) that don't leverage Blackwell's FP4 tensor cores, resulting in significantly lower prefill and concurrent throughput:
|
| 77 |
|
| 78 |
|
| 79 |
+
| | [prithivMLmods NVFP4](https://huggingface.co/prithivMLmods/gemma-4-31B-it-NVFP4) | [cyankiwi AWQ](https://huggingface.co/cyankiwi/gemma-4-31B-it-AWQ-4bit) | ***⚡ Turbo*** (this model) |
|
| 80 |
+
|------------------|----------------------------------------------------------------------------------|-------------------------------------------------------------------------|----------------------------|
|
| 81 |
+
| GPU memory | 19.6 GiB | 19.6 GiB | **18.5 GiB** |
|
| 82 |
+
| Prefill | 6647 tok/s | 6626 tok/s | **15359 tok/s** |
|
| 83 |
+
| Decode (single) | 64.3 tok/s | 64.4 tok/s | **51 tok/s** |
|
| 84 |
+
| Decode (batched) | 757 tok/s | 757 tok/s | **1244 tok/s** |
|
| 85 |
+
| Concurrency | 3.79 req/s | 3.78 req/s | **6.22 req/s** |
|
| 86 |
|
| 87 |
|
| 88 |
## Usage
|
bench/chart/chart.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Generate P95 E2E latency vs RPS benchmark chart."""
|
| 3 |
+
|
| 4 |
+
import csv
|
| 5 |
+
import os
|
| 6 |
+
import numpy as np
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import matplotlib.ticker as ticker
|
| 9 |
+
import matplotlib.patheffects as pe
|
| 10 |
+
from scipy.ndimage import uniform_filter1d
|
| 11 |
+
|
| 12 |
+
RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "results")
|
| 13 |
+
OUTPUT = os.path.join(os.path.dirname(__file__), "benchmark.png")
|
| 14 |
+
|
| 15 |
+
MODELS = [
|
| 16 |
+
{
|
| 17 |
+
"file": "LilaRest--gemma-4-31B-it-NVFP4-turbo",
|
| 18 |
+
"label": "⚡ Turbo (this model)",
|
| 19 |
+
"color": "#58a6ff",
|
| 20 |
+
"linewidth": 3.2,
|
| 21 |
+
"zorder": 10,
|
| 22 |
+
"alpha": 1.0,
|
| 23 |
+
"glow": True,
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"file": "nvidia--Gemma-4-31B-IT-NVFP4",
|
| 27 |
+
"label": "NVIDIA NVFP4",
|
| 28 |
+
"color": "#76b900",
|
| 29 |
+
"linewidth": 2.0,
|
| 30 |
+
"zorder": 5,
|
| 31 |
+
"alpha": 0.85,
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"file": "google--gemma-4-31B-it",
|
| 35 |
+
"label": "Google BF16 (base)",
|
| 36 |
+
"color": "#f97316",
|
| 37 |
+
"linewidth": 2.0,
|
| 38 |
+
"zorder": 4,
|
| 39 |
+
"alpha": 0.85,
|
| 40 |
+
},
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def read_csv(filename):
|
| 45 |
+
rps, e2e = [], []
|
| 46 |
+
with open(os.path.join(RESULTS_DIR, filename + ".csv")) as f:
|
| 47 |
+
for row in csv.DictReader(f):
|
| 48 |
+
rps.append(float(row["rps"]))
|
| 49 |
+
e2e.append(float(row["p95_e2e_ms"]))
|
| 50 |
+
return np.array(rps), np.array(e2e)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def smooth(y, size=3):
|
| 54 |
+
return uniform_filter1d(y, size=size, mode="nearest")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# --- Style ---
|
| 58 |
+
plt.rcParams.update({
|
| 59 |
+
"figure.facecolor": "#0d1117",
|
| 60 |
+
"axes.facecolor": "#0d1117",
|
| 61 |
+
"axes.edgecolor": "#30363d",
|
| 62 |
+
"axes.labelcolor": "#8b949e",
|
| 63 |
+
"text.color": "#e6edf3",
|
| 64 |
+
"xtick.color": "#8b949e",
|
| 65 |
+
"ytick.color": "#8b949e",
|
| 66 |
+
"grid.color": "#21262d",
|
| 67 |
+
"legend.facecolor": "#161b22",
|
| 68 |
+
"legend.edgecolor": "#30363d",
|
| 69 |
+
"font.family": "sans-serif",
|
| 70 |
+
"font.size": 12,
|
| 71 |
+
})
|
| 72 |
+
|
| 73 |
+
fig, ax = plt.subplots(figsize=(12, 6.5))
|
| 74 |
+
|
| 75 |
+
for m in MODELS:
|
| 76 |
+
rps, e2e = read_csv(m["file"])
|
| 77 |
+
e2e_smooth = smooth(e2e)
|
| 78 |
+
|
| 79 |
+
plot_kwargs = dict(
|
| 80 |
+
label=m["label"], color=m["color"],
|
| 81 |
+
linewidth=m["linewidth"], zorder=m["zorder"], alpha=m["alpha"],
|
| 82 |
+
linestyle=m.get("linestyle", "-"),
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# Subtle glow on turbo line
|
| 86 |
+
if m.get("glow"):
|
| 87 |
+
plot_kwargs["path_effects"] = [
|
| 88 |
+
pe.withStroke(linewidth=6, foreground=m["color"], alpha=0.15),
|
| 89 |
+
]
|
| 90 |
+
|
| 91 |
+
ax.plot(rps, e2e_smooth, **plot_kwargs)
|
| 92 |
+
|
| 93 |
+
ax.set_yscale("log")
|
| 94 |
+
ax.set_xlabel("Request Rate (RPS)", fontsize=14, labelpad=10)
|
| 95 |
+
ax.set_ylabel("P95 End-to-End Latency", fontsize=14, labelpad=10)
|
| 96 |
+
ax.text(0.5, 0.97, "lower is better ↓", transform=ax.transAxes, fontsize=15,
|
| 97 |
+
color="white", ha="center", va="top", alpha=1.0, style="italic",
|
| 98 |
+
bbox=dict(boxstyle="round,pad=0.4", facecolor="#161b22", edgecolor="#30363d", alpha=0.8))
|
| 99 |
+
|
| 100 |
+
# Y-axis: clean labels
|
| 101 |
+
ax.yaxis.set_major_formatter(ticker.FuncFormatter(
|
| 102 |
+
lambda x, _: f"{x/1000:.0f}s"))
|
| 103 |
+
ax.set_ylim(2_500, 200_000)
|
| 104 |
+
|
| 105 |
+
# X-axis
|
| 106 |
+
ax.set_xlim(0, 15.5)
|
| 107 |
+
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
|
| 108 |
+
|
| 109 |
+
ax.grid(True, which="major", alpha=0.35)
|
| 110 |
+
|
| 111 |
+
# 10s threshold — boundary between linear and log
|
| 112 |
+
ax.axhline(y=10_000, color="#f0883e", linestyle="--", linewidth=1, alpha=0.35)
|
| 113 |
+
ax.text(15.3, 11_500, "10s", color="#f0883e", fontsize=10, alpha=0.5,
|
| 114 |
+
ha="right", va="bottom")
|
| 115 |
+
|
| 116 |
+
# Legend — larger, with padding
|
| 117 |
+
legend = ax.legend(loc="upper left", fontsize=12, framealpha=0.95,
|
| 118 |
+
borderpad=0.8, labelspacing=0.6)
|
| 119 |
+
for text in legend.get_texts():
|
| 120 |
+
if "Turbo" in text.get_text():
|
| 121 |
+
text.set_fontweight("bold")
|
| 122 |
+
|
| 123 |
+
plt.tight_layout(pad=1.5)
|
| 124 |
+
plt.savefig(OUTPUT, dpi=200, bbox_inches="tight")
|
| 125 |
+
print(f"Saved to {OUTPUT}")
|
bench/results/cyankiwi--gemma-4-31B-it-AWQ-4bit.csv
CHANGED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rps,effective_rps,p95_ttft_ms,p95_tpot_ms,p95_e2e_ms
|
| 2 |
+
0.5,0.45,58.19,16.35,3302.55
|
| 3 |
+
1.0,0.90,60.98,17.21,3478.73
|
| 4 |
+
1.5,1.34,391.80,26.34,5485.43
|
| 5 |
+
2.0,1.75,616.11,34.33,6927.24
|
| 6 |
+
2.5,2.22,449.66,36.03,7351.67
|
| 7 |
+
3.0,2.60,1222.34,35.56,7326.99
|
| 8 |
+
3.5,3.02,735.37,38.73,7776.74
|
| 9 |
+
4.0,3.49,281.78,39.45,7941.53
|
| 10 |
+
4.5,3.78,935.95,37.79,7586.05
|
| 11 |
+
5.0,3.35,6610.82,185.39,37592.90
|
| 12 |
+
5.5,2.82,10223.96,246.77,49863.89
|
| 13 |
+
6.0,2.83,26880.89,254.89,51985.53
|
| 14 |
+
6.5,2.88,31920.09,256.79,53213.68
|
| 15 |
+
7.0,2.91,35328.45,255.08,55262.87
|
| 16 |
+
7.5,2.90,38643.28,256.16,57394.03
|
| 17 |
+
8.0,2.92,41402.93,251.52,59681.00
|
| 18 |
+
8.5,2.94,44821.73,250.68,65312.49
|
| 19 |
+
9.0,2.95,48704.06,252.64,71196.65
|
| 20 |
+
9.5,2.96,51980.99,250.13,77310.91
|
| 21 |
+
10.0,2.96,55519.13,250.40,83422.20
|
| 22 |
+
10.5,2.84,58139.61,255.72,90126.81
|
| 23 |
+
11.0,2.87,69658.86,253.41,92134.56
|
| 24 |
+
11.5,2.89,83025.85,255.71,95146.62
|
| 25 |
+
12.0,2.91,85717.79,253.46,97926.27
|
| 26 |
+
12.5,2.91,89763.07,252.60,101964.48
|
| 27 |
+
13.0,2.92,93397.80,252.92,108076.43
|
| 28 |
+
13.5,2.93,95972.66,252.58,113769.49
|
| 29 |
+
14.0,2.94,98589.56,248.68,118258.77
|
| 30 |
+
14.5,2.96,102418.07,249.98,123505.64
|
| 31 |
+
15.0,2.95,106032.85,249.78,129777.93
|