LilaRest commited on
Commit
09d1ab0
·
1 Parent(s): b08582c

Add benchmark chart

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ bench/chart/benchmark.png
README.md CHANGED
@@ -55,14 +55,13 @@ This variant is **text-only**, video/audio weights and encoders have been stripp
55
 
56
  ## Benchmark
57
 
 
 
58
  > [!NOTE]
59
  > RTX PRO 6000, `vllm bench` @ 1K input / 200 output tokens. See [bench.sh](/bench/bench.sh).
60
  >
61
  > Note: We also ran ***⚡Turbo*** benchmark on RTX 5090, and it performed exactly the same because at 16k context, the performance is not limited by the GPU memory.
62
 
63
- [CHART HERE]
64
-
65
-
66
  | | [Base model](https://huggingface.co/google/gemma-4-31B-it) | [NVIDIA quant](https://huggingface.co/nvidia/Gemma-4-31B-IT-NVFP4) | ***⚡ Turbo*** (this model) |
67
  |------------------|------------------------------------------------------------|--------------------------------------------------------------------|---------------------------------------------|
68
  | GPU memory | 58.9 GiB | 31 GiB | **18.5 GiB** *(-68% base, -40% nvidia)* |
@@ -77,13 +76,13 @@ This variant is **text-only**, video/audio weights and encoders have been stripp
77
  Other quants of similar size use kernel paths (compressed-tensors, Marlin) that don't leverage Blackwell's FP4 tensor cores, resulting in significantly lower prefill and concurrent throughput:
78
 
79
 
80
- | | [prithivMLmods NVFP4 quant](https://huggingface.co/prithivMLmods/gemma-4-31B-it-NVFP4) | [cyankiwi AWQ quant](https://huggingface.co/cyankiwi/gemma-4-31B-it-AWQ-4bit) | ***⚡ Turbo*** (this model) |
81
- |------------------|----------------------------------------------------------------------------------------|-------------------------------------------------------------------------------|----------------------------|
82
- | GPU memory | 19.6 GiB | 19.6 GiB | **18.5 GiB** |
83
- | Prefill | 6647 tok/s | 6626 tok/s | **15359 tok/s** |
84
- | Decode (single) | 64.3 tok/s | 64.4 tok/s | **51 tok/s** |
85
- | Decode (batched) | 757 tok/s | ??? tok/s | **1244 tok/s** |
86
- | Concurrency | 3.79 req/s | ??? req/s | **6.22 req/s** |
87
 
88
 
89
  ## Usage
 
55
 
56
  ## Benchmark
57
 
58
+ ![Benchmark chart](bench/chart/benchmark.png)
59
+
60
  > [!NOTE]
61
  > RTX PRO 6000, `vllm bench` @ 1K input / 200 output tokens. See [bench.sh](/bench/bench.sh).
62
  >
63
  > Note: We also ran ***⚡Turbo*** benchmark on RTX 5090, and it performed exactly the same because at 16k context, the performance is not limited by the GPU memory.
64
 
 
 
 
65
  | | [Base model](https://huggingface.co/google/gemma-4-31B-it) | [NVIDIA quant](https://huggingface.co/nvidia/Gemma-4-31B-IT-NVFP4) | ***⚡ Turbo*** (this model) |
66
  |------------------|------------------------------------------------------------|--------------------------------------------------------------------|---------------------------------------------|
67
  | GPU memory | 58.9 GiB | 31 GiB | **18.5 GiB** *(-68% base, -40% nvidia)* |
 
76
  Other quants of similar size use kernel paths (compressed-tensors, Marlin) that don't leverage Blackwell's FP4 tensor cores, resulting in significantly lower prefill and concurrent throughput:
77
 
78
 
79
+ | | [prithivMLmods NVFP4](https://huggingface.co/prithivMLmods/gemma-4-31B-it-NVFP4) | [cyankiwi AWQ](https://huggingface.co/cyankiwi/gemma-4-31B-it-AWQ-4bit) | ***⚡ Turbo*** (this model) |
80
+ |------------------|----------------------------------------------------------------------------------|-------------------------------------------------------------------------|----------------------------|
81
+ | GPU memory | 19.6 GiB | 19.6 GiB | **18.5 GiB** |
82
+ | Prefill | 6647 tok/s | 6626 tok/s | **15359 tok/s** |
83
+ | Decode (single) | 64.3 tok/s | 64.4 tok/s | **51 tok/s** |
84
+ | Decode (batched) | 757 tok/s | 757 tok/s | **1244 tok/s** |
85
+ | Concurrency | 3.79 req/s | 3.78 req/s | **6.22 req/s** |
86
 
87
 
88
  ## Usage
bench/chart/chart.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Generate P95 E2E latency vs RPS benchmark chart."""
3
+
4
+ import csv
5
+ import os
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+ import matplotlib.ticker as ticker
9
+ import matplotlib.patheffects as pe
10
+ from scipy.ndimage import uniform_filter1d
11
+
12
+ RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "results")
13
+ OUTPUT = os.path.join(os.path.dirname(__file__), "benchmark.png")
14
+
15
+ MODELS = [
16
+ {
17
+ "file": "LilaRest--gemma-4-31B-it-NVFP4-turbo",
18
+ "label": "⚡ Turbo (this model)",
19
+ "color": "#58a6ff",
20
+ "linewidth": 3.2,
21
+ "zorder": 10,
22
+ "alpha": 1.0,
23
+ "glow": True,
24
+ },
25
+ {
26
+ "file": "nvidia--Gemma-4-31B-IT-NVFP4",
27
+ "label": "NVIDIA NVFP4",
28
+ "color": "#76b900",
29
+ "linewidth": 2.0,
30
+ "zorder": 5,
31
+ "alpha": 0.85,
32
+ },
33
+ {
34
+ "file": "google--gemma-4-31B-it",
35
+ "label": "Google BF16 (base)",
36
+ "color": "#f97316",
37
+ "linewidth": 2.0,
38
+ "zorder": 4,
39
+ "alpha": 0.85,
40
+ },
41
+ ]
42
+
43
+
44
+ def read_csv(filename):
45
+ rps, e2e = [], []
46
+ with open(os.path.join(RESULTS_DIR, filename + ".csv")) as f:
47
+ for row in csv.DictReader(f):
48
+ rps.append(float(row["rps"]))
49
+ e2e.append(float(row["p95_e2e_ms"]))
50
+ return np.array(rps), np.array(e2e)
51
+
52
+
53
+ def smooth(y, size=3):
54
+ return uniform_filter1d(y, size=size, mode="nearest")
55
+
56
+
57
+ # --- Style ---
58
+ plt.rcParams.update({
59
+ "figure.facecolor": "#0d1117",
60
+ "axes.facecolor": "#0d1117",
61
+ "axes.edgecolor": "#30363d",
62
+ "axes.labelcolor": "#8b949e",
63
+ "text.color": "#e6edf3",
64
+ "xtick.color": "#8b949e",
65
+ "ytick.color": "#8b949e",
66
+ "grid.color": "#21262d",
67
+ "legend.facecolor": "#161b22",
68
+ "legend.edgecolor": "#30363d",
69
+ "font.family": "sans-serif",
70
+ "font.size": 12,
71
+ })
72
+
73
+ fig, ax = plt.subplots(figsize=(12, 6.5))
74
+
75
+ for m in MODELS:
76
+ rps, e2e = read_csv(m["file"])
77
+ e2e_smooth = smooth(e2e)
78
+
79
+ plot_kwargs = dict(
80
+ label=m["label"], color=m["color"],
81
+ linewidth=m["linewidth"], zorder=m["zorder"], alpha=m["alpha"],
82
+ linestyle=m.get("linestyle", "-"),
83
+ )
84
+
85
+ # Subtle glow on turbo line
86
+ if m.get("glow"):
87
+ plot_kwargs["path_effects"] = [
88
+ pe.withStroke(linewidth=6, foreground=m["color"], alpha=0.15),
89
+ ]
90
+
91
+ ax.plot(rps, e2e_smooth, **plot_kwargs)
92
+
93
+ ax.set_yscale("log")
94
+ ax.set_xlabel("Request Rate (RPS)", fontsize=14, labelpad=10)
95
+ ax.set_ylabel("P95 End-to-End Latency", fontsize=14, labelpad=10)
96
+ ax.text(0.5, 0.97, "lower is better ↓", transform=ax.transAxes, fontsize=15,
97
+ color="white", ha="center", va="top", alpha=1.0, style="italic",
98
+ bbox=dict(boxstyle="round,pad=0.4", facecolor="#161b22", edgecolor="#30363d", alpha=0.8))
99
+
100
+ # Y-axis: clean labels
101
+ ax.yaxis.set_major_formatter(ticker.FuncFormatter(
102
+ lambda x, _: f"{x/1000:.0f}s"))
103
+ ax.set_ylim(2_500, 200_000)
104
+
105
+ # X-axis
106
+ ax.set_xlim(0, 15.5)
107
+ ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
108
+
109
+ ax.grid(True, which="major", alpha=0.35)
110
+
111
+ # 10s threshold — boundary between linear and log
112
+ ax.axhline(y=10_000, color="#f0883e", linestyle="--", linewidth=1, alpha=0.35)
113
+ ax.text(15.3, 11_500, "10s", color="#f0883e", fontsize=10, alpha=0.5,
114
+ ha="right", va="bottom")
115
+
116
+ # Legend — larger, with padding
117
+ legend = ax.legend(loc="upper left", fontsize=12, framealpha=0.95,
118
+ borderpad=0.8, labelspacing=0.6)
119
+ for text in legend.get_texts():
120
+ if "Turbo" in text.get_text():
121
+ text.set_fontweight("bold")
122
+
123
+ plt.tight_layout(pad=1.5)
124
+ plt.savefig(OUTPUT, dpi=200, bbox_inches="tight")
125
+ print(f"Saved to {OUTPUT}")
bench/results/cyankiwi--gemma-4-31B-it-AWQ-4bit.csv CHANGED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ rps,effective_rps,p95_ttft_ms,p95_tpot_ms,p95_e2e_ms
2
+ 0.5,0.45,58.19,16.35,3302.55
3
+ 1.0,0.90,60.98,17.21,3478.73
4
+ 1.5,1.34,391.80,26.34,5485.43
5
+ 2.0,1.75,616.11,34.33,6927.24
6
+ 2.5,2.22,449.66,36.03,7351.67
7
+ 3.0,2.60,1222.34,35.56,7326.99
8
+ 3.5,3.02,735.37,38.73,7776.74
9
+ 4.0,3.49,281.78,39.45,7941.53
10
+ 4.5,3.78,935.95,37.79,7586.05
11
+ 5.0,3.35,6610.82,185.39,37592.90
12
+ 5.5,2.82,10223.96,246.77,49863.89
13
+ 6.0,2.83,26880.89,254.89,51985.53
14
+ 6.5,2.88,31920.09,256.79,53213.68
15
+ 7.0,2.91,35328.45,255.08,55262.87
16
+ 7.5,2.90,38643.28,256.16,57394.03
17
+ 8.0,2.92,41402.93,251.52,59681.00
18
+ 8.5,2.94,44821.73,250.68,65312.49
19
+ 9.0,2.95,48704.06,252.64,71196.65
20
+ 9.5,2.96,51980.99,250.13,77310.91
21
+ 10.0,2.96,55519.13,250.40,83422.20
22
+ 10.5,2.84,58139.61,255.72,90126.81
23
+ 11.0,2.87,69658.86,253.41,92134.56
24
+ 11.5,2.89,83025.85,255.71,95146.62
25
+ 12.0,2.91,85717.79,253.46,97926.27
26
+ 12.5,2.91,89763.07,252.60,101964.48
27
+ 13.0,2.92,93397.80,252.92,108076.43
28
+ 13.5,2.93,95972.66,252.58,113769.49
29
+ 14.0,2.94,98589.56,248.68,118258.77
30
+ 14.5,2.96,102418.07,249.98,123505.64
31
+ 15.0,2.95,106032.85,249.78,129777.93