Youssofal commited on
Commit
77ba53c
·
verified ·
1 Parent(s): 89123c9

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -1,35 +1,3 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ assistant/tokenizer.json filter=lfs diff=lfs merge=lfs -text
3
+ target/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ license_link: https://ai.google.dev/gemma/docs/gemma_4_license
4
+ base_model:
5
+ - google/gemma-4-31B-it
6
+ - google/gemma-4-31B-it-assistant
7
+ library_name: mlx
8
+ tags:
9
+ - mlx
10
+ - gemma4
11
+ - mtplx
12
+ - speculative-decoding
13
+ - apple-silicon
14
+ - text-generation
15
+ pipeline_tag: text-generation
16
+ ---
17
+
18
+ # Gemma4 MTPLX Optimized Quality
19
+
20
+ This is an **MTPLX pair bundle** for Gemma 4 31B speculative decoding on Apple Silicon.
21
+
22
+ It is not a single vanilla Transformers model directory. The repository contains two MLX-format artifacts:
23
+
24
+ - `target/` - Gemma 4 31B IT target, MLX Q8 affine group-size 64
25
+ - `assistant/` - official Gemma 4 31B assistant drafter, MLX Q8 affine group-size 64
26
+
27
+ Use this pair when target precision and high acceptance are the priority.
28
+
29
+ ## Source
30
+
31
+ - Target source: `google/gemma-4-31B-it`
32
+ - Target revision: `145dc2508c480a64b47242f160d286cff94a2343`
33
+ - Assistant source: `google/gemma-4-31B-it-assistant`
34
+ - Assistant revision: `cffbbd2cea41ea56a0fa5b0487e0d445121fd204`
35
+
36
+ Both artifacts were converted locally to MLX format.
37
+
38
+ ## Quantization
39
+
40
+ Target:
41
+
42
+ ```text
43
+ bits: 8
44
+ group_size: 64
45
+ mode: affine
46
+ ```
47
+
48
+ Assistant:
49
+
50
+ ```text
51
+ bits: 8
52
+ group_size: 64
53
+ mode: affine
54
+ ```
55
+
56
+ ## MTPLX Usage
57
+
58
+ After downloading this repository, point MTPLX at the two subdirectories:
59
+
60
+ ```bash
61
+ mtplx bench gemma-mtp \
62
+ --target-model ./target \
63
+ --assistant-model ./assistant \
64
+ --prompt-suite mtplx/benchmarks/prompts/flappy.jsonl \
65
+ --max-tokens 1000 \
66
+ --draft-block-sizes 6 \
67
+ --allow-unverified-gemma
68
+ ```
69
+
70
+ The Gemma 4 assistant is a separate drafter model. MTPLX uses exact speculative sampling with target verification and residual correction.
71
+
72
+ ## Local Benchmark
73
+
74
+ Prompt: single-file HTML5 Canvas Flappy Bird game, capped at 1000 generated tokens.
75
+
76
+ Sampler:
77
+
78
+ ```text
79
+ temperature: 1.0
80
+ top_p: 0.95
81
+ top_k: 64
82
+ seed: 0
83
+ ```
84
+
85
+ Best observed block size:
86
+
87
+ ```text
88
+ block_size: 6
89
+ acceptance: 833 / 835 = 99.76%
90
+ speedup_vs_ar: 2.49x
91
+ ```
92
+
93
+ Observed MTPLX throughput samples:
94
+
95
+ ```text
96
+ 34.22 tok/s
97
+ 32.88 tok/s
98
+ 33.12 tok/s
99
+ ```
100
+
101
+ The bundled benchmark JSON file is in `benchmarks/`.
102
+
103
+ ## Notes
104
+
105
+ This release is optimized for target precision and high acceptance. It is not the fastest absolute-TPS pair; for speed, use `Youssofal/Gemma4-MTPLX-Optimized-Speed`.
106
+
107
+ Gemma 4 is released by Google under the Gemma 4 license terms linked above.
assistant/README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ tags:
4
+ - mlx
5
+ pipeline_tag: text-generation
6
+ library_name: mlx
7
+ ---
assistant/config.json ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma4AssistantForCausalLM"
4
+ ],
5
+ "audio_token_id": 258881,
6
+ "backbone_hidden_size": 5376,
7
+ "boa_token_id": 256000,
8
+ "boi_token_id": 255999,
9
+ "centroid_intermediate_top_k": 32,
10
+ "dtype": "bfloat16",
11
+ "eoa_token_id": 258883,
12
+ "eoi_token_id": 258882,
13
+ "eos_token_id": [
14
+ 1,
15
+ 106,
16
+ 50
17
+ ],
18
+ "image_token_id": 258880,
19
+ "model_type": "gemma4_assistant",
20
+ "num_centroids": 2048,
21
+ "quantization": {
22
+ "group_size": 64,
23
+ "bits": 8,
24
+ "mode": "affine"
25
+ },
26
+ "quantization_config": {
27
+ "group_size": 64,
28
+ "bits": 8,
29
+ "mode": "affine"
30
+ },
31
+ "text_config": {
32
+ "_name_or_path": "",
33
+ "architectures": null,
34
+ "attention_bias": false,
35
+ "attention_dropout": 0.0,
36
+ "attention_k_eq_v": true,
37
+ "bos_token_id": 2,
38
+ "chunk_size_feed_forward": 0,
39
+ "dtype": "bfloat16",
40
+ "enable_moe_block": false,
41
+ "eos_token_id": 1,
42
+ "final_logit_softcapping": null,
43
+ "global_head_dim": 512,
44
+ "head_dim": 256,
45
+ "hidden_activation": "gelu_pytorch_tanh",
46
+ "hidden_size": 1024,
47
+ "hidden_size_per_layer_input": 0,
48
+ "id2label": {
49
+ "0": "LABEL_0",
50
+ "1": "LABEL_1"
51
+ },
52
+ "initializer_range": 0.02,
53
+ "intermediate_size": 8192,
54
+ "is_encoder_decoder": false,
55
+ "label2id": {
56
+ "LABEL_0": 0,
57
+ "LABEL_1": 1
58
+ },
59
+ "layer_types": [
60
+ "sliding_attention",
61
+ "sliding_attention",
62
+ "sliding_attention",
63
+ "full_attention"
64
+ ],
65
+ "max_position_embeddings": 262144,
66
+ "model_type": "gemma4_text",
67
+ "moe_intermediate_size": null,
68
+ "num_attention_heads": 32,
69
+ "num_experts": null,
70
+ "num_global_key_value_heads": 4,
71
+ "num_hidden_layers": 4,
72
+ "num_key_value_heads": 16,
73
+ "num_kv_shared_layers": 4,
74
+ "output_attentions": false,
75
+ "output_hidden_states": false,
76
+ "pad_token_id": 0,
77
+ "problem_type": null,
78
+ "return_dict": true,
79
+ "rms_norm_eps": 1e-06,
80
+ "rope_parameters": {
81
+ "full_attention": {
82
+ "partial_rotary_factor": 0.25,
83
+ "rope_theta": 1000000.0,
84
+ "rope_type": "proportional"
85
+ },
86
+ "sliding_attention": {
87
+ "rope_theta": 10000.0,
88
+ "rope_type": "default"
89
+ }
90
+ },
91
+ "sliding_window": 1024,
92
+ "tie_word_embeddings": true,
93
+ "top_k_experts": null,
94
+ "use_bidirectional_attention": null,
95
+ "use_cache": true,
96
+ "use_double_wide_mlp": false,
97
+ "vocab_size": 262144,
98
+ "vocab_size_per_layer_input": 0
99
+ },
100
+ "tie_word_embeddings": true,
101
+ "transformers_version": "5.7.0.dev0",
102
+ "use_ordered_embeddings": false
103
+ }
assistant/generation_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 1,
6
+ 106,
7
+ 50
8
+ ],
9
+ "is_assistant": true,
10
+ "num_assistant_tokens": 6,
11
+ "num_assistant_tokens_schedule": "constant",
12
+ "pad_token_id": 0,
13
+ "temperature": 1.0,
14
+ "top_k": 64,
15
+ "top_p": 0.95,
16
+ "transformers_version": "5.7.0.dev0"
17
+ }
assistant/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9503f95f4259bb000df3a89c5ab44ce94eed935d8e949926db8aa288ab1c3180
3
+ size 498891309
assistant/model.safetensors.index.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 498881032,
4
+ "total_parameters": 469518592
5
+ },
6
+ "weight_map": {
7
+ "model.embed_tokens.biases": "model.safetensors",
8
+ "model.embed_tokens.scales": "model.safetensors",
9
+ "model.embed_tokens.weight": "model.safetensors",
10
+ "model.layers.0.input_layernorm.weight": "model.safetensors",
11
+ "model.layers.0.layer_scalar": "model.safetensors",
12
+ "model.layers.0.mlp.down_proj.biases": "model.safetensors",
13
+ "model.layers.0.mlp.down_proj.scales": "model.safetensors",
14
+ "model.layers.0.mlp.down_proj.weight": "model.safetensors",
15
+ "model.layers.0.mlp.gate_proj.biases": "model.safetensors",
16
+ "model.layers.0.mlp.gate_proj.scales": "model.safetensors",
17
+ "model.layers.0.mlp.gate_proj.weight": "model.safetensors",
18
+ "model.layers.0.mlp.up_proj.biases": "model.safetensors",
19
+ "model.layers.0.mlp.up_proj.scales": "model.safetensors",
20
+ "model.layers.0.mlp.up_proj.weight": "model.safetensors",
21
+ "model.layers.0.post_attention_layernorm.weight": "model.safetensors",
22
+ "model.layers.0.post_feedforward_layernorm.weight": "model.safetensors",
23
+ "model.layers.0.pre_feedforward_layernorm.weight": "model.safetensors",
24
+ "model.layers.0.self_attn.o_proj.biases": "model.safetensors",
25
+ "model.layers.0.self_attn.o_proj.scales": "model.safetensors",
26
+ "model.layers.0.self_attn.o_proj.weight": "model.safetensors",
27
+ "model.layers.0.self_attn.q_norm.weight": "model.safetensors",
28
+ "model.layers.0.self_attn.q_proj.biases": "model.safetensors",
29
+ "model.layers.0.self_attn.q_proj.scales": "model.safetensors",
30
+ "model.layers.0.self_attn.q_proj.weight": "model.safetensors",
31
+ "model.layers.1.input_layernorm.weight": "model.safetensors",
32
+ "model.layers.1.layer_scalar": "model.safetensors",
33
+ "model.layers.1.mlp.down_proj.biases": "model.safetensors",
34
+ "model.layers.1.mlp.down_proj.scales": "model.safetensors",
35
+ "model.layers.1.mlp.down_proj.weight": "model.safetensors",
36
+ "model.layers.1.mlp.gate_proj.biases": "model.safetensors",
37
+ "model.layers.1.mlp.gate_proj.scales": "model.safetensors",
38
+ "model.layers.1.mlp.gate_proj.weight": "model.safetensors",
39
+ "model.layers.1.mlp.up_proj.biases": "model.safetensors",
40
+ "model.layers.1.mlp.up_proj.scales": "model.safetensors",
41
+ "model.layers.1.mlp.up_proj.weight": "model.safetensors",
42
+ "model.layers.1.post_attention_layernorm.weight": "model.safetensors",
43
+ "model.layers.1.post_feedforward_layernorm.weight": "model.safetensors",
44
+ "model.layers.1.pre_feedforward_layernorm.weight": "model.safetensors",
45
+ "model.layers.1.self_attn.o_proj.biases": "model.safetensors",
46
+ "model.layers.1.self_attn.o_proj.scales": "model.safetensors",
47
+ "model.layers.1.self_attn.o_proj.weight": "model.safetensors",
48
+ "model.layers.1.self_attn.q_norm.weight": "model.safetensors",
49
+ "model.layers.1.self_attn.q_proj.biases": "model.safetensors",
50
+ "model.layers.1.self_attn.q_proj.scales": "model.safetensors",
51
+ "model.layers.1.self_attn.q_proj.weight": "model.safetensors",
52
+ "model.layers.2.input_layernorm.weight": "model.safetensors",
53
+ "model.layers.2.layer_scalar": "model.safetensors",
54
+ "model.layers.2.mlp.down_proj.biases": "model.safetensors",
55
+ "model.layers.2.mlp.down_proj.scales": "model.safetensors",
56
+ "model.layers.2.mlp.down_proj.weight": "model.safetensors",
57
+ "model.layers.2.mlp.gate_proj.biases": "model.safetensors",
58
+ "model.layers.2.mlp.gate_proj.scales": "model.safetensors",
59
+ "model.layers.2.mlp.gate_proj.weight": "model.safetensors",
60
+ "model.layers.2.mlp.up_proj.biases": "model.safetensors",
61
+ "model.layers.2.mlp.up_proj.scales": "model.safetensors",
62
+ "model.layers.2.mlp.up_proj.weight": "model.safetensors",
63
+ "model.layers.2.post_attention_layernorm.weight": "model.safetensors",
64
+ "model.layers.2.post_feedforward_layernorm.weight": "model.safetensors",
65
+ "model.layers.2.pre_feedforward_layernorm.weight": "model.safetensors",
66
+ "model.layers.2.self_attn.o_proj.biases": "model.safetensors",
67
+ "model.layers.2.self_attn.o_proj.scales": "model.safetensors",
68
+ "model.layers.2.self_attn.o_proj.weight": "model.safetensors",
69
+ "model.layers.2.self_attn.q_norm.weight": "model.safetensors",
70
+ "model.layers.2.self_attn.q_proj.biases": "model.safetensors",
71
+ "model.layers.2.self_attn.q_proj.scales": "model.safetensors",
72
+ "model.layers.2.self_attn.q_proj.weight": "model.safetensors",
73
+ "model.layers.3.input_layernorm.weight": "model.safetensors",
74
+ "model.layers.3.layer_scalar": "model.safetensors",
75
+ "model.layers.3.mlp.down_proj.biases": "model.safetensors",
76
+ "model.layers.3.mlp.down_proj.scales": "model.safetensors",
77
+ "model.layers.3.mlp.down_proj.weight": "model.safetensors",
78
+ "model.layers.3.mlp.gate_proj.biases": "model.safetensors",
79
+ "model.layers.3.mlp.gate_proj.scales": "model.safetensors",
80
+ "model.layers.3.mlp.gate_proj.weight": "model.safetensors",
81
+ "model.layers.3.mlp.up_proj.biases": "model.safetensors",
82
+ "model.layers.3.mlp.up_proj.scales": "model.safetensors",
83
+ "model.layers.3.mlp.up_proj.weight": "model.safetensors",
84
+ "model.layers.3.post_attention_layernorm.weight": "model.safetensors",
85
+ "model.layers.3.post_feedforward_layernorm.weight": "model.safetensors",
86
+ "model.layers.3.pre_feedforward_layernorm.weight": "model.safetensors",
87
+ "model.layers.3.self_attn.o_proj.biases": "model.safetensors",
88
+ "model.layers.3.self_attn.o_proj.scales": "model.safetensors",
89
+ "model.layers.3.self_attn.o_proj.weight": "model.safetensors",
90
+ "model.layers.3.self_attn.q_norm.weight": "model.safetensors",
91
+ "model.layers.3.self_attn.q_proj.biases": "model.safetensors",
92
+ "model.layers.3.self_attn.q_proj.scales": "model.safetensors",
93
+ "model.layers.3.self_attn.q_proj.weight": "model.safetensors",
94
+ "model.norm.weight": "model.safetensors",
95
+ "post_projection.biases": "model.safetensors",
96
+ "post_projection.scales": "model.safetensors",
97
+ "post_projection.weight": "model.safetensors",
98
+ "pre_projection.biases": "model.safetensors",
99
+ "pre_projection.scales": "model.safetensors",
100
+ "pre_projection.weight": "model.safetensors"
101
+ }
102
+ }
assistant/mtplx_artifact.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "note": "Converted locally with MTPLX Gemma4 assistant classes because stock mlx_lm.convert does not support model_type=gemma4_assistant.",
3
+ "precision_policy": "Q8 affine G64 for all quantizable assistant modules, including tied embedding/LM-head path and projections.",
4
+ "quantization": {
5
+ "bits": 8,
6
+ "group_size": 64,
7
+ "mode": "affine"
8
+ },
9
+ "source_path": "/Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-assistant-google-bf16-mlx",
10
+ "source_repo": "google/gemma-4-31B-it-assistant",
11
+ "source_revision": "cffbbd2cea41ea56a0fa5b0487e0d445121fd204"
12
+ }
assistant/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75a6583c1a418e2bbd79c60d95d28e0f5bf549ad3f2990b5bdb5238c6c2bf70c
3
+ size 32169440
assistant/tokenizer_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [],
17
+ "image_token": "<|image|>",
18
+ "is_local": true,
19
+ "local_files_only": false,
20
+ "mask_token": "<mask>",
21
+ "model_max_length": 1000000000000000019884624838656,
22
+ "model_specific_special_tokens": {
23
+ "audio_token": "<|audio|>",
24
+ "boa_token": "<|audio>",
25
+ "boi_token": "<|image>",
26
+ "eoa_token": "<audio|>",
27
+ "eoc_token": "<channel|>",
28
+ "eoi_token": "<image|>",
29
+ "eot_token": "<turn|>",
30
+ "escape_token": "<|\"|>",
31
+ "etc_token": "<tool_call|>",
32
+ "etd_token": "<tool|>",
33
+ "etr_token": "<tool_response|>",
34
+ "image_token": "<|image|>",
35
+ "soc_token": "<|channel>",
36
+ "sot_token": "<|turn>",
37
+ "stc_token": "<|tool_call>",
38
+ "std_token": "<|tool>",
39
+ "str_token": "<|tool_response>",
40
+ "think_token": "<|think|>"
41
+ },
42
+ "pad_token": "<pad>",
43
+ "padding_side": "left",
44
+ "soc_token": "<|channel>",
45
+ "sot_token": "<|turn>",
46
+ "stc_token": "<|tool_call>",
47
+ "std_token": "<|tool>",
48
+ "str_token": "<|tool_response>",
49
+ "think_token": "<|think|>",
50
+ "tokenizer_class": "GemmaTokenizer",
51
+ "unk_token": "<unk>"
52
+ }
benchmarks/flappy1000-targetq8-assistantq8-sweep.json ADDED
@@ -0,0 +1,2104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "arch_id": "gemma4-assistant-mtp",
3
+ "artifacts": {
4
+ "assistant_dtype": null,
5
+ "assistant_format": "q8-g64-affine",
6
+ "assistant_model": "/Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-assistant-google-q8-g64-mlx",
7
+ "assistant_quantization": {
8
+ "bits": 8,
9
+ "group_size": 64,
10
+ "mode": "affine"
11
+ },
12
+ "disk_ok": true,
13
+ "min_free_gib": 220.0,
14
+ "observed_free_gib": 630.7444229125977,
15
+ "target_dtype": null,
16
+ "target_format": "q8-g64-affine",
17
+ "target_model": "/Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-google-q8-g64-mlx",
18
+ "target_quantization": {
19
+ "bits": 8,
20
+ "group_size": 64,
21
+ "mode": "affine"
22
+ }
23
+ },
24
+ "backend": "gemma4_assistant",
25
+ "benchmark": {
26
+ "draft_block_sizes": [
27
+ 3,
28
+ 4,
29
+ 5,
30
+ 6
31
+ ],
32
+ "draft_sampler": {
33
+ "exactness_note": "Assistant q may differ from target p; MTPLX remains exact because acceptance uses p/q and rejection samples the residual distribution.",
34
+ "inherits_target_sampler": true,
35
+ "temperature": null,
36
+ "top_k": null,
37
+ "top_p": null
38
+ },
39
+ "max_mode": true,
40
+ "max_tokens": 1000,
41
+ "profile": "sustained",
42
+ "prompt_suite": "mtplx/benchmarks/prompts/flappy.jsonl",
43
+ "reasoning": "off",
44
+ "sampler_source": {
45
+ "do_sample": true,
46
+ "local_reference": "/Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-google-q8-g64-mlx/generation_config.json",
47
+ "name": "official Gemma 4 generation_config.json",
48
+ "temperature": 1.0,
49
+ "top_k": 64,
50
+ "top_p": 0.95
51
+ },
52
+ "seed": 0,
53
+ "temperature": 1.0,
54
+ "top_k": 64,
55
+ "top_p": 0.95
56
+ },
57
+ "blockers": [],
58
+ "can_run_now": true,
59
+ "gates": {
60
+ "generated_tokens": 1000,
61
+ "longer_lengths_blocked_until_160_passes": true,
62
+ "median_of_3_min_speedup_vs_ar": 2.0,
63
+ "min_speedup_vs_ar": 2.0,
64
+ "mtp_peak_memory_lte_ar_multiplier": 1.18,
65
+ "mtp_peak_memory_lte_ar_plus_gib": 6
66
+ },
67
+ "official_sources": {
68
+ "assistant": "google/gemma-4-31B-it-assistant",
69
+ "assistant_revision": "cffbbd2cea41ea56a0fa5b0487e0d445121fd204",
70
+ "target": "google/gemma-4-31B-it",
71
+ "target_revision": "145dc2508c480a64b47242f160d286cff94a2343"
72
+ },
73
+ "pair": {
74
+ "assistant_exists": true,
75
+ "assistant_inspection": {
76
+ "architecture": "Gemma4AssistantForCausalLM",
77
+ "architecture_recognized": true,
78
+ "backbone_hidden_size": 5376,
79
+ "compatibility": {
80
+ "arch_id": "gemma4-assistant-mtp",
81
+ "can_run": false,
82
+ "exit_code": 3,
83
+ "message": "Official-style Gemma 4 31B assistant artifact recognized. This is an assistant-backed MTP pair, not a standalone target; MTPLX scaffold is present but QA and the 160-token speed/memory gate are still pending.",
84
+ "mtp_supported": "recognized",
85
+ "recognized": true,
86
+ "recommended_backend": "gemma4_assistant",
87
+ "recommended_profile": "performance-cold",
88
+ "runtime_compatibility": "assistant-pair-qa-pending",
89
+ "runtime_contract": null,
90
+ "runtime_contract_error": null,
91
+ "runtime_contract_path": null,
92
+ "support_level": "architecture-scaffolded-qa-pending",
93
+ "support_notes": "Assistant-backed scaffold for the official dense Gemma 4 31B pair. It remains QA-pending and is not a public runnable backend until 160-token exactness, speed, and memory gates pass.",
94
+ "supported": false,
95
+ "tier": "architecture-compatible-but-unverified",
96
+ "unsafe_force_required": false,
97
+ "unverified_model": true
98
+ },
99
+ "config_exists": true,
100
+ "hidden_size": 1024,
101
+ "layer_types": [
102
+ "sliding_attention",
103
+ "sliding_attention",
104
+ "sliding_attention",
105
+ "full_attention"
106
+ ],
107
+ "model_dir": "/Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-assistant-google-q8-g64-mlx",
108
+ "model_files": [
109
+ "model.safetensors"
110
+ ],
111
+ "model_type": "gemma4_assistant",
112
+ "mtp": {
113
+ "exists": false,
114
+ "expected_tensor_count": 15,
115
+ "extra_keys": [],
116
+ "metadata_only": true,
117
+ "missing_expected_keys": [],
118
+ "mtp_file": "model.safetensors.index.json::embedded",
119
+ "passes_tensor_gate": false,
120
+ "sidecar_format": "bf16",
121
+ "tensor_count": 0,
122
+ "tensors": []
123
+ },
124
+ "mtp_arch": "gemma4-assistant-mtp",
125
+ "mtp_num_hidden_layers": 0,
126
+ "mtp_pattern": null,
127
+ "mtp_supported": "recognized",
128
+ "num_hidden_layers": 4,
129
+ "num_kv_shared_layers": 4,
130
+ "passes_primary_gate": false,
131
+ "quantization": {
132
+ "bits": 8,
133
+ "group_size": 64,
134
+ "mode": "affine"
135
+ },
136
+ "recommended_backend": "gemma4_assistant",
137
+ "recommended_profile": "performance-cold",
138
+ "runtime_compatibility": "assistant-pair-qa-pending",
139
+ "runtime_contract_path": null,
140
+ "sidecars": {
141
+ "preprocessor_config.json": false,
142
+ "processor_config.json": false,
143
+ "video_preprocessor_config.json": false
144
+ },
145
+ "source": "local",
146
+ "support_level": "architecture-scaffolded-qa-pending",
147
+ "support_notes": "Assistant-backed scaffold for the official dense Gemma 4 31B pair. It remains QA-pending and is not a public runnable backend until 160-token exactness, speed, and memory gates pass.",
148
+ "unverified_model": true,
149
+ "use_ordered_embeddings": false,
150
+ "vocab_size": 262144
151
+ },
152
+ "assistant_model": "/Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-assistant-google-q8-g64-mlx",
153
+ "pair_error": null,
154
+ "pair_valid": true,
155
+ "target_exists": true,
156
+ "target_inspection": {
157
+ "architecture": "Gemma4ForConditionalGeneration",
158
+ "architecture_recognized": false,
159
+ "backbone_hidden_size": null,
160
+ "compatibility": {
161
+ "arch_id": null,
162
+ "can_run": false,
163
+ "exit_code": 2,
164
+ "message": "Model has no MTP head. MTPLX requires an MTP-equipped model.",
165
+ "mtp_supported": "no",
166
+ "recognized": false,
167
+ "recommended_backend": null,
168
+ "recommended_profile": null,
169
+ "runtime_compatibility": "unsupported",
170
+ "runtime_contract": null,
171
+ "runtime_contract_error": null,
172
+ "runtime_contract_path": null,
173
+ "support_level": "unsupported",
174
+ "support_notes": null,
175
+ "supported": false,
176
+ "tier": "no-MTP",
177
+ "unsafe_force_required": false,
178
+ "unverified_model": false
179
+ },
180
+ "config_exists": true,
181
+ "hidden_size": 5376,
182
+ "layer_types": [
183
+ "sliding_attention",
184
+ "sliding_attention",
185
+ "sliding_attention",
186
+ "sliding_attention",
187
+ "sliding_attention",
188
+ "full_attention",
189
+ "sliding_attention",
190
+ "sliding_attention",
191
+ "sliding_attention",
192
+ "sliding_attention",
193
+ "sliding_attention",
194
+ "full_attention",
195
+ "sliding_attention",
196
+ "sliding_attention",
197
+ "sliding_attention",
198
+ "sliding_attention",
199
+ "sliding_attention",
200
+ "full_attention",
201
+ "sliding_attention",
202
+ "sliding_attention",
203
+ "sliding_attention",
204
+ "sliding_attention",
205
+ "sliding_attention",
206
+ "full_attention",
207
+ "sliding_attention",
208
+ "sliding_attention",
209
+ "sliding_attention",
210
+ "sliding_attention",
211
+ "sliding_attention",
212
+ "full_attention",
213
+ "sliding_attention",
214
+ "sliding_attention",
215
+ "sliding_attention",
216
+ "sliding_attention",
217
+ "sliding_attention",
218
+ "full_attention",
219
+ "sliding_attention",
220
+ "sliding_attention",
221
+ "sliding_attention",
222
+ "sliding_attention",
223
+ "sliding_attention",
224
+ "full_attention",
225
+ "sliding_attention",
226
+ "sliding_attention",
227
+ "sliding_attention",
228
+ "sliding_attention",
229
+ "sliding_attention",
230
+ "full_attention",
231
+ "sliding_attention",
232
+ "sliding_attention",
233
+ "sliding_attention",
234
+ "sliding_attention",
235
+ "sliding_attention",
236
+ "full_attention",
237
+ "sliding_attention",
238
+ "sliding_attention",
239
+ "sliding_attention",
240
+ "sliding_attention",
241
+ "sliding_attention",
242
+ "full_attention"
243
+ ],
244
+ "model_dir": "/Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-google-q8-g64-mlx",
245
+ "model_files": [
246
+ "model-00001-of-00007.safetensors",
247
+ "model-00002-of-00007.safetensors",
248
+ "model-00003-of-00007.safetensors",
249
+ "model-00004-of-00007.safetensors",
250
+ "model-00005-of-00007.safetensors",
251
+ "model-00006-of-00007.safetensors",
252
+ "model-00007-of-00007.safetensors"
253
+ ],
254
+ "model_type": "gemma4_text",
255
+ "mtp": {
256
+ "exists": false,
257
+ "expected_tensor_count": 15,
258
+ "extra_keys": [],
259
+ "metadata_only": true,
260
+ "missing_expected_keys": [],
261
+ "mtp_file": "model.safetensors.index.json::embedded",
262
+ "passes_tensor_gate": false,
263
+ "sidecar_format": "bf16",
264
+ "tensor_count": 0,
265
+ "tensors": []
266
+ },
267
+ "mtp_arch": null,
268
+ "mtp_num_hidden_layers": 0,
269
+ "mtp_pattern": null,
270
+ "mtp_supported": "no",
271
+ "num_hidden_layers": 60,
272
+ "num_kv_shared_layers": 0,
273
+ "passes_primary_gate": false,
274
+ "quantization": {
275
+ "bits": 8,
276
+ "group_size": 64,
277
+ "mode": "affine"
278
+ },
279
+ "recommended_backend": null,
280
+ "recommended_profile": null,
281
+ "runtime_compatibility": "unsupported",
282
+ "runtime_contract_path": null,
283
+ "sidecars": {
284
+ "preprocessor_config.json": false,
285
+ "processor_config.json": false,
286
+ "video_preprocessor_config.json": false
287
+ },
288
+ "source": "local",
289
+ "support_level": "unsupported",
290
+ "support_notes": null,
291
+ "unverified_model": false,
292
+ "use_ordered_embeddings": null,
293
+ "vocab_size": 262144
294
+ },
295
+ "target_model": "/Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-google-q8-g64-mlx"
296
+ },
297
+ "planned_commands": {
298
+ "assistant_bf16_snapshot": "uv run python -c \"from huggingface_hub import snapshot_download; snapshot_download(repo_id='google/gemma-4-31B-it-assistant', revision='cffbbd2cea41ea56a0fa5b0487e0d445121fd204', repo_type='model', local_dir='/Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-assistant-google-q8-g64-mlx')\"",
299
+ "gate": "mtplx bench gemma-mtp --target-model /Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-google-q8-g64-mlx --assistant-model /Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-assistant-google-q8-g64-mlx --profile sustained --max --prompt-suite mtplx/benchmarks/prompts/flappy.jsonl --max-tokens 1000 --temperature 1.0 --top-p 0.95 --top-k 64 --seed 0 --reasoning off --draft-block-sizes 3,4,5,6 --json --output outputs/gemma4/flappy1000-targetq8-assistantq8-sweep.json",
300
+ "target_flat4_g64": "uv run python -m mlx_lm.convert --hf-path /Users/youssof/Documents/MTPLX/models/.sources/gemma-4-31B-it-145dc2508c48 --mlx-path /Users/youssof/Documents/MTPLX/models/gemma-4-31B-it-google-q8-g64-mlx --quantize --q-bits 4 --q-group-size 64 --q-mode affine",
301
+ "target_revision_download": "uv run python -c \"from huggingface_hub import snapshot_download; snapshot_download(repo_id='google/gemma-4-31B-it', revision='145dc2508c480a64b47242f160d286cff94a2343', repo_type='model', local_dir='/Users/youssof/Documents/MTPLX/models/.sources/gemma-4-31B-it-145dc2508c48')\""
302
+ },
303
+ "qa_pending": true,
304
+ "results": {
305
+ "ar": {
306
+ "active_memory_gib": 31.25938833784312,
307
+ "cache_memory_gib": 1.949219443835318,
308
+ "decode_s": 72.81026608298998,
309
+ "generated_tokens": 1000,
310
+ "mode": "ar",
311
+ "peak_memory_gib": 31.318915149196982,
312
+ "prefill_s": 2.473393041000236,
313
+ "tok_s": 13.73432695411645,
314
+ "token_preview": [
315
+ 9996,
316
+ 625,
317
+ 24731,
318
+ 236761,
319
+ 9996,
320
+ 625,
321
+ 24731,
322
+ 236761,
323
+ 9996,
324
+ 625,
325
+ 24731,
326
+ 236761,
327
+ 9996,
328
+ 625,
329
+ 24731,
330
+ 236761
331
+ ]
332
+ },
333
+ "ar_confirmation": [
334
+ {
335
+ "active_memory_gib": 31.25938833784312,
336
+ "cache_memory_gib": 1.949219443835318,
337
+ "decode_s": 72.81026608298998,
338
+ "generated_tokens": 1000,
339
+ "mode": "ar",
340
+ "peak_memory_gib": 31.318915149196982,
341
+ "prefill_s": 2.473393041000236,
342
+ "tok_s": 13.73432695411645,
343
+ "token_preview": [
344
+ 9996,
345
+ 625,
346
+ 24731,
347
+ 236761,
348
+ 9996,
349
+ 625,
350
+ 24731,
351
+ 236761,
352
+ 9996,
353
+ 625,
354
+ 24731,
355
+ 236761,
356
+ 9996,
357
+ 625,
358
+ 24731,
359
+ 236761
360
+ ]
361
+ },
362
+ {
363
+ "active_memory_gib": 31.749414331279695,
364
+ "cache_memory_gib": 2.0085750371217728,
365
+ "confirmation_repeat": 1,
366
+ "decode_s": 76.81720787500672,
367
+ "generated_tokens": 1000,
368
+ "mode": "ar",
369
+ "peak_memory_gib": 31.80895305145532,
370
+ "prefill_s": 0.23355866699421313,
371
+ "tok_s": 13.017916527598246,
372
+ "token_preview": [
373
+ 9996,
374
+ 625,
375
+ 24731,
376
+ 236761,
377
+ 9996,
378
+ 625,
379
+ 24731,
380
+ 236761,
381
+ 9996,
382
+ 625,
383
+ 24731,
384
+ 236761,
385
+ 9996,
386
+ 625,
387
+ 24731,
388
+ 236761
389
+ ]
390
+ },
391
+ {
392
+ "active_memory_gib": 31.749414331279695,
393
+ "cache_memory_gib": 2.0085750371217728,
394
+ "confirmation_repeat": 2,
395
+ "decode_s": 85.77613120799651,
396
+ "generated_tokens": 1000,
397
+ "mode": "ar",
398
+ "peak_memory_gib": 31.80895305145532,
399
+ "prefill_s": 0.2485222500108648,
400
+ "tok_s": 11.658254877165346,
401
+ "token_preview": [
402
+ 9996,
403
+ 625,
404
+ 24731,
405
+ 236761,
406
+ 9996,
407
+ 625,
408
+ 24731,
409
+ 236761,
410
+ 9996,
411
+ 625,
412
+ 24731,
413
+ 236761,
414
+ 9996,
415
+ 625,
416
+ 24731,
417
+ 236761
418
+ ]
419
+ }
420
+ ],
421
+ "best_block_confirmation": [
422
+ {
423
+ "acceptance": 0.9976047904191617,
424
+ "accepted_drafts": 833,
425
+ "active_memory_gib": 31.81573427375406,
426
+ "block_size": 6,
427
+ "cache_memory_gib": 66.49232691712677,
428
+ "decode_s": 29.219117750006262,
429
+ "draft_sampler": {
430
+ "temperature": 1.0,
431
+ "top_k": 64,
432
+ "top_p": 0.95
433
+ },
434
+ "drafted_tokens": 835,
435
+ "generated_tokens": 1000,
436
+ "mode": "mtp",
437
+ "peak_memory_gib": 31.937621283344924,
438
+ "prefill_s": 0.584632291996968,
439
+ "row_distribution_evals": 0,
440
+ "speedup_vs_ar": 2.491870791785778,
441
+ "target_distribution_modes": {
442
+ "batched_logits": 167
443
+ },
444
+ "target_sampler": {
445
+ "temperature": 1.0,
446
+ "top_k": 64,
447
+ "top_p": 0.95
448
+ },
449
+ "telemetry": {
450
+ "ar_dense_fallback_calls": 0,
451
+ "decode_dense_fallback_calls": 0,
452
+ "dense_fallback_calls_by_phase": {
453
+ "ar_decode": 0,
454
+ "decode_verify": 0,
455
+ "postcommit": 0,
456
+ "prefill": 0,
457
+ "unknown": 0
458
+ },
459
+ "events": [],
460
+ "paged_active_array_calls_by_phase": {
461
+ "ar_decode": 0,
462
+ "decode_verify": 0,
463
+ "postcommit": 0,
464
+ "prefill": 0,
465
+ "unknown": 0
466
+ },
467
+ "paged_attention_bailouts_by_phase_reason": {
468
+ "ar_decode": {
469
+ "batch_not_1": 0,
470
+ "block_size_mismatch": 0,
471
+ "blocks_invalid": 0,
472
+ "dtype_unsupported": 0,
473
+ "empty_cache": 0,
474
+ "head_dim_unsupported": 0,
475
+ "kernel_unavailable": 0,
476
+ "offset_invalid": 0,
477
+ "partitioned_invalid_output": 0,
478
+ "partitioned_unavailable": 0,
479
+ "q_len_gt_max": 0,
480
+ "q_len_invalid": 0,
481
+ "turboquant_unsupported": 0,
482
+ "unknown": 0,
483
+ "unsupported_mask": 0
484
+ },
485
+ "decode_verify": {
486
+ "batch_not_1": 0,
487
+ "block_size_mismatch": 0,
488
+ "blocks_invalid": 0,
489
+ "dtype_unsupported": 0,
490
+ "empty_cache": 0,
491
+ "head_dim_unsupported": 0,
492
+ "kernel_unavailable": 0,
493
+ "offset_invalid": 0,
494
+ "partitioned_invalid_output": 0,
495
+ "partitioned_unavailable": 0,
496
+ "q_len_gt_max": 0,
497
+ "q_len_invalid": 0,
498
+ "turboquant_unsupported": 0,
499
+ "unknown": 0,
500
+ "unsupported_mask": 0
501
+ },
502
+ "postcommit": {
503
+ "batch_not_1": 0,
504
+ "block_size_mismatch": 0,
505
+ "blocks_invalid": 0,
506
+ "dtype_unsupported": 0,
507
+ "empty_cache": 0,
508
+ "head_dim_unsupported": 0,
509
+ "kernel_unavailable": 0,
510
+ "offset_invalid": 0,
511
+ "partitioned_invalid_output": 0,
512
+ "partitioned_unavailable": 0,
513
+ "q_len_gt_max": 0,
514
+ "q_len_invalid": 0,
515
+ "turboquant_unsupported": 0,
516
+ "unknown": 0,
517
+ "unsupported_mask": 0
518
+ },
519
+ "prefill": {
520
+ "batch_not_1": 0,
521
+ "block_size_mismatch": 0,
522
+ "blocks_invalid": 0,
523
+ "dtype_unsupported": 0,
524
+ "empty_cache": 0,
525
+ "head_dim_unsupported": 0,
526
+ "kernel_unavailable": 0,
527
+ "offset_invalid": 0,
528
+ "partitioned_invalid_output": 0,
529
+ "partitioned_unavailable": 0,
530
+ "q_len_gt_max": 0,
531
+ "q_len_invalid": 0,
532
+ "turboquant_unsupported": 0,
533
+ "unknown": 0,
534
+ "unsupported_mask": 0
535
+ },
536
+ "unknown": {
537
+ "batch_not_1": 0,
538
+ "block_size_mismatch": 0,
539
+ "blocks_invalid": 0,
540
+ "dtype_unsupported": 0,
541
+ "empty_cache": 0,
542
+ "head_dim_unsupported": 0,
543
+ "kernel_unavailable": 0,
544
+ "offset_invalid": 0,
545
+ "partitioned_invalid_output": 0,
546
+ "partitioned_unavailable": 0,
547
+ "q_len_gt_max": 0,
548
+ "q_len_invalid": 0,
549
+ "turboquant_unsupported": 0,
550
+ "unknown": 0,
551
+ "unsupported_mask": 0
552
+ }
553
+ },
554
+ "paged_attention_large_q_path": {
555
+ "dense_forbidden": 0,
556
+ "large_q_split_sdpa_fallback": 0,
557
+ "partitioned_paged": 0,
558
+ "tail_paged": 0,
559
+ "unknown": 0
560
+ },
561
+ "postcommit_dense_fallback_calls": 0,
562
+ "prefill_dense_fallback_calls": 0,
563
+ "trace_events": false
564
+ },
565
+ "timing_s": {
566
+ "accept": 0.009512208969681524,
567
+ "draft": 1.5696202969993465,
568
+ "rollback": 9.41639591474086e-05,
569
+ "target_distribution": 27.45037787995534,
570
+ "target_hidden": 0.0,
571
+ "verify": 0.17939917097100988
572
+ },
573
+ "tok_s": 34.22416818179891,
574
+ "token_preview": [
575
+ 9996,
576
+ 625,
577
+ 24731,
578
+ 236761,
579
+ 9996,
580
+ 625,
581
+ 24731,
582
+ 236761,
583
+ 9996,
584
+ 625,
585
+ 24731,
586
+ 236761,
587
+ 9996,
588
+ 625,
589
+ 24731,
590
+ 236761
591
+ ],
592
+ "verify_calls": 167
593
+ },
594
+ {
595
+ "acceptance": 0.9976047904191617,
596
+ "accepted_drafts": 833,
597
+ "active_memory_gib": 31.81573427375406,
598
+ "block_size": 6,
599
+ "cache_memory_gib": 66.46690577454865,
600
+ "confirmation_repeat": 1,
601
+ "decode_s": 30.4154408339964,
602
+ "draft_sampler": {
603
+ "temperature": 1.0,
604
+ "top_k": 64,
605
+ "top_p": 0.95
606
+ },
607
+ "drafted_tokens": 835,
608
+ "generated_tokens": 1000,
609
+ "mode": "mtp",
610
+ "peak_memory_gib": 31.937621283344924,
611
+ "prefill_s": 0.26414108400058467,
612
+ "row_distribution_evals": 0,
613
+ "speedup_vs_ar": 2.5255990302512874,
614
+ "target_distribution_modes": {
615
+ "batched_logits": 167
616
+ },
617
+ "target_sampler": {
618
+ "temperature": 1.0,
619
+ "top_k": 64,
620
+ "top_p": 0.95
621
+ },
622
+ "telemetry": {
623
+ "ar_dense_fallback_calls": 0,
624
+ "decode_dense_fallback_calls": 0,
625
+ "dense_fallback_calls_by_phase": {
626
+ "ar_decode": 0,
627
+ "decode_verify": 0,
628
+ "postcommit": 0,
629
+ "prefill": 0,
630
+ "unknown": 0
631
+ },
632
+ "events": [],
633
+ "paged_active_array_calls_by_phase": {
634
+ "ar_decode": 0,
635
+ "decode_verify": 0,
636
+ "postcommit": 0,
637
+ "prefill": 0,
638
+ "unknown": 0
639
+ },
640
+ "paged_attention_bailouts_by_phase_reason": {
641
+ "ar_decode": {
642
+ "batch_not_1": 0,
643
+ "block_size_mismatch": 0,
644
+ "blocks_invalid": 0,
645
+ "dtype_unsupported": 0,
646
+ "empty_cache": 0,
647
+ "head_dim_unsupported": 0,
648
+ "kernel_unavailable": 0,
649
+ "offset_invalid": 0,
650
+ "partitioned_invalid_output": 0,
651
+ "partitioned_unavailable": 0,
652
+ "q_len_gt_max": 0,
653
+ "q_len_invalid": 0,
654
+ "turboquant_unsupported": 0,
655
+ "unknown": 0,
656
+ "unsupported_mask": 0
657
+ },
658
+ "decode_verify": {
659
+ "batch_not_1": 0,
660
+ "block_size_mismatch": 0,
661
+ "blocks_invalid": 0,
662
+ "dtype_unsupported": 0,
663
+ "empty_cache": 0,
664
+ "head_dim_unsupported": 0,
665
+ "kernel_unavailable": 0,
666
+ "offset_invalid": 0,
667
+ "partitioned_invalid_output": 0,
668
+ "partitioned_unavailable": 0,
669
+ "q_len_gt_max": 0,
670
+ "q_len_invalid": 0,
671
+ "turboquant_unsupported": 0,
672
+ "unknown": 0,
673
+ "unsupported_mask": 0
674
+ },
675
+ "postcommit": {
676
+ "batch_not_1": 0,
677
+ "block_size_mismatch": 0,
678
+ "blocks_invalid": 0,
679
+ "dtype_unsupported": 0,
680
+ "empty_cache": 0,
681
+ "head_dim_unsupported": 0,
682
+ "kernel_unavailable": 0,
683
+ "offset_invalid": 0,
684
+ "partitioned_invalid_output": 0,
685
+ "partitioned_unavailable": 0,
686
+ "q_len_gt_max": 0,
687
+ "q_len_invalid": 0,
688
+ "turboquant_unsupported": 0,
689
+ "unknown": 0,
690
+ "unsupported_mask": 0
691
+ },
692
+ "prefill": {
693
+ "batch_not_1": 0,
694
+ "block_size_mismatch": 0,
695
+ "blocks_invalid": 0,
696
+ "dtype_unsupported": 0,
697
+ "empty_cache": 0,
698
+ "head_dim_unsupported": 0,
699
+ "kernel_unavailable": 0,
700
+ "offset_invalid": 0,
701
+ "partitioned_invalid_output": 0,
702
+ "partitioned_unavailable": 0,
703
+ "q_len_gt_max": 0,
704
+ "q_len_invalid": 0,
705
+ "turboquant_unsupported": 0,
706
+ "unknown": 0,
707
+ "unsupported_mask": 0
708
+ },
709
+ "unknown": {
710
+ "batch_not_1": 0,
711
+ "block_size_mismatch": 0,
712
+ "blocks_invalid": 0,
713
+ "dtype_unsupported": 0,
714
+ "empty_cache": 0,
715
+ "head_dim_unsupported": 0,
716
+ "kernel_unavailable": 0,
717
+ "offset_invalid": 0,
718
+ "partitioned_invalid_output": 0,
719
+ "partitioned_unavailable": 0,
720
+ "q_len_gt_max": 0,
721
+ "q_len_invalid": 0,
722
+ "turboquant_unsupported": 0,
723
+ "unknown": 0,
724
+ "unsupported_mask": 0
725
+ }
726
+ },
727
+ "paged_attention_large_q_path": {
728
+ "dense_forbidden": 0,
729
+ "large_q_split_sdpa_fallback": 0,
730
+ "partitioned_paged": 0,
731
+ "tail_paged": 0,
732
+ "unknown": 0
733
+ },
734
+ "postcommit_dense_fallback_calls": 0,
735
+ "prefill_dense_fallback_calls": 0,
736
+ "trace_events": false
737
+ },
738
+ "timing_s": {
739
+ "accept": 0.009377048874739558,
740
+ "draft": 1.6042620869993698,
741
+ "rollback": 8.99219885468483e-05,
742
+ "target_distribution": 28.613301041928935,
743
+ "target_hidden": 0.0,
744
+ "verify": 0.1785685370414285
745
+ },
746
+ "tok_s": 32.87803735799434,
747
+ "token_preview": [
748
+ 9996,
749
+ 625,
750
+ 24731,
751
+ 236761,
752
+ 9996,
753
+ 625,
754
+ 24731,
755
+ 236761,
756
+ 9996,
757
+ 625,
758
+ 24731,
759
+ 236761,
760
+ 9996,
761
+ 625,
762
+ 24731,
763
+ 236761
764
+ ],
765
+ "verify_calls": 167
766
+ },
767
+ {
768
+ "acceptance": 0.9976047904191617,
769
+ "accepted_drafts": 833,
770
+ "active_memory_gib": 31.81573427375406,
771
+ "block_size": 6,
772
+ "cache_memory_gib": 66.46690577454865,
773
+ "confirmation_repeat": 2,
774
+ "decode_s": 30.19647025001177,
775
+ "draft_sampler": {
776
+ "temperature": 1.0,
777
+ "top_k": 64,
778
+ "top_p": 0.95
779
+ },
780
+ "drafted_tokens": 835,
781
+ "generated_tokens": 1000,
782
+ "mode": "mtp",
783
+ "peak_memory_gib": 31.937621283344924,
784
+ "prefill_s": 0.23273954199976288,
785
+ "row_distribution_evals": 0,
786
+ "speedup_vs_ar": 2.840601252325612,
787
+ "target_distribution_modes": {
788
+ "batched_logits": 167
789
+ },
790
+ "target_sampler": {
791
+ "temperature": 1.0,
792
+ "top_k": 64,
793
+ "top_p": 0.95
794
+ },
795
+ "telemetry": {
796
+ "ar_dense_fallback_calls": 0,
797
+ "decode_dense_fallback_calls": 0,
798
+ "dense_fallback_calls_by_phase": {
799
+ "ar_decode": 0,
800
+ "decode_verify": 0,
801
+ "postcommit": 0,
802
+ "prefill": 0,
803
+ "unknown": 0
804
+ },
805
+ "events": [],
806
+ "paged_active_array_calls_by_phase": {
807
+ "ar_decode": 0,
808
+ "decode_verify": 0,
809
+ "postcommit": 0,
810
+ "prefill": 0,
811
+ "unknown": 0
812
+ },
813
+ "paged_attention_bailouts_by_phase_reason": {
814
+ "ar_decode": {
815
+ "batch_not_1": 0,
816
+ "block_size_mismatch": 0,
817
+ "blocks_invalid": 0,
818
+ "dtype_unsupported": 0,
819
+ "empty_cache": 0,
820
+ "head_dim_unsupported": 0,
821
+ "kernel_unavailable": 0,
822
+ "offset_invalid": 0,
823
+ "partitioned_invalid_output": 0,
824
+ "partitioned_unavailable": 0,
825
+ "q_len_gt_max": 0,
826
+ "q_len_invalid": 0,
827
+ "turboquant_unsupported": 0,
828
+ "unknown": 0,
829
+ "unsupported_mask": 0
830
+ },
831
+ "decode_verify": {
832
+ "batch_not_1": 0,
833
+ "block_size_mismatch": 0,
834
+ "blocks_invalid": 0,
835
+ "dtype_unsupported": 0,
836
+ "empty_cache": 0,
837
+ "head_dim_unsupported": 0,
838
+ "kernel_unavailable": 0,
839
+ "offset_invalid": 0,
840
+ "partitioned_invalid_output": 0,
841
+ "partitioned_unavailable": 0,
842
+ "q_len_gt_max": 0,
843
+ "q_len_invalid": 0,
844
+ "turboquant_unsupported": 0,
845
+ "unknown": 0,
846
+ "unsupported_mask": 0
847
+ },
848
+ "postcommit": {
849
+ "batch_not_1": 0,
850
+ "block_size_mismatch": 0,
851
+ "blocks_invalid": 0,
852
+ "dtype_unsupported": 0,
853
+ "empty_cache": 0,
854
+ "head_dim_unsupported": 0,
855
+ "kernel_unavailable": 0,
856
+ "offset_invalid": 0,
857
+ "partitioned_invalid_output": 0,
858
+ "partitioned_unavailable": 0,
859
+ "q_len_gt_max": 0,
860
+ "q_len_invalid": 0,
861
+ "turboquant_unsupported": 0,
862
+ "unknown": 0,
863
+ "unsupported_mask": 0
864
+ },
865
+ "prefill": {
866
+ "batch_not_1": 0,
867
+ "block_size_mismatch": 0,
868
+ "blocks_invalid": 0,
869
+ "dtype_unsupported": 0,
870
+ "empty_cache": 0,
871
+ "head_dim_unsupported": 0,
872
+ "kernel_unavailable": 0,
873
+ "offset_invalid": 0,
874
+ "partitioned_invalid_output": 0,
875
+ "partitioned_unavailable": 0,
876
+ "q_len_gt_max": 0,
877
+ "q_len_invalid": 0,
878
+ "turboquant_unsupported": 0,
879
+ "unknown": 0,
880
+ "unsupported_mask": 0
881
+ },
882
+ "unknown": {
883
+ "batch_not_1": 0,
884
+ "block_size_mismatch": 0,
885
+ "blocks_invalid": 0,
886
+ "dtype_unsupported": 0,
887
+ "empty_cache": 0,
888
+ "head_dim_unsupported": 0,
889
+ "kernel_unavailable": 0,
890
+ "offset_invalid": 0,
891
+ "partitioned_invalid_output": 0,
892
+ "partitioned_unavailable": 0,
893
+ "q_len_gt_max": 0,
894
+ "q_len_invalid": 0,
895
+ "turboquant_unsupported": 0,
896
+ "unknown": 0,
897
+ "unsupported_mask": 0
898
+ }
899
+ },
900
+ "paged_attention_large_q_path": {
901
+ "dense_forbidden": 0,
902
+ "large_q_split_sdpa_fallback": 0,
903
+ "partitioned_paged": 0,
904
+ "tail_paged": 0,
905
+ "unknown": 0
906
+ },
907
+ "postcommit_dense_fallback_calls": 0,
908
+ "prefill_dense_fallback_calls": 0,
909
+ "trace_events": false
910
+ },
911
+ "timing_s": {
912
+ "accept": 0.00978036891319789,
913
+ "draft": 1.6035626278899144,
914
+ "rollback": 9.504603804089129e-05,
915
+ "target_distribution": 28.391074630984804,
916
+ "target_hidden": 0.0,
917
+ "verify": 0.1814885419298662
918
+ },
919
+ "tok_s": 33.11645340400705,
920
+ "token_preview": [
921
+ 9996,
922
+ 625,
923
+ 24731,
924
+ 236761,
925
+ 9996,
926
+ 625,
927
+ 24731,
928
+ 236761,
929
+ 9996,
930
+ 625,
931
+ 24731,
932
+ 236761,
933
+ 9996,
934
+ 625,
935
+ 24731,
936
+ 236761
937
+ ],
938
+ "verify_calls": 167
939
+ }
940
+ ],
941
+ "best_block_size": 6,
942
+ "best_speedup": 2.491870791785778,
943
+ "blockers": [],
944
+ "draft_sampler": {
945
+ "exactness_note": "Assistant q may differ from target p; MTPLX remains exact because acceptance uses p/q and rejection samples the residual distribution.",
946
+ "inherits_target_sampler": true,
947
+ "temperature": null,
948
+ "top_k": null,
949
+ "top_p": null
950
+ },
951
+ "fan_restore": {
952
+ "after": {
953
+ "actual_max_rpm": 7823,
954
+ "actual_min_rpm": 7234,
955
+ "capacity_max_rpm": 7826,
956
+ "capacity_min_rpm": 7826,
957
+ "fans": [
958
+ {
959
+ "actual_rpm": 7234,
960
+ "max_capacity_rpm": 7826,
961
+ "mode": "auto",
962
+ "raw": {
963
+ "actual_rpm": 7234,
964
+ "index": 0,
965
+ "max_rpm": 7826,
966
+ "min_rpm": 2317,
967
+ "mode": "auto",
968
+ "target_rpm": 7245
969
+ },
970
+ "rpm": 7234,
971
+ "target_rpm": 7245
972
+ },
973
+ {
974
+ "actual_rpm": 7823,
975
+ "max_capacity_rpm": 7826,
976
+ "mode": "auto",
977
+ "raw": {
978
+ "actual_rpm": 7823,
979
+ "index": 1,
980
+ "max_rpm": 7826,
981
+ "min_rpm": 2317,
982
+ "mode": "auto",
983
+ "target_rpm": 7824
984
+ },
985
+ "rpm": 7823,
986
+ "target_rpm": 7824
987
+ }
988
+ ],
989
+ "max_rpm": 7823,
990
+ "min_rpm": 7234,
991
+ "ok": true,
992
+ "raw": {
993
+ "attempts": [
994
+ {
995
+ "command": [
996
+ "/Users/youssof/.mtplx/bin/thermalforge",
997
+ "status"
998
+ ],
999
+ "ok": true,
1000
+ "returncode": 0,
1001
+ "stderr": "",
1002
+ "stdout": "{\n \"fans\" : [\n {\n \"actual_rpm\" : 7234,\n \"index\" : 0,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"auto\",\n \"target_rpm\" : 7245\n },\n {\n \"actual_rpm\" : 7823,\n \"index\" : 1,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"auto\",\n \"target_rpm\" : 7824\n }\n ],\n \"temperatures\" : {\n \"TAOL\" : 34.4,\n \"TB0T\" : 33,\n \"TCDX\" : 67.7,\n \"TCHP\" : 60.9,\n \"TCMb\" : 75.5,\n \"TG0B\" : 33,\n \"TG0H\" : 33,\n \"TG0V\" : 33,\n \"TH0x\" : 40.4,\n \"TMVR\" : 61.7,\n \"TPDX\" : 67.1,\n \"TRDX\" : 68.2,\n \"TS0P\" : 69.2,\n \"Tg0j\" : 65.9,\n \"Tm08\" : 65,\n \"Tp04\" : 65.4,\n \"Tp08\" : 65.1,\n \"Tp0C\" : 65.1,\n \"Tp0G\" : 65.6,\n \"Tp0X\" : 65.6\n }\n}"
1003
+ }
1004
+ ],
1005
+ "detection": {
1006
+ "available": true,
1007
+ "clock_anchor_enabled": false,
1008
+ "clock_anchor_policy": "explicit experimental only; never used for product claims",
1009
+ "instructions": "Install ThermalForge and ensure the thermalforge CLI is on PATH.",
1010
+ "selected": {
1011
+ "kind": "thermalforge",
1012
+ "path": "/Users/youssof/.mtplx/bin/thermalforge",
1013
+ "version": {
1014
+ "command": [
1015
+ "/Users/youssof/.mtplx/bin/thermalforge",
1016
+ "--version"
1017
+ ],
1018
+ "ok": true,
1019
+ "returncode": 0,
1020
+ "stderr": "",
1021
+ "stdout": "0.1.0"
1022
+ }
1023
+ },
1024
+ "tools": [
1025
+ {
1026
+ "kind": "thermalforge",
1027
+ "path": "/Users/youssof/.mtplx/bin/thermalforge",
1028
+ "version": {
1029
+ "command": [
1030
+ "/Users/youssof/.mtplx/bin/thermalforge",
1031
+ "--version"
1032
+ ],
1033
+ "ok": true,
1034
+ "returncode": 0,
1035
+ "stderr": "",
1036
+ "stdout": "0.1.0"
1037
+ }
1038
+ }
1039
+ ]
1040
+ },
1041
+ "ok": true,
1042
+ "status": {
1043
+ "command": [
1044
+ "/Users/youssof/.mtplx/bin/thermalforge",
1045
+ "status"
1046
+ ],
1047
+ "ok": true,
1048
+ "returncode": 0,
1049
+ "stderr": "",
1050
+ "stdout": "{\n \"fans\" : [\n {\n \"actual_rpm\" : 7234,\n \"index\" : 0,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"auto\",\n \"target_rpm\" : 7245\n },\n {\n \"actual_rpm\" : 7823,\n \"index\" : 1,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"auto\",\n \"target_rpm\" : 7824\n }\n ],\n \"temperatures\" : {\n \"TAOL\" : 34.4,\n \"TB0T\" : 33,\n \"TCDX\" : 67.7,\n \"TCHP\" : 60.9,\n \"TCMb\" : 75.5,\n \"TG0B\" : 33,\n \"TG0H\" : 33,\n \"TG0V\" : 33,\n \"TH0x\" : 40.4,\n \"TMVR\" : 61.7,\n \"TPDX\" : 67.1,\n \"TRDX\" : 68.2,\n \"TS0P\" : 69.2,\n \"Tg0j\" : 65.9,\n \"Tm08\" : 65,\n \"Tp04\" : 65.4,\n \"Tp08\" : 65.1,\n \"Tp0C\" : 65.1,\n \"Tp0G\" : 65.6,\n \"Tp0X\" : 65.6\n }\n}"
1051
+ }
1052
+ },
1053
+ "target_max_rpm": 7824,
1054
+ "target_min_rpm": 7245
1055
+ },
1056
+ "message": "fan restore was attempted but not verified",
1057
+ "ok": false,
1058
+ "profile": "silent",
1059
+ "set_result": {
1060
+ "attempts": [
1061
+ {
1062
+ "command": [
1063
+ "sudo",
1064
+ "-n",
1065
+ "/Users/youssof/.mtplx/bin/thermalforge",
1066
+ "auto"
1067
+ ],
1068
+ "ok": true,
1069
+ "returncode": 0,
1070
+ "stderr": "No matching processes were found",
1071
+ "stdout": "Fans reset to Apple defaults"
1072
+ }
1073
+ ],
1074
+ "command": [
1075
+ "sudo",
1076
+ "-n",
1077
+ "/Users/youssof/.mtplx/bin/thermalforge",
1078
+ "auto"
1079
+ ],
1080
+ "detection": {
1081
+ "available": true,
1082
+ "clock_anchor_enabled": false,
1083
+ "clock_anchor_policy": "explicit experimental only; never used for product claims",
1084
+ "instructions": "Install ThermalForge and ensure the thermalforge CLI is on PATH.",
1085
+ "selected": {
1086
+ "kind": "thermalforge",
1087
+ "path": "/Users/youssof/.mtplx/bin/thermalforge",
1088
+ "version": {
1089
+ "command": [
1090
+ "/Users/youssof/.mtplx/bin/thermalforge",
1091
+ "--version"
1092
+ ],
1093
+ "ok": true,
1094
+ "returncode": 0,
1095
+ "stderr": "",
1096
+ "stdout": "0.1.0"
1097
+ }
1098
+ },
1099
+ "tools": [
1100
+ {
1101
+ "kind": "thermalforge",
1102
+ "path": "/Users/youssof/.mtplx/bin/thermalforge",
1103
+ "version": {
1104
+ "command": [
1105
+ "/Users/youssof/.mtplx/bin/thermalforge",
1106
+ "--version"
1107
+ ],
1108
+ "ok": true,
1109
+ "returncode": 0,
1110
+ "stderr": "",
1111
+ "stdout": "0.1.0"
1112
+ }
1113
+ }
1114
+ ]
1115
+ },
1116
+ "dry_run": false,
1117
+ "ok": true,
1118
+ "profile": "silent"
1119
+ }
1120
+ },
1121
+ "fanmax": {
1122
+ "after": {
1123
+ "actual_max_rpm": 7424,
1124
+ "actual_min_rpm": 7357,
1125
+ "capacity_max_rpm": 7826,
1126
+ "capacity_min_rpm": 7826,
1127
+ "fans": [
1128
+ {
1129
+ "actual_rpm": 7357,
1130
+ "max_capacity_rpm": 7826,
1131
+ "mode": "manual",
1132
+ "raw": {
1133
+ "actual_rpm": 7357,
1134
+ "index": 0,
1135
+ "max_rpm": 7826,
1136
+ "min_rpm": 2317,
1137
+ "mode": "manual",
1138
+ "target_rpm": 7826
1139
+ },
1140
+ "rpm": 7357,
1141
+ "target_rpm": 7826
1142
+ },
1143
+ {
1144
+ "actual_rpm": 7424,
1145
+ "max_capacity_rpm": 7826,
1146
+ "mode": "manual",
1147
+ "raw": {
1148
+ "actual_rpm": 7424,
1149
+ "index": 1,
1150
+ "max_rpm": 7826,
1151
+ "min_rpm": 2317,
1152
+ "mode": "manual",
1153
+ "target_rpm": 7826
1154
+ },
1155
+ "rpm": 7424,
1156
+ "target_rpm": 7826
1157
+ }
1158
+ ],
1159
+ "max_rpm": 7424,
1160
+ "min_rpm": 7357,
1161
+ "ok": true,
1162
+ "raw": {
1163
+ "attempts": [
1164
+ {
1165
+ "command": [
1166
+ "/Users/youssof/.mtplx/bin/thermalforge",
1167
+ "status"
1168
+ ],
1169
+ "ok": true,
1170
+ "returncode": 0,
1171
+ "stderr": "",
1172
+ "stdout": "{\n \"fans\" : [\n {\n \"actual_rpm\" : 7357,\n \"index\" : 0,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"manual\",\n \"target_rpm\" : 7826\n },\n {\n \"actual_rpm\" : 7424,\n \"index\" : 1,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"manual\",\n \"target_rpm\" : 7826\n }\n ],\n \"temperatures\" : {\n \"TAOL\" : 34.7,\n \"TB0T\" : 33,\n \"TCDX\" : 66.4,\n \"TCHP\" : 59.5,\n \"TCMb\" : 73.7,\n \"TG0B\" : 33,\n \"TG0H\" : 33,\n \"TG0V\" : 33,\n \"TH0x\" : 39.1,\n \"TMVR\" : 59,\n \"TPDX\" : 64,\n \"TRDX\" : 66.9,\n \"TS0P\" : 66,\n \"Tg0j\" : 64.4,\n \"Tm08\" : 63.9,\n \"Tp04\" : 64.4,\n \"Tp08\" : 64.1,\n \"Tp0C\" : 64.3,\n \"Tp0G\" : 64.8,\n \"Tp0X\" : 64.4\n }\n}"
1173
+ }
1174
+ ],
1175
+ "detection": {
1176
+ "available": true,
1177
+ "clock_anchor_enabled": false,
1178
+ "clock_anchor_policy": "explicit experimental only; never used for product claims",
1179
+ "instructions": "Install ThermalForge and ensure the thermalforge CLI is on PATH.",
1180
+ "selected": {
1181
+ "kind": "thermalforge",
1182
+ "path": "/Users/youssof/.mtplx/bin/thermalforge",
1183
+ "version": {
1184
+ "command": [
1185
+ "/Users/youssof/.mtplx/bin/thermalforge",
1186
+ "--version"
1187
+ ],
1188
+ "ok": true,
1189
+ "returncode": 0,
1190
+ "stderr": "",
1191
+ "stdout": "0.1.0"
1192
+ }
1193
+ },
1194
+ "tools": [
1195
+ {
1196
+ "kind": "thermalforge",
1197
+ "path": "/Users/youssof/.mtplx/bin/thermalforge",
1198
+ "version": {
1199
+ "command": [
1200
+ "/Users/youssof/.mtplx/bin/thermalforge",
1201
+ "--version"
1202
+ ],
1203
+ "ok": true,
1204
+ "returncode": 0,
1205
+ "stderr": "",
1206
+ "stdout": "0.1.0"
1207
+ }
1208
+ }
1209
+ ]
1210
+ },
1211
+ "ok": true,
1212
+ "status": {
1213
+ "command": [
1214
+ "/Users/youssof/.mtplx/bin/thermalforge",
1215
+ "status"
1216
+ ],
1217
+ "ok": true,
1218
+ "returncode": 0,
1219
+ "stderr": "",
1220
+ "stdout": "{\n \"fans\" : [\n {\n \"actual_rpm\" : 7357,\n \"index\" : 0,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"manual\",\n \"target_rpm\" : 7826\n },\n {\n \"actual_rpm\" : 7424,\n \"index\" : 1,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"manual\",\n \"target_rpm\" : 7826\n }\n ],\n \"temperatures\" : {\n \"TAOL\" : 34.7,\n \"TB0T\" : 33,\n \"TCDX\" : 66.4,\n \"TCHP\" : 59.5,\n \"TCMb\" : 73.7,\n \"TG0B\" : 33,\n \"TG0H\" : 33,\n \"TG0V\" : 33,\n \"TH0x\" : 39.1,\n \"TMVR\" : 59,\n \"TPDX\" : 64,\n \"TRDX\" : 66.9,\n \"TS0P\" : 66,\n \"Tg0j\" : 64.4,\n \"Tm08\" : 63.9,\n \"Tp04\" : 64.4,\n \"Tp08\" : 64.1,\n \"Tp0C\" : 64.3,\n \"Tp0G\" : 64.8,\n \"Tp0X\" : 64.4\n }\n}"
1221
+ }
1222
+ },
1223
+ "target_max_rpm": 7826,
1224
+ "target_min_rpm": 7826
1225
+ },
1226
+ "baseline": {
1227
+ "actual_max_rpm": 6020,
1228
+ "actual_min_rpm": 5580,
1229
+ "capacity_max_rpm": 7826,
1230
+ "capacity_min_rpm": 7826,
1231
+ "fans": [
1232
+ {
1233
+ "actual_rpm": 5580,
1234
+ "max_capacity_rpm": 7826,
1235
+ "mode": "auto",
1236
+ "raw": {
1237
+ "actual_rpm": 5580,
1238
+ "index": 0,
1239
+ "max_rpm": 7826,
1240
+ "min_rpm": 2317,
1241
+ "mode": "auto",
1242
+ "target_rpm": 5575
1243
+ },
1244
+ "rpm": 5580,
1245
+ "target_rpm": 5575
1246
+ },
1247
+ {
1248
+ "actual_rpm": 6020,
1249
+ "max_capacity_rpm": 7826,
1250
+ "mode": "auto",
1251
+ "raw": {
1252
+ "actual_rpm": 6020,
1253
+ "index": 1,
1254
+ "max_rpm": 7826,
1255
+ "min_rpm": 2317,
1256
+ "mode": "auto",
1257
+ "target_rpm": 6021
1258
+ },
1259
+ "rpm": 6020,
1260
+ "target_rpm": 6021
1261
+ }
1262
+ ],
1263
+ "max_rpm": 6020,
1264
+ "min_rpm": 5580,
1265
+ "ok": true,
1266
+ "raw": {
1267
+ "attempts": [
1268
+ {
1269
+ "command": [
1270
+ "/Users/youssof/.mtplx/bin/thermalforge",
1271
+ "status"
1272
+ ],
1273
+ "ok": true,
1274
+ "returncode": 0,
1275
+ "stderr": "",
1276
+ "stdout": "{\n \"fans\" : [\n {\n \"actual_rpm\" : 5580,\n \"index\" : 0,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"auto\",\n \"target_rpm\" : 5575\n },\n {\n \"actual_rpm\" : 6020,\n \"index\" : 1,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"auto\",\n \"target_rpm\" : 6021\n }\n ],\n \"temperatures\" : {\n \"TAOL\" : 34.6,\n \"TB0T\" : 33,\n \"TCDX\" : 66.6,\n \"TCHP\" : 59.7,\n \"TCMb\" : 74.3,\n \"TG0B\" : 33,\n \"TG0H\" : 33,\n \"TG0V\" : 33,\n \"TH0x\" : 39.1,\n \"TMVR\" : 59.5,\n \"TPDX\" : 64.9,\n \"TRDX\" : 67.6,\n \"TS0P\" : 67,\n \"Tg0j\" : 64.9,\n \"Tm08\" : 64.5,\n \"Tp04\" : 64.3,\n \"Tp08\" : 64,\n \"Tp0C\" : 63.9,\n \"Tp0G\" : 64.7,\n \"Tp0X\" : 64.5\n }\n}"
1277
+ }
1278
+ ],
1279
+ "detection": {
1280
+ "available": true,
1281
+ "clock_anchor_enabled": false,
1282
+ "clock_anchor_policy": "explicit experimental only; never used for product claims",
1283
+ "instructions": "Install ThermalForge and ensure the thermalforge CLI is on PATH.",
1284
+ "selected": {
1285
+ "kind": "thermalforge",
1286
+ "path": "/Users/youssof/.mtplx/bin/thermalforge",
1287
+ "version": {
1288
+ "command": [
1289
+ "/Users/youssof/.mtplx/bin/thermalforge",
1290
+ "--version"
1291
+ ],
1292
+ "ok": true,
1293
+ "returncode": 0,
1294
+ "stderr": "",
1295
+ "stdout": "0.1.0"
1296
+ }
1297
+ },
1298
+ "tools": [
1299
+ {
1300
+ "kind": "thermalforge",
1301
+ "path": "/Users/youssof/.mtplx/bin/thermalforge",
1302
+ "version": {
1303
+ "command": [
1304
+ "/Users/youssof/.mtplx/bin/thermalforge",
1305
+ "--version"
1306
+ ],
1307
+ "ok": true,
1308
+ "returncode": 0,
1309
+ "stderr": "",
1310
+ "stdout": "0.1.0"
1311
+ }
1312
+ }
1313
+ ]
1314
+ },
1315
+ "ok": true,
1316
+ "status": {
1317
+ "command": [
1318
+ "/Users/youssof/.mtplx/bin/thermalforge",
1319
+ "status"
1320
+ ],
1321
+ "ok": true,
1322
+ "returncode": 0,
1323
+ "stderr": "",
1324
+ "stdout": "{\n \"fans\" : [\n {\n \"actual_rpm\" : 5580,\n \"index\" : 0,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"auto\",\n \"target_rpm\" : 5575\n },\n {\n \"actual_rpm\" : 6020,\n \"index\" : 1,\n \"max_rpm\" : 7826,\n \"min_rpm\" : 2317,\n \"mode\" : \"auto\",\n \"target_rpm\" : 6021\n }\n ],\n \"temperatures\" : {\n \"TAOL\" : 34.6,\n \"TB0T\" : 33,\n \"TCDX\" : 66.6,\n \"TCHP\" : 59.7,\n \"TCMb\" : 74.3,\n \"TG0B\" : 33,\n \"TG0H\" : 33,\n \"TG0V\" : 33,\n \"TH0x\" : 39.1,\n \"TMVR\" : 59.5,\n \"TPDX\" : 64.9,\n \"TRDX\" : 67.6,\n \"TS0P\" : 67,\n \"Tg0j\" : 64.9,\n \"Tm08\" : 64.5,\n \"Tp04\" : 64.3,\n \"Tp08\" : 64,\n \"Tp0C\" : 63.9,\n \"Tp0G\" : 64.7,\n \"Tp0X\" : 64.5\n }\n}"
1325
+ }
1326
+ },
1327
+ "target_max_rpm": 6021,
1328
+ "target_min_rpm": 5575
1329
+ },
1330
+ "message": "fans ramped to max (actual 7357-7424 RPM; target 7826 RPM)",
1331
+ "ok": true,
1332
+ "profile": "max",
1333
+ "set_result": {
1334
+ "attempts": [
1335
+ {
1336
+ "command": [
1337
+ "sudo",
1338
+ "-n",
1339
+ "/Users/youssof/.mtplx/bin/thermalforge",
1340
+ "max"
1341
+ ],
1342
+ "ok": true,
1343
+ "returncode": 0,
1344
+ "stderr": "",
1345
+ "stdout": "Fan 0: 5580 RPM \u2192 max (7826 RPM)\nFan 1: 6020 RPM \u2192 max (7826 RPM)"
1346
+ }
1347
+ ],
1348
+ "command": [
1349
+ "sudo",
1350
+ "-n",
1351
+ "/Users/youssof/.mtplx/bin/thermalforge",
1352
+ "max"
1353
+ ],
1354
+ "detection": {
1355
+ "available": true,
1356
+ "clock_anchor_enabled": false,
1357
+ "clock_anchor_policy": "explicit experimental only; never used for product claims",
1358
+ "instructions": "Install ThermalForge and ensure the thermalforge CLI is on PATH.",
1359
+ "selected": {
1360
+ "kind": "thermalforge",
1361
+ "path": "/Users/youssof/.mtplx/bin/thermalforge",
1362
+ "version": {
1363
+ "command": [
1364
+ "/Users/youssof/.mtplx/bin/thermalforge",
1365
+ "--version"
1366
+ ],
1367
+ "ok": true,
1368
+ "returncode": 0,
1369
+ "stderr": "",
1370
+ "stdout": "0.1.0"
1371
+ }
1372
+ },
1373
+ "tools": [
1374
+ {
1375
+ "kind": "thermalforge",
1376
+ "path": "/Users/youssof/.mtplx/bin/thermalforge",
1377
+ "version": {
1378
+ "command": [
1379
+ "/Users/youssof/.mtplx/bin/thermalforge",
1380
+ "--version"
1381
+ ],
1382
+ "ok": true,
1383
+ "returncode": 0,
1384
+ "stderr": "",
1385
+ "stdout": "0.1.0"
1386
+ }
1387
+ }
1388
+ ]
1389
+ },
1390
+ "dry_run": false,
1391
+ "ok": true,
1392
+ "profile": "max"
1393
+ }
1394
+ },
1395
+ "max_tokens": 1000,
1396
+ "median_confirmation_speedup": 2.5439134852185834,
1397
+ "memory_gate": {
1398
+ "ar_peak_memory_gib": 31.80895305145532,
1399
+ "lte_ar_plus_6_gib": true,
1400
+ "lte_ar_times_1_18": true,
1401
+ "mtp_peak_memory_gib": 31.937621283344924
1402
+ },
1403
+ "mtp_by_block_size": {
1404
+ "3": {
1405
+ "acceptance": 0.9970059880239521,
1406
+ "accepted_drafts": 666,
1407
+ "active_memory_gib": 31.813567525707185,
1408
+ "block_size": 3,
1409
+ "cache_memory_gib": 82.18594096973538,
1410
+ "decode_s": 100.14749887499784,
1411
+ "draft_sampler": {
1412
+ "temperature": 1.0,
1413
+ "top_k": 64,
1414
+ "top_p": 0.95
1415
+ },
1416
+ "drafted_tokens": 668,
1417
+ "generated_tokens": 1000,
1418
+ "mode": "mtp",
1419
+ "peak_memory_gib": 31.90066777355969,
1420
+ "prefill_s": 0.4200237909972202,
1421
+ "row_distribution_evals": 0,
1422
+ "speedup_vs_ar": 0.7270302993175131,
1423
+ "target_distribution_modes": {
1424
+ "batched_logits": 334
1425
+ },
1426
+ "target_sampler": {
1427
+ "temperature": 1.0,
1428
+ "top_k": 64,
1429
+ "top_p": 0.95
1430
+ },
1431
+ "telemetry": {
1432
+ "ar_dense_fallback_calls": 0,
1433
+ "decode_dense_fallback_calls": 0,
1434
+ "dense_fallback_calls_by_phase": {
1435
+ "ar_decode": 0,
1436
+ "decode_verify": 0,
1437
+ "postcommit": 0,
1438
+ "prefill": 0,
1439
+ "unknown": 0
1440
+ },
1441
+ "events": [],
1442
+ "paged_active_array_calls_by_phase": {
1443
+ "ar_decode": 0,
1444
+ "decode_verify": 0,
1445
+ "postcommit": 0,
1446
+ "prefill": 0,
1447
+ "unknown": 0
1448
+ },
1449
+ "paged_attention_bailouts_by_phase_reason": {
1450
+ "ar_decode": {
1451
+ "batch_not_1": 0,
1452
+ "block_size_mismatch": 0,
1453
+ "blocks_invalid": 0,
1454
+ "dtype_unsupported": 0,
1455
+ "empty_cache": 0,
1456
+ "head_dim_unsupported": 0,
1457
+ "kernel_unavailable": 0,
1458
+ "offset_invalid": 0,
1459
+ "partitioned_invalid_output": 0,
1460
+ "partitioned_unavailable": 0,
1461
+ "q_len_gt_max": 0,
1462
+ "q_len_invalid": 0,
1463
+ "turboquant_unsupported": 0,
1464
+ "unknown": 0,
1465
+ "unsupported_mask": 0
1466
+ },
1467
+ "decode_verify": {
1468
+ "batch_not_1": 0,
1469
+ "block_size_mismatch": 0,
1470
+ "blocks_invalid": 0,
1471
+ "dtype_unsupported": 0,
1472
+ "empty_cache": 0,
1473
+ "head_dim_unsupported": 0,
1474
+ "kernel_unavailable": 0,
1475
+ "offset_invalid": 0,
1476
+ "partitioned_invalid_output": 0,
1477
+ "partitioned_unavailable": 0,
1478
+ "q_len_gt_max": 0,
1479
+ "q_len_invalid": 0,
1480
+ "turboquant_unsupported": 0,
1481
+ "unknown": 0,
1482
+ "unsupported_mask": 0
1483
+ },
1484
+ "postcommit": {
1485
+ "batch_not_1": 0,
1486
+ "block_size_mismatch": 0,
1487
+ "blocks_invalid": 0,
1488
+ "dtype_unsupported": 0,
1489
+ "empty_cache": 0,
1490
+ "head_dim_unsupported": 0,
1491
+ "kernel_unavailable": 0,
1492
+ "offset_invalid": 0,
1493
+ "partitioned_invalid_output": 0,
1494
+ "partitioned_unavailable": 0,
1495
+ "q_len_gt_max": 0,
1496
+ "q_len_invalid": 0,
1497
+ "turboquant_unsupported": 0,
1498
+ "unknown": 0,
1499
+ "unsupported_mask": 0
1500
+ },
1501
+ "prefill": {
1502
+ "batch_not_1": 0,
1503
+ "block_size_mismatch": 0,
1504
+ "blocks_invalid": 0,
1505
+ "dtype_unsupported": 0,
1506
+ "empty_cache": 0,
1507
+ "head_dim_unsupported": 0,
1508
+ "kernel_unavailable": 0,
1509
+ "offset_invalid": 0,
1510
+ "partitioned_invalid_output": 0,
1511
+ "partitioned_unavailable": 0,
1512
+ "q_len_gt_max": 0,
1513
+ "q_len_invalid": 0,
1514
+ "turboquant_unsupported": 0,
1515
+ "unknown": 0,
1516
+ "unsupported_mask": 0
1517
+ },
1518
+ "unknown": {
1519
+ "batch_not_1": 0,
1520
+ "block_size_mismatch": 0,
1521
+ "blocks_invalid": 0,
1522
+ "dtype_unsupported": 0,
1523
+ "empty_cache": 0,
1524
+ "head_dim_unsupported": 0,
1525
+ "kernel_unavailable": 0,
1526
+ "offset_invalid": 0,
1527
+ "partitioned_invalid_output": 0,
1528
+ "partitioned_unavailable": 0,
1529
+ "q_len_gt_max": 0,
1530
+ "q_len_invalid": 0,
1531
+ "turboquant_unsupported": 0,
1532
+ "unknown": 0,
1533
+ "unsupported_mask": 0
1534
+ }
1535
+ },
1536
+ "paged_attention_large_q_path": {
1537
+ "dense_forbidden": 0,
1538
+ "large_q_split_sdpa_fallback": 0,
1539
+ "partitioned_paged": 0,
1540
+ "tail_paged": 0,
1541
+ "unknown": 0
1542
+ },
1543
+ "postcommit_dense_fallback_calls": 0,
1544
+ "prefill_dense_fallback_calls": 0,
1545
+ "trace_events": false
1546
+ },
1547
+ "timing_s": {
1548
+ "accept": 0.02364034196943976,
1549
+ "draft": 2.0620414960430935,
1550
+ "rollback": 0.00019853397679980844,
1551
+ "target_distribution": 97.66836021310883,
1552
+ "target_hidden": 0.0,
1553
+ "verify": 0.3714816799911205
1554
+ },
1555
+ "tok_s": 9.985271836375869,
1556
+ "token_preview": [
1557
+ 9996,
1558
+ 625,
1559
+ 24731,
1560
+ 236761,
1561
+ 9996,
1562
+ 625,
1563
+ 24731,
1564
+ 236761,
1565
+ 9996,
1566
+ 625,
1567
+ 24731,
1568
+ 236761,
1569
+ 9996,
1570
+ 625,
1571
+ 24731,
1572
+ 236761
1573
+ ],
1574
+ "verify_calls": 334
1575
+ },
1576
+ "4": {
1577
+ "acceptance": 1.0,
1578
+ "accepted_drafts": 750,
1579
+ "active_memory_gib": 31.815169698558748,
1580
+ "block_size": 4,
1581
+ "cache_memory_gib": 82.184341263026,
1582
+ "decode_s": 88.162400583009,
1583
+ "draft_sampler": {
1584
+ "temperature": 1.0,
1585
+ "top_k": 64,
1586
+ "top_p": 0.95
1587
+ },
1588
+ "drafted_tokens": 750,
1589
+ "generated_tokens": 1000,
1590
+ "mode": "mtp",
1591
+ "peak_memory_gib": 31.913927708752453,
1592
+ "prefill_s": 0.45891812500485685,
1593
+ "row_distribution_evals": 0,
1594
+ "speedup_vs_ar": 0.8258652849911423,
1595
+ "target_distribution_modes": {
1596
+ "batched_logits": 250
1597
+ },
1598
+ "target_sampler": {
1599
+ "temperature": 1.0,
1600
+ "top_k": 64,
1601
+ "top_p": 0.95
1602
+ },
1603
+ "telemetry": {
1604
+ "ar_dense_fallback_calls": 0,
1605
+ "decode_dense_fallback_calls": 0,
1606
+ "dense_fallback_calls_by_phase": {
1607
+ "ar_decode": 0,
1608
+ "decode_verify": 0,
1609
+ "postcommit": 0,
1610
+ "prefill": 0,
1611
+ "unknown": 0
1612
+ },
1613
+ "events": [],
1614
+ "paged_active_array_calls_by_phase": {
1615
+ "ar_decode": 0,
1616
+ "decode_verify": 0,
1617
+ "postcommit": 0,
1618
+ "prefill": 0,
1619
+ "unknown": 0
1620
+ },
1621
+ "paged_attention_bailouts_by_phase_reason": {
1622
+ "ar_decode": {
1623
+ "batch_not_1": 0,
1624
+ "block_size_mismatch": 0,
1625
+ "blocks_invalid": 0,
1626
+ "dtype_unsupported": 0,
1627
+ "empty_cache": 0,
1628
+ "head_dim_unsupported": 0,
1629
+ "kernel_unavailable": 0,
1630
+ "offset_invalid": 0,
1631
+ "partitioned_invalid_output": 0,
1632
+ "partitioned_unavailable": 0,
1633
+ "q_len_gt_max": 0,
1634
+ "q_len_invalid": 0,
1635
+ "turboquant_unsupported": 0,
1636
+ "unknown": 0,
1637
+ "unsupported_mask": 0
1638
+ },
1639
+ "decode_verify": {
1640
+ "batch_not_1": 0,
1641
+ "block_size_mismatch": 0,
1642
+ "blocks_invalid": 0,
1643
+ "dtype_unsupported": 0,
1644
+ "empty_cache": 0,
1645
+ "head_dim_unsupported": 0,
1646
+ "kernel_unavailable": 0,
1647
+ "offset_invalid": 0,
1648
+ "partitioned_invalid_output": 0,
1649
+ "partitioned_unavailable": 0,
1650
+ "q_len_gt_max": 0,
1651
+ "q_len_invalid": 0,
1652
+ "turboquant_unsupported": 0,
1653
+ "unknown": 0,
1654
+ "unsupported_mask": 0
1655
+ },
1656
+ "postcommit": {
1657
+ "batch_not_1": 0,
1658
+ "block_size_mismatch": 0,
1659
+ "blocks_invalid": 0,
1660
+ "dtype_unsupported": 0,
1661
+ "empty_cache": 0,
1662
+ "head_dim_unsupported": 0,
1663
+ "kernel_unavailable": 0,
1664
+ "offset_invalid": 0,
1665
+ "partitioned_invalid_output": 0,
1666
+ "partitioned_unavailable": 0,
1667
+ "q_len_gt_max": 0,
1668
+ "q_len_invalid": 0,
1669
+ "turboquant_unsupported": 0,
1670
+ "unknown": 0,
1671
+ "unsupported_mask": 0
1672
+ },
1673
+ "prefill": {
1674
+ "batch_not_1": 0,
1675
+ "block_size_mismatch": 0,
1676
+ "blocks_invalid": 0,
1677
+ "dtype_unsupported": 0,
1678
+ "empty_cache": 0,
1679
+ "head_dim_unsupported": 0,
1680
+ "kernel_unavailable": 0,
1681
+ "offset_invalid": 0,
1682
+ "partitioned_invalid_output": 0,
1683
+ "partitioned_unavailable": 0,
1684
+ "q_len_gt_max": 0,
1685
+ "q_len_invalid": 0,
1686
+ "turboquant_unsupported": 0,
1687
+ "unknown": 0,
1688
+ "unsupported_mask": 0
1689
+ },
1690
+ "unknown": {
1691
+ "batch_not_1": 0,
1692
+ "block_size_mismatch": 0,
1693
+ "blocks_invalid": 0,
1694
+ "dtype_unsupported": 0,
1695
+ "empty_cache": 0,
1696
+ "head_dim_unsupported": 0,
1697
+ "kernel_unavailable": 0,
1698
+ "offset_invalid": 0,
1699
+ "partitioned_invalid_output": 0,
1700
+ "partitioned_unavailable": 0,
1701
+ "q_len_gt_max": 0,
1702
+ "q_len_invalid": 0,
1703
+ "turboquant_unsupported": 0,
1704
+ "unknown": 0,
1705
+ "unsupported_mask": 0
1706
+ }
1707
+ },
1708
+ "paged_attention_large_q_path": {
1709
+ "dense_forbidden": 0,
1710
+ "large_q_split_sdpa_fallback": 0,
1711
+ "partitioned_paged": 0,
1712
+ "tail_paged": 0,
1713
+ "unknown": 0
1714
+ },
1715
+ "postcommit_dense_fallback_calls": 0,
1716
+ "prefill_dense_fallback_calls": 0,
1717
+ "trace_events": false
1718
+ },
1719
+ "timing_s": {
1720
+ "accept": 0.012636876970645972,
1721
+ "draft": 2.101732614086359,
1722
+ "rollback": 0.00013187690638005733,
1723
+ "target_distribution": 85.75181395103573,
1724
+ "target_hidden": 0.0,
1725
+ "verify": 0.27962454402586445
1726
+ },
1727
+ "tok_s": 11.342703844122909,
1728
+ "token_preview": [
1729
+ 9996,
1730
+ 625,
1731
+ 24731,
1732
+ 236761,
1733
+ 9996,
1734
+ 625,
1735
+ 24731,
1736
+ 236761,
1737
+ 9996,
1738
+ 625,
1739
+ 24731,
1740
+ 236761,
1741
+ 9996,
1742
+ 625,
1743
+ 24731,
1744
+ 236761
1745
+ ],
1746
+ "verify_calls": 250
1747
+ },
1748
+ "5": {
1749
+ "acceptance": 0.99875,
1750
+ "accepted_drafts": 799,
1751
+ "active_memory_gib": 31.814849263988435,
1752
+ "block_size": 5,
1753
+ "cache_memory_gib": 79.50701175443828,
1754
+ "decode_s": 73.88629216598929,
1755
+ "draft_sampler": {
1756
+ "temperature": 1.0,
1757
+ "top_k": 64,
1758
+ "top_p": 0.95
1759
+ },
1760
+ "drafted_tokens": 800,
1761
+ "generated_tokens": 1000,
1762
+ "mode": "mtp",
1763
+ "peak_memory_gib": 31.925461772829294,
1764
+ "prefill_s": 0.5138492080004653,
1765
+ "row_distribution_evals": 0,
1766
+ "speedup_vs_ar": 0.9854367291759348,
1767
+ "target_distribution_modes": {
1768
+ "batched_logits": 200
1769
+ },
1770
+ "target_sampler": {
1771
+ "temperature": 1.0,
1772
+ "top_k": 64,
1773
+ "top_p": 0.95
1774
+ },
1775
+ "telemetry": {
1776
+ "ar_dense_fallback_calls": 0,
1777
+ "decode_dense_fallback_calls": 0,
1778
+ "dense_fallback_calls_by_phase": {
1779
+ "ar_decode": 0,
1780
+ "decode_verify": 0,
1781
+ "postcommit": 0,
1782
+ "prefill": 0,
1783
+ "unknown": 0
1784
+ },
1785
+ "events": [],
1786
+ "paged_active_array_calls_by_phase": {
1787
+ "ar_decode": 0,
1788
+ "decode_verify": 0,
1789
+ "postcommit": 0,
1790
+ "prefill": 0,
1791
+ "unknown": 0
1792
+ },
1793
+ "paged_attention_bailouts_by_phase_reason": {
1794
+ "ar_decode": {
1795
+ "batch_not_1": 0,
1796
+ "block_size_mismatch": 0,
1797
+ "blocks_invalid": 0,
1798
+ "dtype_unsupported": 0,
1799
+ "empty_cache": 0,
1800
+ "head_dim_unsupported": 0,
1801
+ "kernel_unavailable": 0,
1802
+ "offset_invalid": 0,
1803
+ "partitioned_invalid_output": 0,
1804
+ "partitioned_unavailable": 0,
1805
+ "q_len_gt_max": 0,
1806
+ "q_len_invalid": 0,
1807
+ "turboquant_unsupported": 0,
1808
+ "unknown": 0,
1809
+ "unsupported_mask": 0
1810
+ },
1811
+ "decode_verify": {
1812
+ "batch_not_1": 0,
1813
+ "block_size_mismatch": 0,
1814
+ "blocks_invalid": 0,
1815
+ "dtype_unsupported": 0,
1816
+ "empty_cache": 0,
1817
+ "head_dim_unsupported": 0,
1818
+ "kernel_unavailable": 0,
1819
+ "offset_invalid": 0,
1820
+ "partitioned_invalid_output": 0,
1821
+ "partitioned_unavailable": 0,
1822
+ "q_len_gt_max": 0,
1823
+ "q_len_invalid": 0,
1824
+ "turboquant_unsupported": 0,
1825
+ "unknown": 0,
1826
+ "unsupported_mask": 0
1827
+ },
1828
+ "postcommit": {
1829
+ "batch_not_1": 0,
1830
+ "block_size_mismatch": 0,
1831
+ "blocks_invalid": 0,
1832
+ "dtype_unsupported": 0,
1833
+ "empty_cache": 0,
1834
+ "head_dim_unsupported": 0,
1835
+ "kernel_unavailable": 0,
1836
+ "offset_invalid": 0,
1837
+ "partitioned_invalid_output": 0,
1838
+ "partitioned_unavailable": 0,
1839
+ "q_len_gt_max": 0,
1840
+ "q_len_invalid": 0,
1841
+ "turboquant_unsupported": 0,
1842
+ "unknown": 0,
1843
+ "unsupported_mask": 0
1844
+ },
1845
+ "prefill": {
1846
+ "batch_not_1": 0,
1847
+ "block_size_mismatch": 0,
1848
+ "blocks_invalid": 0,
1849
+ "dtype_unsupported": 0,
1850
+ "empty_cache": 0,
1851
+ "head_dim_unsupported": 0,
1852
+ "kernel_unavailable": 0,
1853
+ "offset_invalid": 0,
1854
+ "partitioned_invalid_output": 0,
1855
+ "partitioned_unavailable": 0,
1856
+ "q_len_gt_max": 0,
1857
+ "q_len_invalid": 0,
1858
+ "turboquant_unsupported": 0,
1859
+ "unknown": 0,
1860
+ "unsupported_mask": 0
1861
+ },
1862
+ "unknown": {
1863
+ "batch_not_1": 0,
1864
+ "block_size_mismatch": 0,
1865
+ "blocks_invalid": 0,
1866
+ "dtype_unsupported": 0,
1867
+ "empty_cache": 0,
1868
+ "head_dim_unsupported": 0,
1869
+ "kernel_unavailable": 0,
1870
+ "offset_invalid": 0,
1871
+ "partitioned_invalid_output": 0,
1872
+ "partitioned_unavailable": 0,
1873
+ "q_len_gt_max": 0,
1874
+ "q_len_invalid": 0,
1875
+ "turboquant_unsupported": 0,
1876
+ "unknown": 0,
1877
+ "unsupported_mask": 0
1878
+ }
1879
+ },
1880
+ "paged_attention_large_q_path": {
1881
+ "dense_forbidden": 0,
1882
+ "large_q_split_sdpa_fallback": 0,
1883
+ "partitioned_paged": 0,
1884
+ "tail_paged": 0,
1885
+ "unknown": 0
1886
+ },
1887
+ "postcommit_dense_fallback_calls": 0,
1888
+ "prefill_dense_fallback_calls": 0,
1889
+ "trace_events": false
1890
+ },
1891
+ "timing_s": {
1892
+ "accept": 0.011033208094886504,
1893
+ "draft": 1.9875367559725419,
1894
+ "rollback": 0.0001222430873895064,
1895
+ "target_distribution": 71.65346937690629,
1896
+ "target_hidden": 0.0,
1897
+ "verify": 0.22101375696365722
1898
+ },
1899
+ "tok_s": 13.534310231097393,
1900
+ "token_preview": [
1901
+ 9996,
1902
+ 625,
1903
+ 24731,
1904
+ 236761,
1905
+ 9996,
1906
+ 625,
1907
+ 24731,
1908
+ 236761,
1909
+ 9996,
1910
+ 625,
1911
+ 24731,
1912
+ 236761,
1913
+ 9996,
1914
+ 625,
1915
+ 24731,
1916
+ 236761
1917
+ ],
1918
+ "verify_calls": 200
1919
+ },
1920
+ "6": {
1921
+ "acceptance": 0.9976047904191617,
1922
+ "accepted_drafts": 833,
1923
+ "active_memory_gib": 31.81573427375406,
1924
+ "block_size": 6,
1925
+ "cache_memory_gib": 66.49232691712677,
1926
+ "decode_s": 29.219117750006262,
1927
+ "draft_sampler": {
1928
+ "temperature": 1.0,
1929
+ "top_k": 64,
1930
+ "top_p": 0.95
1931
+ },
1932
+ "drafted_tokens": 835,
1933
+ "generated_tokens": 1000,
1934
+ "mode": "mtp",
1935
+ "peak_memory_gib": 31.937621283344924,
1936
+ "prefill_s": 0.584632291996968,
1937
+ "row_distribution_evals": 0,
1938
+ "speedup_vs_ar": 2.491870791785778,
1939
+ "target_distribution_modes": {
1940
+ "batched_logits": 167
1941
+ },
1942
+ "target_sampler": {
1943
+ "temperature": 1.0,
1944
+ "top_k": 64,
1945
+ "top_p": 0.95
1946
+ },
1947
+ "telemetry": {
1948
+ "ar_dense_fallback_calls": 0,
1949
+ "decode_dense_fallback_calls": 0,
1950
+ "dense_fallback_calls_by_phase": {
1951
+ "ar_decode": 0,
1952
+ "decode_verify": 0,
1953
+ "postcommit": 0,
1954
+ "prefill": 0,
1955
+ "unknown": 0
1956
+ },
1957
+ "events": [],
1958
+ "paged_active_array_calls_by_phase": {
1959
+ "ar_decode": 0,
1960
+ "decode_verify": 0,
1961
+ "postcommit": 0,
1962
+ "prefill": 0,
1963
+ "unknown": 0
1964
+ },
1965
+ "paged_attention_bailouts_by_phase_reason": {
1966
+ "ar_decode": {
1967
+ "batch_not_1": 0,
1968
+ "block_size_mismatch": 0,
1969
+ "blocks_invalid": 0,
1970
+ "dtype_unsupported": 0,
1971
+ "empty_cache": 0,
1972
+ "head_dim_unsupported": 0,
1973
+ "kernel_unavailable": 0,
1974
+ "offset_invalid": 0,
1975
+ "partitioned_invalid_output": 0,
1976
+ "partitioned_unavailable": 0,
1977
+ "q_len_gt_max": 0,
1978
+ "q_len_invalid": 0,
1979
+ "turboquant_unsupported": 0,
1980
+ "unknown": 0,
1981
+ "unsupported_mask": 0
1982
+ },
1983
+ "decode_verify": {
1984
+ "batch_not_1": 0,
1985
+ "block_size_mismatch": 0,
1986
+ "blocks_invalid": 0,
1987
+ "dtype_unsupported": 0,
1988
+ "empty_cache": 0,
1989
+ "head_dim_unsupported": 0,
1990
+ "kernel_unavailable": 0,
1991
+ "offset_invalid": 0,
1992
+ "partitioned_invalid_output": 0,
1993
+ "partitioned_unavailable": 0,
1994
+ "q_len_gt_max": 0,
1995
+ "q_len_invalid": 0,
1996
+ "turboquant_unsupported": 0,
1997
+ "unknown": 0,
1998
+ "unsupported_mask": 0
1999
+ },
2000
+ "postcommit": {
2001
+ "batch_not_1": 0,
2002
+ "block_size_mismatch": 0,
2003
+ "blocks_invalid": 0,
2004
+ "dtype_unsupported": 0,
2005
+ "empty_cache": 0,
2006
+ "head_dim_unsupported": 0,
2007
+ "kernel_unavailable": 0,
2008
+ "offset_invalid": 0,
2009
+ "partitioned_invalid_output": 0,
2010
+ "partitioned_unavailable": 0,
2011
+ "q_len_gt_max": 0,
2012
+ "q_len_invalid": 0,
2013
+ "turboquant_unsupported": 0,
2014
+ "unknown": 0,
2015
+ "unsupported_mask": 0
2016
+ },
2017
+ "prefill": {
2018
+ "batch_not_1": 0,
2019
+ "block_size_mismatch": 0,
2020
+ "blocks_invalid": 0,
2021
+ "dtype_unsupported": 0,
2022
+ "empty_cache": 0,
2023
+ "head_dim_unsupported": 0,
2024
+ "kernel_unavailable": 0,
2025
+ "offset_invalid": 0,
2026
+ "partitioned_invalid_output": 0,
2027
+ "partitioned_unavailable": 0,
2028
+ "q_len_gt_max": 0,
2029
+ "q_len_invalid": 0,
2030
+ "turboquant_unsupported": 0,
2031
+ "unknown": 0,
2032
+ "unsupported_mask": 0
2033
+ },
2034
+ "unknown": {
2035
+ "batch_not_1": 0,
2036
+ "block_size_mismatch": 0,
2037
+ "blocks_invalid": 0,
2038
+ "dtype_unsupported": 0,
2039
+ "empty_cache": 0,
2040
+ "head_dim_unsupported": 0,
2041
+ "kernel_unavailable": 0,
2042
+ "offset_invalid": 0,
2043
+ "partitioned_invalid_output": 0,
2044
+ "partitioned_unavailable": 0,
2045
+ "q_len_gt_max": 0,
2046
+ "q_len_invalid": 0,
2047
+ "turboquant_unsupported": 0,
2048
+ "unknown": 0,
2049
+ "unsupported_mask": 0
2050
+ }
2051
+ },
2052
+ "paged_attention_large_q_path": {
2053
+ "dense_forbidden": 0,
2054
+ "large_q_split_sdpa_fallback": 0,
2055
+ "partitioned_paged": 0,
2056
+ "tail_paged": 0,
2057
+ "unknown": 0
2058
+ },
2059
+ "postcommit_dense_fallback_calls": 0,
2060
+ "prefill_dense_fallback_calls": 0,
2061
+ "trace_events": false
2062
+ },
2063
+ "timing_s": {
2064
+ "accept": 0.009512208969681524,
2065
+ "draft": 1.5696202969993465,
2066
+ "rollback": 9.41639591474086e-05,
2067
+ "target_distribution": 27.45037787995534,
2068
+ "target_hidden": 0.0,
2069
+ "verify": 0.17939917097100988
2070
+ },
2071
+ "tok_s": 34.22416818179891,
2072
+ "token_preview": [
2073
+ 9996,
2074
+ 625,
2075
+ 24731,
2076
+ 236761,
2077
+ 9996,
2078
+ 625,
2079
+ 24731,
2080
+ 236761,
2081
+ 9996,
2082
+ 625,
2083
+ 24731,
2084
+ 236761,
2085
+ 9996,
2086
+ 625,
2087
+ 24731,
2088
+ 236761
2089
+ ],
2090
+ "verify_calls": 167
2091
+ }
2092
+ },
2093
+ "passed": true,
2094
+ "prompt_id": "flappy_html5_canvas_game",
2095
+ "prompt_tokens": 119,
2096
+ "sampler": {
2097
+ "seed": 0,
2098
+ "temperature": 1.0,
2099
+ "top_k": 64,
2100
+ "top_p": 0.95
2101
+ }
2102
+ },
2103
+ "status": "passed"
2104
+ }
mtplx_pair.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "format_version": 1,
3
+ "name": "Gemma4 MTPLX Optimized Quality",
4
+ "variant": "quality",
5
+ "layout": {
6
+ "target": "target",
7
+ "assistant": "assistant"
8
+ },
9
+ "source": {
10
+ "target_repo": "google/gemma-4-31B-it",
11
+ "target_revision": "145dc2508c480a64b47242f160d286cff94a2343",
12
+ "assistant_repo": "google/gemma-4-31B-it-assistant",
13
+ "assistant_revision": "cffbbd2cea41ea56a0fa5b0487e0d445121fd204"
14
+ },
15
+ "target": {
16
+ "role": "verifier",
17
+ "model_type": "gemma4",
18
+ "quantization": {
19
+ "bits": 8,
20
+ "group_size": 64,
21
+ "mode": "affine"
22
+ }
23
+ },
24
+ "assistant": {
25
+ "role": "drafter",
26
+ "model_type": "gemma4_assistant",
27
+ "quantization": {
28
+ "bits": 8,
29
+ "group_size": 64,
30
+ "mode": "affine"
31
+ }
32
+ },
33
+ "benchmark": {
34
+ "prompt_suite": "flappy",
35
+ "max_tokens": 1000,
36
+ "temperature": 1.0,
37
+ "top_p": 0.95,
38
+ "top_k": 64,
39
+ "seed": 0,
40
+ "best_block_size": 6,
41
+ "acceptance": {
42
+ "accepted": 833,
43
+ "drafted": 835,
44
+ "ratio": 0.9976047904191617
45
+ },
46
+ "observed_mtp_tok_s": [
47
+ 34.22416818179891,
48
+ 32.87803735799434,
49
+ 33.11645340400705
50
+ ],
51
+ "speedup_vs_ar": 2.491870791785778
52
+ }
53
+ }
target/README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ library_name: mlx
4
+ tags:
5
+ - mlx
6
+ pipeline_tag: text-generation
7
+ ---
target/config.json ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma4ForConditionalGeneration"
4
+ ],
5
+ "audio_config": null,
6
+ "audio_token_id": 258881,
7
+ "boa_token_id": 256000,
8
+ "boi_token_id": 255999,
9
+ "dtype": "bfloat16",
10
+ "eoa_token_id": 258883,
11
+ "eoa_token_index": 258883,
12
+ "eoi_token_id": 258882,
13
+ "eos_token_id": [
14
+ 1,
15
+ 106,
16
+ 50
17
+ ],
18
+ "image_token_id": 258880,
19
+ "initializer_range": 0.02,
20
+ "model_type": "gemma4",
21
+ "quantization": {
22
+ "group_size": 64,
23
+ "bits": 8,
24
+ "mode": "affine"
25
+ },
26
+ "quantization_config": {
27
+ "group_size": 64,
28
+ "bits": 8,
29
+ "mode": "affine"
30
+ },
31
+ "text_config": {
32
+ "attention_bias": false,
33
+ "attention_dropout": 0.0,
34
+ "attention_k_eq_v": true,
35
+ "bos_token_id": 2,
36
+ "dtype": "bfloat16",
37
+ "enable_moe_block": false,
38
+ "eos_token_id": 1,
39
+ "expert_intermediate_size": null,
40
+ "final_logit_softcapping": 30.0,
41
+ "global_head_dim": 512,
42
+ "head_dim": 256,
43
+ "hidden_activation": "gelu_pytorch_tanh",
44
+ "hidden_size": 5376,
45
+ "hidden_size_per_layer_input": 0,
46
+ "initializer_range": 0.02,
47
+ "intermediate_size": 21504,
48
+ "layer_types": [
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "sliding_attention",
53
+ "sliding_attention",
54
+ "full_attention",
55
+ "sliding_attention",
56
+ "sliding_attention",
57
+ "sliding_attention",
58
+ "sliding_attention",
59
+ "sliding_attention",
60
+ "full_attention",
61
+ "sliding_attention",
62
+ "sliding_attention",
63
+ "sliding_attention",
64
+ "sliding_attention",
65
+ "sliding_attention",
66
+ "full_attention",
67
+ "sliding_attention",
68
+ "sliding_attention",
69
+ "sliding_attention",
70
+ "sliding_attention",
71
+ "sliding_attention",
72
+ "full_attention",
73
+ "sliding_attention",
74
+ "sliding_attention",
75
+ "sliding_attention",
76
+ "sliding_attention",
77
+ "sliding_attention",
78
+ "full_attention",
79
+ "sliding_attention",
80
+ "sliding_attention",
81
+ "sliding_attention",
82
+ "sliding_attention",
83
+ "sliding_attention",
84
+ "full_attention",
85
+ "sliding_attention",
86
+ "sliding_attention",
87
+ "sliding_attention",
88
+ "sliding_attention",
89
+ "sliding_attention",
90
+ "full_attention",
91
+ "sliding_attention",
92
+ "sliding_attention",
93
+ "sliding_attention",
94
+ "sliding_attention",
95
+ "sliding_attention",
96
+ "full_attention",
97
+ "sliding_attention",
98
+ "sliding_attention",
99
+ "sliding_attention",
100
+ "sliding_attention",
101
+ "sliding_attention",
102
+ "full_attention",
103
+ "sliding_attention",
104
+ "sliding_attention",
105
+ "sliding_attention",
106
+ "sliding_attention",
107
+ "sliding_attention",
108
+ "full_attention"
109
+ ],
110
+ "max_position_embeddings": 262144,
111
+ "model_type": "gemma4_text",
112
+ "num_attention_heads": 32,
113
+ "num_experts": null,
114
+ "num_global_key_value_heads": 4,
115
+ "num_hidden_layers": 60,
116
+ "num_key_value_heads": 16,
117
+ "num_kv_shared_layers": 0,
118
+ "pad_token_id": 0,
119
+ "rms_norm_eps": 1e-06,
120
+ "rope_parameters": {
121
+ "full_attention": {
122
+ "partial_rotary_factor": 0.25,
123
+ "rope_theta": 1000000.0,
124
+ "rope_type": "proportional"
125
+ },
126
+ "sliding_attention": {
127
+ "rope_theta": 10000.0,
128
+ "rope_type": "default"
129
+ }
130
+ },
131
+ "sliding_window": 1024,
132
+ "tie_word_embeddings": true,
133
+ "top_k_experts": null,
134
+ "use_bidirectional_attention": "vision",
135
+ "use_cache": true,
136
+ "use_double_wide_mlp": false,
137
+ "vocab_size": 262144,
138
+ "vocab_size_per_layer_input": 262144
139
+ },
140
+ "tie_word_embeddings": true,
141
+ "transformers_version": "5.5.0.dev0",
142
+ "video_token_id": 258884,
143
+ "vision_soft_tokens_per_image": 280
144
+ }
target/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 1,
6
+ 106,
7
+ 50
8
+ ],
9
+ "pad_token_id": 0,
10
+ "temperature": 1.0,
11
+ "top_k": 64,
12
+ "top_p": 0.95,
13
+ "transformers_version": "5.5.0.dev0"
14
+ }
target/model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3fe7c584ab9fb211c3071ac035628ccc6861c4a7e1b59c200f58b33ddb81435
3
+ size 5258667341
target/model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed27cbf128295cb8c5fe462ccee517f5840f034b8310af432707c6b01089d2ce
3
+ size 5328997408
target/model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a6cf32b03f6953bb42d863a4cc28e07289b6ea9fd501dfa9882b19e5539b048
3
+ size 5270505988
target/model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:615bb5fb4e2150da2ef820f2beae5dcf7939fae8b6405c5d8cfb376a16d18ff3
3
+ size 5329041180
target/model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1f2fad40a9535464bc81e522c13021d2478413be623cd07d2db00a2b19fc01d
3
+ size 5346547174
target/model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dbf38950088a46ed5c9baab1921f5aa75ad4a498ee51e25765c951148983761
3
+ size 5270505993
target/model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f0db64eef976c2985704d0aa7cbf4bc852d62a572bc59a02b20e62fe4f72806
3
+ size 813126760
target/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
target/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
3
+ size 32169626
target/tokenizer_config.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": true,
21
+ "local_files_only": false,
22
+ "mask_token": "<mask>",
23
+ "model_max_length": 1000000000000000019884624838656,
24
+ "model_specific_special_tokens": {
25
+ "audio_token": "<|audio|>",
26
+ "boa_token": "<|audio>",
27
+ "boi_token": "<|image>",
28
+ "eoa_token": "<audio|>",
29
+ "eoc_token": "<channel|>",
30
+ "eoi_token": "<image|>",
31
+ "eot_token": "<turn|>",
32
+ "escape_token": "<|\"|>",
33
+ "etc_token": "<tool_call|>",
34
+ "etd_token": "<tool|>",
35
+ "etr_token": "<tool_response|>",
36
+ "image_token": "<|image|>",
37
+ "soc_token": "<|channel>",
38
+ "sot_token": "<|turn>",
39
+ "stc_token": "<|tool_call>",
40
+ "std_token": "<|tool>",
41
+ "str_token": "<|tool_response>",
42
+ "think_token": "<|think|>"
43
+ },
44
+ "pad_token": "<pad>",
45
+ "padding_side": "left",
46
+ "processor_class": "Gemma4Processor",
47
+ "response_schema": {
48
+ "properties": {
49
+ "content": {
50
+ "type": "string"
51
+ },
52
+ "role": {
53
+ "const": "assistant"
54
+ },
55
+ "thinking": {
56
+ "type": "string"
57
+ },
58
+ "tool_calls": {
59
+ "items": {
60
+ "properties": {
61
+ "function": {
62
+ "properties": {
63
+ "arguments": {
64
+ "additionalProperties": {},
65
+ "type": "object",
66
+ "x-parser": "gemma4-tool-call"
67
+ },
68
+ "name": {
69
+ "type": "string"
70
+ }
71
+ },
72
+ "type": "object",
73
+ "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
74
+ },
75
+ "type": {
76
+ "const": "function"
77
+ }
78
+ },
79
+ "type": "object"
80
+ },
81
+ "type": "array",
82
+ "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
83
+ }
84
+ },
85
+ "type": "object",
86
+ "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
87
+ },
88
+ "soc_token": "<|channel>",
89
+ "sot_token": "<|turn>",
90
+ "stc_token": "<|tool_call>",
91
+ "std_token": "<|tool>",
92
+ "str_token": "<|tool_response>",
93
+ "think_token": "<|think|>",
94
+ "tokenizer_class": "GemmaTokenizer",
95
+ "unk_token": "<unk>"
96
+ }