Youssofal commited on
Commit
995f754
·
verified ·
1 Parent(s): d85dd52

Recommend 3-bit draft head for speed lane

Browse files
Files changed (3) hide show
  1. MTPLX_PUBLISH_MANIFEST.json +3 -3
  2. README.md +6 -4
  3. mtplx_runtime.json +5 -0
MTPLX_PUBLISH_MANIFEST.json CHANGED
@@ -78,7 +78,7 @@
78
  "name": "mtplx_runtime.json",
79
  "resolved_source_path": "/Users/youssof/Documents/MTPLX/models/Qwen3.6-27B-MTPLX-Flat4-CyanKiwiMTP/mtplx_runtime.json",
80
  "same_inode_as_source": false,
81
- "size_bytes": 589,
82
  "source_path": "/Users/youssof/Documents/MTPLX/models/Qwen3.6-27B-MTPLX-Flat4-CyanKiwiMTP/mtplx_runtime.json"
83
  },
84
  {
@@ -142,7 +142,7 @@
142
  "name": "README.md",
143
  "resolved_source_path": null,
144
  "same_inode_as_source": false,
145
- "size_bytes": 2651,
146
  "source_path": null
147
  },
148
  {
@@ -156,7 +156,7 @@
156
  ],
157
  "include_hashes": false,
158
  "repo_id": "Youssofal/Qwen3.6-27B-MTPLX-Optimized-Speed",
159
- "size_bytes": 16419069596,
160
  "source_provenance": {
161
  "base_model": "Qwen/Qwen3.6-27B",
162
  "base_model_revision": "6a9e13bd6fc8f0983b9b99948120bc37f49c13e9",
 
78
  "name": "mtplx_runtime.json",
79
  "resolved_source_path": "/Users/youssof/Documents/MTPLX/models/Qwen3.6-27B-MTPLX-Flat4-CyanKiwiMTP/mtplx_runtime.json",
80
  "same_inode_as_source": false,
81
+ "size_bytes": 1025,
82
  "source_path": "/Users/youssof/Documents/MTPLX/models/Qwen3.6-27B-MTPLX-Flat4-CyanKiwiMTP/mtplx_runtime.json"
83
  },
84
  {
 
142
  "name": "README.md",
143
  "resolved_source_path": null,
144
  "same_inode_as_source": false,
145
+ "size_bytes": 2797,
146
  "source_path": null
147
  },
148
  {
 
156
  ],
157
  "include_hashes": false,
158
  "repo_id": "Youssofal/Qwen3.6-27B-MTPLX-Optimized-Speed",
159
+ "size_bytes": 16419070178,
160
  "source_provenance": {
161
  "base_model": "Qwen/Qwen3.6-27B",
162
  "base_model_revision": "6a9e13bd6fc8f0983b9b99948120bc37f49c13e9",
README.md CHANGED
@@ -55,6 +55,7 @@ compressed-tensors checkpoint at the revision listed above.
55
  - architecture: `qwen3-next-mtp`
56
  - maximum MTP depth: `3`
57
  - recommended profile: `performance-cold`
 
58
  - exactness gate: `Phase 0H paged-verifier smoke`
59
  - exactness max absolute diff: `0.0`
60
  - verified hardware: `Apple M5 Max, 128 GB unified memory`
@@ -66,10 +67,11 @@ top-p `0.95`, top-k `20`.
66
  ## Performance Honesty
67
 
68
  This is the speed lane. On the local Apple M5 Max fanmax performance-cold
69
- benchmark, this artifact reached 57.668 tok/s at depth 3 on the long-code
70
- 192-token prompt, with acceptance [94.3%, 90.6%, 77.4%]. It is faster than the
71
- current GDN8+CyanKiwi quality/default artifact on the same lane, while GDN8
72
- remains the conservative quality/default checkpoint.
 
73
 
74
  ## Files
75
 
 
55
  - architecture: `qwen3-next-mtp`
56
  - maximum MTP depth: `3`
57
  - recommended profile: `performance-cold`
58
+ - recommended draft-only LM head: `3-bit affine, group_size=64`
59
  - exactness gate: `Phase 0H paged-verifier smoke`
60
  - exactness max absolute diff: `0.0`
61
  - verified hardware: `Apple M5 Max, 128 GB unified memory`
 
67
  ## Performance Honesty
68
 
69
  This is the speed lane. On the local Apple M5 Max fanmax performance-cold
70
+ benchmark, this artifact reached 60.061 tok/s at depth 3 on the long-code
71
+ 192-token prompt when using its contract-recommended 3-bit draft-only LM head,
72
+ with acceptance [100.0%, 98.0%, 87.8%]. The same flat4+CyanKiwi artifact with
73
+ the older 4-bit draft-only LM head measured 57.668 tok/s on the same lane.
74
+ GDN8 remains the conservative quality/default checkpoint.
75
 
76
  ## Files
77
 
mtplx_runtime.json CHANGED
@@ -6,6 +6,11 @@
6
  "mtp_sidecar": "Qwen3.6-27B-MTPLX-CyanKiwi-Packed-BF16-INT4-v3",
7
  "mtp_depth_max": 3,
8
  "recommended_profile": "performance-cold",
 
 
 
 
 
9
  "sampler": {
10
  "temperature": 0.6,
11
  "top_p": 0.95,
 
6
  "mtp_sidecar": "Qwen3.6-27B-MTPLX-CyanKiwi-Packed-BF16-INT4-v3",
7
  "mtp_depth_max": 3,
8
  "recommended_profile": "performance-cold",
9
+ "recommended_draft_lm_head": {
10
+ "bits": 3,
11
+ "group_size": 64,
12
+ "mode": "affine"
13
+ },
14
  "sampler": {
15
  "temperature": 0.6,
16
  "top_p": 0.95,