koichi12 commited on
Commit
96a7806
·
verified ·
1 Parent(s): 5c4f0d8

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  2. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  3. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  4. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  5. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  6. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  7. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  8. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  9. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  10. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  11. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  12. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  13. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  14. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  15. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  16. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  17. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  18. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  19. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  20. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  21. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  22. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  23. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  24. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__init__.py +141 -0
  25. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/aqlm.py +373 -0
  26. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/awq.py +183 -0
  27. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/awq_marlin.py +480 -0
  28. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/awq_triton.py +319 -0
  29. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/base_config.py +141 -0
  30. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/bitsandbytes.py +359 -0
  31. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  32. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-311.pyc +0 -0
  33. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-311.pyc +0 -0
  34. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-311.pyc +0 -0
  35. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/triton_scaled_mm.cpython-311.pyc +0 -0
  36. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-311.pyc +0 -0
  37. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +617 -0
  38. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +574 -0
  39. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +20 -0
  40. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-311.pyc +0 -0
  41. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_24.cpython-311.pyc +0 -0
  42. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-311.pyc +0 -0
  43. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-311.pyc +0 -0
  44. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-311.pyc +0 -0
  45. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-311.pyc +0 -0
  46. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-311.pyc +0 -0
  47. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-311.pyc +0 -0
  48. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +352 -0
  49. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +54 -0
  50. .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +159 -0
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 16,
7
+ "num_warps": 4,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 256,
14
+ "GROUP_SIZE_M": 32,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 256,
22
+ "GROUP_SIZE_M": 16,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 64,
29
+ "BLOCK_SIZE_K": 256,
30
+ "GROUP_SIZE_M": 32,
31
+ "num_warps": 4,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 64,
39
+ "num_warps": 4,
40
+ "num_stages": 4
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 32,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 128,
46
+ "GROUP_SIZE_M": 16,
47
+ "num_warps": 4,
48
+ "num_stages": 5
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 32,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 5
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 32,
60
+ "BLOCK_SIZE_N": 64,
61
+ "BLOCK_SIZE_K": 64,
62
+ "GROUP_SIZE_M": 16,
63
+ "num_warps": 4,
64
+ "num_stages": 5
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 64,
68
+ "BLOCK_SIZE_N": 64,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 4
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 256,
77
+ "BLOCK_SIZE_K": 64,
78
+ "GROUP_SIZE_M": 16,
79
+ "num_warps": 4,
80
+ "num_stages": 5
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 64,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 64,
86
+ "GROUP_SIZE_M": 64,
87
+ "num_warps": 4,
88
+ "num_stages": 4
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 128,
92
+ "BLOCK_SIZE_N": 128,
93
+ "BLOCK_SIZE_K": 64,
94
+ "GROUP_SIZE_M": 16,
95
+ "num_warps": 8,
96
+ "num_stages": 4
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 128,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 64,
103
+ "num_warps": 4,
104
+ "num_stages": 3
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 128,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 64,
111
+ "num_warps": 4,
112
+ "num_stages": 3
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 128,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 16,
119
+ "num_warps": 4,
120
+ "num_stages": 3
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 128,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 32,
127
+ "num_warps": 4,
128
+ "num_stages": 3
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 128,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 16,
135
+ "num_warps": 4,
136
+ "num_stages": 3
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 128,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 16,
143
+ "num_warps": 4,
144
+ "num_stages": 3
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 32,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 32,
7
+ "num_warps": 4,
8
+ "num_stages": 3
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 32,
13
+ "BLOCK_SIZE_K": 256,
14
+ "GROUP_SIZE_M": 16,
15
+ "num_warps": 4,
16
+ "num_stages": 3
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 32,
21
+ "BLOCK_SIZE_K": 256,
22
+ "GROUP_SIZE_M": 32,
23
+ "num_warps": 4,
24
+ "num_stages": 3
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 32,
29
+ "BLOCK_SIZE_K": 256,
30
+ "GROUP_SIZE_M": 64,
31
+ "num_warps": 4,
32
+ "num_stages": 3
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 32,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 32,
39
+ "num_warps": 4,
40
+ "num_stages": 3
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 32,
45
+ "BLOCK_SIZE_K": 256,
46
+ "GROUP_SIZE_M": 1,
47
+ "num_warps": 4,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 32,
53
+ "BLOCK_SIZE_K": 256,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 3
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 128,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 8,
64
+ "num_stages": 3
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 64,
68
+ "BLOCK_SIZE_N": 64,
69
+ "BLOCK_SIZE_K": 64,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 3
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 1,
79
+ "num_warps": 4,
80
+ "num_stages": 3
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 64,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 8,
88
+ "num_stages": 3
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 64,
93
+ "BLOCK_SIZE_K": 64,
94
+ "GROUP_SIZE_M": 64,
95
+ "num_warps": 4,
96
+ "num_stages": 3
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 64,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 1,
103
+ "num_warps": 4,
104
+ "num_stages": 4
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 64,
108
+ "BLOCK_SIZE_N": 64,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 1,
111
+ "num_warps": 4,
112
+ "num_stages": 3
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 64,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 1,
119
+ "num_warps": 4,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 64,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 32,
127
+ "num_warps": 4,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 64,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 64,
135
+ "num_warps": 4,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 64,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 64,
143
+ "num_warps": 4,
144
+ "num_stages": 4
145
+ },
146
+ "5120": {
147
+ "BLOCK_SIZE_M": 64,
148
+ "BLOCK_SIZE_N": 256,
149
+ "BLOCK_SIZE_K": 64,
150
+ "GROUP_SIZE_M": 16,
151
+ "num_warps": 4,
152
+ "num_stages": 4
153
+ },
154
+ "9216": {
155
+ "BLOCK_SIZE_M": 64,
156
+ "BLOCK_SIZE_N": 256,
157
+ "BLOCK_SIZE_K": 64,
158
+ "GROUP_SIZE_M": 32,
159
+ "num_warps": 4,
160
+ "num_stages": 4
161
+ },
162
+ "13312": {
163
+ "BLOCK_SIZE_M": 64,
164
+ "BLOCK_SIZE_N": 256,
165
+ "BLOCK_SIZE_K": 64,
166
+ "GROUP_SIZE_M": 16,
167
+ "num_warps": 4,
168
+ "num_stages": 4
169
+ },
170
+ "17408": {
171
+ "BLOCK_SIZE_M": 64,
172
+ "BLOCK_SIZE_N": 256,
173
+ "BLOCK_SIZE_K": 64,
174
+ "GROUP_SIZE_M": 32,
175
+ "num_warps": 4,
176
+ "num_stages": 4
177
+ },
178
+ "25600": {
179
+ "BLOCK_SIZE_M": 64,
180
+ "BLOCK_SIZE_N": 256,
181
+ "BLOCK_SIZE_K": 64,
182
+ "GROUP_SIZE_M": 16,
183
+ "num_warps": 4,
184
+ "num_stages": 4
185
+ },
186
+ "33792": {
187
+ "BLOCK_SIZE_M": 64,
188
+ "BLOCK_SIZE_N": 256,
189
+ "BLOCK_SIZE_K": 64,
190
+ "GROUP_SIZE_M": 16,
191
+ "num_warps": 4,
192
+ "num_stages": 4
193
+ },
194
+ "41984": {
195
+ "BLOCK_SIZE_M": 64,
196
+ "BLOCK_SIZE_N": 256,
197
+ "BLOCK_SIZE_K": 64,
198
+ "GROUP_SIZE_M": 16,
199
+ "num_warps": 4,
200
+ "num_stages": 4
201
+ },
202
+ "50176": {
203
+ "BLOCK_SIZE_M": 64,
204
+ "BLOCK_SIZE_N": 256,
205
+ "BLOCK_SIZE_K": 64,
206
+ "GROUP_SIZE_M": 32,
207
+ "num_warps": 4,
208
+ "num_stages": 4
209
+ },
210
+ "58368": {
211
+ "BLOCK_SIZE_M": 64,
212
+ "BLOCK_SIZE_N": 256,
213
+ "BLOCK_SIZE_K": 64,
214
+ "GROUP_SIZE_M": 16,
215
+ "num_warps": 4,
216
+ "num_stages": 4
217
+ }
218
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 32,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 32,
7
+ "num_warps": 4,
8
+ "num_stages": 3
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 128,
14
+ "GROUP_SIZE_M": 1,
15
+ "num_warps": 8,
16
+ "num_stages": 5
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 128,
22
+ "GROUP_SIZE_M": 1,
23
+ "num_warps": 8,
24
+ "num_stages": 5
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 64,
29
+ "BLOCK_SIZE_K": 128,
30
+ "GROUP_SIZE_M": 1,
31
+ "num_warps": 8,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 128,
38
+ "GROUP_SIZE_M": 1,
39
+ "num_warps": 8,
40
+ "num_stages": 5
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 32,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 256,
46
+ "GROUP_SIZE_M": 1,
47
+ "num_warps": 8,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 32,
53
+ "BLOCK_SIZE_K": 256,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 5
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 64,
60
+ "BLOCK_SIZE_N": 32,
61
+ "BLOCK_SIZE_K": 256,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 3
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 64,
68
+ "BLOCK_SIZE_N": 32,
69
+ "BLOCK_SIZE_K": 256,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 3
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 64,
76
+ "BLOCK_SIZE_N": 64,
77
+ "BLOCK_SIZE_K": 256,
78
+ "GROUP_SIZE_M": 1,
79
+ "num_warps": 4,
80
+ "num_stages": 2
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 64,
84
+ "BLOCK_SIZE_N": 64,
85
+ "BLOCK_SIZE_K": 256,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 2
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 256,
92
+ "BLOCK_SIZE_N": 64,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 64,
95
+ "num_warps": 8,
96
+ "num_stages": 3
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 256,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 128,
102
+ "GROUP_SIZE_M": 1,
103
+ "num_warps": 8,
104
+ "num_stages": 3
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 256,
108
+ "BLOCK_SIZE_N": 128,
109
+ "BLOCK_SIZE_K": 128,
110
+ "GROUP_SIZE_M": 1,
111
+ "num_warps": 8,
112
+ "num_stages": 3
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 256,
116
+ "BLOCK_SIZE_N": 128,
117
+ "BLOCK_SIZE_K": 128,
118
+ "GROUP_SIZE_M": 1,
119
+ "num_warps": 8,
120
+ "num_stages": 3
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 256,
124
+ "BLOCK_SIZE_N": 128,
125
+ "BLOCK_SIZE_K": 128,
126
+ "GROUP_SIZE_M": 1,
127
+ "num_warps": 8,
128
+ "num_stages": 3
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 256,
132
+ "BLOCK_SIZE_N": 128,
133
+ "BLOCK_SIZE_K": 128,
134
+ "GROUP_SIZE_M": 1,
135
+ "num_warps": 8,
136
+ "num_stages": 3
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 256,
140
+ "BLOCK_SIZE_N": 128,
141
+ "BLOCK_SIZE_K": 128,
142
+ "GROUP_SIZE_M": 1,
143
+ "num_warps": 8,
144
+ "num_stages": 3
145
+ },
146
+ "5120": {
147
+ "BLOCK_SIZE_M": 256,
148
+ "BLOCK_SIZE_N": 128,
149
+ "BLOCK_SIZE_K": 128,
150
+ "GROUP_SIZE_M": 1,
151
+ "num_warps": 8,
152
+ "num_stages": 3
153
+ },
154
+ "9216": {
155
+ "BLOCK_SIZE_M": 256,
156
+ "BLOCK_SIZE_N": 128,
157
+ "BLOCK_SIZE_K": 128,
158
+ "GROUP_SIZE_M": 1,
159
+ "num_warps": 8,
160
+ "num_stages": 3
161
+ },
162
+ "13312": {
163
+ "BLOCK_SIZE_M": 256,
164
+ "BLOCK_SIZE_N": 128,
165
+ "BLOCK_SIZE_K": 128,
166
+ "GROUP_SIZE_M": 1,
167
+ "num_warps": 8,
168
+ "num_stages": 3
169
+ },
170
+ "17408": {
171
+ "BLOCK_SIZE_M": 256,
172
+ "BLOCK_SIZE_N": 128,
173
+ "BLOCK_SIZE_K": 128,
174
+ "GROUP_SIZE_M": 1,
175
+ "num_warps": 8,
176
+ "num_stages": 3
177
+ },
178
+ "25600": {
179
+ "BLOCK_SIZE_M": 256,
180
+ "BLOCK_SIZE_N": 128,
181
+ "BLOCK_SIZE_K": 128,
182
+ "GROUP_SIZE_M": 1,
183
+ "num_warps": 8,
184
+ "num_stages": 3
185
+ },
186
+ "33792": {
187
+ "BLOCK_SIZE_M": 256,
188
+ "BLOCK_SIZE_N": 128,
189
+ "BLOCK_SIZE_K": 128,
190
+ "GROUP_SIZE_M": 1,
191
+ "num_warps": 8,
192
+ "num_stages": 3
193
+ },
194
+ "41984": {
195
+ "BLOCK_SIZE_M": 256,
196
+ "BLOCK_SIZE_N": 128,
197
+ "BLOCK_SIZE_K": 128,
198
+ "GROUP_SIZE_M": 1,
199
+ "num_warps": 8,
200
+ "num_stages": 3
201
+ },
202
+ "50176": {
203
+ "BLOCK_SIZE_M": 256,
204
+ "BLOCK_SIZE_N": 128,
205
+ "BLOCK_SIZE_K": 128,
206
+ "GROUP_SIZE_M": 1,
207
+ "num_warps": 8,
208
+ "num_stages": 3
209
+ },
210
+ "58368": {
211
+ "BLOCK_SIZE_M": 256,
212
+ "BLOCK_SIZE_N": 128,
213
+ "BLOCK_SIZE_K": 128,
214
+ "GROUP_SIZE_M": 1,
215
+ "num_warps": 8,
216
+ "num_stages": 3
217
+ }
218
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 4,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 256,
14
+ "GROUP_SIZE_M": 1,
15
+ "num_warps": 4,
16
+ "num_stages": 3
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 256,
22
+ "GROUP_SIZE_M": 1,
23
+ "num_warps": 4,
24
+ "num_stages": 3
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 64,
29
+ "BLOCK_SIZE_K": 256,
30
+ "GROUP_SIZE_M": 1,
31
+ "num_warps": 4,
32
+ "num_stages": 3
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 1,
39
+ "num_warps": 4,
40
+ "num_stages": 5
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 32,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 256,
46
+ "GROUP_SIZE_M": 1,
47
+ "num_warps": 8,
48
+ "num_stages": 4
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 64,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 5
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 64,
60
+ "BLOCK_SIZE_N": 64,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 5
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 64,
68
+ "BLOCK_SIZE_N": 64,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 5
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 64,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 16,
79
+ "num_warps": 4,
80
+ "num_stages": 4
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 64,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 16,
87
+ "num_warps": 4,
88
+ "num_stages": 4
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 128,
92
+ "BLOCK_SIZE_N": 128,
93
+ "BLOCK_SIZE_K": 64,
94
+ "GROUP_SIZE_M": 64,
95
+ "num_warps": 8,
96
+ "num_stages": 5
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 128,
100
+ "BLOCK_SIZE_N": 256,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 16,
103
+ "num_warps": 8,
104
+ "num_stages": 4
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 32,
111
+ "num_warps": 8,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 32,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 16,
127
+ "num_warps": 8,
128
+ "num_stages": 3
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 64,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 32,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ },
146
+ "5120": {
147
+ "BLOCK_SIZE_M": 128,
148
+ "BLOCK_SIZE_N": 256,
149
+ "BLOCK_SIZE_K": 64,
150
+ "GROUP_SIZE_M": 16,
151
+ "num_warps": 8,
152
+ "num_stages": 4
153
+ },
154
+ "9216": {
155
+ "BLOCK_SIZE_M": 128,
156
+ "BLOCK_SIZE_N": 256,
157
+ "BLOCK_SIZE_K": 64,
158
+ "GROUP_SIZE_M": 16,
159
+ "num_warps": 8,
160
+ "num_stages": 3
161
+ },
162
+ "13312": {
163
+ "BLOCK_SIZE_M": 128,
164
+ "BLOCK_SIZE_N": 256,
165
+ "BLOCK_SIZE_K": 64,
166
+ "GROUP_SIZE_M": 16,
167
+ "num_warps": 8,
168
+ "num_stages": 3
169
+ },
170
+ "17408": {
171
+ "BLOCK_SIZE_M": 128,
172
+ "BLOCK_SIZE_N": 256,
173
+ "BLOCK_SIZE_K": 64,
174
+ "GROUP_SIZE_M": 16,
175
+ "num_warps": 8,
176
+ "num_stages": 3
177
+ },
178
+ "25600": {
179
+ "BLOCK_SIZE_M": 128,
180
+ "BLOCK_SIZE_N": 256,
181
+ "BLOCK_SIZE_K": 64,
182
+ "GROUP_SIZE_M": 16,
183
+ "num_warps": 8,
184
+ "num_stages": 4
185
+ },
186
+ "33792": {
187
+ "BLOCK_SIZE_M": 128,
188
+ "BLOCK_SIZE_N": 256,
189
+ "BLOCK_SIZE_K": 64,
190
+ "GROUP_SIZE_M": 16,
191
+ "num_warps": 8,
192
+ "num_stages": 3
193
+ },
194
+ "41984": {
195
+ "BLOCK_SIZE_M": 128,
196
+ "BLOCK_SIZE_N": 256,
197
+ "BLOCK_SIZE_K": 64,
198
+ "GROUP_SIZE_M": 16,
199
+ "num_warps": 8,
200
+ "num_stages": 3
201
+ },
202
+ "50176": {
203
+ "BLOCK_SIZE_M": 128,
204
+ "BLOCK_SIZE_N": 256,
205
+ "BLOCK_SIZE_K": 64,
206
+ "GROUP_SIZE_M": 16,
207
+ "num_warps": 8,
208
+ "num_stages": 3
209
+ },
210
+ "58368": {
211
+ "BLOCK_SIZE_M": 128,
212
+ "BLOCK_SIZE_N": 256,
213
+ "BLOCK_SIZE_K": 64,
214
+ "GROUP_SIZE_M": 16,
215
+ "num_warps": 8,
216
+ "num_stages": 3
217
+ }
218
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 32,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 16,
7
+ "num_warps": 4,
8
+ "num_stages": 3
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 32,
13
+ "BLOCK_SIZE_K": 256,
14
+ "GROUP_SIZE_M": 16,
15
+ "num_warps": 4,
16
+ "num_stages": 3
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 32,
21
+ "BLOCK_SIZE_K": 256,
22
+ "GROUP_SIZE_M": 16,
23
+ "num_warps": 4,
24
+ "num_stages": 3
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 32,
29
+ "BLOCK_SIZE_K": 256,
30
+ "GROUP_SIZE_M": 32,
31
+ "num_warps": 4,
32
+ "num_stages": 3
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 32,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 32,
39
+ "num_warps": 4,
40
+ "num_stages": 3
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 32,
45
+ "BLOCK_SIZE_K": 256,
46
+ "GROUP_SIZE_M": 1,
47
+ "num_warps": 4,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 32,
53
+ "BLOCK_SIZE_K": 256,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 3
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 32,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 3
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 32,
68
+ "BLOCK_SIZE_N": 128,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 3
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 1,
79
+ "num_warps": 4,
80
+ "num_stages": 3
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 128,
84
+ "BLOCK_SIZE_N": 32,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 3
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 64,
93
+ "BLOCK_SIZE_K": 64,
94
+ "GROUP_SIZE_M": 1,
95
+ "num_warps": 4,
96
+ "num_stages": 3
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 256,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 16,
103
+ "num_warps": 4,
104
+ "num_stages": 4
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 64,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 16,
111
+ "num_warps": 4,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 64,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 32,
119
+ "num_warps": 4,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 64,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 32,
127
+ "num_warps": 4,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 64,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 64,
135
+ "num_warps": 4,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 64,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 32,
143
+ "num_warps": 4,
144
+ "num_stages": 4
145
+ },
146
+ "5120": {
147
+ "BLOCK_SIZE_M": 64,
148
+ "BLOCK_SIZE_N": 256,
149
+ "BLOCK_SIZE_K": 64,
150
+ "GROUP_SIZE_M": 32,
151
+ "num_warps": 4,
152
+ "num_stages": 4
153
+ },
154
+ "9216": {
155
+ "BLOCK_SIZE_M": 64,
156
+ "BLOCK_SIZE_N": 256,
157
+ "BLOCK_SIZE_K": 64,
158
+ "GROUP_SIZE_M": 32,
159
+ "num_warps": 4,
160
+ "num_stages": 3
161
+ },
162
+ "13312": {
163
+ "BLOCK_SIZE_M": 64,
164
+ "BLOCK_SIZE_N": 256,
165
+ "BLOCK_SIZE_K": 64,
166
+ "GROUP_SIZE_M": 16,
167
+ "num_warps": 4,
168
+ "num_stages": 4
169
+ },
170
+ "17408": {
171
+ "BLOCK_SIZE_M": 64,
172
+ "BLOCK_SIZE_N": 256,
173
+ "BLOCK_SIZE_K": 64,
174
+ "GROUP_SIZE_M": 16,
175
+ "num_warps": 4,
176
+ "num_stages": 4
177
+ },
178
+ "25600": {
179
+ "BLOCK_SIZE_M": 64,
180
+ "BLOCK_SIZE_N": 256,
181
+ "BLOCK_SIZE_K": 64,
182
+ "GROUP_SIZE_M": 16,
183
+ "num_warps": 4,
184
+ "num_stages": 4
185
+ },
186
+ "33792": {
187
+ "BLOCK_SIZE_M": 64,
188
+ "BLOCK_SIZE_N": 256,
189
+ "BLOCK_SIZE_K": 64,
190
+ "GROUP_SIZE_M": 16,
191
+ "num_warps": 4,
192
+ "num_stages": 4
193
+ },
194
+ "41984": {
195
+ "BLOCK_SIZE_M": 64,
196
+ "BLOCK_SIZE_N": 256,
197
+ "BLOCK_SIZE_K": 64,
198
+ "GROUP_SIZE_M": 16,
199
+ "num_warps": 4,
200
+ "num_stages": 4
201
+ },
202
+ "50176": {
203
+ "BLOCK_SIZE_M": 64,
204
+ "BLOCK_SIZE_N": 256,
205
+ "BLOCK_SIZE_K": 64,
206
+ "GROUP_SIZE_M": 16,
207
+ "num_warps": 4,
208
+ "num_stages": 4
209
+ },
210
+ "58368": {
211
+ "BLOCK_SIZE_M": 64,
212
+ "BLOCK_SIZE_N": 256,
213
+ "BLOCK_SIZE_K": 64,
214
+ "GROUP_SIZE_M": 16,
215
+ "num_warps": 4,
216
+ "num_stages": 4
217
+ }
218
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 64,
6
+ "GROUP_SIZE_M": 32,
7
+ "num_warps": 4,
8
+ "num_stages": 5
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 256,
14
+ "GROUP_SIZE_M": 1,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 256,
22
+ "GROUP_SIZE_M": 32,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 64,
29
+ "BLOCK_SIZE_K": 256,
30
+ "GROUP_SIZE_M": 32,
31
+ "num_warps": 4,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 64,
39
+ "num_warps": 4,
40
+ "num_stages": 4
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 32,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 64,
46
+ "GROUP_SIZE_M": 1,
47
+ "num_warps": 4,
48
+ "num_stages": 5
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 32,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 64,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 5
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 64,
60
+ "BLOCK_SIZE_N": 64,
61
+ "BLOCK_SIZE_K": 64,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 4
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 64,
68
+ "BLOCK_SIZE_N": 64,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 32,
71
+ "num_warps": 4,
72
+ "num_stages": 4
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 64,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 64,
78
+ "GROUP_SIZE_M": 64,
79
+ "num_warps": 4,
80
+ "num_stages": 4
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 64,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 64,
86
+ "GROUP_SIZE_M": 64,
87
+ "num_warps": 4,
88
+ "num_stages": 4
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 256,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 1,
95
+ "num_warps": 8,
96
+ "num_stages": 3
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 16,
103
+ "num_warps": 4,
104
+ "num_stages": 3
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 128,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 32,
111
+ "num_warps": 8,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 128,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 32,
119
+ "num_warps": 4,
120
+ "num_stages": 3
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 128,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 16,
127
+ "num_warps": 4,
128
+ "num_stages": 3
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 128,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 16,
135
+ "num_warps": 4,
136
+ "num_stages": 3
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 128,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 16,
143
+ "num_warps": 4,
144
+ "num_stages": 3
145
+ },
146
+ "5120": {
147
+ "BLOCK_SIZE_M": 128,
148
+ "BLOCK_SIZE_N": 128,
149
+ "BLOCK_SIZE_K": 64,
150
+ "GROUP_SIZE_M": 16,
151
+ "num_warps": 4,
152
+ "num_stages": 3
153
+ },
154
+ "9216": {
155
+ "BLOCK_SIZE_M": 128,
156
+ "BLOCK_SIZE_N": 128,
157
+ "BLOCK_SIZE_K": 64,
158
+ "GROUP_SIZE_M": 16,
159
+ "num_warps": 4,
160
+ "num_stages": 3
161
+ },
162
+ "13312": {
163
+ "BLOCK_SIZE_M": 128,
164
+ "BLOCK_SIZE_N": 128,
165
+ "BLOCK_SIZE_K": 64,
166
+ "GROUP_SIZE_M": 16,
167
+ "num_warps": 4,
168
+ "num_stages": 3
169
+ },
170
+ "17408": {
171
+ "BLOCK_SIZE_M": 128,
172
+ "BLOCK_SIZE_N": 128,
173
+ "BLOCK_SIZE_K": 64,
174
+ "GROUP_SIZE_M": 16,
175
+ "num_warps": 4,
176
+ "num_stages": 3
177
+ },
178
+ "25600": {
179
+ "BLOCK_SIZE_M": 128,
180
+ "BLOCK_SIZE_N": 128,
181
+ "BLOCK_SIZE_K": 64,
182
+ "GROUP_SIZE_M": 16,
183
+ "num_warps": 4,
184
+ "num_stages": 3
185
+ },
186
+ "33792": {
187
+ "BLOCK_SIZE_M": 128,
188
+ "BLOCK_SIZE_N": 128,
189
+ "BLOCK_SIZE_K": 64,
190
+ "GROUP_SIZE_M": 16,
191
+ "num_warps": 4,
192
+ "num_stages": 3
193
+ },
194
+ "41984": {
195
+ "BLOCK_SIZE_M": 128,
196
+ "BLOCK_SIZE_N": 128,
197
+ "BLOCK_SIZE_K": 64,
198
+ "GROUP_SIZE_M": 16,
199
+ "num_warps": 4,
200
+ "num_stages": 3
201
+ },
202
+ "50176": {
203
+ "BLOCK_SIZE_M": 128,
204
+ "BLOCK_SIZE_N": 128,
205
+ "BLOCK_SIZE_K": 64,
206
+ "GROUP_SIZE_M": 16,
207
+ "num_warps": 4,
208
+ "num_stages": 3
209
+ },
210
+ "58368": {
211
+ "BLOCK_SIZE_M": 128,
212
+ "BLOCK_SIZE_N": 128,
213
+ "BLOCK_SIZE_K": 64,
214
+ "GROUP_SIZE_M": 16,
215
+ "num_warps": 4,
216
+ "num_stages": 3
217
+ }
218
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 32,
5
+ "BLOCK_SIZE_K": 64,
6
+ "GROUP_SIZE_M": 32,
7
+ "num_warps": 4,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 256,
13
+ "BLOCK_SIZE_K": 64,
14
+ "GROUP_SIZE_M": 16,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 32,
21
+ "BLOCK_SIZE_K": 64,
22
+ "GROUP_SIZE_M": 64,
23
+ "num_warps": 8,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 256,
29
+ "BLOCK_SIZE_K": 64,
30
+ "GROUP_SIZE_M": 1,
31
+ "num_warps": 8,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 64,
39
+ "num_warps": 4,
40
+ "num_stages": 4
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 256,
46
+ "GROUP_SIZE_M": 64,
47
+ "num_warps": 4,
48
+ "num_stages": 4
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 16,
55
+ "num_warps": 4,
56
+ "num_stages": 4
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 64,
61
+ "BLOCK_SIZE_K": 256,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 4
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 64,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 4
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 16,
76
+ "BLOCK_SIZE_N": 64,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 64,
79
+ "num_warps": 4,
80
+ "num_stages": 4
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 16,
84
+ "BLOCK_SIZE_N": 64,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 32,
87
+ "num_warps": 4,
88
+ "num_stages": 4
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 64,
93
+ "BLOCK_SIZE_K": 64,
94
+ "GROUP_SIZE_M": 1,
95
+ "num_warps": 4,
96
+ "num_stages": 4
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 32,
103
+ "num_warps": 4,
104
+ "num_stages": 4
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 1,
111
+ "num_warps": 8,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 1,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 64,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 16,
127
+ "num_warps": 4,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 32,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 1,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 32,
7
+ "num_warps": 8,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 256,
14
+ "GROUP_SIZE_M": 32,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 128,
21
+ "BLOCK_SIZE_K": 128,
22
+ "GROUP_SIZE_M": 64,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 128,
29
+ "BLOCK_SIZE_K": 128,
30
+ "GROUP_SIZE_M": 32,
31
+ "num_warps": 4,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 128,
37
+ "BLOCK_SIZE_K": 128,
38
+ "GROUP_SIZE_M": 16,
39
+ "num_warps": 8,
40
+ "num_stages": 4
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 128,
46
+ "GROUP_SIZE_M": 64,
47
+ "num_warps": 4,
48
+ "num_stages": 4
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 16,
55
+ "num_warps": 4,
56
+ "num_stages": 4
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 64,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 16,
63
+ "num_warps": 4,
64
+ "num_stages": 4
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 64,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 64,
71
+ "num_warps": 4,
72
+ "num_stages": 4
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 32,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 1,
79
+ "num_warps": 4,
80
+ "num_stages": 4
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 32,
84
+ "BLOCK_SIZE_N": 32,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 4
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 256,
93
+ "BLOCK_SIZE_K": 64,
94
+ "GROUP_SIZE_M": 1,
95
+ "num_warps": 8,
96
+ "num_stages": 4
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 32,
103
+ "num_warps": 4,
104
+ "num_stages": 4
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 64,
111
+ "num_warps": 8,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 16,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 32,
127
+ "num_warps": 8,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 16,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 16,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 32,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 4,
8
+ "num_stages": 5
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 32,
13
+ "BLOCK_SIZE_K": 256,
14
+ "GROUP_SIZE_M": 1,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 256,
22
+ "GROUP_SIZE_M": 1,
23
+ "num_warps": 8,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 32,
29
+ "BLOCK_SIZE_K": 128,
30
+ "GROUP_SIZE_M": 1,
31
+ "num_warps": 4,
32
+ "num_stages": 5
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 64,
39
+ "num_warps": 4,
40
+ "num_stages": 4
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 256,
46
+ "GROUP_SIZE_M": 32,
47
+ "num_warps": 4,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 256,
54
+ "GROUP_SIZE_M": 32,
55
+ "num_warps": 4,
56
+ "num_stages": 4
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 64,
61
+ "BLOCK_SIZE_K": 256,
62
+ "GROUP_SIZE_M": 64,
63
+ "num_warps": 4,
64
+ "num_stages": 4
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 64,
69
+ "BLOCK_SIZE_K": 256,
70
+ "GROUP_SIZE_M": 32,
71
+ "num_warps": 4,
72
+ "num_stages": 3
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 16,
76
+ "BLOCK_SIZE_N": 64,
77
+ "BLOCK_SIZE_K": 256,
78
+ "GROUP_SIZE_M": 32,
79
+ "num_warps": 4,
80
+ "num_stages": 4
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 32,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 4
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 32,
92
+ "BLOCK_SIZE_N": 128,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 16,
95
+ "num_warps": 4,
96
+ "num_stages": 4
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 128,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 1,
103
+ "num_warps": 8,
104
+ "num_stages": 4
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 128,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 64,
111
+ "num_warps": 8,
112
+ "num_stages": 5
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 256,
116
+ "BLOCK_SIZE_N": 128,
117
+ "BLOCK_SIZE_K": 128,
118
+ "GROUP_SIZE_M": 1,
119
+ "num_warps": 8,
120
+ "num_stages": 3
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 128,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 64,
127
+ "num_warps": 8,
128
+ "num_stages": 5
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 128,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 64,
135
+ "num_warps": 8,
136
+ "num_stages": 5
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 256,
140
+ "BLOCK_SIZE_N": 128,
141
+ "BLOCK_SIZE_K": 128,
142
+ "GROUP_SIZE_M": 1,
143
+ "num_warps": 8,
144
+ "num_stages": 3
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 4,
8
+ "num_stages": 3
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 128,
13
+ "BLOCK_SIZE_K": 128,
14
+ "GROUP_SIZE_M": 16,
15
+ "num_warps": 4,
16
+ "num_stages": 5
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 128,
21
+ "BLOCK_SIZE_K": 128,
22
+ "GROUP_SIZE_M": 64,
23
+ "num_warps": 8,
24
+ "num_stages": 5
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 128,
29
+ "BLOCK_SIZE_K": 128,
30
+ "GROUP_SIZE_M": 1,
31
+ "num_warps": 4,
32
+ "num_stages": 3
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 1,
39
+ "num_warps": 4,
40
+ "num_stages": 2
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 32,
45
+ "BLOCK_SIZE_K": 256,
46
+ "GROUP_SIZE_M": 64,
47
+ "num_warps": 4,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 32,
52
+ "BLOCK_SIZE_N": 128,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 32,
55
+ "num_warps": 8,
56
+ "num_stages": 5
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 64,
61
+ "BLOCK_SIZE_K": 256,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 3
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 128,
69
+ "BLOCK_SIZE_K": 256,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 3
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 256,
78
+ "GROUP_SIZE_M": 1,
79
+ "num_warps": 4,
80
+ "num_stages": 3
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 32,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 256,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 3
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 256,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 1,
95
+ "num_warps": 8,
96
+ "num_stages": 3
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 128,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 128,
102
+ "GROUP_SIZE_M": 1,
103
+ "num_warps": 8,
104
+ "num_stages": 3
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 64,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 128,
110
+ "GROUP_SIZE_M": 64,
111
+ "num_warps": 8,
112
+ "num_stages": 3
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 64,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 32,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 32,
127
+ "num_warps": 8,
128
+ "num_stages": 3
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 128,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 16,
135
+ "num_warps": 4,
136
+ "num_stages": 3
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 32,
143
+ "num_warps": 8,
144
+ "num_stages": 3
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 32,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 4,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 32,
13
+ "BLOCK_SIZE_K": 256,
14
+ "GROUP_SIZE_M": 1,
15
+ "num_warps": 4,
16
+ "num_stages": 3
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 256,
22
+ "GROUP_SIZE_M": 1,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 32,
29
+ "BLOCK_SIZE_K": 256,
30
+ "GROUP_SIZE_M": 1,
31
+ "num_warps": 4,
32
+ "num_stages": 3
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 32,
39
+ "num_warps": 4,
40
+ "num_stages": 3
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 32,
45
+ "BLOCK_SIZE_K": 256,
46
+ "GROUP_SIZE_M": 64,
47
+ "num_warps": 4,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 256,
54
+ "GROUP_SIZE_M": 64,
55
+ "num_warps": 4,
56
+ "num_stages": 3
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 64,
61
+ "BLOCK_SIZE_K": 256,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 3
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 64,
69
+ "BLOCK_SIZE_K": 256,
70
+ "GROUP_SIZE_M": 16,
71
+ "num_warps": 4,
72
+ "num_stages": 3
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 16,
76
+ "BLOCK_SIZE_N": 64,
77
+ "BLOCK_SIZE_K": 256,
78
+ "GROUP_SIZE_M": 32,
79
+ "num_warps": 4,
80
+ "num_stages": 4
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 32,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 3
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 32,
92
+ "BLOCK_SIZE_N": 128,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 64,
95
+ "num_warps": 4,
96
+ "num_stages": 3
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 128,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 1,
103
+ "num_warps": 8,
104
+ "num_stages": 3
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 128,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 1,
111
+ "num_warps": 8,
112
+ "num_stages": 5
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 256,
116
+ "BLOCK_SIZE_N": 128,
117
+ "BLOCK_SIZE_K": 128,
118
+ "GROUP_SIZE_M": 1,
119
+ "num_warps": 8,
120
+ "num_stages": 3
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 128,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 64,
127
+ "num_warps": 8,
128
+ "num_stages": 3
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 128,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 32,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 256,
140
+ "BLOCK_SIZE_N": 128,
141
+ "BLOCK_SIZE_K": 128,
142
+ "GROUP_SIZE_M": 1,
143
+ "num_warps": 8,
144
+ "num_stages": 3
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 128,
5
+ "BLOCK_SIZE_K": 128,
6
+ "GROUP_SIZE_M": 64,
7
+ "num_warps": 4,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 32,
12
+ "BLOCK_SIZE_N": 128,
13
+ "BLOCK_SIZE_K": 64,
14
+ "GROUP_SIZE_M": 1,
15
+ "num_warps": 4,
16
+ "num_stages": 5
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 32,
21
+ "BLOCK_SIZE_K": 64,
22
+ "GROUP_SIZE_M": 16,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 64,
29
+ "BLOCK_SIZE_K": 256,
30
+ "GROUP_SIZE_M": 16,
31
+ "num_warps": 4,
32
+ "num_stages": 5
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 1,
39
+ "num_warps": 4,
40
+ "num_stages": 2
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 256,
46
+ "GROUP_SIZE_M": 16,
47
+ "num_warps": 4,
48
+ "num_stages": 2
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 128,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 16,
55
+ "num_warps": 4,
56
+ "num_stages": 3
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 128,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 3
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 128,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 3
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 1,
79
+ "num_warps": 4,
80
+ "num_stages": 3
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 32,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 3
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 256,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 1,
95
+ "num_warps": 8,
96
+ "num_stages": 3
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 256,
101
+ "BLOCK_SIZE_K": 128,
102
+ "GROUP_SIZE_M": 1,
103
+ "num_warps": 8,
104
+ "num_stages": 3
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 64,
108
+ "BLOCK_SIZE_N": 128,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 1,
111
+ "num_warps": 4,
112
+ "num_stages": 3
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 64,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 1,
119
+ "num_warps": 4,
120
+ "num_stages": 3
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 128,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 1,
127
+ "num_warps": 4,
128
+ "num_stages": 3
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 128,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 1,
135
+ "num_warps": 4,
136
+ "num_stages": 3
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 128,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 1,
143
+ "num_warps": 4,
144
+ "num_stages": 3
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 128,
5
+ "BLOCK_SIZE_K": 128,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 8,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 128,
14
+ "GROUP_SIZE_M": 16,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 128,
22
+ "GROUP_SIZE_M": 64,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 32,
28
+ "BLOCK_SIZE_N": 256,
29
+ "BLOCK_SIZE_K": 128,
30
+ "GROUP_SIZE_M": 32,
31
+ "num_warps": 4,
32
+ "num_stages": 3
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 128,
37
+ "BLOCK_SIZE_K": 128,
38
+ "GROUP_SIZE_M": 32,
39
+ "num_warps": 8,
40
+ "num_stages": 4
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 32,
45
+ "BLOCK_SIZE_K": 128,
46
+ "GROUP_SIZE_M": 64,
47
+ "num_warps": 4,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 16,
55
+ "num_warps": 4,
56
+ "num_stages": 2
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 128,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 2
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 64,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 3
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 64,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 1,
79
+ "num_warps": 4,
80
+ "num_stages": 2
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 32,
84
+ "BLOCK_SIZE_N": 64,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 2
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 128,
93
+ "BLOCK_SIZE_K": 64,
94
+ "GROUP_SIZE_M": 1,
95
+ "num_warps": 4,
96
+ "num_stages": 3
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 1,
103
+ "num_warps": 4,
104
+ "num_stages": 3
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 1,
111
+ "num_warps": 8,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 16,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 1,
127
+ "num_warps": 8,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 1,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 16,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 32,
5
+ "BLOCK_SIZE_K": 128,
6
+ "GROUP_SIZE_M": 16,
7
+ "num_warps": 8,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 64,
14
+ "GROUP_SIZE_M": 16,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 128,
21
+ "BLOCK_SIZE_K": 128,
22
+ "GROUP_SIZE_M": 1,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 128,
29
+ "BLOCK_SIZE_K": 64,
30
+ "GROUP_SIZE_M": 32,
31
+ "num_warps": 4,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 256,
37
+ "BLOCK_SIZE_K": 64,
38
+ "GROUP_SIZE_M": 32,
39
+ "num_warps": 8,
40
+ "num_stages": 4
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 32,
44
+ "BLOCK_SIZE_N": 32,
45
+ "BLOCK_SIZE_K": 64,
46
+ "GROUP_SIZE_M": 32,
47
+ "num_warps": 8,
48
+ "num_stages": 4
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 32,
53
+ "BLOCK_SIZE_K": 256,
54
+ "GROUP_SIZE_M": 32,
55
+ "num_warps": 4,
56
+ "num_stages": 4
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 32,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 32,
63
+ "num_warps": 4,
64
+ "num_stages": 4
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 32,
68
+ "BLOCK_SIZE_N": 32,
69
+ "BLOCK_SIZE_K": 256,
70
+ "GROUP_SIZE_M": 16,
71
+ "num_warps": 4,
72
+ "num_stages": 4
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 32,
77
+ "BLOCK_SIZE_K": 64,
78
+ "GROUP_SIZE_M": 64,
79
+ "num_warps": 4,
80
+ "num_stages": 4
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 16,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 64,
87
+ "num_warps": 8,
88
+ "num_stages": 4
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 256,
93
+ "BLOCK_SIZE_K": 64,
94
+ "GROUP_SIZE_M": 32,
95
+ "num_warps": 4,
96
+ "num_stages": 4
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 256,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 64,
103
+ "num_warps": 8,
104
+ "num_stages": 4
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 64,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 32,
111
+ "num_warps": 8,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 64,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 64,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 64,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 32,
127
+ "num_warps": 8,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 128,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 16,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 128,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 16,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 64,
4
+ "BLOCK_SIZE_N": 128,
5
+ "BLOCK_SIZE_K": 128,
6
+ "GROUP_SIZE_M": 16,
7
+ "num_warps": 4,
8
+ "num_stages": 5
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 64,
12
+ "BLOCK_SIZE_N": 256,
13
+ "BLOCK_SIZE_K": 128,
14
+ "GROUP_SIZE_M": 16,
15
+ "num_warps": 4,
16
+ "num_stages": 5
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 64,
20
+ "BLOCK_SIZE_N": 256,
21
+ "BLOCK_SIZE_K": 128,
22
+ "GROUP_SIZE_M": 32,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 64,
28
+ "BLOCK_SIZE_N": 256,
29
+ "BLOCK_SIZE_K": 128,
30
+ "GROUP_SIZE_M": 1,
31
+ "num_warps": 4,
32
+ "num_stages": 5
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 64,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 32,
39
+ "num_warps": 4,
40
+ "num_stages": 3
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 64,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 128,
46
+ "GROUP_SIZE_M": 1,
47
+ "num_warps": 4,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 64,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 16,
55
+ "num_warps": 4,
56
+ "num_stages": 3
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 64,
60
+ "BLOCK_SIZE_N": 64,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 3
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 64,
68
+ "BLOCK_SIZE_N": 64,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 16,
71
+ "num_warps": 4,
72
+ "num_stages": 3
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 64,
76
+ "BLOCK_SIZE_N": 64,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 16,
79
+ "num_warps": 4,
80
+ "num_stages": 3
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 64,
84
+ "BLOCK_SIZE_N": 64,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 16,
87
+ "num_warps": 4,
88
+ "num_stages": 3
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 128,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 64,
95
+ "num_warps": 4,
96
+ "num_stages": 3
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 128,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 128,
102
+ "GROUP_SIZE_M": 64,
103
+ "num_warps": 8,
104
+ "num_stages": 5
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 128,
110
+ "GROUP_SIZE_M": 64,
111
+ "num_warps": 8,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 128,
118
+ "GROUP_SIZE_M": 64,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 128,
126
+ "GROUP_SIZE_M": 64,
127
+ "num_warps": 8,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 128,
134
+ "GROUP_SIZE_M": 32,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 128,
142
+ "GROUP_SIZE_M": 16,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 128,
5
+ "BLOCK_SIZE_K": 128,
6
+ "GROUP_SIZE_M": 32,
7
+ "num_warps": 8,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 128,
14
+ "GROUP_SIZE_M": 64,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 32,
21
+ "BLOCK_SIZE_K": 256,
22
+ "GROUP_SIZE_M": 16,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 32,
29
+ "BLOCK_SIZE_K": 256,
30
+ "GROUP_SIZE_M": 16,
31
+ "num_warps": 4,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 256,
37
+ "BLOCK_SIZE_K": 64,
38
+ "GROUP_SIZE_M": 64,
39
+ "num_warps": 8,
40
+ "num_stages": 4
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 32,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 128,
46
+ "GROUP_SIZE_M": 1,
47
+ "num_warps": 8,
48
+ "num_stages": 4
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 128,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 4
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 32,
60
+ "BLOCK_SIZE_N": 128,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 4
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 32,
68
+ "BLOCK_SIZE_N": 256,
69
+ "BLOCK_SIZE_K": 64,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 4
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 1,
79
+ "num_warps": 4,
80
+ "num_stages": 4
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 64,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 4
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 64,
93
+ "BLOCK_SIZE_K": 64,
94
+ "GROUP_SIZE_M": 32,
95
+ "num_warps": 4,
96
+ "num_stages": 4
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 256,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 64,
103
+ "num_warps": 4,
104
+ "num_stages": 4
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 32,
111
+ "num_warps": 8,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 64,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 16,
127
+ "num_warps": 8,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 32,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 1,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 4,
8
+ "num_stages": 2,
9
+ "waves_per_eu": 0,
10
+ "matrix_instr_nonkdim": 16,
11
+ "kpack": 2
12
+ },
13
+ "2": {
14
+ "BLOCK_SIZE_M": 16,
15
+ "BLOCK_SIZE_N": 64,
16
+ "BLOCK_SIZE_K": 128,
17
+ "GROUP_SIZE_M": 1,
18
+ "num_warps": 4,
19
+ "num_stages": 2,
20
+ "waves_per_eu": 0,
21
+ "matrix_instr_nonkdim": 16,
22
+ "kpack": 2
23
+ },
24
+ "4": {
25
+ "BLOCK_SIZE_M": 16,
26
+ "BLOCK_SIZE_N": 64,
27
+ "BLOCK_SIZE_K": 128,
28
+ "GROUP_SIZE_M": 1,
29
+ "num_warps": 4,
30
+ "num_stages": 2,
31
+ "waves_per_eu": 0,
32
+ "matrix_instr_nonkdim": 16,
33
+ "kpack": 2
34
+ },
35
+ "8": {
36
+ "BLOCK_SIZE_M": 16,
37
+ "BLOCK_SIZE_N": 16,
38
+ "BLOCK_SIZE_K": 256,
39
+ "GROUP_SIZE_M": 1,
40
+ "num_warps": 2,
41
+ "num_stages": 2,
42
+ "waves_per_eu": 0,
43
+ "matrix_instr_nonkdim": 16,
44
+ "kpack": 2
45
+ },
46
+ "16": {
47
+ "BLOCK_SIZE_M": 16,
48
+ "BLOCK_SIZE_N": 32,
49
+ "BLOCK_SIZE_K": 256,
50
+ "GROUP_SIZE_M": 1,
51
+ "num_warps": 2,
52
+ "num_stages": 2,
53
+ "waves_per_eu": 0,
54
+ "matrix_instr_nonkdim": 16,
55
+ "kpack": 1
56
+ },
57
+ "24": {
58
+ "BLOCK_SIZE_M": 16,
59
+ "BLOCK_SIZE_N": 32,
60
+ "BLOCK_SIZE_K": 256,
61
+ "GROUP_SIZE_M": 1,
62
+ "num_warps": 2,
63
+ "num_stages": 2,
64
+ "waves_per_eu": 0,
65
+ "matrix_instr_nonkdim": 16,
66
+ "kpack": 2
67
+ },
68
+ "32": {
69
+ "BLOCK_SIZE_M": 16,
70
+ "BLOCK_SIZE_N": 32,
71
+ "BLOCK_SIZE_K": 256,
72
+ "GROUP_SIZE_M": 4,
73
+ "num_warps": 2,
74
+ "num_stages": 2,
75
+ "waves_per_eu": 0,
76
+ "matrix_instr_nonkdim": 16,
77
+ "kpack": 1
78
+ },
79
+ "48": {
80
+ "BLOCK_SIZE_M": 32,
81
+ "BLOCK_SIZE_N": 32,
82
+ "BLOCK_SIZE_K": 256,
83
+ "GROUP_SIZE_M": 1,
84
+ "num_warps": 2,
85
+ "num_stages": 2,
86
+ "waves_per_eu": 0,
87
+ "matrix_instr_nonkdim": 16,
88
+ "kpack": 1
89
+ },
90
+ "64": {
91
+ "BLOCK_SIZE_M": 32,
92
+ "BLOCK_SIZE_N": 32,
93
+ "BLOCK_SIZE_K": 256,
94
+ "GROUP_SIZE_M": 4,
95
+ "num_warps": 4,
96
+ "num_stages": 2,
97
+ "waves_per_eu": 0,
98
+ "matrix_instr_nonkdim": 16,
99
+ "kpack": 2
100
+ },
101
+ "96": {
102
+ "BLOCK_SIZE_M": 32,
103
+ "BLOCK_SIZE_N": 32,
104
+ "BLOCK_SIZE_K": 256,
105
+ "GROUP_SIZE_M": 4,
106
+ "num_warps": 2,
107
+ "num_stages": 2,
108
+ "waves_per_eu": 0,
109
+ "matrix_instr_nonkdim": 16,
110
+ "kpack": 2
111
+ },
112
+ "128": {
113
+ "BLOCK_SIZE_M": 64,
114
+ "BLOCK_SIZE_N": 64,
115
+ "BLOCK_SIZE_K": 256,
116
+ "GROUP_SIZE_M": 1,
117
+ "num_warps": 8,
118
+ "num_stages": 2,
119
+ "waves_per_eu": 0,
120
+ "matrix_instr_nonkdim": 16,
121
+ "kpack": 2
122
+ },
123
+ "256": {
124
+ "BLOCK_SIZE_M": 64,
125
+ "BLOCK_SIZE_N": 64,
126
+ "BLOCK_SIZE_K": 128,
127
+ "GROUP_SIZE_M": 1,
128
+ "num_warps": 8,
129
+ "num_stages": 2,
130
+ "waves_per_eu": 0,
131
+ "matrix_instr_nonkdim": 16,
132
+ "kpack": 2
133
+ },
134
+ "512": {
135
+ "BLOCK_SIZE_M": 64,
136
+ "BLOCK_SIZE_N": 128,
137
+ "BLOCK_SIZE_K": 128,
138
+ "GROUP_SIZE_M": 1,
139
+ "num_warps": 8,
140
+ "num_stages": 2,
141
+ "waves_per_eu": 0,
142
+ "matrix_instr_nonkdim": 32,
143
+ "kpack": 2
144
+ },
145
+ "1024": {
146
+ "BLOCK_SIZE_M": 128,
147
+ "BLOCK_SIZE_N": 128,
148
+ "BLOCK_SIZE_K": 64,
149
+ "GROUP_SIZE_M": 1,
150
+ "num_warps": 8,
151
+ "num_stages": 2,
152
+ "waves_per_eu": 0,
153
+ "matrix_instr_nonkdim": 16,
154
+ "kpack": 2
155
+ },
156
+ "1536": {
157
+ "BLOCK_SIZE_M": 128,
158
+ "BLOCK_SIZE_N": 128,
159
+ "BLOCK_SIZE_K": 64,
160
+ "GROUP_SIZE_M": 1,
161
+ "num_warps": 8,
162
+ "num_stages": 2,
163
+ "waves_per_eu": 0,
164
+ "matrix_instr_nonkdim": 16,
165
+ "kpack": 2
166
+ },
167
+ "2048": {
168
+ "BLOCK_SIZE_M": 128,
169
+ "BLOCK_SIZE_N": 128,
170
+ "BLOCK_SIZE_K": 64,
171
+ "GROUP_SIZE_M": 1,
172
+ "num_warps": 8,
173
+ "num_stages": 2,
174
+ "waves_per_eu": 0,
175
+ "matrix_instr_nonkdim": 16,
176
+ "kpack": 2
177
+ },
178
+ "3072": {
179
+ "BLOCK_SIZE_M": 128,
180
+ "BLOCK_SIZE_N": 128,
181
+ "BLOCK_SIZE_K": 64,
182
+ "GROUP_SIZE_M": 1,
183
+ "num_warps": 8,
184
+ "num_stages": 2,
185
+ "waves_per_eu": 0,
186
+ "matrix_instr_nonkdim": 16,
187
+ "kpack": 2
188
+ },
189
+ "4096": {
190
+ "BLOCK_SIZE_M": 128,
191
+ "BLOCK_SIZE_N": 128,
192
+ "BLOCK_SIZE_K": 64,
193
+ "GROUP_SIZE_M": 1,
194
+ "num_warps": 8,
195
+ "num_stages": 2,
196
+ "waves_per_eu": 0,
197
+ "matrix_instr_nonkdim": 16,
198
+ "kpack": 2
199
+ }
200
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 32,
5
+ "BLOCK_SIZE_K": 64,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 4,
8
+ "num_ctas": 1,
9
+ "num_stages": 2
10
+ },
11
+ "2": {
12
+ "BLOCK_SIZE_M": 32,
13
+ "BLOCK_SIZE_N": 64,
14
+ "BLOCK_SIZE_K": 32,
15
+ "GROUP_SIZE_M": 2,
16
+ "num_warps": 4,
17
+ "num_ctas": 1,
18
+ "num_stages": 7
19
+ },
20
+ "4": {
21
+ "BLOCK_SIZE_M": 32,
22
+ "BLOCK_SIZE_N": 128,
23
+ "BLOCK_SIZE_K": 32,
24
+ "GROUP_SIZE_M": 128,
25
+ "num_warps": 2,
26
+ "num_ctas": 1,
27
+ "num_stages": 4
28
+ },
29
+ "8": {
30
+ "BLOCK_SIZE_M": 16,
31
+ "BLOCK_SIZE_N": 32,
32
+ "BLOCK_SIZE_K": 256,
33
+ "GROUP_SIZE_M": 1,
34
+ "num_warps": 2,
35
+ "num_ctas": 1,
36
+ "num_stages": 1
37
+ },
38
+ "16": {
39
+ "BLOCK_SIZE_M": 16,
40
+ "BLOCK_SIZE_N": 32,
41
+ "BLOCK_SIZE_K": 256,
42
+ "GROUP_SIZE_M": 1,
43
+ "num_warps": 2,
44
+ "num_ctas": 1,
45
+ "num_stages": 1
46
+ },
47
+ "32": {
48
+ "BLOCK_SIZE_M": 16,
49
+ "BLOCK_SIZE_N": 256,
50
+ "BLOCK_SIZE_K": 128,
51
+ "GROUP_SIZE_M": 2,
52
+ "num_warps": 4,
53
+ "num_ctas": 1,
54
+ "num_stages": 2
55
+ },
56
+ "48": {
57
+ "BLOCK_SIZE_M": 16,
58
+ "BLOCK_SIZE_N": 256,
59
+ "BLOCK_SIZE_K": 128,
60
+ "GROUP_SIZE_M": 2,
61
+ "num_warps": 4,
62
+ "num_ctas": 1,
63
+ "num_stages": 2
64
+ },
65
+ "64": {
66
+ "BLOCK_SIZE_M": 16,
67
+ "BLOCK_SIZE_N": 256,
68
+ "BLOCK_SIZE_K": 128,
69
+ "GROUP_SIZE_M": 2,
70
+ "num_warps": 4,
71
+ "num_ctas": 1,
72
+ "num_stages": 2
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 256,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 8,
79
+ "num_warps": 8,
80
+ "num_ctas": 1,
81
+ "num_stages": 2
82
+ },
83
+ "128": {
84
+ "BLOCK_SIZE_M": 32,
85
+ "BLOCK_SIZE_N": 256,
86
+ "BLOCK_SIZE_K": 128,
87
+ "GROUP_SIZE_M": 8,
88
+ "num_warps": 8,
89
+ "num_ctas": 1,
90
+ "num_stages": 2
91
+ },
92
+ "192": {
93
+ "BLOCK_SIZE_M": 32,
94
+ "BLOCK_SIZE_N": 256,
95
+ "BLOCK_SIZE_K": 128,
96
+ "GROUP_SIZE_M": 8,
97
+ "num_warps": 8,
98
+ "num_ctas": 1,
99
+ "num_stages": 2
100
+ },
101
+ "256": {
102
+ "BLOCK_SIZE_M": 64,
103
+ "BLOCK_SIZE_N": 256,
104
+ "BLOCK_SIZE_K": 128,
105
+ "GROUP_SIZE_M": 1,
106
+ "num_warps": 16,
107
+ "num_ctas": 1,
108
+ "num_stages": 2
109
+ },
110
+ "512": {
111
+ "BLOCK_SIZE_M": 32,
112
+ "BLOCK_SIZE_N": 32,
113
+ "BLOCK_SIZE_K": 64,
114
+ "GROUP_SIZE_M": 128,
115
+ "num_warps": 2,
116
+ "num_ctas": 1,
117
+ "num_stages": 8
118
+ },
119
+ "1024": {
120
+ "BLOCK_SIZE_M": 64,
121
+ "BLOCK_SIZE_N": 128,
122
+ "BLOCK_SIZE_K": 128,
123
+ "GROUP_SIZE_M": 4,
124
+ "num_warps": 8,
125
+ "num_ctas": 1,
126
+ "num_stages": 3
127
+ },
128
+ "2048": {
129
+ "BLOCK_SIZE_M": 64,
130
+ "BLOCK_SIZE_N": 256,
131
+ "BLOCK_SIZE_K": 128,
132
+ "GROUP_SIZE_M": 1,
133
+ "num_warps": 16,
134
+ "num_ctas": 1,
135
+ "num_stages": 2
136
+ },
137
+ "3072": {
138
+ "BLOCK_SIZE_M": 64,
139
+ "BLOCK_SIZE_N": 256,
140
+ "BLOCK_SIZE_K": 128,
141
+ "GROUP_SIZE_M": 1,
142
+ "num_warps": 16,
143
+ "num_ctas": 1,
144
+ "num_stages": 2
145
+ },
146
+ "4096": {
147
+ "BLOCK_SIZE_M": 128,
148
+ "BLOCK_SIZE_N": 256,
149
+ "BLOCK_SIZE_K": 32,
150
+ "GROUP_SIZE_M": 64,
151
+ "num_warps": 8,
152
+ "num_ctas": 1,
153
+ "num_stages": 2
154
+ },
155
+ "6144": {
156
+ "BLOCK_SIZE_M": 128,
157
+ "BLOCK_SIZE_N": 256,
158
+ "BLOCK_SIZE_K": 32,
159
+ "GROUP_SIZE_M": 64,
160
+ "num_warps": 8,
161
+ "num_ctas": 1,
162
+ "num_stages": 2
163
+ },
164
+ "8192": {
165
+ "BLOCK_SIZE_M": 64,
166
+ "BLOCK_SIZE_N": 256,
167
+ "BLOCK_SIZE_K": 128,
168
+ "GROUP_SIZE_M": 1,
169
+ "num_warps": 16,
170
+ "num_ctas": 1,
171
+ "num_stages": 2
172
+ }
173
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 16,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 2,
8
+ "num_stages": 2,
9
+ "waves_per_eu": 0,
10
+ "matrix_instr_nonkdim": 16,
11
+ "kpack": 2
12
+ },
13
+ "2": {
14
+ "BLOCK_SIZE_M": 16,
15
+ "BLOCK_SIZE_N": 16,
16
+ "BLOCK_SIZE_K": 128,
17
+ "GROUP_SIZE_M": 1,
18
+ "num_warps": 2,
19
+ "num_stages": 2,
20
+ "waves_per_eu": 0,
21
+ "matrix_instr_nonkdim": 16,
22
+ "kpack": 2
23
+ },
24
+ "4": {
25
+ "BLOCK_SIZE_M": 16,
26
+ "BLOCK_SIZE_N": 16,
27
+ "BLOCK_SIZE_K": 256,
28
+ "GROUP_SIZE_M": 1,
29
+ "num_warps": 4,
30
+ "num_stages": 2,
31
+ "waves_per_eu": 0,
32
+ "matrix_instr_nonkdim": 16,
33
+ "kpack": 1
34
+ },
35
+ "8": {
36
+ "BLOCK_SIZE_M": 16,
37
+ "BLOCK_SIZE_N": 16,
38
+ "BLOCK_SIZE_K": 256,
39
+ "GROUP_SIZE_M": 1,
40
+ "num_warps": 4,
41
+ "num_stages": 2,
42
+ "waves_per_eu": 0,
43
+ "matrix_instr_nonkdim": 16,
44
+ "kpack": 2
45
+ },
46
+ "16": {
47
+ "BLOCK_SIZE_M": 16,
48
+ "BLOCK_SIZE_N": 16,
49
+ "BLOCK_SIZE_K": 256,
50
+ "GROUP_SIZE_M": 1,
51
+ "num_warps": 4,
52
+ "num_stages": 2,
53
+ "waves_per_eu": 0,
54
+ "matrix_instr_nonkdim": 16,
55
+ "kpack": 2
56
+ },
57
+ "24": {
58
+ "BLOCK_SIZE_M": 16,
59
+ "BLOCK_SIZE_N": 16,
60
+ "BLOCK_SIZE_K": 256,
61
+ "GROUP_SIZE_M": 1,
62
+ "num_warps": 4,
63
+ "num_stages": 2,
64
+ "waves_per_eu": 0,
65
+ "matrix_instr_nonkdim": 16,
66
+ "kpack": 1
67
+ },
68
+ "32": {
69
+ "BLOCK_SIZE_M": 16,
70
+ "BLOCK_SIZE_N": 32,
71
+ "BLOCK_SIZE_K": 256,
72
+ "GROUP_SIZE_M": 4,
73
+ "num_warps": 2,
74
+ "num_stages": 2,
75
+ "waves_per_eu": 0,
76
+ "matrix_instr_nonkdim": 16,
77
+ "kpack": 2
78
+ },
79
+ "48": {
80
+ "BLOCK_SIZE_M": 32,
81
+ "BLOCK_SIZE_N": 64,
82
+ "BLOCK_SIZE_K": 64,
83
+ "GROUP_SIZE_M": 1,
84
+ "num_warps": 4,
85
+ "num_stages": 2,
86
+ "waves_per_eu": 0,
87
+ "matrix_instr_nonkdim": 16,
88
+ "kpack": 2
89
+ },
90
+ "64": {
91
+ "BLOCK_SIZE_M": 32,
92
+ "BLOCK_SIZE_N": 32,
93
+ "BLOCK_SIZE_K": 256,
94
+ "GROUP_SIZE_M": 4,
95
+ "num_warps": 4,
96
+ "num_stages": 2,
97
+ "waves_per_eu": 0,
98
+ "matrix_instr_nonkdim": 16,
99
+ "kpack": 1
100
+ },
101
+ "96": {
102
+ "BLOCK_SIZE_M": 32,
103
+ "BLOCK_SIZE_N": 64,
104
+ "BLOCK_SIZE_K": 64,
105
+ "GROUP_SIZE_M": 4,
106
+ "num_warps": 4,
107
+ "num_stages": 2,
108
+ "waves_per_eu": 0,
109
+ "matrix_instr_nonkdim": 16,
110
+ "kpack": 2
111
+ },
112
+ "128": {
113
+ "BLOCK_SIZE_M": 64,
114
+ "BLOCK_SIZE_N": 64,
115
+ "BLOCK_SIZE_K": 64,
116
+ "GROUP_SIZE_M": 1,
117
+ "num_warps": 4,
118
+ "num_stages": 2,
119
+ "waves_per_eu": 0,
120
+ "matrix_instr_nonkdim": 16,
121
+ "kpack": 1
122
+ },
123
+ "256": {
124
+ "BLOCK_SIZE_M": 64,
125
+ "BLOCK_SIZE_N": 128,
126
+ "BLOCK_SIZE_K": 128,
127
+ "GROUP_SIZE_M": 1,
128
+ "num_warps": 8,
129
+ "num_stages": 2,
130
+ "waves_per_eu": 0,
131
+ "matrix_instr_nonkdim": 32,
132
+ "kpack": 2
133
+ },
134
+ "512": {
135
+ "BLOCK_SIZE_M": 128,
136
+ "BLOCK_SIZE_N": 128,
137
+ "BLOCK_SIZE_K": 64,
138
+ "GROUP_SIZE_M": 1,
139
+ "num_warps": 8,
140
+ "num_stages": 2,
141
+ "waves_per_eu": 0,
142
+ "matrix_instr_nonkdim": 16,
143
+ "kpack": 2
144
+ },
145
+ "1024": {
146
+ "BLOCK_SIZE_M": 128,
147
+ "BLOCK_SIZE_N": 128,
148
+ "BLOCK_SIZE_K": 64,
149
+ "GROUP_SIZE_M": 1,
150
+ "num_warps": 8,
151
+ "num_stages": 2,
152
+ "waves_per_eu": 0,
153
+ "matrix_instr_nonkdim": 16,
154
+ "kpack": 2
155
+ },
156
+ "1536": {
157
+ "BLOCK_SIZE_M": 128,
158
+ "BLOCK_SIZE_N": 128,
159
+ "BLOCK_SIZE_K": 64,
160
+ "GROUP_SIZE_M": 1,
161
+ "num_warps": 8,
162
+ "num_stages": 2,
163
+ "waves_per_eu": 0,
164
+ "matrix_instr_nonkdim": 16,
165
+ "kpack": 2
166
+ },
167
+ "2048": {
168
+ "BLOCK_SIZE_M": 128,
169
+ "BLOCK_SIZE_N": 128,
170
+ "BLOCK_SIZE_K": 64,
171
+ "GROUP_SIZE_M": 1,
172
+ "num_warps": 8,
173
+ "num_stages": 2,
174
+ "waves_per_eu": 0,
175
+ "matrix_instr_nonkdim": 16,
176
+ "kpack": 2
177
+ },
178
+ "3072": {
179
+ "BLOCK_SIZE_M": 128,
180
+ "BLOCK_SIZE_N": 128,
181
+ "BLOCK_SIZE_K": 64,
182
+ "GROUP_SIZE_M": 1,
183
+ "num_warps": 8,
184
+ "num_stages": 2,
185
+ "waves_per_eu": 0,
186
+ "matrix_instr_nonkdim": 16,
187
+ "kpack": 2
188
+ },
189
+ "4096": {
190
+ "BLOCK_SIZE_M": 128,
191
+ "BLOCK_SIZE_N": 128,
192
+ "BLOCK_SIZE_K": 64,
193
+ "GROUP_SIZE_M": 1,
194
+ "num_warps": 8,
195
+ "num_stages": 2,
196
+ "waves_per_eu": 0,
197
+ "matrix_instr_nonkdim": 16,
198
+ "kpack": 2
199
+ }
200
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 32,
5
+ "BLOCK_SIZE_K": 64,
6
+ "GROUP_SIZE_M": 16,
7
+ "num_warps": 4,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 128,
14
+ "GROUP_SIZE_M": 64,
15
+ "num_warps": 8,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 256,
22
+ "GROUP_SIZE_M": 16,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 64,
29
+ "BLOCK_SIZE_K": 256,
30
+ "GROUP_SIZE_M": 16,
31
+ "num_warps": 4,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 128,
37
+ "BLOCK_SIZE_K": 128,
38
+ "GROUP_SIZE_M": 32,
39
+ "num_warps": 4,
40
+ "num_stages": 4
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 128,
45
+ "BLOCK_SIZE_K": 128,
46
+ "GROUP_SIZE_M": 64,
47
+ "num_warps": 4,
48
+ "num_stages": 4
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 128,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 4
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 32,
60
+ "BLOCK_SIZE_N": 128,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 4
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 32,
68
+ "BLOCK_SIZE_N": 128,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 4
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 32,
79
+ "num_warps": 4,
80
+ "num_stages": 4
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 64,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 4
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 128,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 64,
95
+ "num_warps": 4,
96
+ "num_stages": 4
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 256,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 32,
103
+ "num_warps": 4,
104
+ "num_stages": 4
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 64,
111
+ "num_warps": 8,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 64,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 256,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 16,
127
+ "num_warps": 8,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 256,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 64,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 256,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 32,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 64,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 4,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 64,
13
+ "BLOCK_SIZE_K": 256,
14
+ "GROUP_SIZE_M": 16,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 256,
22
+ "GROUP_SIZE_M": 64,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 64,
29
+ "BLOCK_SIZE_K": 256,
30
+ "GROUP_SIZE_M": 16,
31
+ "num_warps": 4,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 64,
37
+ "BLOCK_SIZE_K": 256,
38
+ "GROUP_SIZE_M": 1,
39
+ "num_warps": 4,
40
+ "num_stages": 4
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 256,
46
+ "GROUP_SIZE_M": 1,
47
+ "num_warps": 4,
48
+ "num_stages": 4
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 256,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 4
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 32,
60
+ "BLOCK_SIZE_N": 128,
61
+ "BLOCK_SIZE_K": 128,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 4
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 32,
68
+ "BLOCK_SIZE_N": 128,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 4
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 16,
79
+ "num_warps": 4,
80
+ "num_stages": 4
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 32,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 32,
87
+ "num_warps": 4,
88
+ "num_stages": 4
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 64,
92
+ "BLOCK_SIZE_N": 256,
93
+ "BLOCK_SIZE_K": 64,
94
+ "GROUP_SIZE_M": 64,
95
+ "num_warps": 8,
96
+ "num_stages": 4
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 64,
100
+ "BLOCK_SIZE_N": 256,
101
+ "BLOCK_SIZE_K": 64,
102
+ "GROUP_SIZE_M": 32,
103
+ "num_warps": 8,
104
+ "num_stages": 4
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 64,
108
+ "BLOCK_SIZE_N": 256,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 64,
111
+ "num_warps": 8,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 64,
116
+ "BLOCK_SIZE_N": 256,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 32,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 128,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 16,
127
+ "num_warps": 8,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 128,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 16,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 128,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 16,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ }
146
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 32,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 2,
8
+ "num_stages": 2,
9
+ "waves_per_eu": 0
10
+ },
11
+ "2": {
12
+ "BLOCK_SIZE_M": 16,
13
+ "BLOCK_SIZE_N": 128,
14
+ "BLOCK_SIZE_K": 256,
15
+ "GROUP_SIZE_M": 1,
16
+ "num_warps": 4,
17
+ "num_stages": 2,
18
+ "waves_per_eu": 0
19
+ },
20
+ "4": {
21
+ "BLOCK_SIZE_M": 16,
22
+ "BLOCK_SIZE_N": 32,
23
+ "BLOCK_SIZE_K": 128,
24
+ "GROUP_SIZE_M": 1,
25
+ "num_warps": 2,
26
+ "num_stages": 2,
27
+ "waves_per_eu": 0
28
+ },
29
+ "8": {
30
+ "BLOCK_SIZE_M": 16,
31
+ "BLOCK_SIZE_N": 64,
32
+ "BLOCK_SIZE_K": 256,
33
+ "GROUP_SIZE_M": 1,
34
+ "num_warps": 2,
35
+ "num_stages": 2,
36
+ "waves_per_eu": 0
37
+ },
38
+ "16": {
39
+ "BLOCK_SIZE_M": 16,
40
+ "BLOCK_SIZE_N": 64,
41
+ "BLOCK_SIZE_K": 256,
42
+ "GROUP_SIZE_M": 1,
43
+ "num_warps": 1,
44
+ "num_stages": 2,
45
+ "waves_per_eu": 0
46
+ },
47
+ "24": {
48
+ "BLOCK_SIZE_M": 16,
49
+ "BLOCK_SIZE_N": 64,
50
+ "BLOCK_SIZE_K": 256,
51
+ "GROUP_SIZE_M": 1,
52
+ "num_warps": 2,
53
+ "num_stages": 2,
54
+ "waves_per_eu": 0
55
+ },
56
+ "32": {
57
+ "BLOCK_SIZE_M": 16,
58
+ "BLOCK_SIZE_N": 64,
59
+ "BLOCK_SIZE_K": 256,
60
+ "GROUP_SIZE_M": 4,
61
+ "num_warps": 4,
62
+ "num_stages": 2,
63
+ "waves_per_eu": 0
64
+ },
65
+ "48": {
66
+ "BLOCK_SIZE_M": 16,
67
+ "BLOCK_SIZE_N": 64,
68
+ "BLOCK_SIZE_K": 256,
69
+ "GROUP_SIZE_M": 1,
70
+ "num_warps": 2,
71
+ "num_stages": 2,
72
+ "waves_per_eu": 0
73
+ },
74
+ "64": {
75
+ "BLOCK_SIZE_M": 32,
76
+ "BLOCK_SIZE_N": 32,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 4,
79
+ "num_warps": 1,
80
+ "num_stages": 2,
81
+ "waves_per_eu": 0
82
+ },
83
+ "96": {
84
+ "BLOCK_SIZE_M": 64,
85
+ "BLOCK_SIZE_N": 64,
86
+ "BLOCK_SIZE_K": 128,
87
+ "GROUP_SIZE_M": 1,
88
+ "num_warps": 4,
89
+ "num_stages": 2,
90
+ "waves_per_eu": 0
91
+ },
92
+ "128": {
93
+ "BLOCK_SIZE_M": 64,
94
+ "BLOCK_SIZE_N": 64,
95
+ "BLOCK_SIZE_K": 128,
96
+ "GROUP_SIZE_M": 4,
97
+ "num_warps": 2,
98
+ "num_stages": 2,
99
+ "waves_per_eu": 0
100
+ },
101
+ "256": {
102
+ "BLOCK_SIZE_M": 64,
103
+ "BLOCK_SIZE_N": 64,
104
+ "BLOCK_SIZE_K": 128,
105
+ "GROUP_SIZE_M": 1,
106
+ "num_warps": 4,
107
+ "num_stages": 2,
108
+ "waves_per_eu": 0
109
+ },
110
+ "512": {
111
+ "BLOCK_SIZE_M": 128,
112
+ "BLOCK_SIZE_N": 128,
113
+ "BLOCK_SIZE_K": 256,
114
+ "GROUP_SIZE_M": 1,
115
+ "num_warps": 8,
116
+ "num_stages": 2,
117
+ "waves_per_eu": 0
118
+ },
119
+ "1024": {
120
+ "BLOCK_SIZE_M": 128,
121
+ "BLOCK_SIZE_N": 256,
122
+ "BLOCK_SIZE_K": 128,
123
+ "GROUP_SIZE_M": 1,
124
+ "num_warps": 8,
125
+ "num_stages": 2,
126
+ "waves_per_eu": 0
127
+ },
128
+ "1536": {
129
+ "BLOCK_SIZE_M": 128,
130
+ "BLOCK_SIZE_N": 256,
131
+ "BLOCK_SIZE_K": 128,
132
+ "GROUP_SIZE_M": 1,
133
+ "num_warps": 8,
134
+ "num_stages": 2,
135
+ "waves_per_eu": 0
136
+ },
137
+ "2048": {
138
+ "BLOCK_SIZE_M": 128,
139
+ "BLOCK_SIZE_N": 256,
140
+ "BLOCK_SIZE_K": 128,
141
+ "GROUP_SIZE_M": 1,
142
+ "num_warps": 8,
143
+ "num_stages": 2,
144
+ "waves_per_eu": 0
145
+ },
146
+ "3072": {
147
+ "BLOCK_SIZE_M": 128,
148
+ "BLOCK_SIZE_N": 256,
149
+ "BLOCK_SIZE_K": 128,
150
+ "GROUP_SIZE_M": 1,
151
+ "num_warps": 8,
152
+ "num_stages": 2,
153
+ "waves_per_eu": 0
154
+ },
155
+ "4096": {
156
+ "BLOCK_SIZE_M": 256,
157
+ "BLOCK_SIZE_N": 256,
158
+ "BLOCK_SIZE_K": 64,
159
+ "GROUP_SIZE_M": 1,
160
+ "num_warps": 8,
161
+ "num_stages": 2,
162
+ "waves_per_eu": 0
163
+ }
164
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 16,
5
+ "BLOCK_SIZE_K": 256,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 1,
8
+ "num_stages": 2,
9
+ "waves_per_eu": 0,
10
+ "matrix_instr_nonkdim": 16,
11
+ "kpack": 1
12
+ },
13
+ "2": {
14
+ "BLOCK_SIZE_M": 16,
15
+ "BLOCK_SIZE_N": 64,
16
+ "BLOCK_SIZE_K": 128,
17
+ "GROUP_SIZE_M": 1,
18
+ "num_warps": 2,
19
+ "num_stages": 2,
20
+ "waves_per_eu": 0,
21
+ "matrix_instr_nonkdim": 16,
22
+ "kpack": 2
23
+ },
24
+ "4": {
25
+ "BLOCK_SIZE_M": 16,
26
+ "BLOCK_SIZE_N": 64,
27
+ "BLOCK_SIZE_K": 128,
28
+ "GROUP_SIZE_M": 1,
29
+ "num_warps": 4,
30
+ "num_stages": 2,
31
+ "waves_per_eu": 0,
32
+ "matrix_instr_nonkdim": 16,
33
+ "kpack": 2
34
+ },
35
+ "8": {
36
+ "BLOCK_SIZE_M": 16,
37
+ "BLOCK_SIZE_N": 16,
38
+ "BLOCK_SIZE_K": 256,
39
+ "GROUP_SIZE_M": 1,
40
+ "num_warps": 1,
41
+ "num_stages": 2,
42
+ "waves_per_eu": 0,
43
+ "matrix_instr_nonkdim": 16,
44
+ "kpack": 1
45
+ },
46
+ "16": {
47
+ "BLOCK_SIZE_M": 16,
48
+ "BLOCK_SIZE_N": 32,
49
+ "BLOCK_SIZE_K": 64,
50
+ "GROUP_SIZE_M": 1,
51
+ "num_warps": 2,
52
+ "num_stages": 2,
53
+ "waves_per_eu": 0,
54
+ "matrix_instr_nonkdim": 16,
55
+ "kpack": 2
56
+ },
57
+ "24": {
58
+ "BLOCK_SIZE_M": 16,
59
+ "BLOCK_SIZE_N": 64,
60
+ "BLOCK_SIZE_K": 128,
61
+ "GROUP_SIZE_M": 1,
62
+ "num_warps": 2,
63
+ "num_stages": 2,
64
+ "waves_per_eu": 0,
65
+ "matrix_instr_nonkdim": 16,
66
+ "kpack": 2
67
+ },
68
+ "32": {
69
+ "BLOCK_SIZE_M": 16,
70
+ "BLOCK_SIZE_N": 64,
71
+ "BLOCK_SIZE_K": 128,
72
+ "GROUP_SIZE_M": 4,
73
+ "num_warps": 2,
74
+ "num_stages": 2,
75
+ "waves_per_eu": 0,
76
+ "matrix_instr_nonkdim": 16,
77
+ "kpack": 2
78
+ },
79
+ "48": {
80
+ "BLOCK_SIZE_M": 32,
81
+ "BLOCK_SIZE_N": 64,
82
+ "BLOCK_SIZE_K": 64,
83
+ "GROUP_SIZE_M": 1,
84
+ "num_warps": 8,
85
+ "num_stages": 2,
86
+ "waves_per_eu": 0,
87
+ "matrix_instr_nonkdim": 16,
88
+ "kpack": 2
89
+ },
90
+ "64": {
91
+ "BLOCK_SIZE_M": 32,
92
+ "BLOCK_SIZE_N": 32,
93
+ "BLOCK_SIZE_K": 256,
94
+ "GROUP_SIZE_M": 4,
95
+ "num_warps": 4,
96
+ "num_stages": 2,
97
+ "waves_per_eu": 0,
98
+ "matrix_instr_nonkdim": 16,
99
+ "kpack": 2
100
+ },
101
+ "96": {
102
+ "BLOCK_SIZE_M": 32,
103
+ "BLOCK_SIZE_N": 32,
104
+ "BLOCK_SIZE_K": 256,
105
+ "GROUP_SIZE_M": 4,
106
+ "num_warps": 4,
107
+ "num_stages": 2,
108
+ "waves_per_eu": 0,
109
+ "matrix_instr_nonkdim": 16,
110
+ "kpack": 2
111
+ },
112
+ "128": {
113
+ "BLOCK_SIZE_M": 64,
114
+ "BLOCK_SIZE_N": 64,
115
+ "BLOCK_SIZE_K": 64,
116
+ "GROUP_SIZE_M": 4,
117
+ "num_warps": 4,
118
+ "num_stages": 2,
119
+ "waves_per_eu": 0,
120
+ "matrix_instr_nonkdim": 16,
121
+ "kpack": 2
122
+ },
123
+ "256": {
124
+ "BLOCK_SIZE_M": 128,
125
+ "BLOCK_SIZE_N": 128,
126
+ "BLOCK_SIZE_K": 64,
127
+ "GROUP_SIZE_M": 1,
128
+ "num_warps": 4,
129
+ "num_stages": 2,
130
+ "waves_per_eu": 0,
131
+ "matrix_instr_nonkdim": 16,
132
+ "kpack": 2
133
+ },
134
+ "512": {
135
+ "BLOCK_SIZE_M": 128,
136
+ "BLOCK_SIZE_N": 128,
137
+ "BLOCK_SIZE_K": 64,
138
+ "GROUP_SIZE_M": 1,
139
+ "num_warps": 8,
140
+ "num_stages": 2,
141
+ "waves_per_eu": 0,
142
+ "matrix_instr_nonkdim": 16,
143
+ "kpack": 2
144
+ },
145
+ "1024": {
146
+ "BLOCK_SIZE_M": 128,
147
+ "BLOCK_SIZE_N": 128,
148
+ "BLOCK_SIZE_K": 64,
149
+ "GROUP_SIZE_M": 1,
150
+ "num_warps": 8,
151
+ "num_stages": 2,
152
+ "waves_per_eu": 0,
153
+ "matrix_instr_nonkdim": 16,
154
+ "kpack": 2
155
+ },
156
+ "1536": {
157
+ "BLOCK_SIZE_M": 128,
158
+ "BLOCK_SIZE_N": 128,
159
+ "BLOCK_SIZE_K": 64,
160
+ "GROUP_SIZE_M": 1,
161
+ "num_warps": 8,
162
+ "num_stages": 2,
163
+ "waves_per_eu": 0,
164
+ "matrix_instr_nonkdim": 16,
165
+ "kpack": 2
166
+ },
167
+ "2048": {
168
+ "BLOCK_SIZE_M": 128,
169
+ "BLOCK_SIZE_N": 128,
170
+ "BLOCK_SIZE_K": 64,
171
+ "GROUP_SIZE_M": 1,
172
+ "num_warps": 8,
173
+ "num_stages": 2,
174
+ "waves_per_eu": 0,
175
+ "matrix_instr_nonkdim": 16,
176
+ "kpack": 2
177
+ },
178
+ "3072": {
179
+ "BLOCK_SIZE_M": 128,
180
+ "BLOCK_SIZE_N": 128,
181
+ "BLOCK_SIZE_K": 64,
182
+ "GROUP_SIZE_M": 1,
183
+ "num_warps": 8,
184
+ "num_stages": 2,
185
+ "waves_per_eu": 0,
186
+ "matrix_instr_nonkdim": 16,
187
+ "kpack": 2
188
+ },
189
+ "4096": {
190
+ "BLOCK_SIZE_M": 128,
191
+ "BLOCK_SIZE_N": 128,
192
+ "BLOCK_SIZE_K": 64,
193
+ "GROUP_SIZE_M": 1,
194
+ "num_warps": 8,
195
+ "num_stages": 2,
196
+ "waves_per_eu": 0,
197
+ "matrix_instr_nonkdim": 16,
198
+ "kpack": 2
199
+ }
200
+ }
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/__init__.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Dict, List, Type
4
+
5
+ from vllm.model_executor.layers.quantization.base_config import (
6
+ QuantizationConfig)
7
+
8
+ QUANTIZATION_METHODS: List[str] = [
9
+ "aqlm",
10
+ "awq",
11
+ "deepspeedfp",
12
+ "tpu_int8",
13
+ "fp8",
14
+ "fbgemm_fp8",
15
+ "modelopt",
16
+ # The order of gptq methods is important for config.py iteration over
17
+ # override_quantization_method(..)
18
+ "marlin",
19
+ "gguf",
20
+ "gptq_marlin_24",
21
+ "gptq_marlin",
22
+ "awq_marlin",
23
+ "gptq",
24
+ "compressed-tensors",
25
+ "bitsandbytes",
26
+ "qqq",
27
+ "hqq",
28
+ "experts_int8",
29
+ "neuron_quant",
30
+ "ipex",
31
+ "quark",
32
+ "moe_wna16"
33
+ ]
34
+
35
+ # The customized quantization methods which will be added to this dict.
36
+ _CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}
37
+
38
+
39
+ def register_quantization_config(quantization: str):
40
+ """Register a customized vllm quantization config.
41
+
42
+ When a quantization method is not supported by vllm, you can register a customized
43
+ quantization config to support it.
44
+
45
+ Args:
46
+ quantization (str): The quantization method name.
47
+
48
+ Examples:
49
+ >>> from vllm.model_executor.layers.quantization import register_quantization_config
50
+ >>> from vllm.model_executor.layers.quantization import get_quantization_config
51
+ >>> from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
52
+ >>>
53
+ >>> @register_quantization_config("my_quant")
54
+ ... class MyQuantConfig(QuantizationConfig):
55
+ ... pass
56
+ >>>
57
+ >>> get_quantization_config("my_quant")
58
+ <class 'MyQuantConfig'>
59
+ """ # noqa: E501
60
+
61
+ def _wrapper(quant_config_cls):
62
+ if quantization in QUANTIZATION_METHODS:
63
+ raise ValueError(
64
+ f"The quantization method `{quantization}` is already exists.")
65
+ if not issubclass(quant_config_cls, QuantizationConfig):
66
+ raise ValueError("The quantization config must be a subclass of "
67
+ "`QuantizationConfig`.")
68
+ _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls
69
+ QUANTIZATION_METHODS.append(quantization)
70
+ return quant_config_cls
71
+
72
+ return _wrapper
73
+
74
+
75
+ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
76
+ if quantization not in QUANTIZATION_METHODS:
77
+ raise ValueError(f"Invalid quantization method: {quantization}")
78
+
79
+ # lazy import to avoid triggering `torch.compile` too early
80
+ from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig
81
+
82
+ from .aqlm import AQLMConfig
83
+ from .awq import AWQConfig
84
+ from .awq_marlin import AWQMarlinConfig
85
+ from .bitsandbytes import BitsAndBytesConfig
86
+ from .compressed_tensors.compressed_tensors import ( # noqa: E501
87
+ CompressedTensorsConfig)
88
+ from .deepspeedfp import DeepSpeedFPConfig
89
+ from .experts_int8 import ExpertsInt8Config
90
+ from .fbgemm_fp8 import FBGEMMFp8Config
91
+ from .fp8 import Fp8Config
92
+ from .gguf import GGUFConfig
93
+ from .gptq import GPTQConfig
94
+ from .gptq_marlin import GPTQMarlinConfig
95
+ from .gptq_marlin_24 import GPTQMarlin24Config
96
+ from .hqq_marlin import HQQMarlinConfig
97
+ from .ipex_quant import IPEXConfig
98
+ from .marlin import MarlinConfig
99
+ from .modelopt import ModelOptFp8Config
100
+ from .moe_wna16 import MoeWNA16Config
101
+ from .neuron_quant import NeuronQuantConfig
102
+ from .qqq import QQQConfig
103
+ from .tpu_int8 import Int8TpuConfig
104
+
105
+ method_to_config: Dict[str, Type[QuantizationConfig]] = {
106
+ "aqlm": AQLMConfig,
107
+ "awq": AWQConfig,
108
+ "deepspeedfp": DeepSpeedFPConfig,
109
+ "tpu_int8": Int8TpuConfig,
110
+ "fp8": Fp8Config,
111
+ "fbgemm_fp8": FBGEMMFp8Config,
112
+ "modelopt": ModelOptFp8Config,
113
+ # The order of gptq methods is important for config.py iteration over
114
+ # override_quantization_method(..)
115
+ "marlin": MarlinConfig,
116
+ "gguf": GGUFConfig,
117
+ "gptq_marlin_24": GPTQMarlin24Config,
118
+ "gptq_marlin": GPTQMarlinConfig,
119
+ "awq_marlin": AWQMarlinConfig,
120
+ "gptq": GPTQConfig,
121
+ "compressed-tensors": CompressedTensorsConfig,
122
+ "bitsandbytes": BitsAndBytesConfig,
123
+ "qqq": QQQConfig,
124
+ "hqq": HQQMarlinConfig,
125
+ "experts_int8": ExpertsInt8Config,
126
+ "neuron_quant": NeuronQuantConfig,
127
+ "ipex": IPEXConfig,
128
+ "quark": QuarkConfig,
129
+ "moe_wna16": MoeWNA16Config,
130
+ }
131
+ # Update the `method_to_config` with customized quantization methods.
132
+ method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
133
+
134
+ return method_to_config[quantization]
135
+
136
+
137
+ __all__ = [
138
+ "QuantizationConfig",
139
+ "get_quantization_config",
140
+ "QUANTIZATION_METHODS",
141
+ ]
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/aqlm.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Supports AQLM compression, see https://github.com/Vahe1994/AQLM
4
+ # and https://arxiv.org/pdf/2401.06118.pdf
5
+
6
+ import math
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from torch.nn.parameter import Parameter
12
+
13
+ from vllm import _custom_ops as ops
14
+ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
15
+ from vllm.model_executor.layers.quantization.base_config import (
16
+ QuantizationConfig)
17
+ from vllm.model_executor.utils import set_weight_attrs
18
+
19
+
20
+ def get_int_dtype(nbits: int) -> torch.dtype:
21
+ if nbits <= 8:
22
+ return torch.int8
23
+ if nbits <= 16:
24
+ return torch.int16
25
+ if nbits <= 32:
26
+ return torch.int32
27
+ if nbits <= 64:
28
+ return torch.int64
29
+ raise ValueError(f"No dtype available for {nbits}-bit codebooks")
30
+
31
+
32
+ @torch.inference_mode()
33
+ def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor:
34
+ return data.to(torch.int64) % (2**nbits)
35
+
36
+
37
+ def dequantize_weight(codes: torch.Tensor,
38
+ codebooks: torch.Tensor,
39
+ scales: Optional[torch.Tensor] = None) -> torch.Tensor:
40
+ """
41
+ Decode float weights from quantization codes. Differentiable.
42
+ :param codes: tensor of integer quantization codes, shape
43
+ [*dims, num_out_groups, num_in_groups, num_codebooks]
44
+ :param codebooks: tensor of vectors for each quantization code,
45
+ [num_codebooks, codebook_size, out_group_size, in_group_size]
46
+ :param scales: weight will be multiplied by this factor, must be
47
+ broadcastble with
48
+ [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
49
+ :return: reconstructed weight tensor of shape
50
+ [*dims, num_in_groups*group_size]
51
+ """
52
+ num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
53
+ num_codebooks, codebook_size, out_group_size, in_group_size = \
54
+ codebooks.shape
55
+ out_features = num_out_groups * out_group_size
56
+ in_features = num_in_groups * in_group_size
57
+ codebook_offsets = torch.arange(
58
+ 0, num_codebooks * codebook_size, codebook_size,
59
+ device=codes.device) # shape: [num_codebooks]
60
+ reconstructed_weight_flat = F.embedding_bag(
61
+ codes.flatten(0, -2) + codebook_offsets,
62
+ codebooks.flatten(0, 1).flatten(-2, -1),
63
+ mode="sum"
64
+ ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size
65
+ # * in_group_size]
66
+
67
+ reconstructed_weight_groupwise = reconstructed_weight_flat.view(
68
+ list(codes.shape[:-3]) +
69
+ [num_out_groups, num_in_groups, out_group_size, in_group_size])
70
+ if scales is not None:
71
+ reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(
72
+ scales)
73
+ return reconstructed_weight_groupwise.swapaxes(
74
+ -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
75
+
76
+
77
+ def dequantize_gemm(
78
+ input: torch.Tensor, # [..., in_features]
79
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
80
+ codebooks: torch.
81
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
82
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
83
+ bias: Optional[torch.Tensor],
84
+ ) -> torch.Tensor:
85
+ dequantized_weight = dequantize_weight(
86
+ unpack_int_data(codes, codebooks.shape[1].bit_length() - 1),
87
+ codebooks,
88
+ scales,
89
+ )
90
+ return F.linear(input, dequantized_weight, bias)
91
+
92
+
93
+ # Generic dequantization, slow but flexible.
94
+ def generic_dequantize_gemm(
95
+ input: torch.Tensor, # [..., in_features]
96
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
97
+ codebooks: torch.
98
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
99
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
100
+ output_partition_sizes: List[int],
101
+ bias: Optional[torch.Tensor],
102
+ ) -> torch.Tensor:
103
+ output_shape = input.shape[:-1] + (scales.shape[0], )
104
+ output = torch.empty(output_shape, dtype=input.dtype, device=input.device)
105
+ num_outputs = len(output_partition_sizes)
106
+
107
+ # break the inputs and codebooks apart then combine the outputs.
108
+ # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big
109
+ # multiply at the end.
110
+ num_codebooks = codebooks.shape[0] // num_outputs
111
+ assert (scales.shape[0] == codes.shape[0])
112
+ assert (sum(output_partition_sizes) == scales.shape[0])
113
+ output_offset = 0
114
+ codebooks_offset = 0
115
+ for output_size in output_partition_sizes:
116
+ shard_output = dequantize_gemm(
117
+ input, codes.narrow(0, output_offset, output_size),
118
+ codebooks.narrow(0, codebooks_offset, num_codebooks),
119
+ scales.narrow(0, output_offset, output_size), None
120
+ if bias is None else bias.narrow(0, output_offset, output_size))
121
+
122
+ output_slice = output.narrow(-1, output_offset, output_size)
123
+ assert (output_slice.shape == shard_output.shape)
124
+ output_slice.copy_(shard_output)
125
+ output_offset += output_size
126
+ codebooks_offset += num_codebooks
127
+ return output
128
+
129
+
130
+ # Optimized dequnantize/decompression kernels, supports 1x16 and 2x8
131
+ # at 6 and 9 times faster than the generic version above, respectively.
132
+ def optimized_dequantize_gemm(
133
+ input: torch.Tensor, # [..., in_features]
134
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
135
+ codebooks: torch.
136
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
137
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
138
+ output_partition_sizes: List[int],
139
+ bias: Optional[torch.Tensor],
140
+ ) -> torch.Tensor:
141
+ weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
142
+
143
+ if bias is None:
144
+ # scaling the output is fastest, so we do that when possible.
145
+ output = F.linear(input, weights, bias)
146
+ orig_shape = output.shape
147
+ flattened_output = output.view(-1, output.size(-1))
148
+ f_scales = scales.view(-1, scales.shape[0])
149
+ b_scales = f_scales.expand(flattened_output.shape[0], -1)
150
+ flattened_output *= b_scales
151
+ return output.view(orig_shape)
152
+ else:
153
+ b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
154
+ -1, weights.shape[1])
155
+ weights *= b_scales
156
+ return F.linear(input, weights, bias)
157
+
158
+
159
+ class AQLMConfig(QuantizationConfig):
160
+ """Config class for AQLM.
161
+
162
+ Reference: https://github.com/Vahe1994/AQLM
163
+ """
164
+
165
+ def __init__(
166
+ self,
167
+ in_group_size: int,
168
+ nbits_per_codebook: int,
169
+ num_codebooks: int,
170
+ out_group_size: int,
171
+ ) -> None:
172
+ self.in_group_size = in_group_size
173
+ self.nbits_per_codebook = nbits_per_codebook
174
+ self.num_codebooks = num_codebooks
175
+ self.out_group_size = out_group_size
176
+
177
+ # out_group_size > 1 is untested, and probably won't work as-is.
178
+ assert (self.out_group_size == 1)
179
+ self.pack_factor = (self.in_group_size * self.out_group_size)
180
+
181
+ def __repr__(self) -> str:
182
+ return (f"AQLMConfig(in_group_size={self.in_group_size}, "
183
+ f"nbits_per_codebook={self.nbits_per_codebook}, "
184
+ f"num_codebooks={self.num_codebooks}, "
185
+ f"out_group_size={self.out_group_size})")
186
+
187
+ @classmethod
188
+ def get_name(cls) -> str:
189
+ return "aqlm"
190
+
191
+ @classmethod
192
+ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
193
+ return [torch.half]
194
+
195
+ @classmethod
196
+ def get_min_capability(cls) -> int:
197
+ return 60
198
+
199
+ @classmethod
200
+ def get_config_filenames(cls) -> List[str]:
201
+ return [] # no extra configs.
202
+
203
+ @classmethod
204
+ def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig":
205
+ in_group_size = cls.get_from_keys(config, ["in_group_size"])
206
+ nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"])
207
+ num_code_books = cls.get_from_keys(config, ["num_codebooks"])
208
+ out_group_size = cls.get_from_keys(config, ["out_group_size"])
209
+ return cls(in_group_size, nbits_per_codebook, num_code_books,
210
+ out_group_size)
211
+
212
+ def get_quant_method(self, layer: torch.nn.Module,
213
+ prefix: str) -> Optional["AQLMLinearMethod"]:
214
+ if isinstance(layer, LinearBase):
215
+ return AQLMLinearMethod(self)
216
+ return None
217
+
218
+
219
+ class AQLMLinearMethod(LinearMethodBase):
220
+ """Linear method for AQLM.
221
+
222
+ Args:
223
+ quant_config: The AQLM quantization config.
224
+ """
225
+
226
+ def __init__(self, quant_config: AQLMConfig):
227
+ self.quant_config = quant_config
228
+
229
+ def create_weights(self, layer: torch.nn.Module,
230
+ input_size_per_partition: int,
231
+ output_partition_sizes: List[int], input_size: int,
232
+ output_size: int, params_dtype: torch.dtype,
233
+ **extra_weight_attrs):
234
+ del output_size # Unused.
235
+ del input_size # Unused.
236
+
237
+ if params_dtype != torch.half:
238
+ raise ValueError("Only half is currently supported by aqlm")
239
+ if input_size_per_partition % self.quant_config.in_group_size != 0:
240
+ raise ValueError(
241
+ "The input size is not aligned with the quantized "
242
+ "weight shape. This can be caused by too large "
243
+ "tensor parallel size.")
244
+
245
+ output_size_per_partition = sum(output_partition_sizes)
246
+ if output_size_per_partition % self.quant_config.out_group_size != 0:
247
+ raise ValueError(
248
+ "The output size is not aligned with the quantized "
249
+ "weight shape. This can be caused by too large "
250
+ "tensor parallel size.")
251
+
252
+ codes = Parameter(
253
+ torch.empty(
254
+ # There could actually be two pack factors, one along input and
255
+ # one along output, but we don't currently support
256
+ # out_group_size, and only the one along output needs to be
257
+ # marked with "packed_dim" in order for QKVLinear to work.
258
+ output_size_per_partition,
259
+ input_size_per_partition // self.quant_config.pack_factor,
260
+ self.quant_config.num_codebooks,
261
+ dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
262
+ ),
263
+ requires_grad=False,
264
+ )
265
+
266
+ set_weight_attrs(
267
+ codes,
268
+ {
269
+ "input_dim": 1,
270
+ "output_dim": 0,
271
+ "packed_dim": 1,
272
+ "pack_factor": self.quant_config.pack_factor,
273
+ },
274
+ )
275
+
276
+ codebooks = Parameter(
277
+ torch.empty(
278
+ self.quant_config.num_codebooks * len(output_partition_sizes),
279
+ 2**self.quant_config.nbits_per_codebook,
280
+ self.quant_config.out_group_size,
281
+ self.quant_config.in_group_size,
282
+ dtype=params_dtype,
283
+ ),
284
+ requires_grad=False,
285
+ )
286
+ set_weight_attrs(
287
+ codebooks,
288
+ {
289
+ # metadata indicates fixed size concatenated along dim 0
290
+ "is_metadata": True,
291
+ "output_partition_sizes": output_partition_sizes
292
+ },
293
+ )
294
+
295
+ scales = Parameter(
296
+ torch.empty(
297
+ (
298
+ output_size_per_partition //
299
+ self.quant_config.out_group_size,
300
+ 1,
301
+ 1,
302
+ 1,
303
+ ),
304
+ dtype=params_dtype,
305
+ ),
306
+ requires_grad=False,
307
+ )
308
+ set_weight_attrs(
309
+ scales,
310
+ {
311
+ "output_dim": 0,
312
+ "packed_dim": 0,
313
+ "pack_factor": self.quant_config.out_group_size
314
+ },
315
+ )
316
+
317
+ layer.register_parameter("codes", codes)
318
+ set_weight_attrs(codes, extra_weight_attrs)
319
+ layer.register_parameter("codebooks", codebooks)
320
+ set_weight_attrs(codebooks, extra_weight_attrs)
321
+ layer.register_parameter("scales", scales)
322
+ set_weight_attrs(scales, extra_weight_attrs)
323
+
324
+ def apply(
325
+ self,
326
+ layer: torch.nn.Module,
327
+ x: torch.Tensor,
328
+ bias: Optional[torch.Tensor] = None,
329
+ ) -> torch.Tensor:
330
+ codebooks = layer.codebooks
331
+ codes = layer.codes
332
+ scales = layer.scales
333
+ output_partition_sizes = getattr(codebooks, "output_partition_sizes",
334
+ [])
335
+
336
+ nbooks = codes.shape[2]
337
+ ingroups = codebooks.shape[3]
338
+ outgroups = codebooks.shape[2]
339
+ bits = codebooks.shape[1]
340
+
341
+ # We support these formats with dedicated gemm and decompression
342
+ # kernels.
343
+ if ingroups == 8 and outgroups == 1 and (
344
+ (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)):
345
+
346
+ # thresholds determined by timings on an A6000, one GPU
347
+ use_gemv = math.prod(x.shape[:-1]) <= 6
348
+
349
+ return ops.aqlm_gemm(
350
+ x,
351
+ codes,
352
+ codebooks,
353
+ scales,
354
+ output_partition_sizes,
355
+ bias,
356
+ ) if use_gemv else optimized_dequantize_gemm(
357
+ x,
358
+ codes,
359
+ codebooks,
360
+ scales,
361
+ output_partition_sizes,
362
+ bias,
363
+ )
364
+
365
+ # fall back all unoptimized formats
366
+ return generic_dequantize_gemm(
367
+ x,
368
+ codes,
369
+ codebooks,
370
+ scales,
371
+ output_partition_sizes,
372
+ bias,
373
+ )
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/awq.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import torch
6
+
7
+ from vllm import _custom_ops as ops
8
+ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
9
+ UnquantizedLinearMethod)
10
+ from vllm.model_executor.layers.quantization.base_config import (
11
+ QuantizationConfig)
12
+ from vllm.model_executor.parameter import (GroupQuantScaleParameter,
13
+ PackedvLLMParameter)
14
+
15
+
16
+ class AWQConfig(QuantizationConfig):
17
+ """Config class for AWQ.
18
+
19
+ Reference: https://arxiv.org/abs/2306.00978
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ weight_bits: int,
25
+ group_size: int,
26
+ zero_point: bool,
27
+ modules_to_not_convert: Optional[List[str]] = None,
28
+ ) -> None:
29
+ self.weight_bits = weight_bits
30
+ self.group_size = group_size
31
+ self.zero_point = zero_point
32
+ self.modules_to_not_convert = modules_to_not_convert or []
33
+
34
+ if self.weight_bits != 4:
35
+ raise ValueError(
36
+ "Currently, only 4-bit weight quantization is supported for "
37
+ f"AWQ, but got {self.weight_bits} bits.")
38
+ self.pack_factor = 32 // self.weight_bits
39
+
40
+ def __repr__(self) -> str:
41
+ return (f"AWQConfig(weight_bits={self.weight_bits}, "
42
+ f"group_size={self.group_size}, "
43
+ f"zero_point={self.zero_point}, "
44
+ f"modules_to_not_convert={self.modules_to_not_convert})")
45
+
46
+ def get_name(self) -> str:
47
+ return "awq"
48
+
49
+ def get_supported_act_dtypes(self) -> List[torch.dtype]:
50
+ return [torch.half]
51
+
52
+ @classmethod
53
+ def get_min_capability(cls) -> int:
54
+ # The AWQ kernel only supports Turing or newer GPUs.
55
+ return 75
56
+
57
+ @staticmethod
58
+ def get_config_filenames() -> List[str]:
59
+ return [
60
+ "quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq
61
+ # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
62
+ "quantize_config.json",
63
+ ]
64
+
65
+ @classmethod
66
+ def from_config(cls, config: Dict[str, Any]) -> "AWQConfig":
67
+ weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
68
+ group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
69
+ zero_point = cls.get_from_keys(config, ["zero_point"])
70
+ modules_to_not_convert = cls.get_from_keys_or(
71
+ config, ["modules_to_not_convert"], None)
72
+ return cls(weight_bits, group_size, zero_point, modules_to_not_convert)
73
+
74
+ def get_quant_method(self, layer: torch.nn.Module,
75
+ prefix: str) -> Optional["LinearMethodBase"]:
76
+ if isinstance(layer, LinearBase):
77
+ if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
78
+ return UnquantizedLinearMethod()
79
+ return AWQLinearMethod(self)
80
+ return None
81
+
82
+
83
+ def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]):
84
+ return any(module_name in prefix for module_name in modules_to_not_convert)
85
+
86
+
87
+ class AWQLinearMethod(LinearMethodBase):
88
+ """Linear method for AWQ.
89
+
90
+ Args:
91
+ quant_config: The AWQ quantization config.
92
+ """
93
+
94
+ def __init__(self, quant_config: AWQConfig):
95
+ self.quant_config = quant_config
96
+
97
+ def create_weights(self, layer: torch.nn.Module,
98
+ input_size_per_partition: int,
99
+ output_partition_sizes: List[int], input_size: int,
100
+ output_size: int, params_dtype: torch.dtype,
101
+ **extra_weight_attrs):
102
+ if input_size_per_partition % self.quant_config.group_size != 0:
103
+ raise ValueError(
104
+ "The input size is not aligned with the quantized "
105
+ "weight shape. This can be caused by too large "
106
+ "tensor parallel size.")
107
+
108
+ output_size_per_partition = sum(output_partition_sizes)
109
+ if output_size_per_partition % self.quant_config.pack_factor != 0:
110
+ raise ValueError(
111
+ "The output size is not aligned with the quantized "
112
+ "weight shape. This can be caused by too large "
113
+ "tensor parallel size.")
114
+
115
+ weight_loader = extra_weight_attrs.get("weight_loader")
116
+ qweight = PackedvLLMParameter(
117
+ data=torch.empty(
118
+ input_size_per_partition,
119
+ output_size_per_partition // self.quant_config.pack_factor,
120
+ dtype=torch.int32,
121
+ ),
122
+ input_dim=0,
123
+ output_dim=1,
124
+ packed_dim=1,
125
+ packed_factor=self.quant_config.pack_factor,
126
+ weight_loader=weight_loader)
127
+
128
+ qzeros = PackedvLLMParameter(
129
+ data=torch.empty(
130
+ input_size_per_partition // self.quant_config.group_size,
131
+ output_size_per_partition // self.quant_config.pack_factor,
132
+ dtype=torch.int32,
133
+ ),
134
+ input_dim=0,
135
+ output_dim=1,
136
+ packed_dim=1,
137
+ packed_factor=self.quant_config.pack_factor,
138
+ weight_loader=weight_loader)
139
+
140
+ scales = GroupQuantScaleParameter(data=torch.empty(
141
+ input_size_per_partition // self.quant_config.group_size,
142
+ output_size_per_partition,
143
+ dtype=params_dtype,
144
+ ),
145
+ input_dim=0,
146
+ output_dim=1,
147
+ weight_loader=weight_loader)
148
+
149
+ layer.register_parameter("qweight", qweight)
150
+ layer.register_parameter("qzeros", qzeros)
151
+ layer.register_parameter("scales", scales)
152
+
153
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
154
+ layer.qweight = torch.nn.Parameter(layer.qweight.data,
155
+ requires_grad=False)
156
+ layer.qzeros = torch.nn.Parameter(layer.qzeros.data,
157
+ requires_grad=False)
158
+ layer.scales = torch.nn.Parameter(layer.scales.data,
159
+ requires_grad=False)
160
+
161
+ def apply(self,
162
+ layer: torch.nn.Module,
163
+ x: torch.Tensor,
164
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
165
+ qweight = layer.qweight
166
+ scales = layer.scales
167
+ qzeros = layer.qzeros
168
+ pack_factor = self.quant_config.pack_factor
169
+ out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
170
+ reshaped_x = x.reshape(-1, x.shape[-1])
171
+
172
+ # num_tokens >= threshold
173
+ FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256
174
+
175
+ if FP16_MATMUL_HEURISTIC_CONDITION:
176
+ out = ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0)
177
+ out = torch.matmul(reshaped_x, out)
178
+ else:
179
+ out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros,
180
+ pack_factor)
181
+ if bias is not None:
182
+ out.add_(bias)
183
+ return out.reshape(out_shape)
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/awq_marlin.py ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Any, Callable, Dict, List, Optional
4
+
5
+ import torch
6
+ from torch.nn import Parameter
7
+
8
+ import vllm.model_executor.layers.fused_moe # noqa
9
+ from vllm import _custom_ops as ops
10
+ from vllm.logger import init_logger
11
+ from vllm.model_executor.layers.fused_moe.layer import (
12
+ FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
13
+ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
14
+ UnquantizedLinearMethod,
15
+ set_weight_attrs)
16
+ from vllm.model_executor.layers.quantization.awq import is_layer_skipped_awq
17
+ from vllm.model_executor.layers.quantization.base_config import (
18
+ QuantizationConfig, QuantizeMethodBase)
19
+ from vllm.model_executor.layers.quantization.utils import replace_parameter
20
+ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
21
+ apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
22
+ marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales,
23
+ marlin_permute_scales, moe_awq_to_marlin_zero_points,
24
+ verify_marlin_supported, verify_marlin_supports_shape)
25
+ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
26
+ from vllm.model_executor.parameter import (GroupQuantScaleParameter,
27
+ PackedvLLMParameter)
28
+ from vllm.platforms import current_platform
29
+ from vllm.scalar_type import scalar_types
30
+
31
+ logger = init_logger(__name__)
32
+
33
+
34
+ class AWQMarlinConfig(QuantizationConfig):
35
+ """Config class for AWQ Marlin"""
36
+
37
+ # num_bits -> type
38
+ TYPE_MAP = {
39
+ 4: scalar_types.uint4,
40
+ 8: scalar_types.uint8,
41
+ }
42
+
43
+ def __init__(self,
44
+ weight_bits: int,
45
+ group_size: int,
46
+ zero_point: bool,
47
+ lm_head_quantized: bool,
48
+ modules_to_not_convert: Optional[List[str]] = None) -> None:
49
+ self.pack_factor = 32 // weight_bits # packed into int32
50
+ self.group_size = group_size
51
+ self.zero_point = zero_point
52
+ self.lm_head_quantized = lm_head_quantized
53
+ self.weight_bits = weight_bits
54
+ self.modules_to_not_convert = modules_to_not_convert or []
55
+
56
+ if self.weight_bits not in self.TYPE_MAP:
57
+ raise ValueError(f"Unsupported num_bits = {self.weight_bits}. "
58
+ f"Supported num_bits = {self.TYPE_MAP.keys()}")
59
+
60
+ self.quant_type = self.TYPE_MAP[self.weight_bits]
61
+
62
+ verify_marlin_supported(self.quant_type,
63
+ group_size=self.group_size,
64
+ has_zp=self.zero_point)
65
+
66
+ def __repr__(self) -> str:
67
+ return (f"AWQMarlinConfig(quant_type={self.quant_type}, "
68
+ f"group_size={self.group_size}, "
69
+ f"zero_point={self.zero_point}, "
70
+ f"lm_head_quantized={self.lm_head_quantized}, "
71
+ f"modules_to_not_convert={self.modules_to_not_convert})")
72
+
73
+ @classmethod
74
+ def get_name(cls) -> str:
75
+ return "awq_marlin"
76
+
77
+ @classmethod
78
+ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
79
+ return [torch.half, torch.bfloat16]
80
+
81
+ @classmethod
82
+ def get_min_capability(cls) -> int:
83
+ return 80
84
+
85
+ @classmethod
86
+ def get_config_filenames(cls) -> List[str]:
87
+ return ["quantize_config.json"]
88
+
89
+ @classmethod
90
+ def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig":
91
+ weight_bits = cls.get_from_keys(config, ["bits"])
92
+ group_size = cls.get_from_keys(config, ["group_size"])
93
+ zero_point = cls.get_from_keys(config, ["zero_point"])
94
+ lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
95
+ default=False)
96
+ modules_to_not_convert = cls.get_from_keys_or(
97
+ config, ["modules_to_not_convert"], None)
98
+ return cls(weight_bits, group_size, zero_point, lm_head_quantized,
99
+ modules_to_not_convert)
100
+
101
+ @classmethod
102
+ def override_quantization_method(cls, hf_quant_cfg,
103
+ user_quant) -> Optional[str]:
104
+ can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
105
+ is_valid_user_quant = (user_quant is None or user_quant == "marlin"
106
+ or user_quant == "awq_marlin")
107
+
108
+ if can_convert and is_valid_user_quant:
109
+ msg = ("The model is convertible to {} during runtime."
110
+ " Using {} kernel.".format(cls.get_name(), cls.get_name()))
111
+ logger.info(msg)
112
+ return cls.get_name()
113
+
114
+ if can_convert and user_quant == "awq":
115
+ logger.info("Detected that the model can run with awq_marlin"
116
+ ", however you specified quantization=awq explicitly,"
117
+ " so forcing awq. Use quantization=awq_marlin for"
118
+ " faster inference")
119
+ return None
120
+
121
+ def get_quant_method(self, layer: torch.nn.Module,
122
+ prefix: str) -> Optional["QuantizeMethodBase"]:
123
+ if (isinstance(layer, LinearBase) or
124
+ (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
125
+ if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
126
+ return UnquantizedLinearMethod()
127
+ return AWQMarlinLinearMethod(self)
128
+ elif isinstance(layer, FusedMoE):
129
+ return AWQMoEMethod(self)
130
+ return None
131
+
132
+ @classmethod
133
+ def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]):
134
+ # Extract data from quant config.
135
+ quant_method = quant_config.get("quant_method", "").lower()
136
+ num_bits = quant_config.get("bits")
137
+ group_size = quant_config.get("group_size")
138
+ zero_point = quant_config.get("zero_point")
139
+
140
+ if not current_platform.is_cuda():
141
+ return False
142
+
143
+ if quant_method != "awq":
144
+ return False
145
+
146
+ # If we cannot find the info needed in the config, cannot convert.
147
+ if (num_bits is None or group_size is None or zero_point is None):
148
+ return False
149
+
150
+ if num_bits not in cls.TYPE_MAP:
151
+ return False
152
+
153
+ return check_marlin_supported(quant_type=cls.TYPE_MAP[num_bits],
154
+ group_size=group_size,
155
+ has_zp=zero_point)
156
+
157
+
158
+ class AWQMarlinLinearMethod(LinearMethodBase):
159
+ """Linear method for AWQ Marlin.
160
+
161
+ Args:
162
+ quant_config: The AWQ Marlin quantization config.
163
+ """
164
+
165
+ def __init__(self, quant_config: AWQMarlinConfig) -> None:
166
+ self.quant_config = quant_config
167
+
168
+ def create_weights(
169
+ self,
170
+ layer: torch.nn.Module,
171
+ input_size_per_partition: int,
172
+ output_partition_sizes: List[int],
173
+ input_size: int,
174
+ output_size: int,
175
+ params_dtype: torch.dtype,
176
+ **extra_weight_attrs,
177
+ ) -> None:
178
+ del output_size
179
+ output_size_per_partition = sum(output_partition_sizes)
180
+ weight_loader = extra_weight_attrs.get("weight_loader")
181
+
182
+ # Normalize group_size
183
+ if self.quant_config.group_size != -1:
184
+ group_size = self.quant_config.group_size
185
+ else:
186
+ group_size = input_size
187
+
188
+ verify_marlin_supports_shape(
189
+ output_size_per_partition=output_size_per_partition,
190
+ input_size_per_partition=input_size_per_partition,
191
+ input_size=input_size,
192
+ group_size=group_size)
193
+
194
+ qweight = PackedvLLMParameter(
195
+ data=torch.empty(
196
+ input_size_per_partition,
197
+ output_size_per_partition // self.quant_config.pack_factor,
198
+ dtype=torch.int32,
199
+ ),
200
+ input_dim=0,
201
+ output_dim=1,
202
+ packed_dim=1,
203
+ packed_factor=self.quant_config.pack_factor,
204
+ weight_loader=weight_loader)
205
+
206
+ num_groups = input_size_per_partition // group_size
207
+
208
+ qzeros = PackedvLLMParameter(
209
+ data=torch.empty(
210
+ num_groups,
211
+ output_size_per_partition // self.quant_config.pack_factor,
212
+ dtype=torch.int32,
213
+ ),
214
+ input_dim=0,
215
+ output_dim=1,
216
+ packed_dim=1,
217
+ packed_factor=self.quant_config.pack_factor,
218
+ weight_loader=weight_loader)
219
+
220
+ scales = GroupQuantScaleParameter(data=torch.empty(
221
+ num_groups,
222
+ output_size_per_partition,
223
+ dtype=params_dtype,
224
+ ),
225
+ input_dim=0,
226
+ output_dim=1,
227
+ weight_loader=weight_loader)
228
+
229
+ layer.register_parameter("qweight", qweight)
230
+ layer.register_parameter("qzeros", qzeros)
231
+ layer.register_parameter("scales", scales)
232
+
233
+ layer.input_size_per_partition = input_size_per_partition
234
+ layer.output_size_per_partition = output_size_per_partition
235
+ layer.num_groups = num_groups
236
+
237
+ # TODO: Update this docs
238
+ # Checkpoints are serialized in AutoAWQ format, which is different from the
239
+ # marlin format. This function is called after the weights are loaded.
240
+ # Here, we handle the repacking
241
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
242
+ device = layer.qweight.device
243
+ layer.qweight = torch.nn.Parameter(layer.qweight.data,
244
+ requires_grad=False)
245
+ layer.qzeros = torch.nn.Parameter(layer.qzeros.data,
246
+ requires_grad=False)
247
+ layer.scales = torch.nn.Parameter(layer.scales.data,
248
+ requires_grad=False)
249
+
250
+ # Allocate marlin workspace
251
+ layer.workspace = marlin_make_workspace(
252
+ layer.output_size_per_partition, device)
253
+
254
+ # Repack weights from AWQ format to marlin format.
255
+ marlin_qweight = ops.awq_marlin_repack(
256
+ layer.qweight,
257
+ size_k=layer.input_size_per_partition,
258
+ size_n=layer.output_size_per_partition,
259
+ num_bits=self.quant_config.quant_type.size_bits)
260
+ replace_parameter(layer, "qweight", marlin_qweight)
261
+
262
+ # Permute scales from AWQ format to marlin format.
263
+ marlin_scales = marlin_permute_scales(
264
+ layer.scales,
265
+ size_k=layer.input_size_per_partition,
266
+ size_n=layer.output_size_per_partition,
267
+ group_size=self.quant_config.group_size)
268
+ replace_parameter(layer, "scales", marlin_scales)
269
+
270
+ # Permute zero-points from AWQ format to marlin format.
271
+ marlin_zp = awq_to_marlin_zero_points(
272
+ layer.qzeros,
273
+ size_k=layer.num_groups,
274
+ size_n=layer.output_size_per_partition,
275
+ num_bits=self.quant_config.quant_type.size_bits)
276
+ replace_parameter(layer, "qzeros", marlin_zp)
277
+
278
+ # Not-used
279
+ layer.g_idx = marlin_make_empty_g_idx(device)
280
+ layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
281
+
282
+ def apply(
283
+ self,
284
+ layer: torch.nn.Module,
285
+ x: torch.Tensor,
286
+ bias: Optional[torch.Tensor] = None,
287
+ ) -> torch.Tensor:
288
+ return apply_awq_marlin_linear(
289
+ input=x,
290
+ weight=layer.qweight,
291
+ weight_scale=layer.scales,
292
+ weight_zp=layer.qzeros,
293
+ g_idx=layer.g_idx,
294
+ g_idx_sort_indices=layer.g_idx_sort_indices,
295
+ workspace=layer.workspace,
296
+ quant_type=self.quant_config.quant_type,
297
+ output_size_per_partition=layer.output_size_per_partition,
298
+ input_size_per_partition=layer.input_size_per_partition,
299
+ bias=bias)
300
+
301
+
302
+ class AWQMoEMethod(FusedMoEMethodBase):
303
+
304
+ def __init__(self, quant_config: AWQMarlinConfig):
305
+ self.quant_config = quant_config
306
+
307
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
308
+ hidden_size: int, intermediate_size_per_partition: int,
309
+ params_dtype: torch.dtype, **extra_weight_attrs):
310
+ extra_weight_attrs.update({
311
+ "is_transposed":
312
+ True,
313
+ "quant_method":
314
+ FusedMoeWeightScaleSupported.GROUP.value,
315
+ })
316
+
317
+ w13_qweight = Parameter(
318
+ torch.empty(num_experts,
319
+ hidden_size,
320
+ 2 * intermediate_size_per_partition //
321
+ self.quant_config.pack_factor,
322
+ dtype=torch.int32),
323
+ requires_grad=False)
324
+ layer.register_parameter("w13_qweight", w13_qweight)
325
+ set_weight_attrs(w13_qweight, extra_weight_attrs)
326
+
327
+ w2_qweight = Parameter(torch.empty(num_experts,
328
+ intermediate_size_per_partition,
329
+ hidden_size //
330
+ self.quant_config.pack_factor,
331
+ dtype=torch.int32),
332
+ requires_grad=False)
333
+ layer.register_parameter("w2_qweight", w2_qweight)
334
+ set_weight_attrs(w2_qweight, extra_weight_attrs)
335
+
336
+ num_groups_w13 = hidden_size // self.quant_config.group_size
337
+ num_groups_w2 = (intermediate_size_per_partition //
338
+ self.quant_config.group_size)
339
+
340
+ # WEIGHT_SCALES
341
+ # Allocate 2 scales for w1 and w3 respectively.
342
+ w13_scales = Parameter(torch.empty(num_experts,
343
+ num_groups_w13,
344
+ intermediate_size_per_partition * 2,
345
+ dtype=params_dtype),
346
+ requires_grad=False)
347
+ layer.register_parameter("w13_scales", w13_scales)
348
+ set_weight_attrs(w13_scales, extra_weight_attrs)
349
+
350
+ w2_scales = Parameter(torch.empty(num_experts,
351
+ num_groups_w2,
352
+ hidden_size,
353
+ dtype=params_dtype),
354
+ requires_grad=False)
355
+ layer.register_parameter("w2_scales", w2_scales)
356
+ set_weight_attrs(w2_scales, extra_weight_attrs)
357
+
358
+ # WEIGHT_ZERO_POINT
359
+ # Allocate 2 zero points for w1 and w3 respectively.
360
+ w13_qzeros = Parameter(
361
+ torch.empty(num_experts,
362
+ num_groups_w13,
363
+ 2 * intermediate_size_per_partition //
364
+ self.quant_config.pack_factor,
365
+ dtype=torch.int32),
366
+ requires_grad=False)
367
+ layer.register_parameter("w13_qzeros", w13_qzeros)
368
+ set_weight_attrs(w13_qzeros, extra_weight_attrs)
369
+
370
+ w2_qzeros = Parameter(torch.empty(num_experts,
371
+ num_groups_w2,
372
+ hidden_size //
373
+ self.quant_config.pack_factor,
374
+ dtype=torch.int32),
375
+ requires_grad=False)
376
+ layer.register_parameter("w2_qzeros", w2_qzeros)
377
+ set_weight_attrs(w2_qzeros, extra_weight_attrs)
378
+
379
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
380
+ num_experts = layer.w13_qweight.shape[0]
381
+ device = layer.w13_qweight.device
382
+
383
+ layer.w13_g_idx_sort_indices = torch.nn.Parameter(
384
+ torch.empty((num_experts, 0), dtype=torch.int32, device=device),
385
+ requires_grad=False,
386
+ )
387
+ layer.w2_g_idx_sort_indices = torch.nn.Parameter(
388
+ torch.empty((num_experts, 0), dtype=torch.int32, device=device),
389
+ requires_grad=False,
390
+ )
391
+
392
+ marlin_w13_qweight = ops.awq_marlin_moe_repack(
393
+ layer.w13_qweight,
394
+ layer.w13_g_idx_sort_indices,
395
+ size_k=layer.w13_qweight.shape[1],
396
+ size_n=layer.w13_qweight.shape[2] * self.quant_config.pack_factor,
397
+ num_bits=self.quant_config.weight_bits,
398
+ )
399
+ replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
400
+
401
+ marlin_w2_qweight = ops.awq_marlin_moe_repack(
402
+ layer.w2_qweight,
403
+ layer.w2_g_idx_sort_indices,
404
+ size_k=layer.w2_qweight.shape[1],
405
+ size_n=layer.w2_qweight.shape[2] * self.quant_config.pack_factor,
406
+ num_bits=self.quant_config.weight_bits,
407
+ )
408
+ replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
409
+
410
+ # Why does this take the intermediate size for size_k?
411
+ marlin_w13_scales = marlin_moe_permute_scales(
412
+ s=layer.w13_scales,
413
+ size_k=layer.intermediate_size_per_partition,
414
+ size_n=layer.w13_scales.shape[2],
415
+ group_size=self.quant_config.group_size,
416
+ )
417
+
418
+ replace_parameter(layer, "w13_scales", marlin_w13_scales)
419
+
420
+ marlin_w2_scales = marlin_moe_permute_scales(
421
+ s=layer.w2_scales,
422
+ size_k=layer.intermediate_size_per_partition,
423
+ size_n=layer.w2_scales.shape[2],
424
+ group_size=self.quant_config.group_size,
425
+ )
426
+ replace_parameter(layer, "w2_scales", marlin_w2_scales)
427
+
428
+ marlin_w13_zp = moe_awq_to_marlin_zero_points(
429
+ layer.w13_qzeros,
430
+ size_k=layer.w13_qzeros.shape[1],
431
+ size_n=layer.w13_qzeros.shape[2] * self.quant_config.pack_factor,
432
+ num_bits=self.quant_config.weight_bits)
433
+ replace_parameter(layer, "w13_qzeros", marlin_w13_zp)
434
+
435
+ marlin_w2_zp = moe_awq_to_marlin_zero_points(
436
+ layer.w2_qzeros,
437
+ size_k=layer.w2_qzeros.shape[1],
438
+ size_n=layer.w2_qzeros.shape[2] * self.quant_config.pack_factor,
439
+ num_bits=self.quant_config.weight_bits)
440
+ replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
441
+
442
+ def apply(
443
+ self,
444
+ layer: torch.nn.Module,
445
+ x: torch.Tensor,
446
+ router_logits: torch.Tensor,
447
+ top_k: int,
448
+ renormalize: bool,
449
+ use_grouped_topk: bool = False,
450
+ topk_group: Optional[int] = None,
451
+ num_expert_group: Optional[int] = None,
452
+ custom_routing_function: Optional[Callable] = None,
453
+ scoring_func: str = "softmax",
454
+ e_score_correction_bias: Optional[torch.Tensor] = None,
455
+ ) -> torch.Tensor:
456
+ topk_weights, topk_ids = FusedMoE.select_experts(
457
+ hidden_states=x,
458
+ router_logits=router_logits,
459
+ use_grouped_topk=use_grouped_topk,
460
+ top_k=top_k,
461
+ renormalize=renormalize,
462
+ topk_group=topk_group,
463
+ num_expert_group=num_expert_group,
464
+ custom_routing_function=custom_routing_function,
465
+ scoring_func=scoring_func,
466
+ e_score_correction_bias=e_score_correction_bias)
467
+
468
+ return torch.ops.vllm.fused_marlin_moe(
469
+ x,
470
+ layer.w13_qweight,
471
+ layer.w2_qweight,
472
+ layer.w13_scales,
473
+ layer.w2_scales,
474
+ router_logits,
475
+ topk_weights,
476
+ topk_ids,
477
+ w1_zeros=layer.w13_qzeros,
478
+ w2_zeros=layer.w2_qzeros,
479
+ num_bits=self.quant_config.weight_bits,
480
+ )
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/awq_triton.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import torch
4
+ import triton
5
+ import triton.language as tl
6
+
7
+ AWQ_TRITON_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
8
+
9
+
10
+ @triton.jit
11
+ def awq_dequantize_kernel(
12
+ qweight_ptr, # quantized matrix
13
+ scales_ptr, # scales, per group
14
+ zeros_ptr, # zeros, per group
15
+ group_size, # Should always be one of the supported group sizes
16
+ result_ptr, # Output matrix
17
+ num_cols, # input num cols in qweight
18
+ num_rows, # input num rows in qweight
19
+ BLOCK_SIZE_X: tl.constexpr,
20
+ BLOCK_SIZE_Y: tl.constexpr):
21
+ # Setup the pids.
22
+ pid_x = tl.program_id(axis=0)
23
+ pid_y = tl.program_id(axis=1)
24
+
25
+ # Compute offsets and masks for qweight_ptr.
26
+ offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y)
27
+ offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X)
28
+ offsets = num_cols * offsets_y[:, None] + offsets_x[None, :]
29
+
30
+ masks_y = offsets_y < num_rows
31
+ masks_x = offsets_x < num_cols
32
+
33
+ masks = masks_y[:, None] & masks_x[None, :]
34
+
35
+ # Compute offsets and masks for result output ptr.
36
+ result_offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y)
37
+ result_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(
38
+ 0, BLOCK_SIZE_X * 8)
39
+ result_offsets = (8 * num_cols * result_offsets_y[:, None] +
40
+ result_offsets_x[None, :])
41
+
42
+ result_masks_y = result_offsets_y < num_rows
43
+ result_masks_x = result_offsets_x < num_cols * 8
44
+ result_masks = result_masks_y[:, None] & result_masks_x[None, :]
45
+
46
+ # Load the weights.
47
+ iweights = tl.load(qweight_ptr + offsets, masks, 0.0)
48
+ iweights = tl.interleave(iweights, iweights)
49
+ iweights = tl.interleave(iweights, iweights)
50
+ iweights = tl.interleave(iweights, iweights)
51
+
52
+ # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
53
+ # that will map given indices to the correct order.
54
+ reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] +
55
+ tl.arange(0, 4)[:, None]).reshape(8)
56
+
57
+ # Use this to compute a set of shifts that can be used to unpack and
58
+ # reorder the values in iweights and zeros.
59
+ shifts = reverse_awq_order_tensor * 4
60
+ shifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_Y * BLOCK_SIZE_X, 8))
61
+ shifts = tl.reshape(shifts, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
62
+
63
+ # Unpack and reorder: shift out the correct 4-bit value and mask.
64
+ iweights = (iweights >> shifts) & 0xF
65
+
66
+ # Compute zero offsets and masks.
67
+ zero_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1)
68
+ zero_offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X)
69
+ zero_offsets = num_cols * zero_offsets_y[:, None] + zero_offsets_x[None, :]
70
+
71
+ zero_masks_y = zero_offsets_y < num_rows // group_size
72
+ zero_masks_x = zero_offsets_x < num_cols
73
+ zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :]
74
+
75
+ # Load the zeros.
76
+ zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0)
77
+ zeros = tl.interleave(zeros, zeros)
78
+ zeros = tl.interleave(zeros, zeros)
79
+ zeros = tl.interleave(zeros, zeros)
80
+ zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
81
+
82
+ # Unpack and reorder: shift out the correct 4-bit value and mask.
83
+ zeros = (zeros >> shifts) & 0xF
84
+
85
+ # Compute scale offsets and masks.
86
+ scale_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1)
87
+ scale_offsets_x = (pid_x * BLOCK_SIZE_X * 8 +
88
+ tl.arange(0, BLOCK_SIZE_X * 8))
89
+ scale_offsets = (num_cols * 8 * scale_offsets_y[:, None] +
90
+ scale_offsets_x[None, :])
91
+ scale_masks_y = scale_offsets_y < num_rows // group_size
92
+ scale_masks_x = scale_offsets_x < num_cols * 8
93
+ scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :]
94
+
95
+ # Load the scales.
96
+ scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0)
97
+ scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
98
+
99
+ # Dequantize.
100
+ iweights = (iweights - zeros) * scales
101
+ iweights = iweights.to(result_ptr.type.element_ty)
102
+
103
+ # Finally, store.
104
+ tl.store(result_ptr + result_offsets, iweights, result_masks)
105
+
106
+
107
+ @triton.jit
108
+ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
109
+ group_size, BLOCK_SIZE_M: tl.constexpr,
110
+ BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
111
+ SPLIT_K: tl.constexpr):
112
+ pid = tl.program_id(axis=0)
113
+ pid_z = tl.program_id(1)
114
+
115
+ # NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead.
116
+ # num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
117
+ num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
118
+
119
+ pid_m = pid // num_pid_n
120
+ pid_n = pid % num_pid_n
121
+
122
+ accumulator_dtype = c_ptr.type.element_ty
123
+
124
+ # NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead.
125
+ # accumulator = tl.arange(0, BLOCK_SIZE_N)
126
+ # accumulator = tl.broadcast_to(accumulator[None, :],
127
+ # (BLOCK_SIZE_M, BLOCK_SIZE_N))
128
+ # accumulator = accumulator & 0x0
129
+ # accumulator = accumulator.to(accumulator_dtype)
130
+ accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
131
+ dtype=accumulator_dtype)
132
+
133
+ # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
134
+ # that will map given indices to the correct order.
135
+ reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] +
136
+ tl.arange(0, 4)[:, None]).reshape(8)
137
+
138
+ # Create the necessary shifts to use to unpack.
139
+ shifts = reverse_awq_order_tensor * 4
140
+ shifts = tl.broadcast_to(shifts[None, :],
141
+ (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
142
+ shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N))
143
+
144
+ # Offsets and masks.
145
+ offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
146
+ masks_am = offsets_am < M
147
+
148
+ offsets_bn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
149
+ masks_bn = offsets_bn < N // 8
150
+
151
+ offsets_zn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
152
+ masks_zn = offsets_zn < N // 8
153
+
154
+ offsets_sn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
155
+ masks_sn = offsets_sn < N
156
+
157
+ offsets_k = pid_z * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
158
+ offsets_a = K * offsets_am[:, None] + offsets_k[None, :]
159
+ offsets_b = (N // 8) * offsets_k[:, None] + offsets_bn[None, :]
160
+
161
+ a_ptrs = a_ptr + offsets_a
162
+ b_ptrs = b_ptr + offsets_b
163
+
164
+ # NOTE: Use this in TRITON_INTERPRET=1 mode instead of tl.cdiv
165
+ # block_offset = BLOCK_SIZE_K * SPLIT_K
166
+ # for k in range(0, (K + block_offset - 1) // (block_offset)):
167
+ for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
168
+ masks_k = offsets_k < K
169
+ masks_a = masks_am[:, None] & masks_k[None, :]
170
+ a = tl.load(a_ptrs, mask=masks_a, other=0.0)
171
+
172
+ masks_b = masks_k[:, None] & masks_bn[None, :]
173
+ b = tl.load(b_ptrs, mask=masks_b, other=0.0)
174
+ b = tl.interleave(b, b)
175
+ b = tl.interleave(b, b)
176
+ b = tl.interleave(b, b)
177
+
178
+ # Dequantize b.
179
+ offsets_szk = (
180
+ (BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size +
181
+ tl.arange(0, 1))
182
+ offsets_z = (N // 8) * offsets_szk[:, None] + offsets_zn[None, :]
183
+ masks_zk = offsets_szk < K // group_size
184
+ masks_z = masks_zk[:, None] & masks_zn[None, :]
185
+ zeros_ptrs = zeros_ptr + offsets_z
186
+ zeros = tl.load(zeros_ptrs, mask=masks_z, other=0.0)
187
+ zeros = tl.interleave(zeros, zeros)
188
+ zeros = tl.interleave(zeros, zeros)
189
+ zeros = tl.interleave(zeros, zeros)
190
+ zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N))
191
+
192
+ offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :]
193
+ masks_sk = offsets_szk < K // group_size
194
+ masks_s = masks_sk[:, None] & masks_sn[None, :]
195
+ scales_ptrs = scales_ptr + offsets_s
196
+ scales = tl.load(scales_ptrs, mask=masks_s, other=0.0)
197
+ scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N))
198
+
199
+ b = (b >> shifts) & 0xF
200
+ zeros = (zeros >> shifts) & 0xF
201
+ b = (b - zeros) * scales
202
+ b = b.to(c_ptr.type.element_ty)
203
+
204
+ # Accumulate results.
205
+ accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
206
+
207
+ offsets_k += BLOCK_SIZE_K * SPLIT_K
208
+ a_ptrs += BLOCK_SIZE_K * SPLIT_K
209
+ b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N // 8)
210
+
211
+ c = accumulator.to(c_ptr.type.element_ty)
212
+ offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
213
+ offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
214
+ c_ptrs = c_ptr + pid_z * N * M + N * offs_cm[:, None] + offs_cn[None, :]
215
+ c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
216
+ tl.store(c_ptrs, c, mask=c_mask)
217
+
218
+
219
+ # qweights - [K , M // 8], int32
220
+ # scales - [K // G, M ], float16
221
+ # zeros - [K // G, M // 8], int32
222
+ def awq_dequantize_triton(qweight: torch.Tensor,
223
+ scales: torch.Tensor,
224
+ zeros: torch.Tensor,
225
+ block_size_x: int = 32,
226
+ block_size_y: int = 32) -> torch.Tensor:
227
+ K = qweight.shape[0]
228
+ M = scales.shape[1]
229
+ group_size = qweight.shape[0] // scales.shape[0]
230
+
231
+ assert K > 0 and M > 0
232
+ assert scales.shape[0] == K // group_size and scales.shape[1] == M
233
+ assert zeros.shape[0] == K // group_size and zeros.shape[1] == M // 8
234
+ assert group_size <= K
235
+ assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
236
+
237
+ # Result tensor:
238
+ # number of rows = same as input tensor
239
+ # number of cols = 8 x input tensor num cols
240
+ result = torch.empty(qweight.shape[0],
241
+ qweight.shape[1] * 8,
242
+ device=qweight.device,
243
+ dtype=scales.dtype)
244
+
245
+ Y = qweight.shape[0] # num rows
246
+ X = qweight.shape[1] # num cols
247
+
248
+ grid = lambda META: (
249
+ triton.cdiv(X, META['BLOCK_SIZE_X']),
250
+ triton.cdiv(Y, META['BLOCK_SIZE_Y']),
251
+ )
252
+ awq_dequantize_kernel[grid](qweight,
253
+ scales,
254
+ zeros,
255
+ group_size,
256
+ result,
257
+ X,
258
+ Y,
259
+ BLOCK_SIZE_X=block_size_x,
260
+ BLOCK_SIZE_Y=block_size_y)
261
+
262
+ return result
263
+
264
+
265
+ # input - [M, K]
266
+ # qweight - [K, N // 8]
267
+ # qzeros - [K // G, N // 8]
268
+ # scales - [K // G, N]
269
+ # split_k_iters - parallelism along K-dimension, int, power of 2.
270
+ def awq_gemm_triton(input: torch.Tensor,
271
+ qweight: torch.Tensor,
272
+ scales: torch.Tensor,
273
+ qzeros: torch.Tensor,
274
+ split_k_iters: int,
275
+ block_size_m: int = 32,
276
+ block_size_n: int = 32,
277
+ block_size_k: int = 32) -> torch.Tensor:
278
+ M, K = input.shape
279
+ N = qweight.shape[1] * 8
280
+ group_size = qweight.shape[0] // qzeros.shape[0]
281
+
282
+ assert N > 0 and K > 0 and M > 0
283
+ assert qweight.shape[0] == K and qweight.shape[1] == N // 8
284
+ assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 8
285
+ assert scales.shape[0] == K // group_size and scales.shape[1] == N
286
+ assert split_k_iters & (split_k_iters - 1) == 0 and split_k_iters != 0
287
+ assert split_k_iters <= 32
288
+ assert group_size <= K
289
+ assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
290
+
291
+ grid = lambda META: (
292
+ triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
293
+ N, META['BLOCK_SIZE_N']),
294
+ split_k_iters,
295
+ )
296
+
297
+ result = torch.zeros((split_k_iters, M, N),
298
+ dtype=scales.dtype,
299
+ device=input.device)
300
+
301
+ # A = input, B = qweight, C = result
302
+ # A = M x K, B = K x N, C = M x N
303
+ awq_gemm_kernel[grid](input,
304
+ qweight,
305
+ result,
306
+ qzeros,
307
+ scales,
308
+ M,
309
+ N,
310
+ K,
311
+ group_size,
312
+ BLOCK_SIZE_M=block_size_m,
313
+ BLOCK_SIZE_N=block_size_n,
314
+ BLOCK_SIZE_K=block_size_k,
315
+ SPLIT_K=split_k_iters)
316
+
317
+ result = result.sum(0)
318
+
319
+ return result
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/base_config.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import inspect
4
+ from abc import ABC, abstractmethod
5
+ from typing import Any, Dict, List, Mapping, Optional, Type
6
+
7
+ import torch
8
+ from torch import nn
9
+
10
+
11
+ class QuantizeMethodBase(ABC):
12
+ """Base class for different quantized methods."""
13
+
14
+ @abstractmethod
15
+ def create_weights(self, layer: torch.nn.Module, *weight_args,
16
+ **extra_weight_attrs):
17
+ """Create weights for a layer.
18
+
19
+ The weights will be set as attributes of the layer."""
20
+ raise NotImplementedError
21
+
22
+ @abstractmethod
23
+ def apply(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
24
+ """Apply the weights in layer to the input tensor.
25
+
26
+ Expects create_weights to have been called before on the layer."""
27
+ raise NotImplementedError
28
+
29
+ # Not required functions
30
+ def embedding(self, layer: torch.nn.Module, *args,
31
+ **kwargs) -> torch.Tensor:
32
+ """Gather embeddings in the layer based on indices in the input tensor.
33
+
34
+ Expects create_weights to have been called before on the layer."""
35
+ raise NotImplementedError
36
+
37
+ def process_weights_after_loading(self, layer: nn.Module) -> None:
38
+ """Process the weight after loading.
39
+
40
+ This can be used for example, to transpose weights for computation.
41
+ """
42
+ return
43
+
44
+
45
+ def method_has_implemented_embedding(
46
+ method_class: Type[QuantizeMethodBase]) -> bool:
47
+ """
48
+ Not all quant methods have embedding implemented, so we need to check that
49
+ it exists for our given method. We check this by making sure the function
50
+ has been changed from the base implementation.
51
+ """
52
+ base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding",
53
+ None)
54
+ class_embedding = inspect.getattr_static(method_class, "embedding", None)
55
+
56
+ return (class_embedding is not None
57
+ and class_embedding is not base_embedding)
58
+
59
+
60
+ class QuantizationConfig(ABC):
61
+ """Base class for quantization configs."""
62
+ packed_modules_mapping: Mapping[str, List[str]] = dict()
63
+
64
+ @abstractmethod
65
+ def get_name(self) -> str:
66
+ """Name of the quantization method."""
67
+ raise NotImplementedError
68
+
69
+ @abstractmethod
70
+ def get_supported_act_dtypes(self) -> List[torch.dtype]:
71
+ """List of supported activation dtypes."""
72
+ raise NotImplementedError
73
+
74
+ @classmethod
75
+ @abstractmethod
76
+ def get_min_capability(cls) -> int:
77
+ """Minimum GPU capability to support the quantization method.
78
+
79
+ E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
80
+ This requirement is due to the custom CUDA kernels used by the
81
+ quantization method.
82
+ """
83
+ raise NotImplementedError
84
+
85
+ @staticmethod
86
+ @abstractmethod
87
+ def get_config_filenames() -> List[str]:
88
+ """List of filenames to search for in the model directory."""
89
+ raise NotImplementedError
90
+
91
+ @classmethod
92
+ @abstractmethod
93
+ def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig":
94
+ """Create a config class from the model's quantization config."""
95
+ raise NotImplementedError
96
+
97
+ @classmethod
98
+ def override_quantization_method(cls, hf_quant_cfg,
99
+ user_quant) -> Optional[str]:
100
+ """
101
+ Detects if this quantization method can support a given checkpoint
102
+ format by overriding the user specified quantization method --
103
+ this method should only be overwritten by subclasses in exceptional
104
+ circumstances
105
+ """
106
+ return None
107
+
108
+ @staticmethod
109
+ def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
110
+ """Get a value from the model's quantization config."""
111
+ for key in keys:
112
+ if key in config:
113
+ return config[key]
114
+ raise ValueError(f"Cannot find any of {keys} in the model's "
115
+ "quantization config.")
116
+
117
+ @staticmethod
118
+ def get_from_keys_or(config: Dict[str, Any], keys: List[str],
119
+ default: Any) -> Any:
120
+ """Get a optional value from the model's quantization config."""
121
+ try:
122
+ return QuantizationConfig.get_from_keys(config, keys)
123
+ except ValueError:
124
+ return default
125
+
126
+ @abstractmethod
127
+ def get_quant_method(self, layer: torch.nn.Module,
128
+ prefix: str) -> Optional[QuantizeMethodBase]:
129
+ """Get the quantize method to use for the quantized layer.
130
+
131
+ Args:
132
+ layer: The layer for the quant method.
133
+ prefix: The full name of the layer in the state dict
134
+ Returns:
135
+ The quantize method. None if the given layer doesn't support quant
136
+ method.
137
+ """
138
+ raise NotImplementedError
139
+
140
+ def get_cache_scale(self, name: str) -> Optional[str]:
141
+ return None
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/bitsandbytes.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import torch
6
+
7
+ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
8
+ UnquantizedLinearMethod,
9
+ set_weight_attrs)
10
+ from vllm.model_executor.layers.quantization.base_config import (
11
+ QuantizationConfig)
12
+
13
+
14
+ class BitsAndBytesConfig(QuantizationConfig):
15
+ """Config class for BitsAndBytes Quantization.
16
+
17
+ Reference: https://arxiv.org/abs/2305.14314
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ load_in_8bit: bool = False,
23
+ load_in_4bit: bool = True,
24
+ bnb_4bit_compute_dtype: str = "float32",
25
+ bnb_4bit_quant_storage: str = "uint8",
26
+ bnb_4bit_quant_type: str = "fp4",
27
+ bnb_4bit_use_double_quant: bool = False,
28
+ llm_int8_enable_fp32_cpu_offload: bool = False,
29
+ llm_int8_has_fp16_weight: bool = False,
30
+ llm_int8_skip_modules: Optional[List[str]] = None,
31
+ llm_int8_threshold: float = 6.0,
32
+ ) -> None:
33
+
34
+ self.load_in_8bit = load_in_8bit
35
+ self.load_in_4bit = load_in_4bit
36
+ self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype
37
+ self.bnb_4bit_quant_storage = bnb_4bit_quant_storage
38
+ self.bnb_4bit_quant_type = bnb_4bit_quant_type
39
+ self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant
40
+ self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload
41
+ self.llm_int8_has_fp16_weight = llm_int8_has_fp16_weight
42
+ self.llm_int8_skip_modules = llm_int8_skip_modules or []
43
+ self.llm_int8_threshold = llm_int8_threshold
44
+
45
+ if self.bnb_4bit_quant_storage not in ["uint8"]:
46
+ raise ValueError("Unsupported bnb_4bit_quant_storage: "
47
+ f"{self.bnb_4bit_quant_storage}")
48
+
49
+ def __repr__(self) -> str:
50
+ return (f"BitsAndBytesConfig(load_in_8bit={self.load_in_8bit}, "
51
+ f"load_in_4bit={self.load_in_4bit}, "
52
+ f"bnb_4bit_compute_dtype={self.bnb_4bit_compute_dtype}, "
53
+ f"bnb_4bit_quant_storage={self.bnb_4bit_quant_storage}, "
54
+ f"bnb_4bit_quant_type={self.bnb_4bit_quant_type}, "
55
+ f"llm_int8_skip_modules={self.llm_int8_skip_modules})")
56
+
57
+ @classmethod
58
+ def get_name(self) -> str:
59
+ return "bitsandbytes"
60
+
61
+ @classmethod
62
+ def get_supported_act_dtypes(self) -> List[torch.dtype]:
63
+ return [torch.float32, torch.float16, torch.bfloat16]
64
+
65
+ @classmethod
66
+ def get_min_capability(cls) -> int:
67
+ return 70
68
+
69
+ @staticmethod
70
+ def get_config_filenames() -> List[str]:
71
+ return [
72
+ "adapter_config.json",
73
+ ]
74
+
75
+ @classmethod
76
+ def from_config(cls, config: Dict[str, Any]) -> "BitsAndBytesConfig":
77
+
78
+ def get_safe_value(config, keys, default_value=None):
79
+ try:
80
+ value = cls.get_from_keys(config, keys)
81
+ return value if value is not None else default_value
82
+ except ValueError:
83
+ return default_value
84
+
85
+ load_in_8bit = get_safe_value(config, ["load_in_8bit"],
86
+ default_value=False)
87
+ load_in_4bit = get_safe_value(config, ["load_in_4bit"],
88
+ default_value=True)
89
+ bnb_4bit_compute_dtype = get_safe_value(config,
90
+ ["bnb_4bit_compute_dtype"],
91
+ default_value="float32")
92
+ bnb_4bit_quant_storage = get_safe_value(config,
93
+ ["bnb_4bit_quant_storage"],
94
+ default_value="uint8")
95
+ bnb_4bit_quant_type = get_safe_value(config, ["bnb_4bit_quant_type"],
96
+ default_value="fp4")
97
+ bnb_4bit_use_double_quant = get_safe_value(
98
+ config, ["bnb_4bit_use_double_quant"], default_value=False)
99
+ llm_int8_enable_fp32_cpu_offload = get_safe_value(
100
+ config, ["llm_int8_enable_fp32_cpu_offload"], default_value=False)
101
+ llm_int8_has_fp16_weight = get_safe_value(config,
102
+ ["llm_int8_has_fp16_weight"],
103
+ default_value=False)
104
+ llm_int8_skip_modules = get_safe_value(config,
105
+ ["llm_int8_skip_modules"],
106
+ default_value=[])
107
+ llm_int8_threshold = get_safe_value(config, ["llm_int8_threshold"],
108
+ default_value=6.0)
109
+
110
+ return cls(
111
+ load_in_8bit=load_in_8bit,
112
+ load_in_4bit=load_in_4bit,
113
+ bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
114
+ bnb_4bit_quant_storage=bnb_4bit_quant_storage,
115
+ bnb_4bit_quant_type=bnb_4bit_quant_type,
116
+ bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
117
+ llm_int8_enable_fp32_cpu_offload=llm_int8_enable_fp32_cpu_offload,
118
+ llm_int8_has_fp16_weight=llm_int8_has_fp16_weight,
119
+ llm_int8_skip_modules=llm_int8_skip_modules,
120
+ llm_int8_threshold=llm_int8_threshold)
121
+
122
+ def get_quant_method(self, layer: torch.nn.Module,
123
+ prefix: str) -> Optional["LinearMethodBase"]:
124
+ if isinstance(layer, LinearBase):
125
+ if is_layer_skipped_bnb(prefix, self.llm_int8_skip_modules):
126
+ return UnquantizedLinearMethod()
127
+ return BitsAndBytesLinearMethod(self)
128
+ return None
129
+
130
+
131
+ def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: List[str]):
132
+ # Split the prefix into its dot-separated components
133
+ components = prefix.split('.')
134
+
135
+ # Check if any of the skip modules exactly matches any component
136
+ return any(module_name in components
137
+ for module_name in llm_int8_skip_modules)
138
+
139
+
140
+ class BitsAndBytesLinearMethod(LinearMethodBase):
141
+ """Linear method for BitsAndBytes.
142
+
143
+ Args:
144
+ quant_config: The BitsAndBytes quantization config.
145
+ """
146
+
147
+ def __init__(self, quant_config: BitsAndBytesConfig):
148
+ try:
149
+ import bitsandbytes
150
+ if bitsandbytes.__version__ < "0.45.0":
151
+ raise ImportError("bitsandbytes version is wrong. Please "
152
+ "install bitsandbytes>=0.45.0.")
153
+ except ImportError as err:
154
+ raise ImportError("Please install bitsandbytes>=0.45.0 via "
155
+ "`pip install bitsandbytes>=0.45.0` to use "
156
+ "bitsandbytes quantizer.") from err
157
+
158
+ self.quant_config = quant_config
159
+
160
+ def create_weights(self, layer: torch.nn.Module,
161
+ input_size_per_partition: int,
162
+ output_partition_sizes: List[int], input_size: int,
163
+ output_size: int, params_dtype: torch.dtype,
164
+ **extra_weight_attrs):
165
+ from bitsandbytes.nn import Int8Params
166
+
167
+ def calculate_quant_ratio(dtype):
168
+ if dtype.is_floating_point:
169
+ return torch.finfo(dtype).bits // torch.iinfo(torch.uint8).bits
170
+ else:
171
+ return torch.iinfo(dtype).bits // torch.iinfo(torch.uint8).bits
172
+
173
+ def create_qweight_for_8bit():
174
+ qweight = Int8Params(
175
+ data=torch.empty(sum(output_partition_sizes),
176
+ input_size_per_partition,
177
+ dtype=torch.int8),
178
+ has_fp16_weights=self.quant_config.llm_int8_has_fp16_weight,
179
+ requires_grad=False)
180
+ set_weight_attrs(
181
+ qweight, {
182
+ "input_dim": 0,
183
+ "output_dim": 0,
184
+ "pack_factor": 1,
185
+ "use_bitsandbytes_8bit": True,
186
+ "generation": 0
187
+ })
188
+ return qweight
189
+
190
+ def create_qweight_for_4bit():
191
+ quant_ratio = calculate_quant_ratio(params_dtype)
192
+
193
+ total_size = input_size_per_partition * sum(output_partition_sizes)
194
+ if total_size % quant_ratio != 0:
195
+ raise ValueError(
196
+ "The input size is not aligned with the quantized "
197
+ "weight shape.")
198
+
199
+ qweight = torch.nn.Parameter(torch.empty(total_size // quant_ratio,
200
+ 1,
201
+ dtype=torch.uint8),
202
+ requires_grad=False)
203
+ set_weight_attrs(
204
+ qweight, {
205
+ "input_dim": 0,
206
+ "output_dim": 0,
207
+ "pack_factor": quant_ratio,
208
+ "use_bitsandbytes_4bit": True
209
+ })
210
+ return qweight
211
+
212
+ if self.quant_config.load_in_8bit:
213
+ qweight = create_qweight_for_8bit()
214
+ else:
215
+ qweight = create_qweight_for_4bit()
216
+ # Enable parameters to have the same name as in the BNB
217
+ # checkpoint format.
218
+ layer.register_parameter("weight", qweight)
219
+ set_weight_attrs(qweight, extra_weight_attrs)
220
+
221
+ def apply(self,
222
+ layer: torch.nn.Module,
223
+ x: torch.Tensor,
224
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
225
+
226
+ if self.quant_config.load_in_8bit:
227
+ return self._apply_8bit_weight(layer, x, bias)
228
+ else:
229
+ return self._apply_4bit_weight(layer, x, bias)
230
+
231
+ def _apply_8bit_weight(
232
+ self,
233
+ layer: torch.nn.Module,
234
+ x: torch.Tensor,
235
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
236
+
237
+ # only load the bitsandbytes module when needed
238
+ from bitsandbytes import MatmulLtState, matmul
239
+
240
+ original_type = x.dtype
241
+ original_shape = x.shape
242
+ reshape_after_matmul = False
243
+ if x.ndim > 2:
244
+ x = x.reshape(-1, x.size(-1))
245
+ reshape_after_matmul = True
246
+ bf_x = x.to(torch.bfloat16)
247
+
248
+ qweight = layer.weight
249
+ offsets = qweight.bnb_shard_offsets
250
+ quant_states = qweight.bnb_quant_state
251
+ matmul_states = qweight.matmul_state
252
+ generation = qweight.generation
253
+
254
+ out_dim_0 = x.shape[0]
255
+ out_dim_1 = sum(
256
+ [quant_state[1].shape[0] for quant_state in quant_states.items()])
257
+ out = torch.empty(out_dim_0,
258
+ out_dim_1,
259
+ dtype=torch.float16,
260
+ device=x.device)
261
+
262
+ current_index = 0
263
+ for i in range(len(quant_states)):
264
+ output_size = quant_states[i].shape[0]
265
+
266
+ # in profile_run or the first generation of inference,
267
+ # create new matmul_states
268
+ if generation == 0 or generation == 1:
269
+ matmul_states[i] = MatmulLtState()
270
+ matmul_states[i].CB = qweight[offsets[i]:offsets[i + 1]]
271
+ matmul_states[i].SCB = quant_states[i].to(x.device)
272
+ matmul_states[i].threshold = (
273
+ self.quant_config.llm_int8_threshold)
274
+ matmul_states[i].has_fp16_weights = (
275
+ self.quant_config.llm_int8_has_fp16_weight)
276
+ matmul_states[i].is_training = False
277
+ if matmul_states[i].threshold > 0.0 and not matmul_states[
278
+ i].has_fp16_weights:
279
+ matmul_states[i].use_pool = True
280
+
281
+ new_x = bf_x.unsqueeze(0)
282
+
283
+ out[:, current_index:current_index + output_size] = matmul(
284
+ new_x,
285
+ qweight[offsets[i]:offsets[i + 1]],
286
+ state=matmul_states[i])
287
+
288
+ current_index += output_size
289
+
290
+ # only update the matmul_states if it is not profile_run
291
+ if (generation > 0
292
+ and not self.quant_config.llm_int8_has_fp16_weight
293
+ and matmul_states[i].CB is not None
294
+ and matmul_states[i].CxB is not None):
295
+ del matmul_states[i].CB
296
+ qweight[offsets[i]:offsets[i + 1]] = matmul_states[i].CxB
297
+
298
+ out = out.to(original_type)
299
+
300
+ if reshape_after_matmul:
301
+ out = out.view(*original_shape[:-1], out.size(-1))
302
+
303
+ if bias is not None:
304
+ out += bias
305
+
306
+ qweight.generation += 1
307
+
308
+ return out
309
+
310
+ def _apply_4bit_weight(
311
+ self,
312
+ layer: torch.nn.Module,
313
+ x: torch.Tensor,
314
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
315
+
316
+ # only load the bitsandbytes module when needed
317
+ from bitsandbytes import matmul_4bit
318
+
319
+ original_type = x.dtype
320
+ original_shape = x.shape
321
+ reshape_after_matmul = False
322
+ if x.ndim > 2:
323
+ x = x.reshape(-1, x.size(-1))
324
+ reshape_after_matmul = True
325
+ bf_x = x.to(torch.bfloat16)
326
+
327
+ qweight = layer.weight
328
+ quant_states = qweight.bnb_quant_state
329
+ offsets = qweight.bnb_shard_offsets
330
+
331
+ out_dim_0 = x.shape[0]
332
+ out_dim_1 = sum(
333
+ [quant_state[1].shape[0] for quant_state in quant_states.items()])
334
+ out = torch.empty(out_dim_0,
335
+ out_dim_1,
336
+ dtype=torch.bfloat16,
337
+ device=x.device)
338
+
339
+ current_index = 0
340
+ for i in range(len(quant_states)):
341
+ output_size = quant_states[i].shape[0]
342
+ # It is more efficient to use out kwarg like
343
+ # matmul_4bit(..., out = ...). Infeasible now due to the bug
344
+ # https://github.com/TimDettmers/bitsandbytes/issues/1235.
345
+ # Need to change after the bug is fixed.
346
+ out[:, current_index:current_index + output_size] = matmul_4bit(
347
+ bf_x, qweight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
348
+
349
+ current_index += output_size
350
+
351
+ out = out.to(original_type)
352
+
353
+ if reshape_after_matmul:
354
+ out = out.view(*original_shape[:-1], out.size(-1))
355
+
356
+ if bias is not None:
357
+ out += bias
358
+
359
+ return out
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (231 Bytes). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-311.pyc ADDED
Binary file (27.4 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-311.pyc ADDED
Binary file (26.5 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/triton_scaled_mm.cpython-311.pyc ADDED
Binary file (10 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-311.pyc ADDED
Binary file (8.58 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py ADDED
@@ -0,0 +1,617 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from contextlib import suppress
4
+ from typing import Any, Dict, List, Literal, Optional, Tuple, cast
5
+
6
+ import torch
7
+ from compressed_tensors.config import (CompressionFormat,
8
+ SparsityCompressionConfig,
9
+ SparsityStructure)
10
+ from compressed_tensors.quantization import (QuantizationArgs,
11
+ QuantizationStrategy,
12
+ QuantizationType)
13
+ from pydantic import BaseModel
14
+
15
+ from vllm.logger import init_logger
16
+ from vllm.model_executor.layers.fused_moe import FusedMoE
17
+ from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
18
+ UnquantizedLinearMethod)
19
+ from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501
20
+ QuantizationConfig, QuantizeMethodBase)
21
+ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( # noqa: E501
22
+ CompressedTensorsMoEMethod)
23
+ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
24
+ W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensors24,
25
+ CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
26
+ CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
27
+ CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
28
+ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
29
+ find_matched_target, is_activation_quantization_format,
30
+ should_ignore_layer)
31
+ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
32
+ from vllm.platforms import current_platform
33
+
34
+ logger = init_logger(__name__)
35
+
36
+ __all__ = ["CompressedTensorsLinearMethod"]
37
+
38
+ SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
39
+ QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]]
40
+
41
+
42
+ class CompressedTensorsConfig(QuantizationConfig):
43
+
44
+ def __init__(
45
+ self,
46
+ target_scheme_map: Dict[str, Any],
47
+ ignore: List[str],
48
+ quant_format: str,
49
+ sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
50
+ sparsity_ignore_list: List[str],
51
+ kv_cache_scheme: Optional[Dict[str, Any]] = None,
52
+ config: Optional[Dict[str, Any]] = None,
53
+ ):
54
+
55
+ self.ignore = ignore
56
+ self.quant_format = quant_format
57
+ # Map from [target -> scheme]
58
+ self.target_scheme_map = target_scheme_map
59
+ self.kv_cache_scheme = kv_cache_scheme
60
+ self.sparsity_scheme_map = sparsity_scheme_map
61
+ self.sparsity_ignore_list = sparsity_ignore_list
62
+ self.config = config
63
+
64
+ def get_linear_method(self) -> "CompressedTensorsLinearMethod":
65
+ return CompressedTensorsLinearMethod(self)
66
+
67
+ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
68
+ return [torch.float16, torch.bfloat16]
69
+
70
+ @classmethod
71
+ def get_min_capability(cls) -> int:
72
+ return 70
73
+
74
+ def get_name(self) -> str:
75
+ return "compressed_tensors"
76
+
77
+ def get_quant_method(
78
+ self,
79
+ layer: torch.nn.Module,
80
+ prefix: str,
81
+ ) -> Optional["QuantizeMethodBase"]:
82
+ from vllm.attention.layer import Attention # Avoid circular import
83
+
84
+ # Check if the layer is skipped for quantization.
85
+ # TODO (@robertgshaw2): support module names
86
+ if should_ignore_layer(prefix,
87
+ ignore=self.ignore,
88
+ fused_mapping=self.packed_modules_mapping):
89
+ return UnquantizedLinearMethod()
90
+ if isinstance(layer, LinearBase):
91
+ scheme = self.get_scheme(layer=layer, layer_name=prefix)
92
+ if scheme is None:
93
+ return UnquantizedLinearMethod()
94
+ layer.scheme = scheme
95
+ return CompressedTensorsLinearMethod(self)
96
+ if isinstance(layer, Attention):
97
+ return CompressedTensorsKVCacheMethod(self)
98
+ if isinstance(layer, FusedMoE):
99
+ return CompressedTensorsMoEMethod.get_moe_method(self)
100
+ return None
101
+
102
+ @classmethod
103
+ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
104
+ ignore: List[str] = cast(List[str], config.get("ignore", []))
105
+ quant_format = cast(str, config.get("format"))
106
+ target_scheme_map = cls._quantization_scheme_map_from_config(
107
+ config=config)
108
+ sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
109
+ config=config)
110
+
111
+ return cls(
112
+ target_scheme_map=target_scheme_map,
113
+ ignore=ignore,
114
+ quant_format=quant_format,
115
+ sparsity_scheme_map=sparsity_scheme_map,
116
+ sparsity_ignore_list=sparsity_ignore_list,
117
+ config=config,
118
+ )
119
+
120
+ @classmethod
121
+ def _parse_sparsity_config(
122
+ cls, config: Dict[str, Any]
123
+ ) -> Tuple[Dict[str, SparsityCompressionConfig], List[str]]:
124
+ """
125
+ :param config: The `quantization_config` dictionary from config.json
126
+ :return: A tuple with two elements
127
+ 1. A dictionary mapping target layer names to their corresponding
128
+ sparsity_config
129
+ 2. A list of layer names to ignore for sparsity
130
+ """
131
+ if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)):
132
+ return dict(), []
133
+
134
+ sparsity_config = SparsityCompressionConfig.model_validate(
135
+ sparsity_config)
136
+ sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
137
+ target: sparsity_config
138
+ for target in sparsity_config.targets or list()
139
+ }
140
+ sparsity_ignore_list = sparsity_config.ignore or list()
141
+ return sparse_scheme_map, sparsity_ignore_list
142
+
143
+ @classmethod
144
+ def _quantization_scheme_map_from_config(
145
+ cls, config: Dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
146
+ """
147
+ :param config: The `quantization_config` dictionary from config.json
148
+ :return: A dictionary mapping target layer names to their corresponding
149
+ quantization_args for weights and input activations
150
+ """
151
+ target_scheme_map: Dict[str, Any] = dict()
152
+ quant_format = cast(str, config.get("format"))
153
+
154
+ # The quant_config has multiple config_groups, each containing
155
+ # an input_activations key with details about how the activations are
156
+ # quantized, a weights key indicating how the weights are quantized,
157
+ # and a list of targets under the `targets` key, dictating which
158
+ # layers are impacted by the quantization details. The quantization
159
+ # details follow the structure defined by the QuantizationArgs
160
+ # pydantic model, which is used to verify the structure of the
161
+ # quant_config and also store the details for later use.
162
+
163
+ config_groups = config.get("config_groups", dict())
164
+ for _, quant_config in config_groups.items():
165
+ targets = quant_config.get("targets")
166
+ for target in targets:
167
+ target_scheme_map[target] = {}
168
+ target_scheme_map[target][
169
+ "weights"] = QuantizationArgs.model_validate(
170
+ quant_config.get("weights"))
171
+
172
+ target_scheme_map[target]["input_activations"] = None
173
+ if is_activation_quantization_format(quant_format):
174
+ input_activations = quant_config.get("input_activations")
175
+ # The only case where we have activation quant supported
176
+ # but no input_activations provided in the config
177
+ # should be w8a16fp8 w8a16fp8 can also run for cases where
178
+ # there is an input_quant but it is ignored
179
+ if not input_activations:
180
+ assert target_scheme_map[target][
181
+ "weights"].type == QuantizationType.FLOAT
182
+ else:
183
+ target_scheme_map[target][
184
+ "input_activations"] = QuantizationArgs.model_validate( # noqa: E501
185
+ quant_config.get("input_activations"))
186
+ return target_scheme_map
187
+
188
+ @classmethod
189
+ def get_config_filenames(cls) -> List[str]:
190
+ return []
191
+
192
+ def _check_scheme_supported(self,
193
+ min_capability: int,
194
+ error: bool = True) -> bool:
195
+ capability_tuple = current_platform.get_device_capability()
196
+
197
+ if capability_tuple is not None:
198
+ capability = capability_tuple.to_int()
199
+ supported = capability >= min_capability
200
+ if error and not supported:
201
+ raise RuntimeError(
202
+ "Quantization scheme is not supported for ",
203
+ f"the current GPU. Min capability: {min_capability}. ",
204
+ f"Current capability: {capability}.")
205
+ return supported
206
+ else:
207
+ return False
208
+
209
+ def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
210
+ input_quant: BaseModel) -> bool:
211
+ is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
212
+ weight_strategy = (
213
+ weight_quant.strategy == QuantizationStrategy.TENSOR.value
214
+ or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
215
+ is_tensor = (weight_strategy and input_quant.strategy
216
+ == QuantizationStrategy.TENSOR.value)
217
+ is_static = not weight_quant.dynamic and not input_quant.dynamic
218
+
219
+ # Both symmetric and asymmetric input quantization supported.
220
+ # Only symmetric weight quantization supported.
221
+ return is_8_bits and is_tensor and weight_quant.symmetric and is_static
222
+
223
+ def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
224
+ input_quant: BaseModel) -> bool:
225
+ is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
226
+ weight_strategy = (
227
+ weight_quant.strategy == QuantizationStrategy.TENSOR.value
228
+ or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
229
+ is_token = (weight_strategy and input_quant.strategy
230
+ == QuantizationStrategy.TOKEN.value)
231
+ is_dynamic = not weight_quant.dynamic and input_quant.dynamic
232
+
233
+ # Both symmetric and asymmetric input quantization supported.
234
+ # Only symmetric weight quantization supported.
235
+ return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
236
+
237
+ def _is_fp8_w8a8(self, weight_quant: BaseModel,
238
+ input_quant: BaseModel) -> bool:
239
+ # Confirm weights and activations quantized.
240
+ if weight_quant is None or input_quant is None:
241
+ return False
242
+
243
+ # Confirm weight scheme is supported.
244
+ is_floating_point = (weight_quant.type == QuantizationType.FLOAT
245
+ and input_quant.type == QuantizationType.FLOAT)
246
+ is_symmetric_weight = weight_quant.symmetric
247
+ is_static_weight = not weight_quant.dynamic
248
+ is_per_tensor_or_channel_weight = (weight_quant.strategy in [
249
+ QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
250
+ ])
251
+ if not (is_floating_point and is_symmetric_weight and is_static_weight
252
+ and is_per_tensor_or_channel_weight):
253
+ return False
254
+
255
+ # Dynamic quantization is always supported if weights supported.
256
+ if input_quant.dynamic:
257
+ return True
258
+
259
+ # Confirm activation scheme is supported.
260
+ is_symmetric_activation = input_quant.symmetric
261
+ is_per_tensor_activation = (
262
+ input_quant.strategy == QuantizationStrategy.TENSOR)
263
+ return is_symmetric_activation and is_per_tensor_activation
264
+
265
+ def _is_fp8_w8a16(self, weight_quant: BaseModel,
266
+ input_quant: BaseModel) -> bool:
267
+ # Confirm weights quantized.
268
+ if weight_quant is None:
269
+ return False
270
+
271
+ # Confirm we have floating points.
272
+ if weight_quant.type != QuantizationType.FLOAT:
273
+ return False
274
+
275
+ # Confirm weight scheme is supported.
276
+ is_symmetric_weight = weight_quant.symmetric
277
+ is_static_weight = not weight_quant.dynamic
278
+ is_per_tensor_or_channel_weight = (weight_quant.strategy in [
279
+ QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
280
+ ])
281
+ if not (is_symmetric_weight and is_static_weight # noqa: SIM103
282
+ and is_per_tensor_or_channel_weight):
283
+ return False
284
+
285
+ # All conditions satisfied.
286
+ return True
287
+
288
+ def _is_wNa16_group_channel(self, weight_quant: BaseModel,
289
+ input_quant: BaseModel) -> bool:
290
+ input_quant_none = input_quant is None
291
+ is_symmetric = weight_quant.symmetric
292
+ is_channel_group = (
293
+ weight_quant.strategy == QuantizationStrategy.CHANNEL.value
294
+ or weight_quant.strategy == QuantizationStrategy.GROUP.value)
295
+ is_static = not weight_quant.dynamic
296
+
297
+ return (is_channel_group and input_quant_none and is_symmetric
298
+ and is_static)
299
+
300
+ def _get_scheme_from_parts(
301
+ self, weight_quant: BaseModel,
302
+ input_quant: BaseModel) -> "CompressedTensorsScheme":
303
+
304
+ # Detect If Mixed Precision
305
+ if self._is_wNa16_group_channel(weight_quant, input_quant):
306
+ if (self.quant_format == CompressionFormat.marlin_24.value
307
+ and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
308
+ return CompressedTensorsW4A16Sparse24(
309
+ strategy=weight_quant.strategy,
310
+ num_bits=weight_quant.num_bits,
311
+ group_size=weight_quant.group_size)
312
+ if (self.quant_format == CompressionFormat.pack_quantized.value
313
+ and weight_quant.num_bits in WNA16_SUPPORTED_BITS):
314
+ return CompressedTensorsWNA16(
315
+ num_bits=weight_quant.num_bits,
316
+ strategy=weight_quant.strategy,
317
+ group_size=weight_quant.group_size,
318
+ actorder=weight_quant.actorder)
319
+
320
+ if is_activation_quantization_format(self.quant_format):
321
+ if self._is_fp8_w8a8(weight_quant, input_quant):
322
+ is_fp8_w8a8_supported = self._check_scheme_supported(
323
+ CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
324
+ if is_fp8_w8a8_supported:
325
+ return CompressedTensorsW8A8Fp8(
326
+ strategy=weight_quant.strategy,
327
+ is_static_input_scheme=(input_quant
328
+ and not input_quant.dynamic))
329
+ else:
330
+ # note: input_quant will be present for converted models;
331
+ # will be ignored during inference post loading
332
+ return CompressedTensorsW8A16Fp8(
333
+ strategy=weight_quant.strategy,
334
+ is_static_input_scheme=not input_quant.dynamic)
335
+
336
+ # note: input_quant can be None
337
+ if self._is_fp8_w8a16(weight_quant, input_quant):
338
+ is_static_input_scheme = (input_quant
339
+ and not input_quant.dynamic)
340
+ return CompressedTensorsW8A16Fp8(
341
+ strategy=weight_quant.strategy,
342
+ is_static_input_scheme=is_static_input_scheme)
343
+
344
+ if self._is_static_tensor_w8a8(weight_quant, input_quant):
345
+ return CompressedTensorsW8A8Int8(
346
+ strategy=weight_quant.strategy,
347
+ is_static_input_scheme=True,
348
+ input_symmetric=input_quant.symmetric)
349
+
350
+ if self._is_dynamic_token_w8a8(weight_quant, input_quant):
351
+ return CompressedTensorsW8A8Int8(
352
+ strategy=weight_quant.strategy,
353
+ is_static_input_scheme=False,
354
+ input_symmetric=input_quant.symmetric)
355
+
356
+ raise NotImplementedError(
357
+ "No compressed-tensors compatible scheme was found.")
358
+
359
+ def get_scheme(self,
360
+ layer: torch.nn.Module,
361
+ layer_name: Optional[str] = None
362
+ ) -> Optional["CompressedTensorsScheme"]:
363
+ """
364
+ compressed-tensors supports non uniform in the following way:
365
+
366
+ targets of config_groups: There can be N config_groups which each
367
+ have a quantization scheme. Each config_group has a list of targets
368
+ which can be a full layer_name, a regex for a layer_name, or
369
+ an nn.Module name.
370
+
371
+ Detect whether a layer_name is found in any target and
372
+ use the quantization scheme corresponding to the matched target
373
+ to select the CompressedTensorsScheme used for infernece.
374
+ """
375
+
376
+ # Find the "target" in the compressed-tensors config
377
+ # that our layer conforms to.
378
+ # TODO (@robertgshaw): add compressed-tensors as dep
379
+ # so we do not have to re-write these functions
380
+ # need to make accelerate optional in ct to do this
381
+
382
+ # Will be empty for models with only sparsity
383
+ weight_quant = input_quant = None
384
+ if self.target_scheme_map:
385
+ matched_target = find_matched_target(
386
+ layer_name=layer_name,
387
+ module=layer,
388
+ targets=self.target_scheme_map.keys(),
389
+ fused_mapping=self.packed_modules_mapping)
390
+
391
+ scheme_dict = self.target_scheme_map[matched_target]
392
+ weight_quant = scheme_dict.get("weights")
393
+ input_quant = scheme_dict.get("input_activations")
394
+
395
+ # Find the sparsity scheme of the layer
396
+ # assume that fused layers inerhit first component's sparsity scheme
397
+ sparsity_targets = (self.sparsity_scheme_map.keys() -
398
+ set(self.sparsity_ignore_list))
399
+ sparsity_scheme: Optional[SparsityCompressionConfig] = None
400
+ with suppress(ValueError):
401
+ matched_target = find_matched_target(
402
+ layer_name=layer_name,
403
+ module=layer,
404
+ targets=sparsity_targets,
405
+ fused_mapping=self.packed_modules_mapping)
406
+ sparsity_scheme = self.sparsity_scheme_map[matched_target]
407
+
408
+ if self.supports_cutlass_24(weight_quant=weight_quant,
409
+ input_quant=input_quant,
410
+ sparsity_scheme=sparsity_scheme):
411
+ # FIXME(tlrmchlsmth): layers using W16A16 CUTLASS 2:4 sparse kernels
412
+ # currently produce bad output in some cases
413
+ if weight_quant is None:
414
+ logger.warning_once(
415
+ "CompressedTensors24 scheme is disabled for the w16a16 "
416
+ "case. Falling back to UnquantizedLinearMethod")
417
+ return None
418
+ # Have a valid sparsity scheme
419
+ # Validate layer is supported by Cutlass 2:4 Kernel
420
+ model_compression_config = (None if sparsity_scheme is None
421
+ or sparsity_scheme.format == "dense"
422
+ else self.config)
423
+
424
+ scheme = CompressedTensors24(
425
+ quantized=weight_quant is not None or input_quant is not None,
426
+ weight_quant=weight_quant,
427
+ input_quant=input_quant,
428
+ model_compression_config=model_compression_config,
429
+ )
430
+ elif weight_quant is None:
431
+ logger.warning_once("Acceleration for non-quantized schemes is "
432
+ "not supported by Compressed Tensors. "
433
+ "Falling back to UnquantizedLinearMethod")
434
+ return None
435
+
436
+ else:
437
+ # Find the quant_scheme
438
+ scheme = self._get_scheme_from_parts( # type: ignore
439
+ weight_quant=weight_quant,
440
+ input_quant=input_quant,
441
+ )
442
+
443
+ # Raise error if device does not support the scheme
444
+ # (e.g. fp8 needs ada lovelace)
445
+ self._check_scheme_supported(scheme.get_min_capability())
446
+ logger.debug("Using scheme: %s for %s", scheme.__class__.__name__,
447
+ layer_name)
448
+ return scheme
449
+
450
+ def get_cache_scale(self, name: str) -> Optional[str]:
451
+ """
452
+ Check whether the param name matches the format for k/v cache scales
453
+ in compressed-tensors. If this is the case, return its equivalent
454
+ param name expected by vLLM
455
+
456
+ :param name: param name
457
+ :return: matching param name for KV cache scale in vLLM
458
+ """
459
+ if name.endswith(".output_scale") and ".k_proj" in name:
460
+ return name.replace(".k_proj.output_scale", ".attn.k_scale")
461
+ if name.endswith(".output_scale") and ".v_proj" in name:
462
+ return name.replace(".v_proj.output_scale", ".attn.v_scale")
463
+ # If no matches, return None
464
+ return None
465
+
466
+ @staticmethod
467
+ def supports_cutlass_24(
468
+ weight_quant: Optional[QuantizationArgs],
469
+ input_quant: Optional[QuantizationArgs],
470
+ sparsity_scheme: Optional[SparsityCompressionConfig] = None
471
+ ) -> bool:
472
+ """
473
+ Check if the layer is supported by the Cutlass 2:4 Kernel
474
+ Conditions:
475
+ - Overarching condition: Sparsity Structure is 2:4
476
+ - Unquantized cases are supported
477
+ - Weight only quantization is not-supported
478
+ - Supported weight quantization strategies are TENSOR and CHANNEL
479
+ - Supported input quantization strategies are TENSOR and TOKEN
480
+ - Only 8 bit quantization is supported
481
+
482
+ :return: True if the layer is supported by the Cutlass 2:4 Kernel
483
+ False otherwise
484
+ """
485
+ if sparsity_scheme is None:
486
+ return False
487
+
488
+ is_valid_sparsity_structure: bool = (
489
+ sparsity_scheme.sparsity_structure ==
490
+ SparsityStructure.TWO_FOUR.value)
491
+
492
+ valid_compressors = {
493
+ CompressionFormat.dense.value,
494
+ CompressionFormat.sparse_24_bitmask.value
495
+ }
496
+
497
+ is_valid_sparsity = (is_valid_sparsity_structure
498
+ and sparsity_scheme.format in valid_compressors)
499
+
500
+ if not is_valid_sparsity:
501
+ return False
502
+
503
+ # Unquantized cases are supported
504
+ if weight_quant is None and input_quant is None:
505
+ return True
506
+
507
+ # Weight only quantization is not-supported
508
+ if weight_quant is not None and input_quant is None:
509
+ return False
510
+
511
+ supported_weight_quant_strategies = [
512
+ QuantizationStrategy.TENSOR.value,
513
+ QuantizationStrategy.CHANNEL.value
514
+ ]
515
+
516
+ assert weight_quant is not None
517
+ assert input_quant is not None
518
+ if weight_quant.strategy not in supported_weight_quant_strategies:
519
+ return False
520
+
521
+ supported_input_quant_strategies = [
522
+ QuantizationStrategy.TENSOR.value, QuantizationStrategy.TOKEN.value
523
+ ]
524
+
525
+ if input_quant.strategy not in supported_input_quant_strategies:
526
+ return False
527
+
528
+ return weight_quant.num_bits == input_quant.num_bits == 8
529
+
530
+
531
+ class CompressedTensorsLinearMethod(LinearMethodBase):
532
+
533
+ def __init__(self, quantization_config: CompressedTensorsConfig):
534
+ self.quantization_config = quantization_config
535
+
536
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
537
+ layer.scheme.process_weights_after_loading(layer)
538
+
539
+ def create_weights(self, layer: torch.nn.Module,
540
+ input_size_per_partition: int,
541
+ output_partition_sizes: List[int], input_size: int,
542
+ output_size: int, params_dtype: torch.dtype,
543
+ **extra_weight_attrs):
544
+ """
545
+ Use the CompressedTensorsScheme associated with each layer to create
546
+ the necessary parameters for the layer. See LinearMethodBase for param
547
+ details
548
+ """
549
+ weight_loader = extra_weight_attrs.get("weight_loader")
550
+ layer.scheme.create_weights(
551
+ layer=layer,
552
+ input_size=input_size,
553
+ input_size_per_partition=input_size_per_partition,
554
+ output_partition_sizes=output_partition_sizes,
555
+ output_size=output_size,
556
+ params_dtype=params_dtype,
557
+ weight_loader=weight_loader)
558
+
559
+ def apply(self,
560
+ layer: torch.nn.Module,
561
+ x: torch.Tensor,
562
+ bias: Optional[torch.Tensor] = None):
563
+ """
564
+ Use the output of create_weights and the CompressedTensorsScheme
565
+ associated with the layer to apply the forward pass with the
566
+ layer input. See LinearMethodBase for param details
567
+
568
+ """
569
+
570
+ scheme = layer.scheme
571
+ if scheme is None:
572
+ raise ValueError("A scheme must be defined for each layer")
573
+ return scheme.apply_weights(layer, x, bias=bias)
574
+
575
+
576
+ class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):
577
+ """
578
+ Supports loading kv-cache scaling factors from compressed-tensors
579
+ checkpoints.
580
+ """
581
+
582
+ def __init__(self, quant_config: CompressedTensorsConfig):
583
+ self.validate_kv_cache_scheme(quant_config.kv_cache_scheme)
584
+ super().__init__(quant_config)
585
+
586
+ @staticmethod
587
+ def validate_kv_cache_scheme(kv_cache_scheme: Optional[Dict[str, Any]]):
588
+ """
589
+ Validator for the kv cache scheme. Useful for controlling the
590
+ kv cache quantization schemes, that are being supported in vLLM
591
+ :param kv_cache_scheme: the compressed-tensors kv cache scheme
592
+ """
593
+ if kv_cache_scheme is None:
594
+ return
595
+
596
+ type_ = kv_cache_scheme.get("type")
597
+ num_bits = kv_cache_scheme.get("num_bits")
598
+
599
+ if type_ != "float" and num_bits != 8:
600
+ raise NotImplementedError(
601
+ "Currently supported kv cache quantization is "
602
+ "num_bits=8, type=float, however "
603
+ f"received num_bits={num_bits}, type={type_}")
604
+
605
+ strategy = kv_cache_scheme.get("strategy")
606
+ if strategy != "tensor":
607
+ raise NotImplementedError(
608
+ "Only support per-tensor scaling factor "
609
+ "for compressed-tensors KV cache. "
610
+ f"Expected strategy: tensor, found strategy: {strategy}")
611
+
612
+ is_symmetric = kv_cache_scheme.get("symmetric")
613
+ if not is_symmetric:
614
+ raise NotImplementedError(
615
+ "Only support symmetric scaling factor "
616
+ "for compressed-tensors KV cache. "
617
+ f"However found symmetric: {is_symmetric}")
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py ADDED
@@ -0,0 +1,574 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import enum
4
+ from enum import Enum
5
+ from typing import Callable, List, Optional
6
+
7
+ import torch
8
+ from compressed_tensors import CompressionFormat
9
+ from compressed_tensors.quantization import QuantizationStrategy
10
+
11
+ import vllm.model_executor.layers.fused_moe # noqa
12
+ from vllm import _custom_ops as ops
13
+ from vllm.logger import init_logger
14
+ from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
15
+ FusedMoeWeightScaleSupported)
16
+ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
17
+ WNA16_SUPPORTED_BITS)
18
+ from vllm.model_executor.layers.quantization.utils import replace_parameter
19
+ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
20
+ all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
21
+ from vllm.model_executor.utils import set_weight_attrs
22
+ from vllm.platforms import current_platform
23
+
24
+ logger = init_logger(__name__)
25
+
26
+
27
+ class GPTQMarlinState(Enum):
28
+ REPACK = enum.auto()
29
+ READY = enum.auto()
30
+
31
+
32
+ __all__ = [
33
+ "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod",
34
+ "CompressedTensorsWNA16MoEMethod"
35
+ ]
36
+
37
+
38
+ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
39
+
40
+ @staticmethod
41
+ def get_moe_method(
42
+ quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501
43
+ ) -> "CompressedTensorsMoEMethod":
44
+ # TODO: @dsikka: refactor this to use schemes as other kernels
45
+ # are supported + check if the layer is being ignored.
46
+ weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
47
+ input_quant = quant_config.target_scheme_map["Linear"].get(
48
+ "input_activations")
49
+
50
+ if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
51
+ return CompressedTensorsWNA16MoEMethod(quant_config)
52
+ elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
53
+ return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
54
+ else:
55
+ raise RuntimeError(
56
+ f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}")
57
+
58
+
59
+ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
60
+
61
+ def __init__(
62
+ self,
63
+ quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501
64
+ ):
65
+ self.quant_config = quant_config
66
+ self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
67
+ "weights")
68
+ self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
69
+ "input_activations")
70
+
71
+ if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
72
+ and self.input_quant.strategy == QuantizationStrategy.TENSOR):
73
+ raise ValueError(
74
+ "For FP8 Fused MoE layers, only per-tensor scales"
75
+ "for weights and activations are supported. Found "
76
+ f"{self.weight_quant}, {self.input_quant}")
77
+
78
+ self.static_input_scales = not self.input_quant.dynamic
79
+
80
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
81
+ hidden_size: int, intermediate_size_per_partition: int,
82
+ params_dtype: torch.dtype, **extra_weight_attrs):
83
+
84
+ params_dtype = torch.float8_e4m3fn
85
+
86
+ # WEIGHTS
87
+ w13_weight = torch.nn.Parameter(torch.empty(
88
+ num_experts,
89
+ 2 * intermediate_size_per_partition,
90
+ hidden_size,
91
+ dtype=params_dtype),
92
+ requires_grad=False)
93
+ layer.register_parameter("w13_weight", w13_weight)
94
+ set_weight_attrs(w13_weight, extra_weight_attrs)
95
+
96
+ w2_weight = torch.nn.Parameter(torch.empty(
97
+ num_experts,
98
+ hidden_size,
99
+ intermediate_size_per_partition,
100
+ dtype=params_dtype),
101
+ requires_grad=False)
102
+ layer.register_parameter("w2_weight", w2_weight)
103
+ set_weight_attrs(w2_weight, extra_weight_attrs)
104
+
105
+ # WEIGHT_SCALES
106
+ # Allocate 2 scales for w1 and w3 respectively.
107
+ # They will be combined to a single scale after weight loading.
108
+ w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
109
+ 2,
110
+ dtype=torch.float32),
111
+ requires_grad=False)
112
+ layer.register_parameter("w13_weight_scale", w13_weight_scale)
113
+
114
+ w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
115
+ dtype=torch.float32),
116
+ requires_grad=False)
117
+ layer.register_parameter("w2_weight_scale", w2_weight_scale)
118
+ # Add the quantization method used (per tensor/grouped/channel)
119
+ # to ensure the weight scales are loaded in properly
120
+ extra_weight_attrs.update(
121
+ {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
122
+ set_weight_attrs(w13_weight_scale, extra_weight_attrs)
123
+ set_weight_attrs(w2_weight_scale, extra_weight_attrs)
124
+
125
+ # INPUT_SCALES
126
+ if self.static_input_scales:
127
+ w13_input_scale = torch.nn.Parameter(torch.ones(
128
+ num_experts, dtype=torch.float32),
129
+ requires_grad=False)
130
+ layer.register_parameter("w13_input_scale", w13_input_scale)
131
+ set_weight_attrs(w13_input_scale, extra_weight_attrs)
132
+
133
+ w2_input_scale = torch.nn.Parameter(torch.ones(
134
+ num_experts, dtype=torch.float32),
135
+ requires_grad=False)
136
+ layer.register_parameter("w2_input_scale", w2_input_scale)
137
+ set_weight_attrs(w2_input_scale, extra_weight_attrs)
138
+ else:
139
+ layer.w13_input_scale = None
140
+ layer.w2_input_scale = None
141
+
142
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
143
+ # Fp8 moe kernels require a single activation scale.
144
+ # We take the max of all the scales in case they differ.
145
+ if self.static_input_scales:
146
+ if (layer.w13_input_scale is None or layer.w2_input_scale is None):
147
+ raise ValueError(
148
+ "QuantConfig has static quantization, but found "
149
+ "activation scales are None.")
150
+ if (not all_close_1d(layer.w13_input_scale)
151
+ or not all_close_1d(layer.w2_input_scale)):
152
+ logger.warning_once(
153
+ "Found input_scales that are not equal for "
154
+ "fp8 MoE layer. Using the maximum across experts "
155
+ "for each layer.")
156
+ layer.w13_input_scale = torch.nn.Parameter(
157
+ layer.w13_input_scale.max(), requires_grad=False)
158
+ layer.w2_input_scale = torch.nn.Parameter(
159
+ layer.w2_input_scale.max(), requires_grad=False)
160
+
161
+ # If rocm, normalize the weights and scales to e4m3fnuz
162
+ if current_platform.is_rocm():
163
+ # Normalize the weights and scales
164
+ w13_weight, w13_weight_scale, w13_input_scale = \
165
+ normalize_e4m3fn_to_e4m3fnuz(
166
+ layer.w13_weight, layer.w13_weight_scale,
167
+ layer.w13_input_scale)
168
+ w2_weight, w2_weight_scale, w2_input_scale = \
169
+ normalize_e4m3fn_to_e4m3fnuz(
170
+ layer.w2_weight, layer.w2_weight_scale,
171
+ layer.w2_input_scale)
172
+ # Reset the parameter
173
+ layer.w13_weight = torch.nn.Parameter(w13_weight,
174
+ requires_grad=False)
175
+ layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale,
176
+ requires_grad=False)
177
+ if w13_input_scale is not None:
178
+ layer.w13_input_scale = torch.nn.Parameter(w13_input_scale,
179
+ requires_grad=False)
180
+ layer.w2_weight = torch.nn.Parameter(w2_weight,
181
+ requires_grad=False)
182
+ layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
183
+ requires_grad=False)
184
+ if w2_input_scale is not None:
185
+ layer.w2_input_scale = torch.nn.Parameter(w2_input_scale,
186
+ requires_grad=False)
187
+
188
+ # Fp8 moe kernel needs single weight scale for w13 per expert.
189
+ # We take the max then dequant and requant each expert.
190
+ assert layer.w13_weight_scale is not None
191
+ shard_size = layer.intermediate_size_per_partition
192
+ max_w13_scales = layer.w13_weight_scale.max(dim=1).values
193
+ for expert_id in range(layer.num_experts):
194
+ start = 0
195
+ for shard_id in range(2):
196
+ dq_weight = per_tensor_dequantize(
197
+ layer.w13_weight[expert_id][start:start + shard_size, :],
198
+ layer.w13_weight_scale[expert_id][shard_id])
199
+ layer.w13_weight[expert_id][
200
+ start:start + shard_size, :], _ = ops.scaled_fp8_quant(
201
+ dq_weight, max_w13_scales[expert_id])
202
+ start += shard_size
203
+
204
+ layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
205
+ requires_grad=False)
206
+
207
+ def apply(
208
+ self,
209
+ layer: torch.nn.Module,
210
+ x: torch.Tensor,
211
+ router_logits: torch.Tensor,
212
+ top_k: int,
213
+ renormalize: bool,
214
+ use_grouped_topk: bool = False,
215
+ topk_group: Optional[int] = None,
216
+ num_expert_group: Optional[int] = None,
217
+ custom_routing_function: Optional[Callable] = None,
218
+ scoring_func: str = "softmax",
219
+ e_score_correction_bias: Optional[torch.Tensor] = None,
220
+ ) -> torch.Tensor:
221
+ from vllm.model_executor.layers.fused_moe import fused_experts
222
+
223
+ topk_weights, topk_ids = FusedMoE.select_experts(
224
+ hidden_states=x,
225
+ router_logits=router_logits,
226
+ use_grouped_topk=use_grouped_topk,
227
+ top_k=top_k,
228
+ renormalize=renormalize,
229
+ topk_group=topk_group,
230
+ num_expert_group=num_expert_group,
231
+ custom_routing_function=custom_routing_function,
232
+ scoring_func=scoring_func,
233
+ e_score_correction_bias=e_score_correction_bias)
234
+
235
+ return fused_experts(x,
236
+ layer.w13_weight,
237
+ layer.w2_weight,
238
+ topk_weights=topk_weights,
239
+ topk_ids=topk_ids,
240
+ inplace=True,
241
+ use_fp8_w8a8=True,
242
+ w1_scale=layer.w13_weight_scale,
243
+ w2_scale=layer.w2_weight_scale,
244
+ a1_scale=layer.w13_input_scale,
245
+ a2_scale=layer.w2_input_scale)
246
+
247
+
248
+ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
249
+
250
+ def __init__(
251
+ self,
252
+ quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501
253
+ ):
254
+ self.quant_config = quant_config
255
+ # TODO: @dsikka: refactor this to use schemes as other kernels
256
+ # are supported + check if the layer is being ignored.
257
+ config = self.quant_config.target_scheme_map["Linear"].get("weights")
258
+ self.num_bits = config.num_bits
259
+ self.packed_factor = 32 // config.num_bits
260
+ self.strategy = config.strategy
261
+ self.group_size = config.group_size
262
+ self.actorder = config.actorder
263
+ assert config.symmetric, (
264
+ "Only symmetric quantization is supported for MoE")
265
+
266
+ if not (self.quant_config.quant_format
267
+ == CompressionFormat.pack_quantized.value
268
+ and self.num_bits in WNA16_SUPPORTED_BITS):
269
+ raise ValueError("For Fused MoE layers, only ",
270
+ f"{CompressionFormat.pack_quantized.value} ",
271
+ "is supported for the following bits: ",
272
+ f"{WNA16_SUPPORTED_BITS}")
273
+
274
+ def create_weights(self, layer: torch.nn.Module, num_experts: int,
275
+ hidden_size: int, intermediate_size_per_partition: int,
276
+ params_dtype: torch.dtype, **extra_weight_attrs):
277
+
278
+ assert params_dtype == torch.float16, (
279
+ "float16 is required for MoE compressed models. Set dtype=torch.float16" # noqa: E501
280
+ )
281
+
282
+ intermediate_size_full = extra_weight_attrs.pop(
283
+ "intermediate_size_full")
284
+
285
+ # Will transpose the loaded weight along the
286
+ # intermediate and hidden dim sizes. Will
287
+ # shard for TP along the transposed dims
288
+ extra_weight_attrs.update({
289
+ "is_transposed": True,
290
+ "quant_method": self.strategy
291
+ })
292
+ w13_weight = torch.nn.Parameter(torch.empty(
293
+ num_experts,
294
+ hidden_size // self.packed_factor,
295
+ 2 * intermediate_size_per_partition,
296
+ dtype=torch.int32),
297
+ requires_grad=False)
298
+ layer.register_parameter("w13_weight_packed", w13_weight)
299
+ set_weight_attrs(w13_weight, extra_weight_attrs)
300
+
301
+ w2_weight = torch.nn.Parameter(torch.empty(
302
+ num_experts,
303
+ intermediate_size_per_partition // self.packed_factor,
304
+ hidden_size,
305
+ dtype=torch.int32),
306
+ requires_grad=False)
307
+ layer.register_parameter("w2_weight_packed", w2_weight)
308
+ set_weight_attrs(w2_weight, extra_weight_attrs)
309
+
310
+ # In the case where we have actorder/g_idx,
311
+ # we do not partition the w2 scales
312
+ load_full_w2 = self.actorder and self.group_size != -1
313
+ w2_scales_size = (intermediate_size_full
314
+ if load_full_w2 else intermediate_size_per_partition)
315
+
316
+ self.is_k_full = (not self.actorder) or (
317
+ intermediate_size_per_partition == intermediate_size_full)
318
+
319
+ if self.strategy == "channel":
320
+ num_groups_w2 = num_groups_w13 = 1
321
+ self.group_size = -1
322
+ else:
323
+ num_groups_w2 = w2_scales_size // self.group_size
324
+ num_groups_w13 = hidden_size // self.group_size
325
+
326
+ w13_scale = torch.nn.Parameter(torch.ones(
327
+ num_experts,
328
+ num_groups_w13,
329
+ 2 * intermediate_size_per_partition,
330
+ dtype=params_dtype),
331
+ requires_grad=False)
332
+ layer.register_parameter("w13_weight_scale", w13_scale)
333
+ set_weight_attrs(w13_scale, extra_weight_attrs)
334
+
335
+ w2_scale = torch.nn.Parameter(torch.ones(num_experts,
336
+ num_groups_w2,
337
+ hidden_size,
338
+ dtype=params_dtype),
339
+ requires_grad=False)
340
+ layer.register_parameter("w2_weight_scale", w2_scale)
341
+ set_weight_attrs(w2_scale, extra_weight_attrs)
342
+ set_weight_attrs(w2_scale, {"load_full_w2": load_full_w2})
343
+
344
+ w2_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
345
+ requires_grad=False)
346
+ layer.register_parameter("w2_weight_shape", w2_weight_shape)
347
+ set_weight_attrs(w2_weight_shape, extra_weight_attrs)
348
+ w13_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2),
349
+ requires_grad=False)
350
+
351
+ layer.register_parameter("w13_weight_shape", w13_weight_shape)
352
+ set_weight_attrs(w13_weight_shape, extra_weight_attrs)
353
+
354
+ w13_g_idx = torch.nn.Parameter(
355
+ torch.empty(
356
+ num_experts,
357
+ hidden_size,
358
+ dtype=torch.int32,
359
+ ),
360
+ requires_grad=False,
361
+ )
362
+ layer.register_parameter("w13_weight_g_idx", w13_g_idx)
363
+ set_weight_attrs(w13_g_idx, extra_weight_attrs)
364
+
365
+ w2_g_idx = torch.nn.Parameter(
366
+ torch.empty(
367
+ num_experts,
368
+ intermediate_size_per_partition,
369
+ dtype=torch.int32,
370
+ ),
371
+ requires_grad=False,
372
+ )
373
+ layer.register_parameter("w2_weight_g_idx", w2_g_idx)
374
+ set_weight_attrs(w2_g_idx, extra_weight_attrs)
375
+
376
+ w13_g_idx_sort_indices = torch.nn.Parameter(
377
+ torch.empty(
378
+ num_experts,
379
+ hidden_size,
380
+ dtype=torch.int32,
381
+ ),
382
+ requires_grad=False,
383
+ )
384
+ layer.register_parameter("w13_g_idx_sort_indices",
385
+ w13_g_idx_sort_indices)
386
+ set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
387
+
388
+ w2_g_idx_sort_indices = torch.nn.Parameter(
389
+ torch.empty(
390
+ num_experts,
391
+ intermediate_size_per_partition,
392
+ dtype=torch.int32,
393
+ ),
394
+ requires_grad=False,
395
+ )
396
+ layer.register_parameter("w2_g_idx_sort_indices",
397
+ w2_g_idx_sort_indices)
398
+ set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
399
+
400
+ layer.a13_scale = None
401
+ layer.a2_scale = None
402
+ layer.marlin_state = GPTQMarlinState.REPACK
403
+
404
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
405
+
406
+ def replace_tensor(name, new_t):
407
+ # It is important to use resize_() here since it ensures
408
+ # the same buffer is reused
409
+ getattr(layer, name).resize_(new_t.shape)
410
+ getattr(layer, name).copy_(new_t)
411
+ del new_t
412
+
413
+ def get_scale_perms(num_bits: int):
414
+ scale_perm: List[int] = []
415
+ for i in range(8):
416
+ scale_perm.extend([i + 8 * j for j in range(8)])
417
+ scale_perm_single: List[int] = []
418
+ for i in range(4):
419
+ scale_perm_single.extend(
420
+ [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
421
+ return scale_perm, scale_perm_single
422
+
423
+ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
424
+ group_size: int, num_bits: int):
425
+ scale_perm, scale_perm_single = get_scale_perms(num_bits)
426
+ if group_size < size_k and group_size != -1:
427
+ s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
428
+ else:
429
+ s = s.reshape((-1, len(scale_perm_single)))[:,
430
+ scale_perm_single]
431
+ s = s.reshape((-1, size_n)).contiguous()
432
+ return s
433
+
434
+ def marlin_moe_permute_scales(s: torch.Tensor, size_k: int,
435
+ size_n: int, group_size: int,
436
+ num_bits: int):
437
+ num_experts = s.shape[0]
438
+ output = torch.empty((num_experts, s.shape[1], s.shape[2]),
439
+ device=s.device,
440
+ dtype=s.dtype)
441
+ for e in range(num_experts):
442
+ output[e] = marlin_permute_scales(s[e], size_k, size_n,
443
+ group_size, num_bits)
444
+ return output
445
+
446
+ size_k2 = layer.w2_weight_packed.shape[2]
447
+ size_k13 = layer.w13_weight_packed.shape[2]
448
+
449
+ num_experts = layer.w13_weight_g_idx.shape[0]
450
+ device = layer.w13_weight_g_idx.device
451
+
452
+ # when running models with grouped act order,
453
+ # resort to g_idx values provided in checkpoint
454
+ if self.actorder == "group":
455
+ w13_g_idx_sort_indices = torch.empty_like(layer.w13_weight_g_idx)
456
+ w2_g_idx_sort_indices = torch.empty_like(layer.w2_weight_g_idx)
457
+ w13_sorted_g_idx = torch.empty_like(layer.w13_weight_g_idx)
458
+ w2_sorted_g_idx = torch.empty_like(layer.w2_weight_g_idx)
459
+
460
+ for e in range(num_experts):
461
+ w13_g_idx_sort_indices[e] = torch.argsort(
462
+ layer.w13_weight_g_idx[e]).to(torch.int32)
463
+ w2_g_idx_sort_indices[e] = torch.argsort(
464
+ layer.w2_weight_g_idx[e]).to(torch.int32)
465
+ w13_sorted_g_idx[e] = layer.w13_weight_g_idx[e][
466
+ w13_g_idx_sort_indices[e]]
467
+ w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][
468
+ w2_g_idx_sort_indices[e]]
469
+
470
+ replace_parameter(layer, "w13_weight_g_idx", w13_sorted_g_idx)
471
+ replace_parameter(layer, "w2_weight_g_idx", w2_sorted_g_idx)
472
+ replace_parameter(layer, "w13_g_idx_sort_indices",
473
+ w13_g_idx_sort_indices)
474
+ replace_parameter(layer, "w2_g_idx_sort_indices",
475
+ w2_g_idx_sort_indices)
476
+
477
+ else:
478
+ layer.w13_weight_g_idx = torch.nn.Parameter(
479
+ torch.empty((num_experts, 0), dtype=torch.int32,
480
+ device=device),
481
+ requires_grad=False,
482
+ )
483
+ layer.w2_weight_g_idx = torch.nn.Parameter(
484
+ torch.empty((num_experts, 0), dtype=torch.int32,
485
+ device=device),
486
+ requires_grad=False,
487
+ )
488
+ layer.w13_g_idx_sort_indices = torch.nn.Parameter(
489
+ torch.empty((num_experts, 0), dtype=torch.int32,
490
+ device=device),
491
+ requires_grad=False,
492
+ )
493
+ layer.w2_g_idx_sort_indices = torch.nn.Parameter(
494
+ torch.empty((num_experts, 0), dtype=torch.int32,
495
+ device=device),
496
+ requires_grad=False,
497
+ )
498
+
499
+ marlin_w13_qweight = ops.gptq_marlin_moe_repack(
500
+ layer.w13_weight_packed,
501
+ layer.w13_g_idx_sort_indices,
502
+ layer.w13_weight_packed.shape[1] * self.packed_factor,
503
+ layer.w13_weight_packed.shape[2],
504
+ self.num_bits,
505
+ )
506
+ replace_tensor("w13_weight_packed", marlin_w13_qweight)
507
+ marlin_w2_qweight = ops.gptq_marlin_moe_repack(
508
+ layer.w2_weight_packed,
509
+ layer.w2_g_idx_sort_indices,
510
+ layer.w2_weight_packed.shape[1] * self.packed_factor,
511
+ layer.w2_weight_packed.shape[2],
512
+ self.num_bits,
513
+ )
514
+ replace_tensor("w2_weight_packed", marlin_w2_qweight)
515
+ # Repack scales
516
+ marlin_w13_scales = marlin_moe_permute_scales(
517
+ layer.w13_weight_scale,
518
+ size_k13,
519
+ layer.w13_weight_scale.shape[2],
520
+ self.group_size,
521
+ self.num_bits,
522
+ )
523
+ replace_tensor("w13_weight_scale", marlin_w13_scales)
524
+ marlin_w2_scales = marlin_moe_permute_scales(
525
+ layer.w2_weight_scale,
526
+ layer.w2_weight_scale.shape[1] * self.packed_factor,
527
+ size_k2,
528
+ self.group_size,
529
+ self.num_bits,
530
+ )
531
+ replace_tensor("w2_weight_scale", marlin_w2_scales)
532
+
533
+ def apply(
534
+ self,
535
+ layer: torch.nn.Module,
536
+ x: torch.Tensor,
537
+ router_logits: torch.Tensor,
538
+ top_k: int,
539
+ renormalize: bool,
540
+ use_grouped_topk: bool = False,
541
+ topk_group: Optional[int] = None,
542
+ num_expert_group: Optional[int] = None,
543
+ custom_routing_function: Optional[Callable] = None,
544
+ scoring_func: str = "softmax",
545
+ e_score_correction_bias: Optional[torch.Tensor] = None,
546
+ ) -> torch.Tensor:
547
+
548
+ topk_weights, topk_ids = FusedMoE.select_experts(
549
+ hidden_states=x,
550
+ router_logits=router_logits,
551
+ use_grouped_topk=use_grouped_topk,
552
+ top_k=top_k,
553
+ renormalize=renormalize,
554
+ topk_group=topk_group,
555
+ num_expert_group=num_expert_group,
556
+ custom_routing_function=custom_routing_function,
557
+ scoring_func=scoring_func,
558
+ e_score_correction_bias=e_score_correction_bias)
559
+
560
+ return torch.ops.vllm.fused_marlin_moe(
561
+ x,
562
+ layer.w13_weight_packed,
563
+ layer.w2_weight_packed,
564
+ layer.w13_weight_scale,
565
+ layer.w2_weight_scale,
566
+ router_logits,
567
+ topk_weights,
568
+ topk_ids,
569
+ g_idx1=layer.w13_weight_g_idx,
570
+ g_idx2=layer.w2_weight_g_idx,
571
+ sort_indices1=layer.w13_g_idx_sort_indices,
572
+ sort_indices2=layer.w2_g_idx_sort_indices,
573
+ num_bits=self.num_bits,
574
+ is_k_full=self.is_k_full)
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from .compressed_tensors_scheme import CompressedTensorsScheme
4
+ from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
5
+ CompressedTensorsW4A16Sparse24)
6
+ from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
7
+ from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
8
+ from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
9
+ from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
10
+ CompressedTensorsWNA16)
11
+
12
+ from .compressed_tensors_24 import CompressedTensors24 # isort: skip
13
+
14
+ __all__ = [
15
+ "CompressedTensorsScheme", "CompressedTensorsWNA16",
16
+ "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
17
+ "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
18
+ "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
19
+ "CompressedTensors24"
20
+ ]
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.08 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_24.cpython-311.pyc ADDED
Binary file (15.6 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-311.pyc ADDED
Binary file (2.62 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-311.pyc ADDED
Binary file (7.12 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-311.pyc ADDED
Binary file (6.12 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-311.pyc ADDED
Binary file (6.8 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-311.pyc ADDED
Binary file (5.8 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-311.pyc ADDED
Binary file (7.32 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Any, Callable, Dict, List, Optional, Tuple
4
+
5
+ import torch
6
+ from compressed_tensors import CompressionFormat, ModelCompressor
7
+ from compressed_tensors.quantization import (QuantizationArgs,
8
+ QuantizationStrategy,
9
+ QuantizationType)
10
+ from compressed_tensors.utils import combine_shards
11
+
12
+ from vllm import _custom_ops as ops
13
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
14
+ QKVParallelLinear)
15
+ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
16
+ CompressedTensorsScheme)
17
+ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
18
+ convert_to_channelwise, sparse_cutlass_supported)
19
+ from vllm.model_executor.parameter import (BasevLLMParameter,
20
+ ChannelQuantScaleParameter,
21
+ ModelWeightParameter,
22
+ PerTensorScaleParameter)
23
+
24
+ __all__ = ["CompressedTensors24"]
25
+
26
+
27
+ class CompressedTensors24(CompressedTensorsScheme):
28
+
29
+ def __init__(
30
+ self,
31
+ quantized: bool = False,
32
+ weight_quant: Optional[QuantizationArgs] = None,
33
+ input_quant: Optional[QuantizationArgs] = None,
34
+ model_compression_config: Optional[Dict[str, Any]] = None,
35
+ ):
36
+ self.quantized = quantized
37
+ self.weight_quant = weight_quant
38
+ self.input_quant = input_quant
39
+ self.model_compressor = (
40
+ ModelCompressor.from_compression_config(model_compression_config)
41
+ if model_compression_config is not None else None)
42
+ self.do_sparse_decompress = (
43
+ self.model_compressor is not None
44
+ and self.model_compressor.sparsity_config.format
45
+ == CompressionFormat.sparse_24_bitmask.value)
46
+
47
+ @classmethod
48
+ def get_min_capability(cls) -> int:
49
+ # Only cutlass 3.x kernels are implemented so far
50
+ return 90
51
+
52
+ def create_weights(
53
+ self,
54
+ layer: torch.nn.Module,
55
+ input_size: int,
56
+ output_partition_sizes: List[int],
57
+ input_size_per_partition: int,
58
+ params_dtype: torch.dtype,
59
+ weight_loader: Callable,
60
+ **kwargs,
61
+ ):
62
+ if not sparse_cutlass_supported():
63
+ raise ValueError(
64
+ "Sparse CUTLASS not supported. vLLM must be built with "
65
+ "CUDA 12.2 or later to use this feature")
66
+
67
+ self.output_dtype = params_dtype
68
+ layer.logical_widths = output_partition_sizes
69
+ layer.input_size = input_size
70
+ layer.input_size_per_partition = input_size_per_partition
71
+ self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
72
+
73
+ # parameter to store uncompressed weight
74
+ weight = ModelWeightParameter(
75
+ data=torch.empty(
76
+ sum(output_partition_sizes),
77
+ input_size_per_partition,
78
+ dtype=self.weights_dtype,
79
+ ),
80
+ input_dim=1,
81
+ output_dim=0,
82
+ weight_loader=weight_loader,
83
+ )
84
+ if self.do_sparse_decompress:
85
+ assert all(partition_size % 8 == 0
86
+ for partition_size in output_partition_sizes
87
+ ), "All partitions must be divisible by 8 for "
88
+ "2:4 sparse compressed models"
89
+
90
+ shape = BasevLLMParameter(
91
+ data=torch.empty(2, 1, dtype=torch.int64),
92
+ weight_loader=weight_loader,
93
+ )
94
+ compressed_weight = ModelWeightParameter(
95
+ data=torch.empty(
96
+ sum(output_partition_sizes),
97
+ input_size_per_partition // 2,
98
+ dtype=self.weights_dtype,
99
+ ),
100
+ input_dim=1,
101
+ output_dim=0,
102
+ weight_loader=weight_loader,
103
+ )
104
+
105
+ bitmask = ModelWeightParameter(
106
+ data=torch.empty(
107
+ sum(output_partition_sizes),
108
+ input_size_per_partition // 8,
109
+ dtype=torch.uint8,
110
+ ),
111
+ input_dim=1,
112
+ output_dim=0,
113
+ weight_loader=weight_loader,
114
+ )
115
+
116
+ layer.register_parameter("shape", shape)
117
+ layer.register_parameter("compressed", compressed_weight)
118
+ layer.register_parameter("bitmask", bitmask)
119
+
120
+ # Check if quantized, not just 2:4 Sparse
121
+ if self.quantized:
122
+ if (self.weight_quant and self.weight_quant.strategy
123
+ == QuantizationStrategy.CHANNEL.value):
124
+ weight_scale = ChannelQuantScaleParameter(
125
+ data=torch.empty((sum(output_partition_sizes), 1),
126
+ dtype=torch.float32),
127
+ output_dim=0,
128
+ weight_loader=weight_loader,
129
+ )
130
+ else:
131
+ assert (self.weight_quant and self.weight_quant.strategy
132
+ == QuantizationStrategy.TENSOR.value)
133
+ weight_scale = PerTensorScaleParameter(
134
+ data=torch.empty(len(output_partition_sizes),
135
+ dtype=torch.float32),
136
+ weight_loader=weight_loader,
137
+ )
138
+
139
+ layer.register_parameter("weight_scale", weight_scale)
140
+
141
+ # input quant will be non-none
142
+ if self.input_quant and not self.input_quant.dynamic:
143
+ # register input quant scale
144
+ assert (self.input_quant.strategy ==
145
+ QuantizationStrategy.TENSOR.value)
146
+ input_scale = BasevLLMParameter(
147
+ data=torch.empty(1, dtype=torch.float32),
148
+ weight_loader=weight_loader,
149
+ )
150
+
151
+ layer.register_parameter("input_scale", input_scale)
152
+
153
+ else:
154
+ # for sparse-only, pass in 1 for weight/input scales
155
+ weight_scale = torch.nn.Parameter(data=torch.ones(
156
+ 1, dtype=torch.float32),
157
+ requires_grad=False)
158
+ input_scale = torch.nn.Parameter(data=torch.ones(
159
+ 1, dtype=torch.float32),
160
+ requires_grad=False)
161
+ layer.register_parameter("input_scale", input_scale)
162
+ layer.register_parameter("weight_scale", weight_scale)
163
+
164
+ layer.register_parameter("weight", weight)
165
+
166
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
167
+ """
168
+ Compress weights after loading. Store compressed weight and meta
169
+ tensor
170
+
171
+ :post-condition: layer.w_compressed and layer.meta are
172
+ set to the compressed weight and meta tensor in the
173
+ format expected by the Cutlass kernels
174
+ :param layer: The layer with the weights to be processed
175
+
176
+ """
177
+ if self.do_sparse_decompress:
178
+ layer.weight.data = self._decompress_bitmask_compressed_weight(
179
+ compressed=layer.compressed,
180
+ bitmask=layer.bitmask,
181
+ layer=layer,
182
+ )
183
+
184
+ # compressed and bitmask tensors
185
+ # are no longer needed after decompression
186
+ del layer.compressed
187
+ del layer.bitmask
188
+
189
+ # torch.compile workaround
190
+ if hasattr(layer, "input_scale"):
191
+ layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
192
+ requires_grad=False)
193
+
194
+ if self.weight_quant:
195
+ if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
196
+ layer.weight_scale = torch.nn.Parameter(
197
+ convert_to_channelwise(
198
+ weight_scale=layer.weight_scale,
199
+ logical_widths=layer.logical_widths,
200
+ ),
201
+ requires_grad=False,
202
+ )
203
+ else:
204
+ # torch.compile workaround
205
+ layer.weight_scale = torch.nn.Parameter(
206
+ layer.weight_scale.data, requires_grad=False)
207
+
208
+ w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
209
+ layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
210
+ layer.meta = torch.nn.Parameter(meta, requires_grad=False)
211
+
212
+ def apply_weights(
213
+ self,
214
+ layer: torch.nn.Module,
215
+ x: torch.Tensor,
216
+ bias: Optional[torch.Tensor] = None,
217
+ ) -> torch.Tensor:
218
+ """
219
+ Returns the output tensor for the layer with 2:4
220
+ sparse compressed weights, given the input tensor
221
+ and bias
222
+
223
+ :param layer: The layer with 2:4 sparse compressed
224
+ weights to be used for the computation
225
+ :param x: The input tensor to the layer
226
+ :param bias: The bias to be added to the output tensor
227
+ :return: The output tensor of the layer
228
+ """
229
+ if self.quantized:
230
+ scale = None
231
+ if hasattr(layer, "input_scale"):
232
+ scale = layer.input_scale
233
+
234
+ if self.weights_dtype == torch.int8:
235
+ ops_output = ops.scaled_int8_quant(x, scale=scale)
236
+ q_input = ops_output[0]
237
+ input_scale = ops_output[1]
238
+ else:
239
+ assert self.weights_dtype == torch.float8_e4m3fn
240
+ if scale is not None:
241
+ q_input, input_scale = ops.scaled_fp8_quant(x, scale=scale)
242
+ else:
243
+ q_input, input_scale = ops.scaled_fp8_quant(
244
+ x, use_per_token_if_dynamic=True)
245
+
246
+ else:
247
+ # Not quantized, nothing to do with the input_scales, use as is
248
+ input_scale = layer.input_scale
249
+ q_input = x
250
+
251
+ out = ops.cutlass_scaled_sparse_mm(
252
+ a=q_input,
253
+ bt_nzs=layer.weight,
254
+ bt_meta=layer.meta,
255
+ scale_a=input_scale,
256
+ scale_b=layer.weight_scale,
257
+ out_dtype=self.output_dtype,
258
+ bias=bias,
259
+ )
260
+ assert out.is_contiguous()
261
+ return out
262
+
263
+ def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
264
+ if not self.quantized:
265
+ return params_dtype
266
+
267
+ assert self.weight_quant is not None
268
+ assert self.input_quant is not None
269
+
270
+ is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
271
+
272
+ if not is_8_bits:
273
+ raise ValueError("Cutlass only supports 8-bit quantization")
274
+
275
+ if (self.weight_quant.type == QuantizationType.FLOAT
276
+ and self.input_quant.type == QuantizationType.FLOAT):
277
+ return torch.float8_e4m3fn
278
+
279
+ if (self.weight_quant.type == QuantizationType.INT
280
+ and self.input_quant.type == QuantizationType.INT):
281
+ return torch.int8
282
+
283
+ raise ValueError("Quantization type not supported by Cutlass")
284
+
285
+ def _decompress_bitmask_compressed_weight(
286
+ self,
287
+ compressed: torch.Tensor,
288
+ bitmask: torch.Tensor,
289
+ layer: torch.nn.Module,
290
+ ) -> torch.Tensor:
291
+ """
292
+ Decompress a compressed 2:4 sparse weight tensor using the bitmask and
293
+ return the result.
294
+
295
+ This function also supports sharded decompression.
296
+
297
+ :param compressed: The 2:4 sparse weight tensor compressed using the
298
+ sparse-24-bitmask compressor. This is different from
299
+ `cutlass_sparse_compress` which uses a different scheme (2 bits for
300
+ every nonzero element that represent the coordinate within the block
301
+ of 4). The bitmask compression here uses a bitmask to indicate the
302
+ positions of non-zero elements.
303
+ :param bitmask: The 2:4 bitmask associated with the compressed weights,
304
+ representing the positions of non-zero elements in the compressed
305
+ tensor.
306
+ :param layer: The layer whose weights need to be processed after
307
+ loading.
308
+ :return: The decompressed 2:4 sparse weight tensor.
309
+ """
310
+
311
+ sparsity_compressor = self.model_compressor.sparsity_compressor
312
+
313
+ def _process_split(
314
+ bitmask_compressed_weight: torch.Tensor,
315
+ shape,
316
+ bitmask: torch.Tensor,
317
+ ) -> torch.Tensor:
318
+ weight_data = dict(
319
+ compressed=bitmask_compressed_weight,
320
+ shape=shape,
321
+ bitmask=bitmask,
322
+ )
323
+ return sparsity_compressor.decompress_weight(weight_data)
324
+
325
+ split_weights: List[torch.Tensor] = []
326
+ split_bitmask: List[torch.Tensor] = []
327
+ split_shape: List[Tuple[int, int]] = []
328
+
329
+ if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)):
330
+ split_weights = torch.split(compressed, layer.logical_widths)
331
+ split_bitmask = torch.split(bitmask, layer.logical_widths)
332
+ split_shape = [(out, layer.input_size_per_partition)
333
+ for out in layer.logical_widths]
334
+
335
+ if split_weights:
336
+ decompressed_shards = [
337
+ _process_split(compressed_weight, shape, bitmask)
338
+ for compressed_weight, shape, bitmask in zip(
339
+ split_weights, split_shape, split_bitmask)
340
+ ]
341
+ decompressed = combine_shards(decompressed_shards)
342
+ else:
343
+ decompressed = sparsity_compressor.decompress_weight(
344
+ dict(
345
+ compressed=compressed,
346
+ shape=(
347
+ layer.logical_widths[0],
348
+ layer.input_size_per_partition,
349
+ ),
350
+ bitmask=bitmask,
351
+ ))
352
+ return decompressed
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Optional
5
+
6
+ import torch
7
+
8
+ __all__ = ["CompressedTensorsScheme"]
9
+
10
+
11
+ class CompressedTensorsScheme(ABC):
12
+ """
13
+ Abstract class used to describe the weight creation and forward pass
14
+ of different quantization schemes supported by CompressedTensors.
15
+ """
16
+
17
+ @classmethod
18
+ @abstractmethod
19
+ def get_min_capability(cls) -> int:
20
+ """
21
+ Get minimum device capability.
22
+ """
23
+ raise NotImplementedError
24
+
25
+ @abstractmethod
26
+ def create_weights(self, *args, **kwargs):
27
+ """
28
+ Weight creation for the particular scheme. Inputs to this function
29
+
30
+ """
31
+ raise NotImplementedError
32
+
33
+ @abstractmethod
34
+ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
35
+ bias: Optional[torch.Tensor]):
36
+ """
37
+ Run the forward pass for the particular scheme. This is where
38
+ scheme-specific dequant/quant steps/kernels should be applied.
39
+
40
+ :param layer: torch.nn.Module with the registered weights and
41
+ other parameters relevant to the particular scheme.
42
+ :param x: input to the layer
43
+ :param bias: bias parameter
44
+
45
+ """
46
+ raise NotImplementedError
47
+
48
+ @abstractmethod
49
+ def process_weights_after_loading(self, layer: torch.nn.Module):
50
+ """
51
+ Called after weight loading is complete for any cleanup that
52
+ needs to occur.
53
+ """
54
+ raise NotImplementedError
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Callable, List, Optional
4
+
5
+ import torch
6
+ from torch.nn import Parameter
7
+
8
+ from vllm import _custom_ops as ops
9
+ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
10
+ CompressedTensorsScheme)
11
+ from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
12
+ GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N)
13
+ from vllm.model_executor.parameter import (BasevLLMParameter,
14
+ ChannelQuantScaleParameter,
15
+ GroupQuantScaleParameter,
16
+ PackedvLLMParameter)
17
+ from vllm.scalar_type import scalar_types
18
+
19
+ __all__ = ["CompressedTensorsW4A16Sparse24"]
20
+ W4A16SPARSE24_SUPPORTED_TYPES_MAP = {
21
+ 4: scalar_types.uint4b8,
22
+ }
23
+ W4A16SPARSE24_SUPPORTED_BITS = list(W4A16SPARSE24_SUPPORTED_TYPES_MAP.keys())
24
+
25
+
26
+ class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
27
+
28
+ def __init__(self,
29
+ strategy: str,
30
+ num_bits: int,
31
+ group_size: Optional[int] = None):
32
+ self.strategy = strategy
33
+ self.group_size = group_size
34
+ self.tile_size = 16
35
+
36
+ if num_bits not in W4A16SPARSE24_SUPPORTED_TYPES_MAP:
37
+ raise ValueError(
38
+ f"Unsupported num_bits = {num_bits}. "
39
+ f"Supported num_bits = {W4A16SPARSE24_SUPPORTED_BITS}")
40
+
41
+ self.quant_type = W4A16SPARSE24_SUPPORTED_TYPES_MAP[num_bits]
42
+
43
+ if self.strategy == "group" and self.group_size is None:
44
+ raise ValueError(
45
+ "group_size must be given when using strategy group")
46
+
47
+ @classmethod
48
+ def get_min_capability(cls) -> int:
49
+ # ampere + up
50
+ return 80
51
+
52
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
53
+ # required by torch.compile to be torch.nn.Parameter
54
+ layer.weight_packed = Parameter(layer.weight_packed.data,
55
+ requires_grad=False)
56
+ layer.scale_packed = Parameter(layer.scale_packed.data,
57
+ requires_grad=False)
58
+ layer.meta = Parameter(layer.meta.data, requires_grad=False)
59
+
60
+ def create_weights(self, layer: torch.nn.Module, input_size: int,
61
+ output_partition_sizes: List[int],
62
+ input_size_per_partition: int,
63
+ params_dtype: torch.dtype, weight_loader: Callable,
64
+ **kwargs):
65
+
66
+ assert params_dtype == torch.float16, (
67
+ "float16 is required for marlin24 compressed models. Set dtype=torch.float16" # noqa: E501
68
+ )
69
+
70
+ pack_factor = 32 // self.quant_type.size_bits
71
+ output_size_per_partition = sum(output_partition_sizes)
72
+
73
+ qweight = PackedvLLMParameter(data=torch.empty(
74
+ input_size_per_partition // self.tile_size // 2,
75
+ output_size_per_partition * self.tile_size // pack_factor,
76
+ dtype=torch.int32,
77
+ ),
78
+ input_dim=0,
79
+ output_dim=1,
80
+ packed_dim=1,
81
+ packed_factor=pack_factor,
82
+ marlin_tile_size=self.tile_size,
83
+ weight_loader=weight_loader)
84
+
85
+ input_groups = (1 if self.group_size is None else
86
+ input_size_per_partition // self.group_size)
87
+
88
+ weight_scale_args = {
89
+ "data":
90
+ torch.empty(
91
+ input_groups,
92
+ output_size_per_partition,
93
+ dtype=params_dtype,
94
+ ),
95
+ "weight_loader":
96
+ weight_loader
97
+ }
98
+
99
+ if self.group_size is not None:
100
+ scales = GroupQuantScaleParameter(output_dim=1,
101
+ input_dim=0,
102
+ **weight_scale_args)
103
+ else:
104
+ scales = ChannelQuantScaleParameter(output_dim=1,
105
+ **weight_scale_args)
106
+
107
+ weight_shape = BasevLLMParameter(data=torch.empty(2,
108
+ dtype=torch.int64),
109
+ weight_loader=weight_loader)
110
+
111
+ meta = PackedvLLMParameter(data=torch.empty(
112
+ input_size_per_partition // 8 // 2 // 2,
113
+ output_size_per_partition * 2,
114
+ dtype=torch.int16,
115
+ ),
116
+ input_dim=0,
117
+ output_dim=1,
118
+ packed_dim=1,
119
+ packed_factor=1,
120
+ marlin_tile_size=2,
121
+ weight_loader=weight_loader)
122
+
123
+ layer.register_parameter("weight_packed", qweight)
124
+ layer.register_parameter("weight_shape", weight_shape)
125
+ layer.register_parameter("scale_packed", scales)
126
+ layer.register_parameter("meta", meta)
127
+
128
+ max_workspace_size = (
129
+ output_size_per_partition //
130
+ GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL
131
+
132
+ workspace = Parameter(torch.zeros(max_workspace_size, dtype=torch.int),
133
+ requires_grad=False)
134
+ layer.workspace = workspace
135
+
136
+ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
137
+ bias: Optional[torch.Tensor]) -> torch.Tensor:
138
+
139
+ qweight = layer.weight_packed
140
+ meta = layer.meta
141
+ scales = layer.scale_packed
142
+ workspace = layer.workspace
143
+
144
+ x_2d = x.view(-1, x.shape[-1])
145
+
146
+ size_m = x_2d.shape[0]
147
+ size_k = x_2d.shape[1]
148
+ size_n = scales.shape[1]
149
+
150
+ output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
151
+ workspace, self.quant_type, size_m,
152
+ size_n, size_k)
153
+
154
+ output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
155
+
156
+ if bias is not None:
157
+ output.add_(bias) # In-place add
158
+
159
+ return output