drawais commited on
Commit
572b359
·
verified ·
1 Parent(s): 677604d

Initial upload of Phi-4-mini-instruct-AWQ-INT4

Browse files
Files changed (11) hide show
  1. .gitattributes +1 -0
  2. LICENSE +21 -0
  3. NOTICE +6 -0
  4. README.md +57 -0
  5. chat_template.jinja +1 -0
  6. config.json +183 -0
  7. generation_config.json +10 -0
  8. model.safetensors +3 -0
  9. recipe.yaml +23 -0
  10. tokenizer.json +3 -0
  11. tokenizer_config.json +13 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 DeepSeek
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
NOTICE ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ NOTICE
2
+
3
+ This artifact is a derivative work of microsoft/Phi-4-mini-instruct, distributed under the MIT License.
4
+ The full license text is in the LICENSE file at the root of this repository.
5
+
6
+ Source model: https://huggingface.co/microsoft/Phi-4-mini-instruct
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ license_link: https://opensource.org/license/mit
4
+ base_model: microsoft/Phi-4-mini-instruct
5
+ tags:
6
+ - quantized
7
+ - 4-bit
8
+ - int4
9
+ - awq
10
+ language:
11
+ - en
12
+ library_name: transformers
13
+ pipeline_tag: text-generation
14
+ ---
15
+
16
+ # Phi-4-mini-instruct-AWQ-INT4
17
+
18
+ INT4 weight-only quantization of [`microsoft/Phi-4-mini-instruct`](https://huggingface.co/microsoft/Phi-4-mini-instruct).
19
+
20
+ Microsoft Phi-4-mini-instruct in INT4. About 2.5 GB on disk. Runs on a 4 GB consumer GPU.
21
+
22
+ | Property | Value |
23
+ |---|---|
24
+ | Base model | [microsoft/Phi-4-mini-instruct](https://huggingface.co/microsoft/Phi-4-mini-instruct) |
25
+ | Quantization | INT4 weight-only |
26
+ | Approx. on-disk size | ~2.9 GB |
27
+ | License | MIT License |
28
+ | Languages | English |
29
+
30
+ ## Load (vLLM)
31
+
32
+ ```bash
33
+ vllm serve drawais/Phi-4-mini-instruct-AWQ-INT4 \
34
+ --max-model-len 32768 \
35
+ --gpu-memory-utilization 0.94
36
+ ```
37
+
38
+ ```python
39
+ from vllm import LLM, SamplingParams
40
+ llm = LLM(model="drawais/Phi-4-mini-instruct-AWQ-INT4", max_model_len=32768)
41
+ print(llm.generate(["Hello!"], SamplingParams(max_tokens=128))[0].outputs[0].text)
42
+ ```
43
+
44
+ ## Footprint
45
+
46
+ ~2.9 GB on disk. Recommended VRAM: enough headroom for KV cache.
47
+
48
+ ## License & attribution
49
+
50
+ This artifact is a derivative work of [`microsoft/Phi-4-mini-instruct`](https://huggingface.co/microsoft/Phi-4-mini-instruct),
51
+ released by its original authors under the **MIT License**.
52
+
53
+ This artifact is distributed under the same license. The full license text is
54
+ included in [`LICENSE`](LICENSE), and required attribution is in [`NOTICE`](NOTICE).
55
+
56
+ License text: https://opensource.org/license/mit
57
+ Source model: https://huggingface.co/microsoft/Phi-4-mini-instruct
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}
config.json ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Phi3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_phi3.Phi3Config",
9
+ "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM",
10
+ "AutoTokenizer": "Xenova/gpt-4o"
11
+ },
12
+ "bos_token_id": 199999,
13
+ "dtype": "bfloat16",
14
+ "embd_pdrop": 0.0,
15
+ "eos_token_id": 199999,
16
+ "full_attn_mod": 1,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 3072,
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 8192,
21
+ "interpolate_factor": 1,
22
+ "lm_head_bias": false,
23
+ "max_position_embeddings": 131072,
24
+ "mlp_bias": false,
25
+ "model_type": "phi3",
26
+ "num_attention_heads": 24,
27
+ "num_hidden_layers": 32,
28
+ "num_key_value_heads": 8,
29
+ "original_max_position_embeddings": 4096,
30
+ "pad_token_id": 199999,
31
+ "partial_rotary_factor": 0.75,
32
+ "quantization_config": {
33
+ "config_groups": {
34
+ "group_0": {
35
+ "format": "pack-quantized",
36
+ "input_activations": null,
37
+ "output_activations": null,
38
+ "targets": [
39
+ "Linear"
40
+ ],
41
+ "weights": {
42
+ "actorder": null,
43
+ "block_structure": null,
44
+ "dynamic": false,
45
+ "group_size": 128,
46
+ "num_bits": 4,
47
+ "observer": "memoryless_minmax",
48
+ "observer_kwargs": {},
49
+ "scale_dtype": null,
50
+ "strategy": "group",
51
+ "symmetric": true,
52
+ "type": "int",
53
+ "zp_dtype": null
54
+ }
55
+ }
56
+ },
57
+ "format": "pack-quantized",
58
+ "global_compression_ratio": null,
59
+ "ignore": [
60
+ "lm_head"
61
+ ],
62
+ "kv_cache_scheme": null,
63
+ "quant_method": "compressed-tensors",
64
+ "quantization_status": "compressed",
65
+ "sparsity_config": {},
66
+ "transform_config": {},
67
+ "version": "0.15.1.a20260428"
68
+ },
69
+ "resid_pdrop": 0.0,
70
+ "rms_norm_eps": 1e-05,
71
+ "rope_parameters": {
72
+ "long_factor": [
73
+ 1,
74
+ 1.118320672,
75
+ 1.250641126,
76
+ 1.398617824,
77
+ 1.564103225,
78
+ 1.74916897,
79
+ 1.956131817,
80
+ 2.187582649,
81
+ 2.446418898,
82
+ 2.735880826,
83
+ 3.059592084,
84
+ 3.421605075,
85
+ 3.826451687,
86
+ 4.279200023,
87
+ 4.785517845,
88
+ 5.351743533,
89
+ 5.984965424,
90
+ 6.693110555,
91
+ 7.485043894,
92
+ 8.370679318,
93
+ 9.36110372,
94
+ 10.4687158,
95
+ 11.70738129,
96
+ 13.09260651,
97
+ 14.64173252,
98
+ 16.37415215,
99
+ 18.31155283,
100
+ 20.47818807,
101
+ 22.90118105,
102
+ 25.61086418,
103
+ 28.64115884,
104
+ 32.03,
105
+ 32.1,
106
+ 32.13,
107
+ 32.23,
108
+ 32.6,
109
+ 32.61,
110
+ 32.64,
111
+ 32.66,
112
+ 32.7,
113
+ 32.71,
114
+ 32.93,
115
+ 32.97,
116
+ 33.28,
117
+ 33.49,
118
+ 33.5,
119
+ 44.16,
120
+ 47.77
121
+ ],
122
+ "original_max_position_embeddings": 4096,
123
+ "partial_rotary_factor": 0.75,
124
+ "rope_theta": 10000.0,
125
+ "rope_type": "longrope",
126
+ "short_factor": [
127
+ 1.0,
128
+ 1.0,
129
+ 1.0,
130
+ 1.0,
131
+ 1.0,
132
+ 1.0,
133
+ 1.0,
134
+ 1.0,
135
+ 1.0,
136
+ 1.0,
137
+ 1.0,
138
+ 1.0,
139
+ 1.0,
140
+ 1.0,
141
+ 1.0,
142
+ 1.0,
143
+ 1.0,
144
+ 1.0,
145
+ 1.0,
146
+ 1.0,
147
+ 1.0,
148
+ 1.0,
149
+ 1.0,
150
+ 1.0,
151
+ 1.0,
152
+ 1.0,
153
+ 1.0,
154
+ 1.0,
155
+ 1.0,
156
+ 1.0,
157
+ 1.0,
158
+ 1.0,
159
+ 1.0,
160
+ 1.0,
161
+ 1.0,
162
+ 1.0,
163
+ 1.0,
164
+ 1.0,
165
+ 1.0,
166
+ 1.0,
167
+ 1.0,
168
+ 1.0,
169
+ 1.0,
170
+ 1.0,
171
+ 1.0,
172
+ 1.0,
173
+ 1.0,
174
+ 1.0
175
+ ],
176
+ "type": "longrope"
177
+ },
178
+ "sliding_window": 262144,
179
+ "tie_word_embeddings": true,
180
+ "transformers_version": "5.8.0.dev0",
181
+ "use_cache": true,
182
+ "vocab_size": 200064
183
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 199999,
4
+ "eos_token_id": [
5
+ 200020,
6
+ 199999
7
+ ],
8
+ "pad_token_id": 199999,
9
+ "transformers_version": "5.8.0.dev0"
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44ccf7b76f5a35d7aee0e5b92f50b8794894b7155c33247ebb2680e420a87f3b
3
+ size 2890590792
recipe.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_stage:
2
+ default_modifiers:
3
+ AWQModifier:
4
+ mappings:
5
+ - smooth_layer: re:.*input_layernorm$
6
+ balance_layers: ['re:.*qkv_proj$']
7
+ activation_hook_target: null
8
+ - smooth_layer: re:.*qkv_proj$
9
+ balance_layers: ['re:.*o_proj$']
10
+ activation_hook_target: null
11
+ - smooth_layer: re:.*post_attention_layernorm$
12
+ balance_layers: ['re:.*gate_up_proj$']
13
+ activation_hook_target: null
14
+ - smooth_layer: re:.*gate_up_proj$
15
+ balance_layers: ['re:.*down_proj$']
16
+ activation_hook_target: null
17
+ duo_scaling: true
18
+ n_grid: 20
19
+ QuantizationModifier:
20
+ targets: [Linear]
21
+ ignore: [lm_head, 're:.*embed.*', 're:.*router.*', 're:.*\.gate$']
22
+ scheme: W4A16
23
+ bypass_divisibility_checks: false
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3e90cb29c535164fb1d13876eb66385b5e57e6c1165d35117f3bb16ce114bbe
3
+ size 15524575
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|endoftext|>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|endoftext|>",
7
+ "is_local": false,
8
+ "local_files_only": false,
9
+ "model_max_length": 131072,
10
+ "pad_token": "<|endoftext|>",
11
+ "tokenizer_class": "TokenizersBackend",
12
+ "unk_token": "<|endoftext|>"
13
+ }