drawais commited on
Commit
b85c59f
·
verified ·
1 Parent(s): 216c477

Initial upload of Phi-4-NVFP4

Browse files
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 DeepSeek
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
NOTICE ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ NOTICE
2
+
3
+ This artifact is a derivative work of microsoft/phi-4, distributed under the MIT License.
4
+ The full license text is in the LICENSE file at the root of this repository.
5
+
6
+ Source model: https://huggingface.co/microsoft/phi-4
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ license_link: https://opensource.org/license/mit
4
+ base_model: microsoft/phi-4
5
+ tags:
6
+ - quantized
7
+ - 4-bit
8
+ - int4
9
+ - awq
10
+ language:
11
+ - en
12
+ library_name: transformers
13
+ pipeline_tag: text-generation
14
+ ---
15
+
16
+ # Phi-4-NVFP4
17
+
18
+ INT4 weight-only quantization of [`microsoft/phi-4`](https://huggingface.co/microsoft/phi-4).
19
+
20
+ Microsoft Phi-4 in NVFP4 W4A4. Native vLLM compressed-tensors. About 8 GB on disk.
21
+
22
+ | Property | Value |
23
+ |---|---|
24
+ | Base model | [microsoft/phi-4](https://huggingface.co/microsoft/phi-4) |
25
+ | Quantization | INT4 weight-only |
26
+ | Approx. on-disk size | ~9.7 GB |
27
+ | License | MIT License |
28
+ | Languages | English |
29
+
30
+ ## Load (vLLM)
31
+
32
+ ```bash
33
+ vllm serve drawais/Phi-4-NVFP4 \
34
+ --max-model-len 32768 \
35
+ --gpu-memory-utilization 0.94
36
+ ```
37
+
38
+ ```python
39
+ from vllm import LLM, SamplingParams
40
+ llm = LLM(model="drawais/Phi-4-NVFP4", max_model_len=32768)
41
+ print(llm.generate(["Hello!"], SamplingParams(max_tokens=128))[0].outputs[0].text)
42
+ ```
43
+
44
+ ## Footprint
45
+
46
+ ~9.7 GB on disk. Recommended VRAM: enough headroom for KV cache.
47
+
48
+ ## License & attribution
49
+
50
+ This artifact is a derivative work of [`microsoft/phi-4`](https://huggingface.co/microsoft/phi-4),
51
+ released by its original authors under the **MIT License**.
52
+
53
+ This artifact is distributed under the same license. The full license text is
54
+ included in [`LICENSE`](LICENSE), and required attribution is in [`NOTICE`](NOTICE).
55
+
56
+ License text: https://opensource.org/license/mit
57
+ Source model: https://huggingface.co/microsoft/phi-4
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}
config.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Phi3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 100257,
8
+ "dtype": "bfloat16",
9
+ "embd_pdrop": 0.0,
10
+ "eos_token_id": 100265,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 5120,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 17920,
15
+ "max_position_embeddings": 16384,
16
+ "model_type": "phi3",
17
+ "num_attention_heads": 40,
18
+ "num_hidden_layers": 40,
19
+ "num_key_value_heads": 10,
20
+ "original_max_position_embeddings": 16384,
21
+ "pad_token_id": 100349,
22
+ "quantization_config": {
23
+ "config_groups": {
24
+ "group_0": {
25
+ "format": "nvfp4-pack-quantized",
26
+ "input_activations": {
27
+ "actorder": null,
28
+ "block_structure": null,
29
+ "dynamic": "local",
30
+ "group_size": 16,
31
+ "num_bits": 4,
32
+ "observer": "static_minmax",
33
+ "observer_kwargs": {},
34
+ "scale_dtype": "torch.float8_e4m3fn",
35
+ "strategy": "tensor_group",
36
+ "symmetric": true,
37
+ "type": "float",
38
+ "zp_dtype": null
39
+ },
40
+ "output_activations": null,
41
+ "targets": [
42
+ "Linear"
43
+ ],
44
+ "weights": {
45
+ "actorder": null,
46
+ "block_structure": null,
47
+ "dynamic": false,
48
+ "group_size": 16,
49
+ "num_bits": 4,
50
+ "observer": "memoryless_minmax",
51
+ "observer_kwargs": {},
52
+ "scale_dtype": "torch.float8_e4m3fn",
53
+ "strategy": "tensor_group",
54
+ "symmetric": true,
55
+ "type": "float",
56
+ "zp_dtype": null
57
+ }
58
+ }
59
+ },
60
+ "format": "nvfp4-pack-quantized",
61
+ "global_compression_ratio": null,
62
+ "ignore": [
63
+ "lm_head"
64
+ ],
65
+ "kv_cache_scheme": null,
66
+ "quant_method": "compressed-tensors",
67
+ "quantization_status": "compressed",
68
+ "sparsity_config": {},
69
+ "transform_config": {},
70
+ "version": "0.15.1.a20260428"
71
+ },
72
+ "resid_pdrop": 0.0,
73
+ "rms_norm_eps": 1e-05,
74
+ "rope_parameters": {
75
+ "partial_rotary_factor": 1.0,
76
+ "rope_theta": 250000,
77
+ "rope_type": "default"
78
+ },
79
+ "sliding_window": null,
80
+ "tie_word_embeddings": false,
81
+ "transformers_version": "5.8.0.dev0",
82
+ "use_cache": true,
83
+ "vocab_size": 100352
84
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 100257,
4
+ "eos_token_id": 100265,
5
+ "pad_token_id": 100349,
6
+ "transformers_version": "5.8.0.dev0"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56d99adbc01934fd87dcc0bb6ec336a3855f618d31b3105c365887ead5be81a6
3
+ size 9723835440
recipe.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ default_stage:
2
+ default_modifiers:
3
+ QuantizationModifier:
4
+ targets: [Linear]
5
+ ignore: [lm_head, 're:.*embed.*', 're:.*router.*']
6
+ scheme: NVFP4
7
+ bypass_divisibility_checks: false
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|endoftext|>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "is_local": false,
8
+ "local_files_only": false,
9
+ "model_max_length": 16384,
10
+ "pad_token": "<|dummy_85|>",
11
+ "tokenizer_class": "TokenizersBackend"
12
+ }