drawais commited on
Commit
e313e8a
·
verified ·
1 Parent(s): 0107cd0

Initial upload of DeepSeek-R1-Distill-Llama-8B-AWQ-INT4

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 DeepSeek
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
NOTICE ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ NOTICE
2
+
3
+ This artifact is a derivative work of deepseek-ai/DeepSeek-R1-Distill-Llama-8B, distributed under the MIT License.
4
+ The full license text is in the LICENSE file at the root of this repository.
5
+
6
+ Source model: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ license_link: https://opensource.org/license/mit
4
+ base_model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
5
+ tags:
6
+ - quantized
7
+ - 4-bit
8
+ - int4
9
+ - awq
10
+ language:
11
+ - en
12
+ library_name: transformers
13
+ pipeline_tag: text-generation
14
+ ---
15
+
16
+ # DeepSeek-R1-Distill-Llama-8B-AWQ-INT4
17
+
18
+ INT4 weight-only quantization of [`deepseek-ai/DeepSeek-R1-Distill-Llama-8B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B).
19
+
20
+ DeepSeek-R1 reasoning distilled into Llama 8B, then INT4. About 5 GB on disk. Runs on an 8 GB consumer GPU.
21
+
22
+ | Property | Value |
23
+ |---|---|
24
+ | Base model | [deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) |
25
+ | Quantization | INT4 weight-only |
26
+ | Approx. on-disk size | ~5.7 GB |
27
+ | License | MIT License |
28
+ | Languages | English |
29
+
30
+ ## Load (vLLM)
31
+
32
+ ```bash
33
+ vllm serve drawais/DeepSeek-R1-Distill-Llama-8B-AWQ-INT4 \
34
+ --max-model-len 32768 \
35
+ --gpu-memory-utilization 0.94
36
+ ```
37
+
38
+ ```python
39
+ from vllm import LLM, SamplingParams
40
+ llm = LLM(model="drawais/DeepSeek-R1-Distill-Llama-8B-AWQ-INT4", max_model_len=32768)
41
+ print(llm.generate(["Hello!"], SamplingParams(max_tokens=128))[0].outputs[0].text)
42
+ ```
43
+
44
+ ## Footprint
45
+
46
+ ~5.7 GB on disk. Recommended VRAM: enough headroom for KV cache.
47
+
48
+ ## License & attribution
49
+
50
+ This artifact is a derivative work of [`deepseek-ai/DeepSeek-R1-Distill-Llama-8B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B),
51
+ released by its original authors under the **MIT License**.
52
+
53
+ This artifact is distributed under the same license. The full license text is
54
+ included in [`LICENSE`](LICENSE), and required attribution is in [`NOTICE`](NOTICE).
55
+
56
+ License text: https://opensource.org/license/mit
57
+ Source model: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\n'}}{% endif %}
config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 128000,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 128001,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 14336,
15
+ "max_position_embeddings": 131072,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 32,
19
+ "num_hidden_layers": 32,
20
+ "num_key_value_heads": 8,
21
+ "pad_token_id": null,
22
+ "pretraining_tp": 1,
23
+ "quantization_config": {
24
+ "config_groups": {
25
+ "group_0": {
26
+ "format": "pack-quantized",
27
+ "input_activations": null,
28
+ "output_activations": null,
29
+ "targets": [
30
+ "Linear"
31
+ ],
32
+ "weights": {
33
+ "actorder": null,
34
+ "block_structure": null,
35
+ "dynamic": false,
36
+ "group_size": 128,
37
+ "num_bits": 4,
38
+ "observer": "memoryless_minmax",
39
+ "observer_kwargs": {},
40
+ "scale_dtype": null,
41
+ "strategy": "group",
42
+ "symmetric": true,
43
+ "type": "int",
44
+ "zp_dtype": null
45
+ }
46
+ }
47
+ },
48
+ "format": "pack-quantized",
49
+ "global_compression_ratio": null,
50
+ "ignore": [
51
+ "lm_head"
52
+ ],
53
+ "kv_cache_scheme": null,
54
+ "quant_method": "compressed-tensors",
55
+ "quantization_status": "compressed",
56
+ "sparsity_config": {},
57
+ "transform_config": {},
58
+ "version": "0.15.1.a20260428"
59
+ },
60
+ "rms_norm_eps": 1e-05,
61
+ "rope_parameters": {
62
+ "factor": 8.0,
63
+ "high_freq_factor": 4.0,
64
+ "low_freq_factor": 1.0,
65
+ "original_max_position_embeddings": 8192,
66
+ "rope_theta": 500000.0,
67
+ "rope_type": "llama3"
68
+ },
69
+ "tie_word_embeddings": false,
70
+ "transformers_version": "5.8.0.dev0",
71
+ "use_cache": true,
72
+ "vocab_size": 128256
73
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "do_sample": true,
5
+ "eos_token_id": 128001,
6
+ "temperature": 0.6,
7
+ "top_p": 0.95,
8
+ "transformers_version": "5.8.0.dev0"
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9ce8cbb6a0d871f4e04040bb0f52b8176790b6741377793b9a4f12765f61836
3
+ size 5700679568
recipe.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_stage:
2
+ default_modifiers:
3
+ AWQModifier:
4
+ mappings:
5
+ - smooth_layer: re:.*input_layernorm$
6
+ balance_layers: ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$']
7
+ activation_hook_target: null
8
+ - smooth_layer: re:.*v_proj$
9
+ balance_layers: ['re:.*o_proj$']
10
+ activation_hook_target: null
11
+ - smooth_layer: re:.*post_attention_layernorm$
12
+ balance_layers: ['re:.*gate_proj$', 're:.*up_proj$']
13
+ activation_hook_target: null
14
+ - smooth_layer: re:.*up_proj$
15
+ balance_layers: ['re:.*down_proj$']
16
+ activation_hook_target: null
17
+ duo_scaling: true
18
+ n_grid: 20
19
+ QuantizationModifier:
20
+ targets: [Linear]
21
+ ignore: [lm_head, 're:.*embed.*', 're:.*router.*', 're:.*\.gate$']
22
+ scheme: W4A16
23
+ bypass_divisibility_checks: false
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01198a3edd0ca190a83e4b45098d4c267ca39d7a1ea571cd5f43af20b4227438
3
+ size 17208730
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|begin▁of▁sentence|>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|end▁of▁sentence|>",
7
+ "is_local": false,
8
+ "local_files_only": false,
9
+ "model_max_length": 16384,
10
+ "pad_token": "<|end▁of▁sentence|>",
11
+ "sp_model_kwargs": {},
12
+ "tokenizer_class": "LlamaTokenizer",
13
+ "unk_token": null,
14
+ "use_default_system_prompt": false
15
+ }