darkmaniac7 commited on
Commit
11595f5
·
verified ·
1 Parent(s): ea71244

Upload folder using huggingface_hub

Browse files
Files changed (9) hide show
  1. .gitattributes +2 -0
  2. README.md +66 -0
  3. config.json +21 -0
  4. config_cpu.json +9 -0
  5. draft_config_cpu.json +9 -0
  6. llm.mnn +3 -0
  7. llm.mnn.weight +3 -0
  8. llm_config.json +28 -0
  9. tokenizer.txt +0 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ llm.mnn filter=lfs diff=lfs merge=lfs -text
37
+ llm.mnn.weight filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TokForge Acceleration Pack — Draft Model
2
+
3
+ **Qwen3-0.6B (MNN)** draft model for speculative decoding in [TokForge](https://tokforge.ai).
4
+
5
+ ## What is this?
6
+
7
+ This is a curated, ready-to-use 0.6B parameter draft model packaged for TokForge's speculative decoding pipeline. Speculative decoding uses a small, fast "draft" model to propose candidate tokens, which are then verified in parallel by the larger "target" model. This amortizes the cost of target model inference and delivers significant speed improvements.
8
+
9
+ ## Performance
10
+
11
+ Tested on Snapdragon 8 Elite 2 (SM8850 / Adreno 840):
12
+
13
+ | Target Model | Baseline (tok/s) | With Spec-Decode (tok/s) | Improvement |
14
+ |---|---|---|---|
15
+ | Qwen3-8B (OpenCL) | 11.41 | 19.60 | **+72%** |
16
+ | Qwen3.5-9B (CPU) | 9.55 | 16.67 | **+75%** |
17
+
18
+ Improvements vary by device and target model. Typical range: **+18-75%** decode speed on flagship SoCs (SM8850, SM8650).
19
+
20
+ ## Compatible Target Models
21
+
22
+ - Qwen3-4B (MNN, OpenCL or CPU)
23
+ - Qwen3-8B (MNN, OpenCL)
24
+ - Qwen3-14B (MNN, OpenCL, 24GB+ RAM devices)
25
+ - Qwen3.5-9B (MNN, CPU)
26
+
27
+ ## Installation
28
+
29
+ ### Automatic (Recommended)
30
+
31
+ In TokForge: **Settings > Advanced > Speculative Decoding > Download Acceleration Pack**
32
+
33
+ The app will download and configure the draft model automatically.
34
+
35
+ ### Manual
36
+
37
+ 1. Download all files from this repository
38
+ 2. Copy them to: `<device>/Android/data/dev.tokforge/files/models/spec-decode-draft/`
39
+ 3. Enable speculative decoding in TokForge Settings > Advanced
40
+
41
+ ## Files
42
+
43
+ | File | Size | Description |
44
+ |---|---|---|
45
+ | `llm.mnn` | ~450 KB | MNN model graph |
46
+ | `llm.mnn.weight` | ~430 MB | Model weights |
47
+ | `tokenizer.txt` | ~3 MB | Tokenizer vocabulary |
48
+ | `config.json` | <1 KB | Model configuration |
49
+ | `llm_config.json` | ~5 KB | LLM inference configuration |
50
+ | `config_cpu.json` | <1 KB | CPU backend configuration for draft inference |
51
+ | `draft_config_cpu.json` | <1 KB | Draft-specific CPU configuration |
52
+
53
+ ## Technical Details
54
+
55
+ - **Architecture**: Qwen3-0.6B (28 transformer layers, 1024 hidden dim)
56
+ - **Format**: MNN (Mobile Neural Network by Alibaba)
57
+ - **Backend**: CPU with single-thread, low-precision inference for minimal draft overhead (~21ms/token)
58
+ - **Optimal draft length**: d=2 for 8B targets, d=3 for 9B targets
59
+
60
+ ## Source
61
+
62
+ Based on [taobao-mnn/Qwen3-0.6B-MNN](https://huggingface.co/taobao-mnn/Qwen3-0.6B-MNN), repackaged with TokForge-specific draft configurations.
63
+
64
+ ## License
65
+
66
+ This model inherits the license from [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B). See the original model card for license terms.
config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "llm_model": "llm.mnn",
3
+ "llm_weight": "llm.mnn.weight",
4
+ "backend_type": "cpu",
5
+ "thread_num": 4,
6
+ "precision": "low",
7
+ "memory": "low",
8
+ "sampler_type": "mixed",
9
+ "mixed_samplers": [
10
+ "penalty",
11
+ "topK",
12
+ "topP",
13
+ "min_p",
14
+ "temperature"
15
+ ],
16
+ "penalty": 1.1,
17
+ "temperature": 0.6,
18
+ "topP": 0.95,
19
+ "topK": 20,
20
+ "min_p": 0
21
+ }
config_cpu.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "llm_model": "llm.mnn",
3
+ "llm_weight": "llm.mnn.weight",
4
+ "backend_type": "cpu",
5
+ "thread_num": 1,
6
+ "precision": "low",
7
+ "power": "high",
8
+ "sampler_type": "greedy"
9
+ }
draft_config_cpu.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "llm_model": "llm.mnn",
3
+ "llm_weight": "llm.mnn.weight",
4
+ "backend_type": "cpu",
5
+ "thread_num": 1,
6
+ "precision": "low",
7
+ "power": "high",
8
+ "sampler_type": "greedy"
9
+ }
llm.mnn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d426c65a5159c938ccc237cdfbd982137f276804f27b414ca0ecf3fc0a660f8c
3
+ size 461520
llm.mnn.weight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:953afb7e0165818add34a7a6caf0af5d0ed9428da102eb22c9b98ee9da292e9f
3
+ size 450810338
llm_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "hidden_size": 1024,
3
+ "layer_nums": 28,
4
+ "attention_mask": "float",
5
+ "key_value_shape": [
6
+ 2,
7
+ 1,
8
+ 0,
9
+ 8,
10
+ 128
11
+ ],
12
+ "bos": "",
13
+ "system_prompt_template": "<|im_start|>system\n%s<|im_end|>\n",
14
+ "user_prompt_template": "<|im_start|>user\n%s<|im_end|>\n",
15
+ "assistant_prompt_template": "<|im_start|>assistant\n%s<|im_end|>\n",
16
+ "is_visual": false,
17
+ "jinja": {
18
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
19
+ "eos": "<|im_end|>"
20
+ },
21
+ "tie_embeddings": [
22
+ 275780066,
23
+ 431362530,
24
+ 19447808,
25
+ 8,
26
+ 64
27
+ ]
28
+ }
tokenizer.txt ADDED
The diff for this file is too large to render. See raw diff