courage17340 commited on
Commit
e0ee936
·
1 Parent(s): 344a634

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. LICENSE +27 -0
  3. README.md +618 -0
  4. THIRD_PARTY_NOTICES.md +43 -0
  5. chat_template.jinja +112 -0
  6. config.json +192 -0
  7. configuration_deepseek.py +214 -0
  8. configuration_kimi_k25.py +123 -0
  9. docs/deploy_guidance.md +94 -0
  10. figures/demo_video.mp4 +3 -0
  11. figures/kimi-logo.png +0 -0
  12. generation_config.json +4 -0
  13. kimi_k25_processor.py +165 -0
  14. kimi_k25_vision_processing.py +251 -0
  15. media_utils.py +368 -0
  16. model-00001-of-000064.safetensors +3 -0
  17. model-00002-of-000064.safetensors +3 -0
  18. model-00003-of-000064.safetensors +3 -0
  19. model-00004-of-000064.safetensors +3 -0
  20. model-00005-of-000064.safetensors +3 -0
  21. model-00006-of-000064.safetensors +3 -0
  22. model-00007-of-000064.safetensors +3 -0
  23. model-00008-of-000064.safetensors +3 -0
  24. model-00009-of-000064.safetensors +3 -0
  25. model-00010-of-000064.safetensors +3 -0
  26. model-00011-of-000064.safetensors +3 -0
  27. model-00012-of-000064.safetensors +3 -0
  28. model-00013-of-000064.safetensors +3 -0
  29. model-00014-of-000064.safetensors +3 -0
  30. model-00015-of-000064.safetensors +3 -0
  31. model-00016-of-000064.safetensors +3 -0
  32. model-00017-of-000064.safetensors +3 -0
  33. model-00018-of-000064.safetensors +3 -0
  34. model-00019-of-000064.safetensors +3 -0
  35. model-00020-of-000064.safetensors +3 -0
  36. model-00021-of-000064.safetensors +3 -0
  37. model-00022-of-000064.safetensors +3 -0
  38. model-00023-of-000064.safetensors +3 -0
  39. model-00024-of-000064.safetensors +3 -0
  40. model-00025-of-000064.safetensors +3 -0
  41. model-00026-of-000064.safetensors +3 -0
  42. model-00027-of-000064.safetensors +3 -0
  43. model-00028-of-000064.safetensors +3 -0
  44. model-00029-of-000064.safetensors +3 -0
  45. model-00030-of-000064.safetensors +3 -0
  46. model-00031-of-000064.safetensors +3 -0
  47. model-00032-of-000064.safetensors +3 -0
  48. model-00033-of-000064.safetensors +3 -0
  49. model-00034-of-000064.safetensors +3 -0
  50. model-00035-of-000064.safetensors +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
37
+ figures/demo_video.mp4 filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Modified MIT License
2
+
3
+ Copyright (c) 2026 Moonshot AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the “Software”), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ Our only modification part is that, if the Software (or any derivative works
24
+ thereof) is used for any of your commercial products or services that have
25
+ more than 100 million monthly active users, or more than 20 million US dollars
26
+ (or equivalent in other currencies) in monthly revenue, you shall prominently
27
+ display "Kimi K2.6" on the user interface of such product or service.
README.md ADDED
@@ -0,0 +1,618 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - compressed-tensors
4
+ license: other
5
+ license_name: modified-mit
6
+ library_name: transformers
7
+ pipeline_tag: image-text-to-text
8
+ ---
9
+ <div align="center">
10
+ <picture>
11
+ <img src="figures/kimi-logo.png" width="30%" alt="Kimi K2.6">
12
+ </picture>
13
+ </div>
14
+ <hr>
15
+ <div align="center" style="line-height:1">
16
+ <a href="https://www.kimi.com" target="_blank"><img alt="Chat" src="https://img.shields.io/badge/🤖%20Chat-Kimi%20K2.6-ff6b6b?color=1783ff&logoColor=white"/></a>
17
+ <a href="https://www.moonshot.ai" target="_blank"><img alt="Homepage" src="https://img.shields.io/badge/Homepage-Moonshot%20AI-white?logo=Kimi&logoColor=white"/></a>
18
+ </div>
19
+
20
+ <div align="center" style="line-height: 1;">
21
+ <a href="https://huggingface.co/moonshotai" target="_blank"><img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Moonshot%20AI-ffc107?color=ffc107&logoColor=white"/></a>
22
+ <a href="https://twitter.com/kimi_moonshot" target="_blank"><img alt="Twitter Follow" src="https://img.shields.io/badge/Twitter-Kimi.ai-white?logo=x&logoColor=white"/></a>
23
+ <a href="https://discord.gg/TYU2fdJykW" target="_blank"><img alt="Discord" src="https://img.shields.io/badge/Discord-Kimi.ai-white?logo=discord&logoColor=white"/></a>
24
+ </div>
25
+ <div align="center" style="line-height: 1;">
26
+ <a href="https://huggingface.co/moonshotai/Kimi-K2.6/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/License-Modified_MIT-f5de53?&color=f5de53"/></a>
27
+ </div>
28
+
29
+
30
+ ## 1. Model Introduction
31
+
32
+ Kimi K2.6 is an open-source, native multimodal agentic model that advances practical capabilities in long-horizon coding, coding-driven design, proactive autonomous execution, and swarm-based task orchestration.
33
+
34
+ ### Key Features
35
+ - **Long-Horizon Coding**: K2.6 achieves significant improvements on complex, end-to-end coding tasks, generalizing robustly across programming languages (Rust, Go, Python) and domains spanning front-end, DevOps, and performance optimization.
36
+ - **Coding-Driven Design**: K2.6 is capable of transforming simple prompts and visual inputs into production-ready interfaces and lightweight full-stack workflows, generating structured layouts, interactive elements, and rich animations with deliberate aesthetic precision.
37
+ - **Elevated Agent Swarm**: Scaling horizontally to 300 sub-agents executing 4,000 coordinated steps, K2.6 can dynamically decompose tasks into parallel, domain-specialized subtasks, delivering end-to-end outputs from documents to websites to spreadsheets in a single autonomous run.
38
+ - **Proactive & Open Orchestration**: For autonomous tasks, K2.6 demonstrates strong performance in powering persistent, 24/7 background agents that proactively manage schedules, execute code, and orchestrate cross-platform operations without human oversight.
39
+
40
+ ## 2. Model Summary
41
+
42
+ <div align="center">
43
+
44
+
45
+ | | |
46
+ |:---:|:---:|
47
+ | **Architecture** | Mixture-of-Experts (MoE) |
48
+ | **Total Parameters** | 1T |
49
+ | **Activated Parameters** | 32B |
50
+ | **Number of Layers** (Dense layer included) | 61 |
51
+ | **Number of Dense Layers** | 1 |
52
+ | **Attention Hidden Dimension** | 7168 |
53
+ | **MoE Hidden Dimension** (per Expert) | 2048 |
54
+ | **Number of Attention Heads** | 64 |
55
+ | **Number of Experts** | 384 |
56
+ | **Selected Experts per Token** | 8 |
57
+ | **Number of Shared Experts** | 1 |
58
+ | **Vocabulary Size** | 160K |
59
+ | **Context Length** | 256K |
60
+ | **Attention Mechanism** | MLA |
61
+ | **Activation Function** | SwiGLU |
62
+ | **Vision Encoder** | MoonViT |
63
+ | **Parameters of Vision Encoder** | 400M |
64
+ </div>
65
+
66
+ ## 3. Evaluation Results
67
+
68
+ <div align="center">
69
+ <table>
70
+ <thead>
71
+ <tr>
72
+ <th align="center">Benchmark</th>
73
+ <th align="center"><sup>Kimi K2.6</sup></th>
74
+ <th align="center"><sup>GPT-5.4 <br><sup>(xhigh)</sup></sup></th>
75
+ <th align="center"><sup>Claude Opus 4.6 <br><sup>(max effort)</sup></sup></th>
76
+ <th align="center"><sup>Gemini 3.1 Pro<br><sup>(thinking high)</sup></sup></th>
77
+ <th align="center"><sup>Kimi K2.5</sup></th>
78
+ </tr>
79
+ </thead>
80
+ <tbody>
81
+ <tr>
82
+ <td align="center" colspan=6><strong>Agentic</strong></td>
83
+ </tr>
84
+ <tr>
85
+ <td align="center" style="vertical-align: middle">HLE-Full<br>(w/ tools)</td>
86
+ <td align="center" style="vertical-align: middle">54.0</td>
87
+ <td align="center" style="vertical-align: middle">52.1</td>
88
+ <td align="center" style="vertical-align: middle">53.0</td>
89
+ <td align="center" style="vertical-align: middle">51.4</td>
90
+ <td align="center" style="vertical-align: middle">50.2</td>
91
+ </tr>
92
+ <tr>
93
+ <td align="center" style="vertical-align: middle">BrowseComp</td>
94
+ <td align="center" style="vertical-align: middle">83.2</td>
95
+ <td align="center" style="vertical-align: middle" rowspan="2">82.7</td>
96
+ <td align="center" style="vertical-align: middle" rowspan="2">83.7</td>
97
+ <td align="center" style="vertical-align: middle" rowspan="2">85.9</td>
98
+ <td align="center" style="vertical-align: middle">74.9</td>
99
+ </tr>
100
+ <tr>
101
+ <td align="center" style="vertical-align: middle">BrowseComp<br>(Agent Swarm)</td>
102
+ <td align="center" style="vertical-align: middle">86.3</td>
103
+ <td align="center" style="vertical-align: middle">78.4</td>
104
+ </tr>
105
+ <tr>
106
+ <td align="center" style="vertical-align: middle">DeepSearchQA<br>(f1-score)</td>
107
+ <td align="center" style="vertical-align: middle">92.5</td>
108
+ <td align="center" style="vertical-align: middle">78.6</td>
109
+ <td align="center" style="vertical-align: middle">91.3</td>
110
+ <td align="center" style="vertical-align: middle">81.9</td>
111
+ <td align="center" style="vertical-align: middle">89.0</td>
112
+ </tr>
113
+ <tr>
114
+ <td align="center" style="vertical-align: middle">DeepSearchQA<br>(accuracy)</td>
115
+ <td align="center" style="vertical-align: middle">83.0</td>
116
+ <td align="center" style="vertical-align: middle">63.7</td>
117
+ <td align="center" style="vertical-align: middle">80.6</td>
118
+ <td align="center" style="vertical-align: middle">60.2</td>
119
+ <td align="center" style="vertical-align: middle">77.1</td>
120
+ </tr>
121
+ <tr>
122
+ <td align="center" style="vertical-align: middle">WideSearch<br> (item-f1)</td>
123
+ <td align="center" style="vertical-align: middle">80.8</td>
124
+ <td align="center" style="vertical-align: middle">-</td>
125
+ <td align="center" style="vertical-align: middle">-</td>
126
+ <td align="center" style="vertical-align: middle">-</td>
127
+ <td align="center" style="vertical-align: middle">72.7</td>
128
+ </tr>
129
+ <tr>
130
+ <td align="center" style="vertical-align: middle">Toolathlon</td>
131
+ <td align="center" style="vertical-align: middle">50.0</td>
132
+ <td align="center" style="vertical-align: middle">54.6</td>
133
+ <td align="center" style="vertical-align: middle">47.2</td>
134
+ <td align="center" style="vertical-align: middle">48.8</td>
135
+ <td align="center" style="vertical-align: middle">27.8</td>
136
+ </tr>
137
+ <tr>
138
+ <td align="center" style="vertical-align: middle">MCPMark</td>
139
+ <td align="center" style="vertical-align: middle">55.9</td>
140
+ <td align="center" style="vertical-align: middle">62.5*</td>
141
+ <td align="center" style="vertical-align: middle">56.7*</td>
142
+ <td align="center" style="vertical-align: middle">55.9*</td>
143
+ <td align="center" style="vertical-align: middle">29.5</td>
144
+ </tr>
145
+ <tr>
146
+ <td align="center" style="vertical-align: middle">Claw Eval (pass^3)</td>
147
+ <td align="center" style="vertical-align: middle">62.3</td>
148
+ <td align="center" style="vertical-align: middle">60.3</td>
149
+ <td align="center" style="vertical-align: middle">70.4</td>
150
+ <td align="center" style="vertical-align: middle">57.8</td>
151
+ <td align="center" style="vertical-align: middle">52.3</td>
152
+ </tr>
153
+ <tr>
154
+ <td align="center" style="vertical-align: middle">Claw Eval (pass@3)</td>
155
+ <td align="center" style="vertical-align: middle">80.9</td>
156
+ <td align="center" style="vertical-align: middle">78.4</td>
157
+ <td align="center" style="vertical-align: middle">82.4</td>
158
+ <td align="center" style="vertical-align: middle">82.9</td>
159
+ <td align="center" style="vertical-align: middle">75.4</td>
160
+ </tr>
161
+ <tr>
162
+ <td align="center" style="vertical-align: middle">APEX-Agents</td>
163
+ <td align="center" style="vertical-align: middle">27.9</td>
164
+ <td align="center" style="vertical-align: middle">33.3</td>
165
+ <td align="center" style="vertical-align: middle">33.0</td>
166
+ <td align="center" style="vertical-align: middle">32.0</td>
167
+ <td align="center" style="vertical-align: middle">11.5</td>
168
+ </tr>
169
+ <tr>
170
+ <td align="center" style="vertical-align: middle">OSWorld-Verified</td>
171
+ <td align="center" style="vertical-align: middle">73.1</td>
172
+ <td align="center" style="vertical-align: middle">75.0</td>
173
+ <td align="center" style="vertical-align: middle">72.7</td>
174
+ <td align="center" style="vertical-align: middle">-</td>
175
+ <td align="center" style="vertical-align: middle">63.3</td>
176
+ </tr>
177
+ <tr>
178
+ <td align="center" colspan=6><strong>Coding</strong></td>
179
+ </tr>
180
+ <tr>
181
+ <td align="center" style="vertical-align: middle">Terminal-Bench 2.0<br>(Terminus-2)</td>
182
+ <td align="center" style="vertical-align: middle">66.7</td>
183
+ <td align="center" style="vertical-align: middle">65.4*</td>
184
+ <td align="center" style="vertical-align: middle">65.4</td>
185
+ <td align="center" style="vertical-align: middle">68.5</td>
186
+ <td align="center" style="vertical-align: middle">50.8</td>
187
+ </tr>
188
+ <tr>
189
+ <td align="center" style="vertical-align: middle">SWE-Bench Pro</td>
190
+ <td align="center" style="vertical-align: middle">58.6</td>
191
+ <td align="center" style="vertical-align: middle">57.7</td>
192
+ <td align="center" style="vertical-align: middle">53.4</td>
193
+ <td align="center" style="vertical-align: middle">54.2</td>
194
+ <td align="center" style="vertical-align: middle">50.7</td>
195
+ </tr>
196
+ <tr>
197
+ <td align="center" style="vertical-align: middle">SWE-Bench Multilingual</td>
198
+ <td align="center" style="vertical-align: middle">76.7</td>
199
+ <td align="center" style="vertical-align: middle">-</td>
200
+ <td align="center" style="vertical-align: middle">77.8</td>
201
+ <td align="center" style="vertical-align: middle">76.9*</td>
202
+ <td align="center" style="vertical-align: middle">73.0</td>
203
+ </tr>
204
+ <tr>
205
+ <td align="center" style="vertical-align: middle">SWE-Bench Verified</td>
206
+ <td align="center" style="vertical-align: middle">80.2</td>
207
+ <td align="center" style="vertical-align: middle">-</td>
208
+ <td align="center" style="vertical-align: middle">80.8</td>
209
+ <td align="center" style="vertical-align: middle">80.6</td>
210
+ <td align="center" style="vertical-align: middle">76.8</td>
211
+ </tr>
212
+ <tr>
213
+ <td align="center" style="vertical-align: middle">SciCode</td>
214
+ <td align="center" style="vertical-align: middle">52.2</td>
215
+ <td align="center" style="vertical-align: middle">56.6</td>
216
+ <td align="center" style="vertical-align: middle">51.9</td>
217
+ <td align="center" style="vertical-align: middle">58.9</td>
218
+ <td align="center" style="vertical-align: middle">48.7</td>
219
+ </tr>
220
+ <tr>
221
+ <td align="center" style="vertical-align: middle">OJBench (python)</td>
222
+ <td align="center" style="vertical-align: middle">60.6</td>
223
+ <td align="center" style="vertical-align: middle">-</td>
224
+ <td align="center" style="vertical-align: middle">60.3</td>
225
+ <td align="center" style="vertical-align: middle">70.7</td>
226
+ <td align="center" style="vertical-align: middle">54.7</td>
227
+ </tr>
228
+ <tr>
229
+ <td align="center" style="vertical-align: middle">LiveCodeBench (v6)</td>
230
+ <td align="center" style="vertical-align: middle">89.6</td>
231
+ <td align="center" style="vertical-align: middle">-</td>
232
+ <td align="center" style="vertical-align: middle">88.8</td>
233
+ <td align="center" style="vertical-align: middle">91.7</td>
234
+ <td align="center" style="vertical-align: middle">85.0</td>
235
+ </tr>
236
+ <tr>
237
+ <td align="center" colspan=6><strong>Reasoning &amp; Knowledge</strong></td>
238
+ </tr>
239
+ <tr>
240
+ <td align="center" style="vertical-align: middle">HLE-Full</td>
241
+ <td align="center" style="vertical-align: middle">34.7</td>
242
+ <td align="center" style="vertical-align: middle">39.8</td>
243
+ <td align="center" style="vertical-align: middle">40.0</td>
244
+ <td align="center" style="vertical-align: middle">44.4</td>
245
+ <td align="center" style="vertical-align: middle">30.1</td>
246
+ </tr>
247
+ <tr>
248
+ <td align="center" style="vertical-align: middle">AIME 2026</td>
249
+ <td align="center" style="vertical-align: middle">96.4</td>
250
+ <td align="center" style="vertical-align: middle">99.2</td>
251
+ <td align="center" style="vertical-align: middle">96.7</td>
252
+ <td align="center" style="vertical-align: middle">98.3</td>
253
+ <td align="center" style="vertical-align: middle">95.8</td>
254
+ </tr>
255
+ <tr>
256
+ <td align="center" style="vertical-align: middle">HMMT 2026 (Feb)</td>
257
+ <td align="center" style="vertical-align: middle">92.7</td>
258
+ <td align="center" style="vertical-align: middle">97.7</td>
259
+ <td align="center" style="vertical-align: middle">96.2</td>
260
+ <td align="center" style="vertical-align: middle">94.7</td>
261
+ <td align="center" style="vertical-align: middle">87.1</td>
262
+ </tr>
263
+ <tr>
264
+ <td align="center" style="vertical-align: middle">IMO-AnswerBench</td>
265
+ <td align="center" style="vertical-align: middle">86.0</td>
266
+ <td align="center" style="vertical-align: middle">91.4</td>
267
+ <td align="center" style="vertical-align: middle">75.3</td>
268
+ <td align="center" style="vertical-align: middle">91.0*</td>
269
+ <td align="center" style="vertical-align: middle">81.8</td>
270
+ </tr>
271
+ <tr>
272
+ <td align="center" style="vertical-align: middle">GPQA-Diamond</td>
273
+ <td align="center" style="vertical-align: middle">90.5</td>
274
+ <td align="center" style="vertical-align: middle">92.8</td>
275
+ <td align="center" style="vertical-align: middle">91.3</td>
276
+ <td align="center" style="vertical-align: middle">94.3</td>
277
+ <td align="center" style="vertical-align: middle">87.6</td>
278
+ </tr>
279
+ <tr>
280
+ <td align="center" colspan=6><strong>Vision</strong></td>
281
+ </tr>
282
+ <tr>
283
+ <td align="center" style="vertical-align: middle">MMMU-Pro</td>
284
+ <td align="center" style="vertical-align: middle">79.4</td>
285
+ <td align="center" style="vertical-align: middle">81.2</td>
286
+ <td align="center" style="vertical-align: middle">73.9</td>
287
+ <td align="center" style="vertical-align: middle">83.0*</td>
288
+ <td align="center" style="vertical-align: middle">78.5</td>
289
+ </tr>
290
+ <tr>
291
+ <td align="center" style="vertical-align: middle">MMMU-Pro (w/ python)</td>
292
+ <td align="center" style="vertical-align: middle">80.1</td>
293
+ <td align="center" style="vertical-align: middle">82.1</td>
294
+ <td align="center" style="vertical-align: middle">77.3</td>
295
+ <td align="center" style="vertical-align: middle">85.3*</td>
296
+ <td align="center" style="vertical-align: middle">77.7</td>
297
+ </tr>
298
+ <tr>
299
+ <td align="center" style="vertical-align: middle">CharXiv (RQ)</td>
300
+ <td align="center" style="vertical-align: middle">80.4</td>
301
+ <td align="center" style="vertical-align: middle">82.8*</td>
302
+ <td align="center" style="vertical-align: middle">69.1</td>
303
+ <td align="center" style="vertical-align: middle">80.2*</td>
304
+ <td align="center" style="vertical-align: middle">77.5</td>
305
+ </tr>
306
+ <tr>
307
+ <td align="center" style="vertical-align: middle">CharXiv (RQ) (w/ python)</td>
308
+ <td align="center" style="vertical-align: middle">86.7</td>
309
+ <td align="center" style="vertical-align: middle">90.0*</td>
310
+ <td align="center" style="vertical-align: middle">84.7</td>
311
+ <td align="center" style="vertical-align: middle">89.9*</td>
312
+ <td align="center" style="vertical-align: middle">78.7</td>
313
+ </tr>
314
+ <tr>
315
+ <td align="center" style="vertical-align: middle">MathVision</td>
316
+ <td align="center" style="vertical-align: middle">87.4</td>
317
+ <td align="center" style="vertical-align: middle">92.0*</td>
318
+ <td align="center" style="vertical-align: middle">71.2*</td>
319
+ <td align="center" style="vertical-align: middle">89.8*</td>
320
+ <td align="center" style="vertical-align: middle">84.2</td>
321
+ </tr>
322
+ <tr>
323
+ <td align="center" style="vertical-align: middle">MathVision (w/ python)</td>
324
+ <td align="center" style="vertical-align: middle">93.2</td>
325
+ <td align="center" style="vertical-align: middle">96.1*</td>
326
+ <td align="center" style="vertical-align: middle">84.6*</td>
327
+ <td align="center" style="vertical-align: middle">95.7*</td>
328
+ <td align="center" style="vertical-align: middle">85.0</td>
329
+ </tr>
330
+ <tr>
331
+ <td align="center" style="vertical-align: middle">BabyVision</td>
332
+ <td align="center" style="vertical-align: middle">39.8</td>
333
+ <td align="center" style="vertical-align: middle">49.7</td>
334
+ <td align="center" style="vertical-align: middle">14.8</td>
335
+ <td align="center" style="vertical-align: middle">51.6</td>
336
+ <td align="center" style="vertical-align: middle">36.5</td>
337
+ </tr>
338
+ <tr>
339
+ <td align="center" style="vertical-align: middle">BabyVision (w/ python)</td>
340
+ <td align="center" style="vertical-align: middle">68.5</td>
341
+ <td align="center" style="vertical-align: middle">80.2*</td>
342
+ <td align="center" style="vertical-align: middle">38.4*</td>
343
+ <td align="center" style="vertical-align: middle">68.3*</td>
344
+ <td align="center" style="vertical-align: middle">40.5</td>
345
+ </tr>
346
+ <tr>
347
+ <td align="center" style="vertical-align: middle">V* (w/ python)</td>
348
+ <td align="center" style="vertical-align: middle">96.9</td>
349
+ <td align="center" style="vertical-align: middle">98.4*</td>
350
+ <td align="center" style="vertical-align: middle">86.4*</td>
351
+ <td align="center" style="vertical-align: middle">96.9*</td>
352
+ <td align="center" style="vertical-align: middle">86.9</td>
353
+ </tr>
354
+ </tbody>
355
+ </table>
356
+ </div>
357
+
358
+ <details>
359
+ <summary><b>Footnotes</b></summary>
360
+
361
+ 1. **General Testing Details**
362
+ - We report results for Kimi K2.6 and Kimi K2.5 with thinking mode enabled, Claude Opus 4.6 with max effort, GPT-5.4 with xhigh reasoning effort, and Gemini 3.1 Pro with a high thinking level.
363
+ - Unless otherwise specified, all Kimi K2.6 experiments were conducted with temperature = 1.0, top-p = 1.0, and a context length of 262,144 tokens.
364
+ - Benchmarks without publicly available scores were re-evaluated under the same conditions used for Kimi K2.6 and are marked with an asterisk (`*`). Except where noted with an asterisk, all other results are cited from official reports.
365
+ 2. **Reasoning Benchmarks**
366
+ - IMO-AnswerBench scores for GPT-5.4 and Claude 4.6 were obtained from [z.ai/blog/glm-5.1](https://z.ai/blog/glm-5.1).
367
+ - Humanity's Last Exam (HLE) and other reasoning tasks were evaluated with a maximum generation length of 98,304 tokens. By default, we report results on the HLE full set. For the text-only subset, Kimi K2.6 achieves 36.4% accuracy without tools and 55.5% with tools.
368
+ 3. **Tool-Augmented / Agentic Tasks**
369
+ - Kimi K2.6 was equipped with search, code-interpreter, and web-browsing tools for HLE with tools, BrowseComp, DeepSearchQA, and WideSearch.
370
+ - For HLE-Full with tools, the maximum generation length is 262,144 tokens with a per-step limit of 49,152 tokens. We employ a simple context management strategy: once the context window exceeds the threshold, only the most recent round of tool-related messages is retained.
371
+ - For BrowseComp, we report scores obtained with context management using the same discard-all strategy as Kimi K2.5 and DeepSeek-V3.2.
372
+ - For DeepSearchQA, no context management was applied to Kimi K2.6 tests, and tasks exceeding the supported context length were directly counted as failed. Scores for Claude Opus 4.6, GPT-5.4, and Gemini 3.1 Pro on DeepSearchQA are cited from the [Claude Opus 4.7 System Card](https://cdn.sanity.io/files/4zrzovbb/website/037f06850df7fbe871e206dad004c3db5fd50340.pdf).
373
+ - For WideSearch, we report results under the "hide tool result" context management setting. Once the context window exceeds the threshold, only the most recent round of tool-related messages is retained.
374
+ - The test system prompts are identical to those used in the [Kimi K2.5 technical report](https://arxiv.org/pdf/2602.02276).
375
+ - Claw Eval was conducted using version 1.1 with max-tokens-per-step = 16384.
376
+ - For APEX-Agents, we evaluate 452 tasks from the public 480-task release, as done by [Artificial Analysis](https://artificialanalysis.ai/evaluations/apex-agents-aa)(excluding Investment Banking Worlds 244 and 246, which have external runtime dependencies)
377
+ 4. **Coding Tasks**
378
+ - Terminal-Bench 2.0 scores were obtained with the default agent framework (Terminus-2) and the provided JSON parser, operating in preserve thinking mode.
379
+ - For the SWE-Bench series of evaluations (including Verified, Multilingual, and Pro), we used an in-house evaluation framework adapted from SWE-agent. This framework includes a minimal set of tools—bash tool, createfile tool, insert tool, view tool, strreplace tool, and submit tool.
380
+ - All reported scores for coding tasks are averaged over 10 independent runs.
381
+ 5. **Vision Benchmarks**
382
+ - Max-tokens = 98,304, averaged over three runs (avg@3).
383
+ - Settings with Python tool use max-tokens-per-step = 64k and max-steps = 50 for multi-step reasoning.
384
+ - MMMU-Pro follows the official protocol, preserving input order and prepending images.
385
+
386
+ </details>
387
+
388
+
389
+ ## 4. Native INT4 Quantization
390
+ Kimi-K2.6 adopts the same native int4 quantization method as [Kimi-K2-Thinking](https://huggingface.co/moonshotai/Kimi-K2-Thinking#4-native-int4-quantization).
391
+
392
+ ## 5. Deployment
393
+
394
+ > [!Note]
395
+ > You can access Kimi-K2.6's API on https://platform.moonshot.ai and we provide OpenAI/Anthropic-compatible API for you. To verify the deployment is correct, we also provide the [Kimi Vendor Verifier](https://kimi.com/blog/kimi-vendor-verifier.html).
396
+ Currently, Kimi-K2.6 is recommended to run on the following inference engines:
397
+ * vLLM
398
+ * SGLang
399
+ * KTransformers
400
+
401
+ Kimi-K2.6 has the same architecture as Kimi-K2.5, and the deployment method can be directly reused.
402
+
403
+ The minimum version requirement for `transformers` is `4.57.1`.
404
+
405
+ Deployment examples can be found in the [Model Deployment Guide](docs/deploy_guidance.md).
406
+
407
+
408
+ ---
409
+ ## 6. Model Usage
410
+
411
+ The usage demos below demonstrate how to call our official API.
412
+
413
+ For third-party APIs deployed with vLLM or SGLang, please note that:
414
+ > [!Note]
415
+ > - Chat with video content is an experimental feature and is only supported in our official API for now.
416
+ >
417
+ > - The recommended `temperature` will be `1.0` for Thinking mode and `0.6` for Instant mode.
418
+ >
419
+ > - The recommended `top_p` is `0.95`.
420
+ >
421
+ > - To use instant mode, you need to pass `{'chat_template_kwargs': {"thinking": False}}` in `extra_body`.
422
+
423
+ ### Chat Completion
424
+
425
+ This is a simple chat completion script which shows how to call K2.6 API in Thinking and Instant modes.
426
+
427
+ ```python
428
+ import openai
429
+ import base64
430
+ import requests
431
+ def simple_chat(client: openai.OpenAI, model_name: str):
432
+ messages = [
433
+ {'role': 'system', 'content': 'You are Kimi, an AI assistant created by Moonshot AI.'},
434
+ {
435
+ 'role': 'user',
436
+ 'content': [
437
+ {'type': 'text', 'text': 'which one is bigger, 9.11 or 9.9? think carefully.'}
438
+ ],
439
+ },
440
+ ]
441
+ response = client.chat.completions.create(
442
+ model=model_name, messages=messages, stream=False, max_tokens=4096
443
+ )
444
+ print('====== Below is reasoning content in Thinking Mode ======')
445
+ print(f'reasoning content: {response.choices[0].message.reasoning}')
446
+ print('====== Below is response in Thinking Mode ======')
447
+ print(f'response: {response.choices[0].message.content}')
448
+
449
+ # To use instant mode, pass {"thinking" = {"type":"disabled"}}
450
+ response = client.chat.completions.create(
451
+ model=model_name,
452
+ messages=messages,
453
+ stream=False,
454
+ max_tokens=4096,
455
+ extra_body={'thinking': {'type': 'disabled'}}, # this is for official API
456
+ # extra_body= {'chat_template_kwargs': {"thinking": False}} # this is for vLLM/SGLang
457
+ )
458
+ print('====== Below is response in Instant Mode ======')
459
+ print(f'response: {response.choices[0].message.content}')
460
+ ```
461
+
462
+
463
+ ### Chat Completion with visual content
464
+
465
+ K2.6 supports Image and Video input.
466
+
467
+ The following example demonstrates how to call K2.6 API with image input:
468
+
469
+ ```python
470
+ import openai
471
+ import base64
472
+ import requests
473
+
474
+ def chat_with_image(client: openai.OpenAI, model_name: str):
475
+ url = 'https://huggingface.co/moonshotai/Kimi-K2.6/resolve/main/figures/kimi-logo.png'
476
+ image_base64 = base64.b64encode(requests.get(url).content).decode()
477
+ messages = [
478
+ {
479
+ 'role': 'user',
480
+ 'content': [
481
+ {'type': 'text', 'text': 'Describe this image in detail.'},
482
+ {
483
+ 'type': 'image_url',
484
+ 'image_url': {'url': f'data:image/png;base64, {image_base64}'},
485
+ },
486
+ ],
487
+ }
488
+ ]
489
+
490
+ response = client.chat.completions.create(
491
+ model=model_name, messages=messages, stream=False, max_tokens=8192
492
+ )
493
+ print('====== Below is reasoning content in Thinking Mode ======')
494
+ print(f'reasoning content: {response.choices[0].message.reasoning}')
495
+ print('====== Below is response in Thinking Mode ======')
496
+ print(f'response: {response.choices[0].message.content}')
497
+
498
+ # Also support instant mode if you pass {"thinking" = {"type":"disabled"}}
499
+ response = client.chat.completions.create(
500
+ model=model_name,
501
+ messages=messages,
502
+ stream=False,
503
+ max_tokens=4096,
504
+ extra_body={'thinking': {'type': 'disabled'}}, # this is for official API
505
+ # extra_body= {'chat_template_kwargs': {"thinking": False}} # this is for vLLM/SGLang
506
+ )
507
+ print('====== Below is response in Instant Mode ======')
508
+ print(f'response: {response.choices[0].message.content}')
509
+
510
+ return response.choices[0].message.content
511
+ ```
512
+
513
+ The following example demonstrates how to call K2.6 API with video input:
514
+
515
+ ```python
516
+ import openai
517
+ import base64
518
+ import requests
519
+
520
+ def chat_with_video(client: openai.OpenAI, model_name:str):
521
+ url = 'https://huggingface.co/moonshotai/Kimi-K2.6/resolve/main/figures/demo_video.mp4'
522
+ video_base64 = base64.b64encode(requests.get(url).content).decode()
523
+ messages = [
524
+ {
525
+ "role": "user",
526
+ "content": [
527
+ {"type": "text","text": "Describe the video in detail."},
528
+ {
529
+ "type": "video_url",
530
+ "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
531
+ },
532
+ ],
533
+ }
534
+ ]
535
+
536
+ response = client.chat.completions.create(model=model_name, messages=messages)
537
+ print('====== Below is reasoning content in Thinking Mode ======')
538
+ print(f'reasoning content: {response.choices[0].message.reasoning}')
539
+ print('====== Below is response in Thinking Mode ======')
540
+ print(f'response: {response.choices[0].message.content}')
541
+
542
+ # Also support instant mode if pass {"thinking" = {"type":"disabled"}}
543
+ response = client.chat.completions.create(
544
+ model=model_name,
545
+ messages=messages,
546
+ stream=False,
547
+ max_tokens=4096,
548
+ extra_body={'thinking': {'type': 'disabled'}}, # this is for official API
549
+ # extra_body= {'chat_template_kwargs': {"thinking": False}} # this is for vLLM/SGLang
550
+ )
551
+ print('====== Below is response in Instant Mode ======')
552
+ print(f'response: {response.choices[0].message.content}')
553
+ return response.choices[0].message.content
554
+ ```
555
+ ### Preserve Thinking
556
+ Kimi K2.6 supports `preserve_thinking` mode, which enabling it to retain full reasoning content across multi-turn interactions and enhances performance in coding agent scenarios.
557
+
558
+ This feature is disabled by default. The following example demonstrates how to call K2.6 API in `preserve_thinking` mode:
559
+
560
+ ```python
561
+ def chat_with_preserve_thinking(client: openai.OpenAI, model_name: str):
562
+ messages = [
563
+ {
564
+ "role": "user",
565
+ "content": "Tell me three random numbers."
566
+ },
567
+ {
568
+ "role": "assistant",
569
+ "reasoning_content": "I'll start by listing five numbers: 473, 921, 235, 215, 222, and I'll tell you the first three.",
570
+ "content": "473, 921, 235"
571
+ },
572
+ {
573
+ "role": "user",
574
+ "content": "What are the other two numbers you have in mind?"
575
+ }
576
+ ]
577
+
578
+ response = client.chat.completions.create(
579
+ model=model_name,
580
+ messages=messages,
581
+ stream=False,
582
+ max_tokens=4096,
583
+ extra_body={'thinking': {'type': 'enabled', keep: 'all'}}, # this is for official API
584
+ # extra_body={"chat_template_kwargs": {"thinking":True, "preserve_thinking": True}}, # this is for vLLM/SGLang
585
+ # We recommend enabling preserve_thinking only in think mode.
586
+ )
587
+ # the assistant should mention 215 and 222 that appear in the prior reasoning content
588
+ print(f"response: {response.choices[0].message.reasoning}")
589
+ return response.choices[0].message.content
590
+
591
+ ```
592
+
593
+ ### Interleaved Thinking and Multi-Step Tool Call
594
+
595
+ K2.6 shares the same design of Interleaved Thinking and Multi-Step Tool Call as K2 Thinking. For usage example, please refer to the [K2 Thinking documentation](https://platform.moonshot.ai/docs/guide/use-kimi-k2-thinking-model#complete-example).
596
+
597
+ ### Coding Agent Framework
598
+
599
+ Kimi K2.6 works best with Kimi Code CLI as its agent framework — give it a try at https://www.kimi.com/code.
600
+
601
+
602
+ ---
603
+
604
+ ## 7. License
605
+
606
+ Both the code repository and the model weights are released under the [Modified MIT License](LICENSE).
607
+
608
+ ---
609
+
610
+ ## 8. Third Party Notices
611
+
612
+ See [THIRD PARTY NOTICES](THIRD_PARTY_NOTICES.md)
613
+
614
+ ---
615
+
616
+ ## 9. Contact Us
617
+
618
+ If you have any questions, please reach out at [support@moonshot.ai](mailto:support@moonshot.ai).
THIRD_PARTY_NOTICES.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # THIRD_PARTY_NOTICES
2
+
3
+ This file lists third-party software contained in Kimi-K2.6 along with their licenses, in compliance with the redistribution clauses of those licenses.
4
+
5
+ ---
6
+
7
+ ## 1. DeepSeek-V3
8
+
9
+ Our model archietecture is DeepSeek-V3-like. Some of modeling codes are copied from the source repository.
10
+
11
+ - **Source Repository**
12
+ https://huggingface.co/deepseek-ai/DeepSeek-V3
13
+
14
+ - **Files / Directories Used**
15
+ - configuration_deepseek.py
16
+ - modeling_deepseek.py
17
+
18
+ - **License Type**
19
+ MIT License
20
+
21
+ - **Copyright Notice**
22
+ Copyright (c) 2023 DeepSeek
23
+
24
+ - **Full License Text**
25
+ ```
26
+ MIT License
27
+ Copyright (c) 2023 DeepSeek
28
+ Permission is hereby granted, free of charge, to any person obtaining a copy
29
+ of this software and associated documentation files (the "Software"), to deal
30
+ in the Software without restriction, including without limitation the rights
31
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
32
+ copies of the Software, and to permit persons to whom the Software is
33
+ furnished to do so, subject to the following conditions:
34
+ The above copyright notice and this permission notice shall be included in all
35
+ copies or substantial portions of the Software.
36
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
37
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
38
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
39
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
40
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
41
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42
+ SOFTWARE.
43
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro render_content(msg) -%}
2
+ {%- set c = msg.get('content') -%}
3
+ {%- if c is string -%}
4
+ {{ c }}
5
+ {%- elif c is not none -%}
6
+ {% for content in c -%}
7
+ {% if content['type'] == 'image' or content['type'] == 'image_url' -%}
8
+ <|media_begin|>image<|media_content|><|media_pad|><|media_end|>
9
+ {% elif content['type'] == 'video' or content['type']== 'video_url'-%}
10
+ <|kimi_k25_video_placeholder|>
11
+ {% else -%}
12
+ {{ content['text'] }}
13
+ {%- endif -%}
14
+ {%- endfor -%}
15
+ {%- endif -%}
16
+ {%- endmacro -%}
17
+
18
+ {% macro set_roles(message) -%}
19
+ {%- set role_name = message.get('name') or message['role'] -%}
20
+ {%- if message['role'] == 'user' -%}
21
+ <|im_user|>{{role_name}}<|im_middle|>
22
+ {%- elif message['role'] == 'assistant' -%}
23
+ <|im_assistant|>{{role_name}}<|im_middle|>
24
+ {%- else -%}
25
+ <|im_system|>{{role_name}}<|im_middle|>
26
+ {%- endif -%}
27
+ {%- endmacro -%}
28
+
29
+
30
+ {%- macro render_toolcalls(message) -%}
31
+ <|tool_calls_section_begin|>
32
+ {%- for tool_call in message['tool_calls'] -%}
33
+ {%- set formatted_id = tool_call['id'] -%}
34
+ <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
35
+ {%- endfor -%}
36
+ <|tool_calls_section_end|>
37
+ {%- endmacro -%}
38
+
39
+
40
+ {%- set preserve_thinking = preserve_thinking | default(false) -%}
41
+ {# Find last non-tool-call assistant message. If preserve_thinking, keep -1 so hist is empty and all msgs use suffix (retain reasoning). #}
42
+ {%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
43
+ {%- if not preserve_thinking -%}
44
+ {%- for idx in range(messages|length-1, -1, -1) -%}
45
+ {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
46
+ {%- set ns.last_non_tool_call_assistant_msg = idx -%}
47
+ {%- break -%}
48
+ {%- endif -%}
49
+ {%- endfor -%}
50
+ {%- endif -%}
51
+
52
+ {# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
53
+ {%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
54
+ {%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
55
+
56
+ {%- if tools -%}
57
+ {%- if tools_ts_str -%}
58
+ <|im_system|>tool_declare<|im_middle|>{{ tools_ts_str }}<|im_end|>
59
+ {%- else -%}
60
+ <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
61
+ {%- endif -%}
62
+ {%- endif -%}
63
+
64
+
65
+ {%- for message in hist_msgs -%}
66
+ {{set_roles(message)}}
67
+ {%- if message['role'] == 'assistant' -%}
68
+ <think></think>{{render_content(message)}}
69
+ {%- if message.get('tool_calls') -%}
70
+ {{render_toolcalls(message)}}
71
+ {%- endif -%}
72
+ {%- elif message['role'] == 'tool' -%}
73
+ {%- set tool_call_id = message.tool_call_id -%}
74
+ ## Return of {{ tool_call_id }}
75
+ {{render_content(message)}}
76
+ {%- elif message['content'] is not none -%}
77
+ {{render_content(message)}}
78
+ {%- endif -%}
79
+ <|im_end|>
80
+ {%- endfor -%}
81
+
82
+ {%- for message in suffix_msgs -%}
83
+ {{set_roles(message)}}
84
+ {%- if message['role'] == 'assistant' -%}
85
+ {%- if thinking is defined and thinking is false and preserve_thinking is false -%}
86
+ <think></think>{{render_content(message)}}
87
+ {%- else -%}
88
+ {%- set rc = message.get('reasoning', message.get('reasoning_content', '')) -%}
89
+ <think>{{rc}}</think>{{render_content(message)}}
90
+ {%- endif -%}
91
+ {%- if message.get('tool_calls') -%}
92
+ {{render_toolcalls(message)}}
93
+ {%- endif -%}
94
+ {%- elif message['role'] == 'tool' -%}
95
+ {%- set tool_call_id = message.tool_call_id -%}
96
+ ## Return of {{ tool_call_id }}
97
+ {{render_content(message)}}
98
+ {%- elif message['content'] is not none -%}
99
+ {{render_content(message)}}
100
+ {%- endif -%}
101
+ <|im_end|>
102
+ {%- endfor -%}
103
+
104
+
105
+ {%- if add_generation_prompt -%}
106
+ <|im_assistant|>assistant<|im_middle|>
107
+ {%- if thinking is defined and thinking is false -%}
108
+ <think></think>
109
+ {%- else -%}
110
+ <think>
111
+ {%- endif -%}
112
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "KimiK25ForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_kimi_k25.KimiK25Config",
7
+ "AutoModel": "modeling_kimi_k25.KimiK25ForConditionalGeneration",
8
+ "AutoModelForCausalLM": "modeling_kimi_k25.KimiK25ForConditionalGeneration"
9
+ },
10
+ "bos_token_id": 163584,
11
+ "dtype": "bfloat16",
12
+ "eos_token_id": 163586,
13
+ "ignore_index": -100,
14
+ "media_placeholder_token_id": 163605,
15
+ "model_type": "kimi_k25",
16
+ "pad_token_id": 163839,
17
+ "text_config": {
18
+ "_name_or_path": "",
19
+ "add_cross_attention": false,
20
+ "architectures": [
21
+ "DeepseekV3ForCausalLM"
22
+ ],
23
+ "attention_bias": false,
24
+ "attention_dropout": 0.0,
25
+ "auto_map": {
26
+ "AutoConfig": "configuration_deepseek.DeepseekV3Config",
27
+ "AutoModel": "modeling_deepseek.DeepseekV3Model",
28
+ "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
29
+ },
30
+ "aux_loss_alpha": 0.001,
31
+ "bad_words_ids": null,
32
+ "begin_suppress_tokens": null,
33
+ "bos_token_id": 163584,
34
+ "chunk_size_feed_forward": 0,
35
+ "cross_attention_hidden_size": null,
36
+ "decoder_start_token_id": null,
37
+ "diversity_penalty": 0.0,
38
+ "do_sample": false,
39
+ "dtype": "bfloat16",
40
+ "early_stopping": false,
41
+ "encoder_no_repeat_ngram_size": 0,
42
+ "eos_token_id": 163586,
43
+ "ep_size": 1,
44
+ "exponential_decay_length_penalty": null,
45
+ "finetuning_task": null,
46
+ "first_k_dense_replace": 1,
47
+ "forced_bos_token_id": null,
48
+ "forced_eos_token_id": null,
49
+ "hidden_act": "silu",
50
+ "hidden_size": 7168,
51
+ "id2label": {
52
+ "0": "LABEL_0",
53
+ "1": "LABEL_1"
54
+ },
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 18432,
57
+ "is_decoder": false,
58
+ "is_encoder_decoder": false,
59
+ "kv_lora_rank": 512,
60
+ "label2id": {
61
+ "LABEL_0": 0,
62
+ "LABEL_1": 1
63
+ },
64
+ "length_penalty": 1.0,
65
+ "max_length": 20,
66
+ "max_position_embeddings": 262144,
67
+ "min_length": 0,
68
+ "model_type": "kimi_k2",
69
+ "moe_intermediate_size": 2048,
70
+ "moe_layer_freq": 1,
71
+ "n_group": 1,
72
+ "n_routed_experts": 384,
73
+ "n_shared_experts": 1,
74
+ "no_repeat_ngram_size": 0,
75
+ "norm_topk_prob": true,
76
+ "num_attention_heads": 64,
77
+ "num_beam_groups": 1,
78
+ "num_beams": 1,
79
+ "num_experts_per_tok": 8,
80
+ "num_hidden_layers": 61,
81
+ "num_key_value_heads": 64,
82
+ "num_nextn_predict_layers": 0,
83
+ "num_return_sequences": 1,
84
+ "output_attentions": false,
85
+ "output_hidden_states": false,
86
+ "output_scores": false,
87
+ "pad_token_id": 163839,
88
+ "prefix": null,
89
+ "pretraining_tp": 1,
90
+ "problem_type": null,
91
+ "pruned_heads": {},
92
+ "q_lora_rank": 1536,
93
+ "qk_nope_head_dim": 128,
94
+ "qk_rope_head_dim": 64,
95
+ "quantization_config": {
96
+ "config_groups": {
97
+ "group_0": {
98
+ "input_activations": null,
99
+ "output_activations": null,
100
+ "targets": [
101
+ "Linear"
102
+ ],
103
+ "weights": {
104
+ "actorder": null,
105
+ "block_structure": null,
106
+ "dynamic": false,
107
+ "group_size": 32,
108
+ "num_bits": 4,
109
+ "observer": "minmax",
110
+ "observer_kwargs": {},
111
+ "strategy": "group",
112
+ "symmetric": true,
113
+ "type": "int"
114
+ }
115
+ }
116
+ },
117
+ "format": "pack-quantized",
118
+ "ignore": [
119
+ "lm_head",
120
+ "re:.*self_attn.*",
121
+ "re:.*shared_experts.*",
122
+ "re:.*mlp\\.(gate|up|gate_up|down)_proj.*"
123
+ ],
124
+ "kv_cache_scheme": null,
125
+ "quant_method": "compressed-tensors",
126
+ "quantization_status": "compressed"
127
+ },
128
+ "remove_invalid_values": false,
129
+ "repetition_penalty": 1.0,
130
+ "return_dict": true,
131
+ "return_dict_in_generate": false,
132
+ "rms_norm_eps": 1e-05,
133
+ "rope_scaling": {
134
+ "beta_fast": 32.0,
135
+ "beta_slow": 1.0,
136
+ "factor": 64.0,
137
+ "mscale": 1.0,
138
+ "mscale_all_dim": 1.0,
139
+ "original_max_position_embeddings": 4096,
140
+ "type": "yarn"
141
+ },
142
+ "rope_theta": 50000.0,
143
+ "routed_scaling_factor": 2.827,
144
+ "scoring_func": "sigmoid",
145
+ "sep_token_id": null,
146
+ "seq_aux": true,
147
+ "suppress_tokens": null,
148
+ "task_specific_params": null,
149
+ "temperature": 1.0,
150
+ "tf_legacy_loss": false,
151
+ "tie_encoder_decoder": false,
152
+ "tie_word_embeddings": false,
153
+ "tokenizer_class": null,
154
+ "top_k": 50,
155
+ "top_p": 1.0,
156
+ "topk_group": 1,
157
+ "topk_method": "noaux_tc",
158
+ "torchscript": false,
159
+ "transformers_version": "4.56.2",
160
+ "typical_p": 1.0,
161
+ "use_bfloat16": false,
162
+ "use_cache": true,
163
+ "v_head_dim": 128,
164
+ "vocab_size": 163840
165
+ },
166
+ "tie_word_embeddings": false,
167
+ "use_unified_vision_chunk": true,
168
+ "video_placeholder": "<|kimi_k25_video_placeholder|>",
169
+ "vision_config": {
170
+ "_attn_implementation": "flash_attention_2",
171
+ "init_pos_emb_height": 64,
172
+ "init_pos_emb_time": 4,
173
+ "init_pos_emb_width": 64,
174
+ "merge_kernel_size": [
175
+ 2,
176
+ 2
177
+ ],
178
+ "merge_type": "sd2_tpool",
179
+ "mm_hidden_size": 1152,
180
+ "mm_projector_type": "patchmerger",
181
+ "patch_size": 14,
182
+ "pos_emb_type": "divided_fixed",
183
+ "projector_hidden_act": "gelu",
184
+ "projector_ln_eps": 1e-05,
185
+ "text_hidden_size": 7168,
186
+ "video_attn_type": "spatial_temporal",
187
+ "vt_hidden_size": 1152,
188
+ "vt_intermediate_size": 4304,
189
+ "vt_num_attention_heads": 16,
190
+ "vt_num_hidden_layers": 27
191
+ }
192
+ }
configuration_deepseek.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+ logger = logging.get_logger(__name__)
7
+
8
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
9
+
10
+
11
+ class DeepseekV3Config(PretrainedConfig):
12
+ r"""
13
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
14
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
15
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
16
+
17
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
18
+ documentation from [`PretrainedConfig`] for more information.
19
+
20
+
21
+ Args:
22
+ vocab_size (`int`, *optional*, defaults to 129280):
23
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
24
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
25
+ hidden_size (`int`, *optional*, defaults to 4096):
26
+ Dimension of the hidden representations.
27
+ intermediate_size (`int`, *optional*, defaults to 11008):
28
+ Dimension of the MLP representations.
29
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
30
+ Dimension of the MoE representations.
31
+ num_hidden_layers (`int`, *optional*, defaults to 32):
32
+ Number of hidden layers in the Transformer decoder.
33
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
34
+ Number of nextn predict layers in the DeepSeekV3 Model.
35
+ num_attention_heads (`int`, *optional*, defaults to 32):
36
+ Number of attention heads for each attention layer in the Transformer decoder.
37
+ n_shared_experts (`int`, *optional*, defaults to None):
38
+ Number of shared experts, None means dense model.
39
+ n_routed_experts (`int`, *optional*, defaults to None):
40
+ Number of routed experts, None means dense model.
41
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
42
+ Scaling factor or routed experts.
43
+ topk_method (`str`, *optional*, defaults to `gready`):
44
+ Topk method used in routed gate.
45
+ n_group (`int`, *optional*, defaults to None):
46
+ Number of groups for routed experts.
47
+ topk_group (`int`, *optional*, defaults to None):
48
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
49
+ num_experts_per_tok (`int`, *optional*, defaults to None):
50
+ Number of selected experts, None means dense model.
51
+ moe_layer_freq (`int`, *optional*, defaults to 1):
52
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
53
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
54
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
55
+ \--k dense layers--/
56
+ norm_topk_prob (`bool`, *optional*, defaults to False):
57
+ Whether to normalize the weights of the routed experts.
58
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
59
+ Method of computing expert weights.
60
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
61
+ Auxiliary loss weight coefficient.
62
+ seq_aux = (`bool`, *optional*, defaults to True):
63
+ Whether to compute the auxiliary loss for each individual sample.
64
+ num_key_value_heads (`int`, *optional*):
65
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
66
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
67
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
68
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
69
+ by meanpooling all the original heads within that group. For more details checkout [this
70
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
71
+ `num_attention_heads`.
72
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
73
+ The non-linear activation function (function or string) in the decoder.
74
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
75
+ The maximum sequence length that this model might ever be used with.
76
+ initializer_range (`float`, *optional*, defaults to 0.02):
77
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
78
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
79
+ The epsilon used by the rms normalization layers.
80
+ use_cache (`bool`, *optional*, defaults to `True`):
81
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
82
+ relevant if `config.is_decoder=True`.
83
+ pad_token_id (`int`, *optional*):
84
+ Padding token id.
85
+ bos_token_id (`int`, *optional*, defaults to 1):
86
+ Beginning of stream token id.
87
+ eos_token_id (`int`, *optional*, defaults to 2):
88
+ End of stream token id.
89
+ pretraining_tp (`int`, *optional*, defaults to 1):
90
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
91
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
92
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
93
+ issue](https://github.com/pytorch/pytorch/issues/76232).
94
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
95
+ Whether to tie weight embeddings
96
+ rope_theta (`float`, *optional*, defaults to 10000.0):
97
+ The base period of the RoPE embeddings.
98
+ rope_scaling (`Dict`, *optional*):
99
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
100
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
101
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
102
+ `max_position_embeddings` to the expected new maximum.
103
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
104
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
105
+ attention_dropout (`float`, *optional*, defaults to 0.0):
106
+ The dropout ratio for the attention probabilities.
107
+
108
+ ```python
109
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
110
+
111
+ >>> # Initializing a Deepseek-V3 style configuration
112
+ >>> configuration = DeepseekV3Config()
113
+
114
+ >>> # Accessing the model configuration
115
+ >>> configuration = model.config
116
+ ```"""
117
+
118
+ model_type = "deepseek_v3"
119
+ keys_to_ignore_at_inference = ["past_key_values"]
120
+
121
+ def __init__(
122
+ self,
123
+ vocab_size=129280,
124
+ hidden_size=7168,
125
+ intermediate_size=18432,
126
+ moe_intermediate_size=2048,
127
+ num_hidden_layers=61,
128
+ num_nextn_predict_layers=1,
129
+ num_attention_heads=128,
130
+ num_key_value_heads=128,
131
+ n_shared_experts=1,
132
+ n_routed_experts=256,
133
+ ep_size=1,
134
+ routed_scaling_factor=2.5,
135
+ kv_lora_rank=512,
136
+ q_lora_rank=1536,
137
+ qk_rope_head_dim=64,
138
+ v_head_dim=128,
139
+ qk_nope_head_dim=128,
140
+ topk_method='noaux_tc',
141
+ n_group=8,
142
+ topk_group=4,
143
+ num_experts_per_tok=8,
144
+ moe_layer_freq=1,
145
+ first_k_dense_replace=3,
146
+ norm_topk_prob=True,
147
+ scoring_func='sigmoid',
148
+ aux_loss_alpha=0.001,
149
+ seq_aux=True,
150
+ hidden_act="silu",
151
+ max_position_embeddings=4096,
152
+ initializer_range=0.02,
153
+ rms_norm_eps=1e-6,
154
+ use_cache=True,
155
+ pad_token_id=None,
156
+ bos_token_id=0,
157
+ eos_token_id=1,
158
+ pretraining_tp=1,
159
+ tie_word_embeddings=False,
160
+ rope_theta=10000.0,
161
+ rope_scaling=None,
162
+ attention_bias=False,
163
+ attention_dropout=0.0,
164
+ **kwargs,
165
+ ):
166
+ self.vocab_size = vocab_size
167
+ self.max_position_embeddings = max_position_embeddings
168
+ self.hidden_size = hidden_size
169
+ self.intermediate_size = intermediate_size
170
+ self.moe_intermediate_size = moe_intermediate_size
171
+ self.num_hidden_layers = num_hidden_layers
172
+ self.num_nextn_predict_layers = num_nextn_predict_layers
173
+ self.num_attention_heads = num_attention_heads
174
+ self.n_shared_experts = n_shared_experts
175
+ self.n_routed_experts = n_routed_experts
176
+ self.ep_size = ep_size
177
+ self.routed_scaling_factor = routed_scaling_factor
178
+ self.kv_lora_rank = kv_lora_rank
179
+ self.q_lora_rank = q_lora_rank
180
+ self.qk_rope_head_dim = qk_rope_head_dim
181
+ self.v_head_dim = v_head_dim
182
+ self.qk_nope_head_dim = qk_nope_head_dim
183
+ self.topk_method = topk_method
184
+ self.n_group = n_group
185
+ self.topk_group = topk_group
186
+ self.num_experts_per_tok = num_experts_per_tok
187
+ self.moe_layer_freq = moe_layer_freq
188
+ self.first_k_dense_replace = first_k_dense_replace
189
+ self.norm_topk_prob = norm_topk_prob
190
+ self.scoring_func = scoring_func
191
+ self.aux_loss_alpha = aux_loss_alpha
192
+ self.seq_aux = seq_aux
193
+ # for backward compatibility
194
+ if num_key_value_heads is None:
195
+ num_key_value_heads = num_attention_heads
196
+
197
+ self.num_key_value_heads = num_key_value_heads
198
+ self.hidden_act = hidden_act
199
+ self.initializer_range = initializer_range
200
+ self.rms_norm_eps = rms_norm_eps
201
+ self.pretraining_tp = pretraining_tp
202
+ self.use_cache = use_cache
203
+ self.rope_theta = rope_theta
204
+ self.rope_scaling = rope_scaling
205
+ self.attention_bias = attention_bias
206
+ self.attention_dropout = attention_dropout
207
+
208
+ super().__init__(
209
+ pad_token_id=pad_token_id,
210
+ bos_token_id=bos_token_id,
211
+ eos_token_id=eos_token_id,
212
+ tie_word_embeddings=tie_word_embeddings,
213
+ **kwargs,
214
+ )
configuration_kimi_k25.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+
3
+ try:
4
+ from configuration_deepseek import DeepseekV3Config
5
+ except ImportError:
6
+ from .configuration_deepseek import DeepseekV3Config
7
+
8
+
9
+ class KimiK25VisionConfig(PretrainedConfig):
10
+
11
+ def __init__(
12
+ self,
13
+ patch_size: int = 14,
14
+ init_pos_emb_height: int = 64,
15
+ init_pos_emb_width: int = 64,
16
+ init_pos_emb_time: int = 4,
17
+ pos_emb_type: str = 'divided_fixed',
18
+ vt_num_attention_heads: int = 16,
19
+ vt_num_hidden_layers: int = 27,
20
+ vt_hidden_size: int = 1152,
21
+ vt_intermediate_size: int = 4304,
22
+ merge_kernel_size: tuple = (2, 2),
23
+ video_attn_type: str = 'spatial_temporal',
24
+ merge_type: str = 'sd2_tpool',
25
+ _attn_implementation: str = 'flash_attention_2',
26
+ # MM Projector parameters
27
+ mm_projector_type: str = 'patchmerger',
28
+ mm_hidden_size: int | None = None,
29
+ projector_hidden_act: str = "gelu",
30
+ projector_ln_eps: float = 1e-5,
31
+ # Other parameters
32
+ ignore_index: int = -100,
33
+ media_placeholder_token_id: int = 163605,
34
+ pad_token_id: int = 0,
35
+ use_unified_vision_chunk: bool = True,
36
+ video_placeholder="<|kimi_k25_video_placeholder|>",
37
+ text_hidden_size=7168,
38
+ **vision_config_kwargs):
39
+
40
+ self.patch_size = patch_size
41
+ self.init_pos_emb_height = init_pos_emb_height
42
+ self.init_pos_emb_width = init_pos_emb_width
43
+ self.init_pos_emb_time = init_pos_emb_time
44
+ self.pos_emb_type = pos_emb_type
45
+ self.vt_num_attention_heads = vt_num_attention_heads
46
+ self.vt_num_hidden_layers = vt_num_hidden_layers
47
+ self.vt_hidden_size = vt_hidden_size
48
+ self.vt_intermediate_size = vt_intermediate_size
49
+ self.merge_kernel_size = merge_kernel_size
50
+ self.video_attn_type = video_attn_type
51
+ self.merge_type = merge_type
52
+ self._attn_implementation = _attn_implementation
53
+
54
+ # MM Projector config
55
+ self.mm_projector_type = mm_projector_type
56
+ self.mm_hidden_size = mm_hidden_size if mm_hidden_size is not None else vt_hidden_size
57
+ self.projector_hidden_act = projector_hidden_act
58
+ self.projector_ln_eps = projector_ln_eps
59
+ self.text_hidden_size = text_hidden_size
60
+
61
+
62
+ class KimiK25Config(PretrainedConfig):
63
+ """Kimi-K2.5 model configuration.
64
+
65
+ Args:
66
+ text_config (dict | DeepseekV3Config): Configuration for the text model.
67
+
68
+ Vision Tower Parameters (from MoonViT3dConfig):
69
+ patch_size (int): Patch size for vision tower.
70
+ init_pos_emb_height (int): Initial position embedding height.
71
+ init_pos_emb_width (int): Initial position embedding width.
72
+ init_pos_emb_time (int): Initial position embedding time dimension.
73
+ pos_emb_type (str): Type of position embedding.
74
+ vt_num_attention_heads (int): Number of attention heads in vision tower.
75
+ vt_num_hidden_layers (int): Number of hidden layers in vision tower.
76
+ vt_hidden_size (int): Hidden size of vision tower.
77
+ vt_intermediate_size (int): Intermediate size in vision tower FFN.
78
+ merge_kernel_size (tuple): Kernel size for patch merging.
79
+ video_attn_type (str): Type of video attention.
80
+ merge_type (str): Type of merge operation.
81
+ _attn_implementation (str): Attention implementation type.
82
+
83
+ MM Projector Parameters (from MultiModalProjectorConfig):
84
+ mm_projector_type (str): Type of multimodal projector.
85
+ mm_hidden_size (int): Hidden size from vision tower (should match vt_hidden_size).
86
+ projector_hidden_act (str): Activation function for projector.
87
+ projector_ln_eps (float): Layer norm epsilon for projector.
88
+
89
+ Other Parameters:
90
+ ignore_index (int): The ignore index for the loss function.
91
+ media_placeholder_token_id (int): The token ID to use for media placeholders.
92
+ pad_token_id (int): The token ID to use for padding.
93
+ """
94
+
95
+ model_type = "kimi_k25"
96
+
97
+ def __init__(
98
+ self,
99
+ text_config: dict | DeepseekV3Config = None,
100
+ vision_config: dict | KimiK25VisionConfig = None,
101
+ # Other parameters
102
+ ignore_index: int = -100,
103
+ media_placeholder_token_id: int = 163605,
104
+ pad_token_id: int = 0,
105
+ use_unified_vision_chunk: bool = True,
106
+ video_placeholder="<|kimi_k25_video_placeholder|>",
107
+ **kwargs,
108
+ ):
109
+ if isinstance(text_config, dict):
110
+ text_config = DeepseekV3Config(**text_config)
111
+ if isinstance(vision_config, dict):
112
+ vision_config = KimiK25VisionConfig(**vision_config)
113
+ self.text_config = text_config
114
+ self.vision_config = vision_config
115
+ # Other config
116
+ self.ignore_index = ignore_index
117
+ self.media_placeholder_token_id = media_placeholder_token_id
118
+ self.use_unified_vision_chunk = use_unified_vision_chunk
119
+ self.video_placeholder = video_placeholder
120
+ if getattr(self.text_config, "quantization_config", None) is not None:
121
+ self.quantization_config = self.text_config.quantization_config
122
+
123
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
docs/deploy_guidance.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Kimi-K2.6 Deployment Guide
2
+
3
+ > [!Note]
4
+ > This guide only provides some examples of deployment commands for Kimi-K2.6, which may not be the optimal configuration. Since inference engines are still being updated frequenty, please continue to follow the guidance from their homepage if you want to achieve better inference performance.
5
+
6
+ > [!Note]
7
+ > Kimi-K2.6 has the same architecture as Kimi-K2.5, and the deployment method can be directly reused.
8
+ ## vLLM Deployment
9
+
10
+ You can refer to https://recipes.vllm.ai/moonshotai/Kimi-K2.5 for the newest deployment guide.
11
+
12
+ This model is available in nightly vLLM wheel:
13
+ ```
14
+ uv pip install -U vllm \
15
+ --torch-backend=auto \
16
+ --extra-index-url https://wheels.vllm.ai/nightly
17
+ ```
18
+
19
+ Nightly wheels may be unstable and are considered experimental. For stable production use, we recommend vLLM 0.19.1, which has been manually verified.
20
+
21
+ Here is the example to serve this model on a H200 single node with TP8 via vLLM:
22
+ ```bash
23
+ vllm serve $MODEL_PATH -tp 8 --mm-encoder-tp-mode data --trust-remote-code --tool-call-parser kimi_k2 --reasoning-parser kimi_k2
24
+ ```
25
+ **Key notes**
26
+ - `--tool-call-parser kimi_k2`: Required for enabling tool calling
27
+ - `--reasoning-parser kimi_k2`: Kimi-K2.6 enables thinking mode by default. Make sure to pass this for correct reasoning processing.
28
+
29
+ ## SGLang Deployment
30
+
31
+ You can refer to https://cookbook.sglang.io/autoregressive/Moonshotai/Kimi-K2.5 for the newest deployment guide.
32
+
33
+ This model is available in SGLang latest main:
34
+
35
+ ```
36
+ pip install "sglang @ git+https://github.com/sgl-project/sglang.git#subdirectory=python"
37
+ pip install nvidia-cudnn-cu12==9.16.0.29
38
+ ```
39
+
40
+ Similarly, here is the example for it to run with TP8 on H200 in a single node via SGLang:
41
+ ``` bash
42
+ sglang serve --model-path $MODEL_PATH --tp 8 --trust-remote-code --tool-call-parser kimi_k2 --reasoning-parser kimi_k2
43
+ ```
44
+ **Key parameter notes:**
45
+ - `--tool-call-parser kimi_k2`: Required when enabling tool usage.
46
+ - `--reasoning-parser kimi_k2`: Required for correctly processing reasoning content.
47
+
48
+ ## KTransformers Deployment
49
+ ### KTransformers+SGLang Inference Deployment
50
+ Launch with KTransformers + SGLang for CPU+GPU heterogeneous inference:
51
+
52
+ ```
53
+ python -m sglang.launch_server \
54
+ --host 0.0.0.0 \
55
+ --port 31245 \
56
+ --model /path/to/kimi-k2.6 \
57
+ --kt-weight-path /path/to/kimi-k2.6 \
58
+ --kt-cpuinfer 96 \
59
+ --kt-threadpool-count 2 \
60
+ --kt-num-gpu-experts 30 \
61
+ --kt-method RAWINT4 \
62
+ --kt-gpu-prefill-token-threshold 400 \
63
+ --trust-remote-code \
64
+ --mem-fraction-static 0.94 \
65
+ --served-model-name Kimi-K2.6 \
66
+ --enable-mixed-chunk \
67
+ --tensor-parallel-size 4 \
68
+ --enable-p2p-check \
69
+ --disable-shared-experts-fusion \
70
+ --chunked-prefill-size 32658 \
71
+ --max-total-tokens 50000 \
72
+ --attention-backend flashinfer
73
+ ```
74
+
75
+ Achieves 640.12 tokens/s Prefill and 24.51 tokens/s Decode (48-way concurrency) on 8× NVIDIA L20 + 2× Intel 6454S.
76
+
77
+ More details: https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/Kimi-K2.5.md .
78
+
79
+ ### KTransformers+LLaMA-Factory Fine-tuning Deployment
80
+
81
+ You can use below command to run LoRA SFT with KT+llamafactory.
82
+
83
+ ```
84
+ # For LoRA SFT
85
+ USE_KT=1 llamafactory-cli train examples/train_lora/kimik2_lora_sft_kt.yaml
86
+ # For Chat with model after LoRA SFT
87
+ llamafactory-cli chat examples/inference/kimik2_lora_sft_kt.yaml
88
+ # For API with model after LoRA SFT
89
+ llamafactory-cli api examples/inference/kimik2_lora_sft_kt.yaml
90
+ ```
91
+
92
+ This achieves end-to-end LoRA SFT Throughput: 44.55 token/s on 2× NVIDIA 4090 + Intel 8488C with 1.97T RAM and 200G swap memory.
93
+
94
+ More details refer to https://github.com/kvcache-ai/ktransformers/blob/main/doc/en/SFT_Installation_Guide_KimiK2.5.md .
figures/demo_video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09b4d925aa0a7c712feef50765355f0625d8f6d46ea302fd98db9609e9070047
3
+ size 270100
figures/kimi-logo.png ADDED
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_length": 262144,
3
+ "eos_token_id": 163586
4
+ }
kimi_k25_processor.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.feature_extraction_utils import BatchFeature
2
+ from transformers.processing_utils import ProcessorMixin
3
+ from transformers.utils import logging
4
+
5
+ logger = logging.get_logger(__name__)
6
+
7
+
8
+ class KimiK25Processor(ProcessorMixin):
9
+ r"""
10
+ Constructs a KimiK25 processor which wraps a KimiK25 image processor and a tokenizer into a single processor.
11
+
12
+ [`KimiK25Processor`] offers all the functionalities of [`KimiK25ImageProcessor`] and [`TikTokenTokenizer`]. See the
13
+ [`~KimiK25Processor.__call__`] and [`~KimiK25Processor.decode`] for more information.
14
+
15
+ Args:
16
+ image_processor ([`KimiK25ImageProcessor`], *optional*):
17
+ The image processor is a required input.
18
+ tokenizer ([`TikTokenTokenizer`], *optional*):
19
+ The tokenizer is a required input.
20
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
21
+ in a chat into a tokenizable string.
22
+ """
23
+
24
+ attributes = ["image_processor", "tokenizer"]
25
+ valid_kwargs = ["chat_template"]
26
+ image_processor_class = "AutoImageProcessor"
27
+ tokenizer_class = "AutoTokenizer"
28
+
29
+ def __init__(
30
+ self,
31
+ image_processor=None,
32
+ tokenizer=None,
33
+ chat_template=None,
34
+ **kwargs,
35
+ ):
36
+ super().__init__(image_processor,
37
+ tokenizer,
38
+ chat_template=chat_template)
39
+ self.media_processor = image_processor
40
+ # A special temporal placeholder to be replaced by actual video placeholders
41
+ self.video_placeholder = "<|kimi_k25_video_placeholder|>"
42
+
43
+ def update_raw_text(self, text: str, video_prompts: list[str]) -> str:
44
+ # replace video prompt in text with video chunk prompts
45
+ video_count = text.count(self.video_placeholder)
46
+ if video_count == 0:
47
+ return text
48
+ assert video_count == len(video_prompts)
49
+ text_parts = text.split(self.video_placeholder)
50
+ assert len(text_parts) == len(video_prompts) + 1
51
+ text = "".join([
52
+ text_parts[i] + video_prompts[i] for i in range(len(video_prompts))
53
+ ])
54
+ text += text_parts[-1]
55
+ return text
56
+
57
+ def preprocess_medias(self, medias: list[dict]) -> list[dict]:
58
+ updated_medias = []
59
+ video_prompts = []
60
+ for media in medias:
61
+ if media['type'] == 'image':
62
+ updated_medias.append(media)
63
+ elif media['type'] == 'video':
64
+ video_chunks = self.media_processor.split_video_chunks(
65
+ media['video'])
66
+ updated_medias.extend(video_chunks)
67
+ video_prompts.append("".join(
68
+ [vc['prompt'] for vc in video_chunks]))
69
+ else:
70
+ raise ValueError(f"unsupported media type: {media['type']}")
71
+ return updated_medias, video_prompts
72
+
73
+ def __call__(self,
74
+ messages: list[dict] = None,
75
+ medias: list[dict] = None,
76
+ text: str = None,
77
+ return_tensors: str = "pt",
78
+ **kwargs) -> BatchFeature:
79
+ """
80
+ Process multimodal inputs for Kimi-K2.5 model.
81
+
82
+ This processor accepts ordered messages and extracts both media and text in a single pass.
83
+ text will be automatically updated if video input detected in messages
84
+
85
+ Args:
86
+ messages: List of message dicts with 'role' and 'content' fields.
87
+ If provided, medias and text will be extracted automatically.
88
+ medias: Pre-extracted list of media dicts. If None, extracted from messages.
89
+ text: Pre-formatted text string. If None, generated via apply_chat_template.
90
+ return_tensors: Format of returned tensors ('pt', 'np', 'tf'). Default: 'pt'.
91
+ **kwargs: Additional arguments passed to tokenizer.apply_chat_template.
92
+
93
+ Returns:
94
+ BatchFeature with fields: input_ids, attention_mask, pixel_values, grid_thws.
95
+ """
96
+ if messages is None and (medias is None or text is None):
97
+ raise ValueError(
98
+ "Provide either 'messages' or both 'medias' and 'text'")
99
+
100
+ if medias is not None and text is not None:
101
+ updated_medias, video_prompts = self.preprocess_medias(medias)
102
+ preprocessed = self.media_processor.preprocess(
103
+ updated_medias, return_tensors=return_tensors)
104
+ text = self.update_raw_text(text, video_prompts)
105
+ text_inputs = self.tokenizer(text, return_tensors=return_tensors)
106
+ return BatchFeature(data={**text_inputs, **preprocessed.data})
107
+
108
+ if medias is None:
109
+ medias = self._extract_medias_from_messages(messages)
110
+ updated_medias, video_prompts = self.preprocess_medias(medias)
111
+ preprocessed = self.media_processor.preprocess(
112
+ updated_medias, return_tensors=return_tensors)
113
+
114
+ # Generate text if not provided
115
+ if text is None:
116
+ text = self.tokenizer.apply_chat_template(messages, **kwargs)
117
+
118
+ text = self.update_raw_text(text, video_prompts)
119
+
120
+ text_inputs = self.tokenizer(text, return_tensors=return_tensors)
121
+ return BatchFeature(data={**text_inputs, **preprocessed.data})
122
+
123
+ @staticmethod
124
+ def _extract_medias_from_messages(messages: list[dict]) -> list[dict]:
125
+ """
126
+ Extract media items from messages in a single pass.
127
+
128
+ This is an optimized version that processes messages only once.
129
+ Kept as internal method since external callers should use __call__.
130
+ """
131
+ medias = []
132
+ for msg in messages:
133
+ if msg['role'] != 'user' or not msg.get('content'):
134
+ continue
135
+
136
+ for content_part in msg['content']:
137
+ if not isinstance(content_part, dict):
138
+ continue
139
+
140
+ content_type = content_part.get('type')
141
+ if content_type in ['video_url', 'video']:
142
+ medias.append({
143
+ 'type': 'video',
144
+ 'video': content_part['video_url']['url'],
145
+ 'first_frame_timestamp': 0.0
146
+ })
147
+ elif content_type in ['image_url', 'image']:
148
+ medias.append({
149
+ 'type': 'image',
150
+ 'image': content_part['image_url'],
151
+ })
152
+ return medias
153
+
154
+ def apply_chat_template(self, messages, **kwargs):
155
+ return self.tokenizer.apply_chat_template(messages, **kwargs)
156
+
157
+ def batch_decode(self, *args, **kwargs):
158
+ return self.tokenizer.batch_decode(*args, **kwargs)
159
+
160
+ def decode(self, *args, **kwargs):
161
+ return self.tokenizer.decode(*args, **kwargs)
162
+
163
+ @property
164
+ def model_input_names(self):
165
+ return ['input_ids', 'attention_mask', 'pixel_values', 'grid_thws']
kimi_k25_vision_processing.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image processor class for Kimi-K2.5.
2
+ """
3
+
4
+ import json
5
+ from typing import Any, Dict, Optional, Union
6
+
7
+ import numpy as np
8
+ import torch
9
+ from PIL import Image
10
+ from transformers.image_processing_utils import (BaseImageProcessor,
11
+ BatchFeature)
12
+ from transformers.utils import TensorType
13
+
14
+ from .media_utils import (MediaInput, VideoChunkInput, _to_tensor,
15
+ ensure_media_type, get_video_meta, image_to_np,
16
+ navit_patchify, navit_resize_image,
17
+ navit_resize_video, normalize,
18
+ real_sample_fps_and_max_num_frames, timestamp_as_str)
19
+
20
+ try:
21
+ from mecord import VideoReader
22
+ except ImportError:
23
+ VideoReader = None
24
+
25
+
26
+ def resampling(video_bytes: bytes,
27
+ sample_indices: list[int],
28
+ key_indices=None,
29
+ frame_time_info=None,
30
+ num_threads=4) -> str:
31
+ video = VideoReader(video_bytes,
32
+ num_threads=num_threads,
33
+ frame_time_info=frame_time_info,
34
+ key_indices=key_indices)
35
+ # extract target frames
36
+ frames = video[sample_indices]
37
+ frames = [Image.fromarray(frame) for frame in frames]
38
+ return frames
39
+
40
+
41
+ class KimiK25VisionProcessor(BaseImageProcessor):
42
+ model_type = "kimi_k25"
43
+
44
+ def __init__(
45
+ self,
46
+ media_proc_cfg: dict,
47
+ **kwargs,
48
+ ):
49
+ super().__init__(**kwargs)
50
+ self.media_proc_cfg = media_proc_cfg
51
+ self.num_frames_per_chunk = media_proc_cfg[
52
+ 'temporal_merge_kernel_size']
53
+
54
+ def media_tokens_calculator(self, media: MediaInput):
55
+ media = ensure_media_type(media)
56
+ ret = self.get_resize_config(media)
57
+ return ret['num_tokens']
58
+
59
+ @classmethod
60
+ def make_chunk_prompt(cls, timestamp_text: str) -> str:
61
+ return f"{timestamp_text}<|media_begin|>video<|media_content|><|media_pad|><|media_end|>"
62
+
63
+ def split_video_chunks(self,
64
+ video_url: str | bytes) -> list[list[Image.Image]]:
65
+ # video_url should be base64 str or bytes
66
+ video_spec = get_video_meta(video_url)
67
+ sample_fps = min(self.media_proc_cfg['sample_fps'], video_spec.fps)
68
+ sampled_nframes = max(
69
+ round(video_spec.num_frames * sample_fps / video_spec.fps), 1)
70
+ frame_inds = np.linspace(0, video_spec.num_frames - 1,
71
+ sampled_nframes).round().astype(int)
72
+ frame_inds = frame_inds.tolist()
73
+ sampled_frame_ids = []
74
+ temporal_merge_kernel_size = self.media_proc_cfg[
75
+ "temporal_merge_kernel_size"]
76
+ num_chunks = 0
77
+ chunk_timestamp = []
78
+ for i in range(0, len(frame_inds), temporal_merge_kernel_size):
79
+ sampled_frame_ids.extend(frame_inds[i:i +
80
+ temporal_merge_kernel_size])
81
+ start_time = frame_inds[i] / float(video_spec.fps)
82
+ timestamp_text = timestamp_as_str(
83
+ start_time, self.media_proc_cfg["timestamp_mode"])
84
+ chunk_timestamp.append(timestamp_text)
85
+ num_chunks += 1
86
+
87
+ sampled_frames = resampling(video_url, sampled_frame_ids)
88
+ chunks = []
89
+ for chunk_id in range(num_chunks):
90
+ chunk = sampled_frames[chunk_id *
91
+ temporal_merge_kernel_size:(chunk_id + 1) *
92
+ temporal_merge_kernel_size]
93
+ chunks.append(
94
+ VideoChunkInput(type="video_chunk",
95
+ video_chunk=chunk,
96
+ prompt=self.make_chunk_prompt(
97
+ chunk_timestamp[chunk_id])))
98
+ return chunks
99
+
100
+ def get_resize_config(self, media_input: MediaInput) -> dict:
101
+ if media_input['type'] == 'image':
102
+ w, h = media_input['image'].size
103
+ ret = navit_resize_image(
104
+ w, h, self.media_proc_cfg['patch_size'],
105
+ self.media_proc_cfg['merge_kernel_size'],
106
+ self.media_proc_cfg['in_patch_limit'],
107
+ self.media_proc_cfg['patch_limit_on_one_side'],
108
+ self.media_proc_cfg['fixed_output_tokens'])
109
+ return ret
110
+ elif media_input['type'] == 'video_chunk':
111
+ frame = media_input['video_chunk'][0]
112
+ width, height = frame.size
113
+ num_frames = len(media_input["video_chunk"])
114
+ fps = 1.0
115
+
116
+ sample_fps, max_num_frames_each_video = real_sample_fps_and_max_num_frames(
117
+ media_input["type"],
118
+ self.media_proc_cfg['sample_fps'],
119
+ self.media_proc_cfg['max_num_frames_each_video'],
120
+ )
121
+
122
+ in_patch_limit_each_frame = self.media_proc_cfg[
123
+ 'in_patch_limit_each_frame']
124
+ if in_patch_limit_each_frame is None:
125
+ in_patch_limit_each_frame = self.media_proc_cfg[
126
+ 'in_patch_limit']
127
+
128
+ ret = navit_resize_video(
129
+ width,
130
+ height,
131
+ num_frames,
132
+ fps,
133
+ sample_fps,
134
+ self.media_proc_cfg['patch_size'],
135
+ self.media_proc_cfg['merge_kernel_size'],
136
+ in_patch_limit_each_frame,
137
+ self.media_proc_cfg['patch_limit_on_one_side'],
138
+ self.media_proc_cfg['in_patch_limit_video'],
139
+ max_num_frames_each_video,
140
+ self.media_proc_cfg['fixed_output_tokens'],
141
+ )
142
+ return ret
143
+ else:
144
+ raise ValueError("Unsupported type: {}".format(
145
+ media_input['type']))
146
+
147
+ def resize_image(self, image: Image.Image, new_width: int, new_height: int,
148
+ pad_width: int, pad_height: int) -> np.ndarray:
149
+ image_np = image_to_np(image, (new_width, new_height), "resize")
150
+ image_np = np.pad(
151
+ image_np,
152
+ ((0, pad_height), (0, pad_width), (0, 0)),
153
+ mode="constant",
154
+ constant_values=0,
155
+ )
156
+ return image_np
157
+
158
+ def preprocess(
159
+ self,
160
+ medias: list[MediaInput],
161
+ return_tensors: Optional[Union[str, TensorType]] = None,
162
+ ) -> BatchFeature:
163
+ """
164
+ Preprocess a atom vision input (images/video_chunk) into model-ready tensors.
165
+
166
+ Args:
167
+ medias: List of MediaInput.
168
+ return_tensors: Desired output format ('pt', 'np', 'tf', or None).
169
+
170
+ Returns:
171
+ BatchFeature containing 'pixel_values' and 'grid_thws' tensors.
172
+ """
173
+ if not isinstance(medias, list):
174
+ medias = [medias]
175
+ if medias:
176
+ pixel_values = []
177
+ for item in medias:
178
+ item = ensure_media_type(item)
179
+ resize_config = self.get_resize_config(item)
180
+ new_width, new_height, pad_width, pad_height = resize_config[
181
+ 'new_width'], resize_config['new_height'], resize_config[
182
+ 'pad_width'], resize_config['pad_height']
183
+ if item['type'] == 'image':
184
+ image = item['image']
185
+ image_np = self.resize_image(image, new_width, new_height,
186
+ pad_width, pad_height)
187
+ pixel_values.append(np.expand_dims(image_np, axis=0))
188
+ elif item['type'] == 'video_chunk':
189
+ pixels = []
190
+ for frame in item['video_chunk']:
191
+ frame_np = self.resize_image(frame, new_width,
192
+ new_height, pad_width,
193
+ pad_height)
194
+ pixels.append(frame_np)
195
+ pixel_values.append(np.stack(pixels, axis=0))
196
+ else:
197
+ raise ValueError("Unsupported type: {}".format(
198
+ item['type']))
199
+ normalized_pixel_values = []
200
+ image_std_inv = 1.0 / np.array(self.media_proc_cfg['image_std'])
201
+ image_mean = np.array(self.media_proc_cfg['image_mean'])
202
+ for pixels in pixel_values:
203
+ pixels = normalize(pixels, image_mean, image_std_inv)
204
+ pixels_and_thw = navit_patchify(
205
+ pixels,
206
+ self.media_proc_cfg['patch_size'],
207
+ )
208
+ normalized_pixel_values.append(pixels_and_thw)
209
+
210
+ pixel_values = torch.cat([
211
+ _to_tensor(pixel_value['pixel_values'])
212
+ for pixel_value in normalized_pixel_values
213
+ ])
214
+ grid_thws = torch.cat([
215
+ _to_tensor(pixel_value['grid_thw'],
216
+ dtype=torch.int64).unsqueeze(0)
217
+ for pixel_value in normalized_pixel_values
218
+ ])
219
+
220
+ data = {
221
+ 'pixel_values': pixel_values,
222
+ 'grid_thws': grid_thws,
223
+ }
224
+
225
+ else:
226
+ data = {}
227
+
228
+ return BatchFeature(data=data, tensor_type=return_tensors)
229
+
230
+ def __repr__(self):
231
+ return f"KimiK25VisionProcessor(media_proc_cfg={self.media_proc_cfg})"
232
+
233
+ def to_dict(self) -> Dict[str, Any]:
234
+ output = super().to_dict()
235
+ output["media_proc_cfg"] = self.media_proc_cfg
236
+ if "media_processor" in output:
237
+ del output["media_processor"]
238
+ return output
239
+
240
+ @classmethod
241
+ def from_dict(cls, config_dict: Dict[str, Any], **kwargs):
242
+ config = config_dict.copy()
243
+ media_proc_cfg = config.pop("media_proc_cfg", {})
244
+ return cls(media_proc_cfg=media_proc_cfg, **config, **kwargs)
245
+
246
+ def to_json_string(self):
247
+ dictionary = self.to_dict()
248
+ for key, value in dictionary.items():
249
+ if hasattr(value, 'tolist'):
250
+ dictionary[key] = value.tolist()
251
+ return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
media_utils.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import math
4
+ import os
5
+ from datetime import datetime, timezone
6
+ from typing import List, Literal, Optional, TypedDict
7
+
8
+ import numpy as np
9
+ from PIL import Image
10
+ from pydantic import BaseModel, Field
11
+
12
+ try:
13
+ from mecord import VideoReader
14
+ except ImportError:
15
+ VideoReader = None
16
+
17
+
18
+ class VideoSpec(BaseModel):
19
+ media_type: str = Literal['video']
20
+ height: int = Field(..., gt=0, description="video frame height")
21
+ width: int = Field(..., gt=0, description="video frame width")
22
+ num_frames: int = Field(..., gt=0, description="num frames")
23
+ fps: float = Field(..., gt=0, description="average fps")
24
+
25
+ # optional, help to accelerate video reading
26
+ key_indices: list[int] = Field(None, description="key indices")
27
+ frame_time_info: dict = Field(None, description="frame time info")
28
+
29
+
30
+ class ImageInput(TypedDict):
31
+ type: Literal['image']
32
+ image: Image.Image
33
+
34
+
35
+ class VideoChunkInput(TypedDict):
36
+ type: Literal['video_chunk']
37
+ video_chunk: List[Image.Image]
38
+ prompt: Optional[str] = None
39
+
40
+
41
+ MediaInput = ImageInput | VideoChunkInput
42
+
43
+
44
+ def get_video_meta(video_src: bytes | str | os.PathLike,
45
+ accurate: bool = True) -> dict:
46
+ """Get the dimensions of a video."""
47
+ if isinstance(video_src, os.PathLike):
48
+ video_src = str(video_src)
49
+ # if b64 string, decode to bytes
50
+ if isinstance(video_src,
51
+ str) and video_src.startswith('data:video/mp4;base64,'):
52
+ video_src = base64.b64decode(video_src.split(',')[1])
53
+ video = VideoReader(video_src, auto_init=accurate, num_threads=1)
54
+ assert video.num_frames > 0, "Invalid video format."
55
+ assert video.original_width > 0 and video.original_height > 0, (
56
+ "Invalid video format.")
57
+ assert video.avg_fps > 0, "Invalid video format."
58
+ return VideoSpec(media_type='video',
59
+ height=video.original_height,
60
+ width=video.original_width,
61
+ num_frames=video.num_frames,
62
+ fps=video.avg_fps,
63
+ key_indices=video.key_indices,
64
+ frame_time_info=video.frame_time_info)
65
+
66
+
67
+ def timestamp_as_str(timestamp: float,
68
+ timestamp_mode: str = "hh:mm:ss.fff") -> str:
69
+ """Convert a timestamp to a string in the format of HH:MM:SS.mmm."""
70
+ if timestamp_mode == "hh:mm:ss.fff":
71
+ return (datetime.fromtimestamp(timestamp,
72
+ tz=timezone.utc).strftime("%H:%M:%S") +
73
+ f".{int((timestamp % 1) * 1000):03d}")
74
+ elif timestamp_mode == "mm:ss.fff":
75
+ return (datetime.fromtimestamp(timestamp,
76
+ tz=timezone.utc).strftime("%M:%S") +
77
+ f".{int((timestamp % 1) * 1000):03d}")
78
+ elif timestamp_mode == "mm:ss":
79
+ return datetime.fromtimestamp(timestamp,
80
+ tz=timezone.utc).strftime("%M:%S")
81
+ else:
82
+ raise ValueError(f"Invalid timestamp mode: {timestamp_mode}")
83
+
84
+
85
+ def navit_resize_image(
86
+ width: int,
87
+ height: int,
88
+ patch_size: int,
89
+ merge_kernel_size: int,
90
+ in_patch_limit: int,
91
+ patch_limit_on_one_side: int,
92
+ fixed_output_tokens: int | None,
93
+ ):
94
+ # Apply the patch limits.
95
+ s1 = math.sqrt(
96
+ in_patch_limit /
97
+ (max(1.0, width // patch_size) * max(1.0, height // patch_size)))
98
+ s2 = patch_limit_on_one_side * patch_size / width
99
+ s3 = patch_limit_on_one_side * patch_size / height
100
+ scale = min(1.0, s1, s2, s3)
101
+ new_w, new_h = max(1, int(width * scale)), max(1, int(height * scale))
102
+ new_w = min(new_w, patch_limit_on_one_side * patch_size)
103
+ new_h = min(new_h, patch_limit_on_one_side * patch_size)
104
+
105
+ # Calculate the padding to make the height and width divisible by the merge kernel size and patch size.
106
+ factor = merge_kernel_size * patch_size
107
+
108
+ pad_height = (factor - new_h % factor) % factor
109
+ pad_width = (factor - new_w % factor) % factor
110
+
111
+ if fixed_output_tokens is not None:
112
+ num_tokens = fixed_output_tokens
113
+ else:
114
+ # Calculate new dimensions after padding and patching
115
+ token_height = (new_h + pad_height) // factor
116
+ token_width = (new_w + pad_width) // factor
117
+
118
+ assert token_height * merge_kernel_size <= patch_limit_on_one_side, (
119
+ f"token_height {token_height} * merge_kernel_size {merge_kernel_size} > patch_limit_on_one_side {patch_limit_on_one_side}"
120
+ )
121
+ assert token_width * merge_kernel_size <= patch_limit_on_one_side, (
122
+ f"token_width {token_width} * merge_kernel_size {merge_kernel_size} > patch_limit_on_one_side {patch_limit_on_one_side}"
123
+ )
124
+
125
+ num_tokens = token_height * token_width
126
+ return {
127
+ "num_tokens": num_tokens,
128
+ "new_width": new_w,
129
+ "new_height": new_h,
130
+ "pad_width": pad_width,
131
+ "pad_height": pad_height,
132
+ "sampled_nframes": 1,
133
+ }
134
+
135
+
136
+ def navit_resize_video(
137
+ width: int,
138
+ height: int,
139
+ nframes: int,
140
+ avg_fps: float,
141
+ sample_fps: float,
142
+ patch_size: int,
143
+ merge_kernel_size: int,
144
+ in_patch_limit_each_frame: int,
145
+ patch_limit_on_one_side: int,
146
+ in_patch_limit_total: int | None,
147
+ max_num_frames_each_video: int | None,
148
+ fixed_output_tokens_each_frame: int | None,
149
+ ):
150
+ sample_fps = min(sample_fps, avg_fps)
151
+ # Calculate the number of frames to sample based on target FPS
152
+ sampled_nframes = max(round(nframes * sample_fps / avg_fps), 1)
153
+ if max_num_frames_each_video is not None:
154
+ sampled_nframes = min(sampled_nframes, max_num_frames_each_video)
155
+
156
+ if in_patch_limit_total is not None:
157
+ in_patch_limit_each_frame = min(
158
+ round(in_patch_limit_total / sampled_nframes),
159
+ in_patch_limit_each_frame)
160
+
161
+ ret = navit_resize_image(
162
+ width,
163
+ height,
164
+ patch_size,
165
+ merge_kernel_size,
166
+ in_patch_limit_each_frame,
167
+ patch_limit_on_one_side,
168
+ fixed_output_tokens_each_frame,
169
+ )
170
+ ret["sampled_nframes"] = sampled_nframes
171
+ return ret
172
+
173
+
174
+ def real_sample_fps_and_max_num_frames(
175
+ type_name: Literal["video", "video_chunk"],
176
+ sample_fps: float,
177
+ max_num_frames_each_video: int | None,
178
+ ) -> tuple[int, int | None]:
179
+ if type_name == "video":
180
+ return sample_fps, max_num_frames_each_video
181
+ elif type_name == "video_chunk":
182
+ max_num_frames_each_video = None
183
+ sample_fps = math.inf
184
+ return sample_fps, max_num_frames_each_video
185
+ else:
186
+ return math.inf, None
187
+
188
+
189
+ def _to_pil(data: str | bytes):
190
+ if isinstance(data, Image.Image):
191
+
192
+ return data.convert("RGB")
193
+ elif isinstance(data, str):
194
+ if data.startswith("data:"):
195
+ raw_base64 = data.split(",")[1]
196
+ return Image.open(io.BytesIO(
197
+ base64.b64decode(raw_base64))).convert("RGB")
198
+ else:
199
+ return Image.open(data).convert("RGB")
200
+ elif isinstance(data, bytes):
201
+ return Image.open(io.BytesIO(data)).convert("RGB")
202
+ else:
203
+ raise ValueError(f"Unsupported data type: {type(data)}")
204
+
205
+
206
+ def ensure_media_type(media: MediaInput) -> MediaInput:
207
+ if media['type'] == 'image':
208
+ media['image'] = _to_pil(media['image'])
209
+ return media
210
+ elif media['type'] == 'video_chunk':
211
+ media['video_chunk'] = [
212
+ _to_pil(frame) for frame in media['video_chunk']
213
+ ]
214
+ return media
215
+ else:
216
+ raise ValueError(f"Unsupported media type: {media['type']}")
217
+
218
+
219
+ def image_to_np(
220
+ image: Image.Image,
221
+ resize_to: tuple[int, int] | None = None,
222
+ mode: str = "resize",
223
+ raise_error_for_ill_resize: bool = True,
224
+ ) -> np.ndarray:
225
+ """Convert an image to a numpy array.
226
+
227
+ Args:
228
+ content: The image to convert.
229
+ resize_to: The size to resize the image to.
230
+ mode: The mode to resize the image to.
231
+ raise_error_for_ill_resize: Whether to raise an error for ill-sized resize.
232
+
233
+ Returns:
234
+ A numpy array.
235
+ """
236
+ assert isinstance(image, Image.Image), "image must be a PIL Image"
237
+ if resize_to is not None:
238
+ if mode == "resize":
239
+ image = image.resize(resize_to, resample=Image.Resampling.BICUBIC)
240
+
241
+ elif mode == "rescale_and_pad_to_center":
242
+ scale = min(resize_to[0] / image.width,
243
+ resize_to[1] / image.height, 1.0)
244
+ new_width = round(image.width * scale)
245
+ new_height = round(image.height * scale)
246
+ if new_width == 0 or new_height == 0:
247
+ if raise_error_for_ill_resize:
248
+ raise ValueError(
249
+ f"Invalid resize to: {resize_to}, from image size: {image.size}"
250
+ )
251
+ else:
252
+ return np.zeros((resize_to[1], resize_to[0], 3),
253
+ dtype=np.uint8)
254
+
255
+ image = image.resize((new_width, new_height),
256
+ resample=Image.Resampling.BICUBIC)
257
+ padding_left = (resize_to[0] - new_width) // 2
258
+ padding_right = resize_to[0] - new_width - padding_left
259
+ padding_top = (resize_to[1] - new_height) // 2
260
+ padding_bottom = resize_to[1] - new_height - padding_top
261
+ image = np.asarray(image)
262
+ image = np.pad(
263
+ image,
264
+ ((padding_top, padding_bottom), (padding_left, padding_right),
265
+ (0, 0)),
266
+ mode="constant",
267
+ constant_values=0,
268
+ )
269
+ assert image.shape == (resize_to[1], resize_to[0], 3)
270
+
271
+ elif mode == "rescale_and_pad_to_rightbottom":
272
+ scale = min(resize_to[0] / image.width,
273
+ resize_to[1] / image.height, 1.0)
274
+ new_width = round(image.width * scale)
275
+ new_height = round(image.height * scale)
276
+ if new_width == 0 or new_height == 0:
277
+ if raise_error_for_ill_resize:
278
+ raise ValueError(
279
+ f"Invalid resize to: {resize_to}, from image size: {image.size}"
280
+ )
281
+ else:
282
+ return np.zeros((resize_to[1], resize_to[0], 3),
283
+ dtype=np.uint8)
284
+
285
+ image = image.resize((new_width, new_height),
286
+ resample=Image.Resampling.BICUBIC)
287
+ padding_right = resize_to[0] - new_width
288
+ padding_bottom = resize_to[1] - new_height
289
+ image = np.asarray(image)
290
+ image = np.pad(
291
+ image,
292
+ ((0, padding_bottom), (0, padding_right), (0, 0)),
293
+ mode="constant",
294
+ constant_values=0,
295
+ )
296
+ assert image.shape == (resize_to[1], resize_to[0], 3)
297
+
298
+ else:
299
+ raise ValueError(f"Invalid mode: {mode}")
300
+
301
+ if isinstance(image, Image.Image):
302
+ return np.asarray(image)
303
+ else:
304
+ return image
305
+
306
+
307
+ def navit_patchify(pixel_values: np.ndarray,
308
+ patch_size: int) -> dict[str, np.ndarray]:
309
+ """Reshape the pixel values to a navit shape.
310
+
311
+ Args:
312
+ pixel_values: np.ndarray, shape (t, h, w, c)
313
+ patch_size: int
314
+
315
+ Returns:
316
+ dict[str, np.ndarray]
317
+ - patches: np.ndarray, shape (t * h//patch_size * w//patch_size, c, patch_size, patch_size)
318
+ - grid_thw: np.ndarray, (t, h//patch_size, w//patch_size)
319
+ """
320
+ T, H, W, C = pixel_values.shape
321
+ assert C == 3, "pixel_values must have 3 channels"
322
+
323
+ patches = pixel_values.reshape(T, H // patch_size, patch_size,
324
+ W // patch_size, patch_size, C)
325
+ # (T, H//patch_size, W//patch_size, C, patch_size, patch_size)
326
+ patches = patches.transpose(0, 1, 3, 5, 2, 4)
327
+ patches = patches.reshape(-1, C, patch_size, patch_size)
328
+ grid_thw = np.array([T, H // patch_size, W // patch_size])
329
+ return {"pixel_values": patches, "grid_thw": grid_thw}
330
+
331
+
332
+ def normalize(x: np.ndarray,
333
+ mean,
334
+ std_inv,
335
+ pixels_dtype: np.dtype = np.float32) -> np.ndarray:
336
+ """Normalize the image.
337
+
338
+ Args:
339
+ x: The image to normalize. The shape is (..., 3). The dtype is uint8. The range is [0, 255].
340
+ mean: The mean of the image.
341
+ std_inv: The inverse of the std of the image.
342
+ pixels_dtype: The dtype of the image.
343
+ Returns:
344
+ The normalized image. The shape is (..., 3). The dtype is determined by the pixels_dtype.
345
+ """
346
+ x = (x / 255.0).astype(pixels_dtype)
347
+ x -= mean
348
+ x *= std_inv
349
+ return x
350
+
351
+
352
+ def _to_tensor(data, **kwargs):
353
+ import torch
354
+
355
+ if isinstance(data, np.ndarray):
356
+ return torch.from_numpy(data).to(**kwargs)
357
+ elif isinstance(data, torch.Tensor):
358
+ return data.to(**kwargs)
359
+ elif isinstance(data, list):
360
+ return [_to_tensor(item, **kwargs) for item in data]
361
+ elif isinstance(data, tuple):
362
+ return tuple(_to_tensor(item, **kwargs) for item in data)
363
+ elif isinstance(data, dict):
364
+ return {k: _to_tensor(v, **kwargs) for k, v in data.items()}
365
+ elif data is None:
366
+ return None
367
+ else:
368
+ raise ValueError(f"Unsupported data type: {type(data)}")
model-00001-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb6e037206c1936876c33f348bc644fd6f9f4d7ac973f8906359977c1eaebd43
3
+ size 995001888
model-00002-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a609fccff2406505ae575850a51470474e8bb3eb825ddd208f84aae31cfb4960
3
+ size 9809047464
model-00003-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f79ae8d220ecbba674c1224bbb36e2ad826ccc365765a89fab57555af47a8540
3
+ size 9809047464
model-00004-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efa041ff6fd295b3a5b8bc4fe7db502d32e6f887d650353425c3074388ecaa33
3
+ size 9809047464
model-00005-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f772c9bf434aea41b6f6dc05eb32da4e8259548a3145d79e26607b4a9d1753b
3
+ size 9809047464
model-00006-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cf4c4593e5a7e5b4ababc8991245ab53121be414558099dc12dd01ab00eb920
3
+ size 9809047464
model-00007-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66775118dec5d4f549aa19d0c8cd07dcc42c48d4d9dd6f289f772ec5d692a3af
3
+ size 9809047464
model-00008-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdca9b399dce8e82e0cf5bf9c59f600ea9f6001c56b84c78d9364cb8f55f48a5
3
+ size 9809047464
model-00009-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f8c2dcde7c51a3eca91bb892eacd8bcbcd7724842254a0000bb1c45faf92eb2
3
+ size 9809047464
model-00010-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:311ffdf698fb27c03697598cd2c652a1ec210d9a7351d5c2de1b14d605635195
3
+ size 9809047464
model-00011-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2a93273e323843ad067e00dc859f63d90f62bd2e81ef0a77ffe49a9f39a3607
3
+ size 9809050936
model-00012-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa3200710275f7e8d60cdb91d1d623477fda55d04bda5e32be29b5809383f4ca
3
+ size 9809050936
model-00013-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8925783dd26187bbff4f9030aeff392c3fe3d931409daa341d673bc93393965d
3
+ size 9809050936
model-00014-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee309d11f1c4194db36e961293d6e7b9d3c3c8227da5064ffc32b806ac03cc91
3
+ size 9809050936
model-00015-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4859b4dfe8eb7998bbda5e453f9336b139881d403b0a8c128b98842c9c27800d
3
+ size 9809050936
model-00016-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4345e4706a01e63aef6a43025bc101ebbfb8e8c235ea44604fb9c5ad7038b4cc
3
+ size 9809050936
model-00017-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0b6e9d578c7f0eaa3e91aa917fb3cbdcaf5637caa3ad97e9dc002bcb35feb22
3
+ size 9809050936
model-00018-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bef116d936999f347b83403ee13fe479f51214a87592409121614793d7e65e7
3
+ size 9809050936
model-00019-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71b624faee1df5cc99bad259b018befb0bdbbdd160af3d130487bf1165d48b9a
3
+ size 9809050936
model-00020-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c6ee52ccb57bd2fdba6077a18faaf3ea6da1b3536597af4fef6c5cf713bec34
3
+ size 9809050936
model-00021-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb99455eb90ddd9ab343538496c3e90b40868c0029a344d79b0b94f73820dd1f
3
+ size 9809050936
model-00022-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f98118e5b5e700ad57b510f3c80183fc2e5e3dcd1badf43a959ce63140ae4e61
3
+ size 9809050936
model-00023-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a4f4fb4557c2954dc07ffd8d81af93587ab784e6810eca093fef6b0fab03d3c
3
+ size 9809050936
model-00024-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fa6f9aba1f02a552a66d0bcf2808fdc9192425b5554684b637d7e9d9312fb3d
3
+ size 9809050936
model-00025-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98a0dc92227ed3beb2a1491558d2fa631dca256a5c28e1895b1bf28ae4c9731b
3
+ size 9809050936
model-00026-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b6e19204d6f74af7ff2c78c67a33991f5ee4105e0262d99eda6ba6bc0362630
3
+ size 9809050936
model-00027-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8476d21ab258330c16020124d634d2cd1f22a5a7859d3dd778132dea0271e810
3
+ size 9809050936
model-00028-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee16b8cf5c8a9efb0c5fd89c282622e5cfbf4f49fb28d5b3638240434dadef7a
3
+ size 9809050936
model-00029-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87e0e7b5caf5b39314caf4158b666d0b0c9fe00a96dd09ea545563f4a6f408ed
3
+ size 9809050936
model-00030-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f8d186de5354913a9aab43b26395c97bdf27b695586260bbd6492ba8ab795dc
3
+ size 9809050936
model-00031-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:696da932f44c248e96ca8b616424ee87e0609fb32ef640bd086f296665ad4ad8
3
+ size 9809050936
model-00032-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eee21740e4d75ba175965f322dd8d2747411fedc1b59b9a0559630d7c94d06c6
3
+ size 9809050936
model-00033-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc9a7b5737151ca68fed23cb16308aff1527d8932be59d507318795025a9034e
3
+ size 9809050936
model-00034-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:783faa4cbeac1c7336070596c5a3f603bd49193c1ab6da74eb29d6fed935d905
3
+ size 9809050936
model-00035-of-000064.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0256b0e5c83716c66871f0b63b0b87fe92aab595355aae583d5a674a691411e2
3
+ size 9809050936