haijunlv commited on
Commit
de4f18d
·
verified ·
1 Parent(s): a3167aa
Files changed (45) hide show
  1. .gitattributes +3 -0
  2. LICENSE +201 -0
  3. README.md +290 -0
  4. chat_template.jinja +159 -0
  5. config.json +153 -0
  6. configuration_interns2_preview.py +434 -0
  7. figs/efficiency.jpg +3 -0
  8. figs/title.png +3 -0
  9. generation_config.json +13 -0
  10. merges.txt +0 -0
  11. model-00001-of-00022.safetensors +3 -0
  12. model-00002-of-00022.safetensors +3 -0
  13. model-00003-of-00022.safetensors +3 -0
  14. model-00004-of-00022.safetensors +3 -0
  15. model-00005-of-00022.safetensors +3 -0
  16. model-00006-of-00022.safetensors +3 -0
  17. model-00007-of-00022.safetensors +3 -0
  18. model-00008-of-00022.safetensors +3 -0
  19. model-00009-of-00022.safetensors +3 -0
  20. model-00010-of-00022.safetensors +3 -0
  21. model-00011-of-00022.safetensors +3 -0
  22. model-00012-of-00022.safetensors +3 -0
  23. model-00013-of-00022.safetensors +3 -0
  24. model-00014-of-00022.safetensors +3 -0
  25. model-00015-of-00022.safetensors +3 -0
  26. model-00016-of-00022.safetensors +3 -0
  27. model-00017-of-00022.safetensors +3 -0
  28. model-00018-of-00022.safetensors +3 -0
  29. model-00019-of-00022.safetensors +3 -0
  30. model-00020-of-00022.safetensors +3 -0
  31. model-00021-of-00022.safetensors +3 -0
  32. model-00022-of-00022.safetensors +3 -0
  33. model.safetensors.index.json +0 -0
  34. modeling_interns2_preview.py +0 -0
  35. preprocessor_config.json +24 -0
  36. processing_interns2_preview.py +423 -0
  37. special_tokens_map.json +45 -0
  38. tokenization_interns1.py +1009 -0
  39. tokenizer.json +3 -0
  40. tokenizer_PROT.model +3 -0
  41. tokenizer_SMILES.model +3 -0
  42. tokenizer_XNA.model +3 -0
  43. tokenizer_config.json +521 -0
  44. video_preprocessor_config.json +21 -0
  45. vocab.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ figs/efficiency.jpg filter=lfs diff=lfs merge=lfs -text
38
+ figs/title.png filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2025-2026 Shanghai AI Laboratory
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ license_link: https://huggingface.co/internlm/Intern-S2-Preview/blob/main/LICENSE
5
+ pipeline_tag: image-text-to-text
6
+ ---
7
+
8
+ ## Intern-S2-Preview
9
+
10
+ <div align="center">
11
+ <img src="./figs/title.png" />
12
+
13
+ <div>&nbsp;</div>
14
+
15
+ [💻Github Repo](https://github.com/InternLM/Intern-S1) • [🤗Model Collections](https://huggingface.co/collections/internlm/intern-s2) • [💬Online Chat](https://chat.intern-ai.org.cn/)
16
+
17
+ </div>
18
+
19
+ <p align="center">
20
+ 👋 join us on <a href="https://discord.gg/xa29JuW87d" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/intern-s1.jpg" target="_blank">WeChat</a>
21
+ </p>
22
+
23
+
24
+
25
+ ## Introduction
26
+
27
+ We introduce **Intern-S2-Preview**, an efficient 35B scientific multimodal foundation model. Beyond conventional parameter and data scaling, Intern-S2-Preview explores **task scaling**: increasing the difficulty, diversity, and coverage of scientific tasks to further unlock model capabilities.
28
+
29
+ By extending professional scientific tasks into a full-chain training pipeline from pre-training to reinforcement learning, Intern-S2-Preview achieves performance comparable to the trillion-scale Intern-S1-Pro on multiple core professional scientific tasks, while using only 35B parameters. At the same time, it maintains strong general reasoning, multimodal understanding, coding, and agent capabilities.
30
+
31
+ ### Features
32
+
33
+ - **Scientific task scaling with full-chain training.** Intern-S2-Preview scales hundreds of professional scientific tasks from pre-training to RL, enabling strong performance across multiple specialized domains at only 35B parameters. It further strengthens spatial modeling for small-molecule structures and introduces real-valued prediction modules, making it the first open-source model with both material crystal structure generation capability and strong general capabilities.
34
+
35
+ - **Enhanced agent capabilities for scientific workflows.** Intern-S2-Preview significantly improves agentic abilities over the previous generation, achieving strong results on multiple scientific agent benchmarks.
36
+
37
+ - **Efficient RL reasoning with MTP and CoT compression.** During RL, Intern-S2-Preview adopts shared-weight MTP with KL loss to reduce the mismatch between training and inference behavior, substantially improving MTP accept rate and token generation speed. It also introduces CoT compression techniques to shorten responses while preserving strong reasoning capability, achieving improvements in both performance and efficiency.
38
+
39
+ <figure>
40
+ <img src="./figs/efficiency.jpg" alt="efficient RL reasoning with MTP and CoT compression">
41
+ <figcaption>Fig1: Reasoning Efficiency on Complex Math Benchmarks. Accuracy vs. Average Response Length. Intern-S2-Preview (red star) significantly outperforms trillion-scale Intern-S1-Pro (red circle), and achieving higher accuracy with better token efficiency among medium-size models.</figcaption>
42
+ </figure>
43
+
44
+ ### Performance
45
+
46
+ We evaluate the Intern-S2-Preview on various benchmarks, including general datasets and scientific datasets. We report the performance comparison with the recent VLMs and LLMs below.
47
+
48
+ ![performance](./figs/performance.jpeg)
49
+
50
+
51
+ > **Note**: <u>Underline</u> means the best performance among open-sourced models, **Bold** indicates the best performance among all models.
52
+
53
+ We use the [OpenCompass](https://github.com/open-compass/OpenCompass/) and [VLMEvalKit](https://github.com/open-compass/vlmevalkit) to evaluate all models.
54
+
55
+
56
+ ## Quick Start
57
+
58
+ ### Sampling Parameters
59
+
60
+ We recommend using the following hyperparameters to ensure better results
61
+
62
+ ```python
63
+ top_p = 0.95
64
+ top_k = 50
65
+ min_p = 0.0
66
+ temperature = 0.8
67
+ ```
68
+
69
+ ### Serving
70
+
71
+ Intern-S2-Preview can be deployed using any of the following LLM inference frameworks:
72
+
73
+ - LMDeploy
74
+ - vLLM
75
+ - SGLang
76
+
77
+ Detailed deployment examples for these frameworks are available in the [Model Deployment Guide](./deployment_guide.md).
78
+
79
+
80
+ ## Advanced Usage
81
+
82
+ ### Tool Calling
83
+
84
+ Tool Calling lets the model extend its capabilities by invoking external tools and APIs. The example below shows how to use it to fetch the latest weather forecast via an OpenAI-compatible API (based on lmdeploy api server).
85
+
86
+ ```python
87
+
88
+
89
+ from openai import OpenAI
90
+ import json
91
+
92
+
93
+ def get_current_temperature(location: str, unit: str = "celsius"):
94
+ """Get current temperature at a location.
95
+
96
+ Args:
97
+ location: The location to get the temperature for, in the format "City, State, Country".
98
+ unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
99
+
100
+ Returns:
101
+ the temperature, the location, and the unit in a dict
102
+ """
103
+ return {
104
+ "temperature": 26.1,
105
+ "location": location,
106
+ "unit": unit,
107
+ }
108
+
109
+
110
+ def get_temperature_date(location: str, date: str, unit: str = "celsius"):
111
+ """Get temperature at a location and date.
112
+
113
+ Args:
114
+ location: The location to get the temperature for, in the format "City, State, Country".
115
+ date: The date to get the temperature for, in the format "Year-Month-Day".
116
+ unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
117
+
118
+ Returns:
119
+ the temperature, the location, the date and the unit in a dict
120
+ """
121
+ return {
122
+ "temperature": 25.9,
123
+ "location": location,
124
+ "date": date,
125
+ "unit": unit,
126
+ }
127
+
128
+ def get_function_by_name(name):
129
+ if name == "get_current_temperature":
130
+ return get_current_temperature
131
+ if name == "get_temperature_date":
132
+ return get_temperature_date
133
+
134
+ tools = [{
135
+ 'type': 'function',
136
+ 'function': {
137
+ 'name': 'get_current_temperature',
138
+ 'description': 'Get current temperature at a location.',
139
+ 'parameters': {
140
+ 'type': 'object',
141
+ 'properties': {
142
+ 'location': {
143
+ 'type': 'string',
144
+ 'description': 'The location to get the temperature for, in the format \'City, State, Country\'.'
145
+ },
146
+ 'unit': {
147
+ 'type': 'string',
148
+ 'enum': [
149
+ 'celsius',
150
+ 'fahrenheit'
151
+ ],
152
+ 'description': 'The unit to return the temperature in. Defaults to \'celsius\'.'
153
+ }
154
+ },
155
+ 'required': [
156
+ 'location'
157
+ ]
158
+ }
159
+ }
160
+ }, {
161
+ 'type': 'function',
162
+ 'function': {
163
+ 'name': 'get_temperature_date',
164
+ 'description': 'Get temperature at a location and date.',
165
+ 'parameters': {
166
+ 'type': 'object',
167
+ 'properties': {
168
+ 'location': {
169
+ 'type': 'string',
170
+ 'description': 'The location to get the temperature for, in the format \'City, State, Country\'.'
171
+ },
172
+ 'date': {
173
+ 'type': 'string',
174
+ 'description': 'The date to get the temperature for, in the format \'Year-Month-Day\'.'
175
+ },
176
+ 'unit': {
177
+ 'type': 'string',
178
+ 'enum': [
179
+ 'celsius',
180
+ 'fahrenheit'
181
+ ],
182
+ 'description': 'The unit to return the temperature in. Defaults to \'celsius\'.'
183
+ }
184
+ },
185
+ 'required': [
186
+ 'location',
187
+ 'date'
188
+ ]
189
+ }
190
+ }
191
+ }]
192
+
193
+
194
+
195
+ messages = [
196
+ {'role': 'user', 'content': 'Today is 2024-11-14, What\'s the temperature in San Francisco now? How about tomorrow?'}
197
+ ]
198
+
199
+ openai_api_key = "EMPTY"
200
+ openai_api_base = "http://0.0.0.0:23333/v1"
201
+ client = OpenAI(
202
+ api_key=openai_api_key,
203
+ base_url=openai_api_base,
204
+ )
205
+ model_name = client.models.list().data[0].id
206
+ response = client.chat.completions.create(
207
+ model=model_name,
208
+ messages=messages,
209
+ max_tokens=32768,
210
+ temperature=0.8,
211
+ top_p=0.95,
212
+ extra_body=dict(spaces_between_special_tokens=False),
213
+ tools=tools)
214
+ print(response.choices[0].message)
215
+ messages.append(response.choices[0].message)
216
+
217
+ for tool_call in response.choices[0].message.tool_calls:
218
+ tool_call_args = json.loads(tool_call.function.arguments)
219
+ tool_call_result = get_function_by_name(tool_call.function.name)(**tool_call_args)
220
+ tool_call_result = json.dumps(tool_call_result, ensure_ascii=False)
221
+ messages.append({
222
+ 'role': 'tool',
223
+ 'name': tool_call.function.name,
224
+ 'content': tool_call_result,
225
+ 'tool_call_id': tool_call.id
226
+ })
227
+
228
+ response = client.chat.completions.create(
229
+ model=model_name,
230
+ messages=messages,
231
+ temperature=0.8,
232
+ top_p=0.95,
233
+ extra_body=dict(spaces_between_special_tokens=False),
234
+ tools=tools)
235
+ print(response.choices[0].message)
236
+ ```
237
+
238
+ ### Switching Between Thinking and Non-Thinking Modes
239
+
240
+ Intern-S2-Preview enables thinking mode by default, enhancing the model's reasoning capabilities to generate higher-quality responses. This feature can be disabled by setting `enable_thinking=False` in `tokenizer.apply_chat_template`
241
+
242
+ ```python
243
+ text = tokenizer.apply_chat_template(
244
+ messages,
245
+ tokenize=False,
246
+ add_generation_prompt=True,
247
+ enable_thinking=False # think mode indicator
248
+ )
249
+ ```
250
+
251
+ When serving Intern-S2-Preview models, you can dynamically control the thinking mode by adjusting the `enable_thinking` parameter in your requests.
252
+
253
+ ```python
254
+ from openai import OpenAI
255
+ import json
256
+
257
+ messages = [
258
+ {
259
+ 'role': 'user',
260
+ 'content': 'who are you'
261
+ }, {
262
+ 'role': 'assistant',
263
+ 'content': 'I am an AI'
264
+ }, {
265
+ 'role': 'user',
266
+ 'content': 'AGI is?'
267
+ }]
268
+
269
+ openai_api_key = "EMPTY"
270
+ openai_api_base = "http://0.0.0.0:23333/v1"
271
+ client = OpenAI(
272
+ api_key=openai_api_key,
273
+ base_url=openai_api_base,
274
+ )
275
+ model_name = client.models.list().data[0].id
276
+
277
+ response = client.chat.completions.create(
278
+ model=model_name,
279
+ messages=messages,
280
+ temperature=0.8,
281
+ top_p=0.95,
282
+ max_tokens=2048,
283
+ extra_body={
284
+ "chat_template_kwargs": {"enable_thinking": False}
285
+ }
286
+ )
287
+ print(json.dumps(response.model_dump(), indent=2, ensure_ascii=False))
288
+ ```
289
+
290
+ > Note: We do not recommend disabling thinking mode for agentic tasks.
chat_template.jinja ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set image_count = namespace(value=0) %}
2
+ {%- set video_count = namespace(value=0) %}
3
+ {%- macro render_content(content, do_vision_count, is_system_content=false) %}
4
+ {%- if content is string %}
5
+ {{- content }}
6
+ {%- elif content is iterable and content is not mapping %}
7
+ {%- for item in content %}
8
+ {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
9
+ {%- if is_system_content %}
10
+ {{- raise_exception('System message cannot contain images.') }}
11
+ {%- endif %}
12
+ {%- if do_vision_count %}
13
+ {%- set image_count.value = image_count.value + 1 %}
14
+ {%- endif %}
15
+ {%- if add_vision_id %}
16
+ {{- 'Picture ' ~ image_count.value ~ ': ' }}
17
+ {%- endif %}
18
+ {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
19
+ {%- elif 'video' in item or item.type == 'video' %}
20
+ {%- if is_system_content %}
21
+ {{- raise_exception('System message cannot contain videos.') }}
22
+ {%- endif %}
23
+ {%- if do_vision_count %}
24
+ {%- set video_count.value = video_count.value + 1 %}
25
+ {%- endif %}
26
+ {%- if add_vision_id %}
27
+ {{- 'Video ' ~ video_count.value ~ ': ' }}
28
+ {%- endif %}
29
+ {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
30
+ {%- elif 'time_series' in item or item.type == 'time_series' %}
31
+ {%- if is_system_content %}
32
+ {{- raise_exception('System message cannot contain time series.') }}
33
+ {%- endif %}
34
+ {{- '<|ts|><TS_CONTEXT><|/ts|>' }}
35
+ {%- elif 'text' in item %}
36
+ {{- item.text }}
37
+ {%- else %}
38
+ {{- raise_exception('Unexpected item type in content.') }}
39
+ {%- endif %}
40
+ {%- endfor %}
41
+ {%- elif content is none or content is undefined %}
42
+ {{- '' }}
43
+ {%- else %}
44
+ {{- raise_exception('Unexpected content type.') }}
45
+ {%- endif %}
46
+ {%- endmacro %}
47
+ {%- if not messages %}
48
+ {{- raise_exception('No messages provided.') }}
49
+ {%- endif %}
50
+ {%- if tools and tools is iterable and tools is not mapping %}
51
+ {{- '<|im_start|>system\n' }}
52
+ {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
53
+ {%- for tool in tools %}
54
+ {{- "\n" }}
55
+ {{- tool | tojson }}
56
+ {%- endfor %}
57
+ {{- "\n</tools>" }}
58
+ {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
59
+ {%- if messages[0].role == 'system' %}
60
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
61
+ {%- if content %}
62
+ {{- '\n\n' + content }}
63
+ {%- endif %}
64
+ {%- endif %}
65
+ {{- '<|im_end|>\n' }}
66
+ {%- else %}
67
+ {%- if messages[0].role == 'system' %}
68
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
69
+ {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
70
+ {%- endif %}
71
+ {%- endif %}
72
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
73
+ {%- for message in messages[::-1] %}
74
+ {%- set index = (messages|length - 1) - loop.index0 %}
75
+ {%- if ns.multi_step_tool and message.role == "user" %}
76
+ {%- set content = render_content(message.content, false)|trim %}
77
+ {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
78
+ {%- set ns.multi_step_tool = false %}
79
+ {%- set ns.last_query_index = index %}
80
+ {%- endif %}
81
+ {%- endif %}
82
+ {%- endfor %}
83
+ {%- if ns.multi_step_tool %}
84
+ {{- raise_exception('No user query found in messages.') }}
85
+ {%- endif %}
86
+ {%- for message in messages %}
87
+ {%- set content = render_content(message.content, true)|trim %}
88
+ {%- if message.role == "system" %}
89
+ {%- if not loop.first %}
90
+ {{- raise_exception('System message must be at the beginning.') }}
91
+ {%- endif %}
92
+ {%- elif message.role == "user" %}
93
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
94
+ {%- elif message.role == "assistant" %}
95
+ {%- set reasoning_content = '' %}
96
+ {%- if message.reasoning_content is string %}
97
+ {%- set reasoning_content = message.reasoning_content %}
98
+ {%- else %}
99
+ {%- if '</think>' in content %}
100
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
101
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
102
+ {%- endif %}
103
+ {%- endif %}
104
+ {%- set reasoning_content = reasoning_content|trim %}
105
+ {%- if loop.index0 > ns.last_query_index %}
106
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
107
+ {%- else %}
108
+ {{- '<|im_start|>' + message.role + '\n' + content }}
109
+ {%- endif %}
110
+ {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
111
+ {%- for tool_call in message.tool_calls %}
112
+ {%- if tool_call.function is defined %}
113
+ {%- set tool_call = tool_call.function %}
114
+ {%- endif %}
115
+ {%- if loop.first %}
116
+ {%- if content|trim %}
117
+ {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
118
+ {%- else %}
119
+ {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
120
+ {%- endif %}
121
+ {%- else %}
122
+ {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
123
+ {%- endif %}
124
+ {%- if tool_call.arguments is defined %}
125
+ {%- for args_name, args_value in tool_call.arguments|items %}
126
+ {{- '<parameter=' + args_name + '>\n' }}
127
+ {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
128
+ {{- args_value }}
129
+ {{- '\n</parameter>\n' }}
130
+ {%- endfor %}
131
+ {%- endif %}
132
+ {{- '</function>\n</tool_call>' }}
133
+ {%- endfor %}
134
+ {%- endif %}
135
+ {{- '<|im_end|>\n' }}
136
+ {%- elif message.role == "tool" %}
137
+ {%- if loop.previtem and loop.previtem.role != "tool" %}
138
+ {{- '<|im_start|>user' }}
139
+ {%- endif %}
140
+ {{- '\n<tool_response>\n' }}
141
+ {{- content }}
142
+ {{- '\n</tool_response>' }}
143
+ {%- if not loop.last and loop.nextitem.role != "tool" %}
144
+ {{- '<|im_end|>\n' }}
145
+ {%- elif loop.last %}
146
+ {{- '<|im_end|>\n' }}
147
+ {%- endif %}
148
+ {%- else %}
149
+ {{- raise_exception('Unexpected message role.') }}
150
+ {%- endif %}
151
+ {%- endfor %}
152
+ {%- if add_generation_prompt %}
153
+ {{- '<|im_start|>assistant\n' }}
154
+ {%- if enable_thinking is defined and enable_thinking is false %}
155
+ {{- '<think>\n\n</think>\n\n' }}
156
+ {%- else %}
157
+ {{- '<think>\n' }}
158
+ {%- endif %}
159
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "intern_s2_preview",
3
+ "architectures": [
4
+ "InternS2PreviewForConditionalGeneration"
5
+ ],
6
+ "transformers_version": "5.2.0",
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_interns2_preview.InternS2PreviewConfig",
9
+ "AutoModelForCausalLM": "modeling_interns2_preview.InternS2PreviewForCausalLM",
10
+ "AutoModel": "modeling_interns2_preview.InternS2PreviewModel",
11
+ "AutoModelForImageTextToText": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration",
12
+ "AutoModelForMultimodalLM": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration"
13
+ },
14
+ "image_token_id": 248056,
15
+ "text_config": {
16
+ "model_type": "qwen3_5_moe_text",
17
+ "attention_bias": false,
18
+ "attention_dropout": 0.0,
19
+ "attn_output_gate": true,
20
+ "dtype": "bfloat16",
21
+ "eos_token_id": 248044,
22
+ "full_attention_interval": 4,
23
+ "head_dim": 256,
24
+ "hidden_act": "silu",
25
+ "hidden_size": 2048,
26
+ "initializer_range": 0.02,
27
+ "layer_types": [
28
+ "linear_attention",
29
+ "linear_attention",
30
+ "linear_attention",
31
+ "full_attention",
32
+ "linear_attention",
33
+ "linear_attention",
34
+ "linear_attention",
35
+ "full_attention",
36
+ "linear_attention",
37
+ "linear_attention",
38
+ "linear_attention",
39
+ "full_attention",
40
+ "linear_attention",
41
+ "linear_attention",
42
+ "linear_attention",
43
+ "full_attention",
44
+ "linear_attention",
45
+ "linear_attention",
46
+ "linear_attention",
47
+ "full_attention",
48
+ "linear_attention",
49
+ "linear_attention",
50
+ "linear_attention",
51
+ "full_attention",
52
+ "linear_attention",
53
+ "linear_attention",
54
+ "linear_attention",
55
+ "full_attention",
56
+ "linear_attention",
57
+ "linear_attention",
58
+ "linear_attention",
59
+ "full_attention",
60
+ "linear_attention",
61
+ "linear_attention",
62
+ "linear_attention",
63
+ "full_attention",
64
+ "linear_attention",
65
+ "linear_attention",
66
+ "linear_attention",
67
+ "full_attention"
68
+ ],
69
+ "linear_conv_kernel_dim": 4,
70
+ "linear_key_head_dim": 128,
71
+ "linear_num_key_heads": 16,
72
+ "linear_num_value_heads": 32,
73
+ "linear_value_head_dim": 128,
74
+ "max_position_embeddings": 262144,
75
+ "mlp_only_layers": [],
76
+ "moe_intermediate_size": 512,
77
+ "mtp_num_hidden_layers": 1,
78
+ "mtp_use_dedicated_embeddings": false,
79
+ "num_attention_heads": 16,
80
+ "num_experts": 256,
81
+ "num_experts_per_tok": 8,
82
+ "num_hidden_layers": 40,
83
+ "num_key_value_heads": 2,
84
+ "rms_norm_eps": 1e-06,
85
+ "router_aux_loss_coef": 0.001,
86
+ "shared_expert_intermediate_size": 512,
87
+ "use_cache": true,
88
+ "vocab_size": 251392,
89
+ "mamba_ssm_dtype": "float32",
90
+ "rope_parameters": {
91
+ "mrope_interleaved": true,
92
+ "mrope_section": [
93
+ 11,
94
+ 11,
95
+ 10
96
+ ],
97
+ "rope_type": "default",
98
+ "rope_theta": 10000000,
99
+ "partial_rotary_factor": 0.25
100
+ },
101
+ "pad_token_id": null,
102
+ "bos_token_id": null,
103
+ "tie_word_embeddings": false,
104
+ "output_router_logits": false,
105
+ "partial_rotary_factor": 0.25
106
+ },
107
+ "tie_word_embeddings": false,
108
+ "video_token_id": 248057,
109
+ "vision_config": {
110
+ "model_type": "intern_s2_preview",
111
+ "deepstack_visual_indexes": [],
112
+ "depth": 27,
113
+ "hidden_act": "gelu_pytorch_tanh",
114
+ "hidden_size": 1152,
115
+ "in_channels": 3,
116
+ "initializer_range": 0.02,
117
+ "intermediate_size": 4304,
118
+ "num_heads": 16,
119
+ "num_position_embeddings": 2304,
120
+ "out_hidden_size": 2048,
121
+ "patch_size": 16,
122
+ "spatial_merge_size": 2,
123
+ "temporal_patch_size": 2
124
+ },
125
+ "vision_end_token_id": 248054,
126
+ "vision_start_token_id": 248053,
127
+ "ts_config": {
128
+ "model_type": "interns2_preview_time_series",
129
+ "auto_map": {
130
+ "AutoConfig": "configuration_interns2_preview.InternS2PreviewTimeSeriesConfig",
131
+ "AutoModel": "modeling_interns2_preview.InternS2PreviewTimeSeriesModel"
132
+ },
133
+ "activation_dropout": 0.0,
134
+ "activation_function": "gelu",
135
+ "attention_dropout": 0.0,
136
+ "d_model": 768,
137
+ "dropout": 0.0,
138
+ "encoder_attention_heads": 8,
139
+ "encoder_ffn_dim": 3072,
140
+ "encoder_layerdrop": 0.0,
141
+ "encoder_layers": 17,
142
+ "max_source_positions": 1500,
143
+ "num_mel_bins": 80,
144
+ "out_hidden_size": 2048,
145
+ "scale_embedding": false,
146
+ "ts_adapt_in_dim": 256,
147
+ "ts_adapt_out_dim": 1024,
148
+ "ts_hidden_dim": 1024
149
+ },
150
+ "ts_token_id": 248093,
151
+ "ts_start_id": 248091,
152
+ "ts_end_id": 248092
153
+ }
configuration_interns2_preview.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/interns2_preview/modular_interns2_preview.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_interns2_preview.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
21
+ from transformers.modeling_rope_utils import RopeParameters
22
+
23
+
24
+ class InternS2PreviewVisionConfig(PreTrainedConfig):
25
+ model_type = "intern_s2_preview"
26
+ base_config_key = "vision_config"
27
+
28
+ def __init__(
29
+ self,
30
+ depth=27,
31
+ hidden_size=1152,
32
+ hidden_act="gelu_pytorch_tanh",
33
+ intermediate_size=4304,
34
+ num_heads=16,
35
+ in_channels=3,
36
+ patch_size=16,
37
+ spatial_merge_size=2,
38
+ temporal_patch_size=2,
39
+ out_hidden_size=3584,
40
+ num_position_embeddings=2304,
41
+ initializer_range=0.02,
42
+ **kwargs,
43
+ ):
44
+ super().__init__(**kwargs)
45
+
46
+ self.depth = depth
47
+ self.hidden_size = hidden_size
48
+ self.hidden_act = hidden_act
49
+ self.intermediate_size = intermediate_size
50
+ self.num_heads = num_heads
51
+ self.in_channels = in_channels
52
+ self.patch_size = patch_size
53
+ self.spatial_merge_size = spatial_merge_size
54
+ self.temporal_patch_size = temporal_patch_size
55
+ self.out_hidden_size = out_hidden_size
56
+ self.num_position_embeddings = num_position_embeddings
57
+ self.initializer_range = initializer_range
58
+
59
+
60
+ class InternS2PreviewTextConfig(PreTrainedConfig):
61
+ r"""
62
+ This is the configuration class to store the configuration of a [`InternS2PreviewTextModel`]. It is used to instantiate a
63
+ Qwen3.5-MoE model according to the specified arguments, defining the model architecture.
64
+ Instantiating a configuration with the defaults will yield a similar configuration to that of
65
+ Qwen3.5-35B-A3B-Instruct [Qwen/Qwen3.5-35B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-Instruct).
66
+
67
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
68
+ documentation from [`PreTrainedConfig`] for more information.
69
+
70
+
71
+ Args:
72
+ vocab_size (`int`, *optional*, defaults to 248320):
73
+ Vocabulary size of the model. Defines the number of different tokens that can be represented by the
74
+ `inputs_ids`.
75
+ hidden_size (`int`, *optional*, defaults to 2048):
76
+ Dimension of the hidden representations.
77
+ num_hidden_layers (`int`, *optional*, defaults to 40):
78
+ Number of hidden layers in the Transformer encoder.
79
+ num_attention_heads (`int`, *optional*, defaults to 16):
80
+ Number of attention heads for each attention layer in the Transformer encoder.
81
+ num_key_value_heads (`int`, *optional*, defaults to 2):
82
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
83
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
84
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
85
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
86
+ by meanpooling all the original heads within that group. For more details checkout [this
87
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
88
+ hidden_act (`str`, *optional*, defaults to `"silu"`):
89
+ The non-linear activation function in the decoder.
90
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
91
+ The maximum sequence length that this model might ever be used with.
92
+ initializer_range (`float`, *optional*, defaults to 0.02):
93
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
94
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
95
+ The epsilon used by the rms normalization layers.
96
+ use_cache (`bool`, *optional*, defaults to `True`):
97
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
98
+ relevant if `config.is_decoder=True`.
99
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
100
+ Whether the model's input and output word embeddings should be tied.
101
+ rope_parameters (`RopeParameters`, *optional*):
102
+ Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
103
+ a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
104
+ with longer `max_position_embeddings`.
105
+ attention_bias (`bool`, *optional*, defaults to `False`):
106
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
107
+ attention_dropout (`float`, *optional*, defaults to 0.0):
108
+ The dropout ratio for the attention probabilities.
109
+ head_dim (`int`, *optional*, defaults to 256):
110
+ Projection weights dimension in multi-head attention.
111
+ linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
112
+ Kernel size of the convolution used in linear attention layers.
113
+ linear_key_head_dim (`int`, *optional*, defaults to 128):
114
+ Dimension of each key head in linear attention.
115
+ linear_value_head_dim (`int`, *optional*, defaults to 128):
116
+ Dimension of each value head in linear attention.
117
+ linear_num_key_heads (`int`, *optional*, defaults to 16):
118
+ Number of key heads used in linear attention layers.
119
+ linear_num_value_heads (`int`, *optional*, defaults to 32):
120
+ Number of value heads used in linear attention layers.
121
+ moe_intermediate_size (`int`, *optional*, defaults to 512):
122
+ Intermediate size of the routed expert.
123
+ shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
124
+ Intermediate size of the shared expert.
125
+ num_experts_per_tok (`int`, *optional*, defaults to 8):
126
+ Number of selected experts.
127
+ num_experts (`int`, *optional*, defaults to 256):
128
+ Number of routed experts.
129
+ output_router_logits (`bool`, *optional*, defaults to `False`):
130
+ Whether or not the router logits should be returned by the model. Enabling this will also
131
+ allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
132
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
133
+ The aux loss factor for the total loss.
134
+ layer_types (`list[str]`, *optional*):
135
+ Types of each layer (attention or linear).
136
+ pad_token_id (`int`, *optional*):
137
+ Padding token id.
138
+ bos_token_id (`int`, *optional*):
139
+ Beginning of stream token id.
140
+ eos_token_id (`int`, *optional*):
141
+ End of stream token id.
142
+
143
+ ```python
144
+ >>> from transformers import InternS2PreviewTextModel, InternS2PreviewTextConfig
145
+
146
+ >>> # Initializing a Qwen3.5-MoE style configuration
147
+ >>> configuration = InternS2PreviewTextConfig()
148
+
149
+ >>> # Initializing a model from the Qwen3.5-35B-A3B style configuration
150
+ >>> model = InternS2PreviewTextModel(configuration)
151
+
152
+ >>> # Accessing the model configuration
153
+ >>> configuration = model.config
154
+ ```
155
+ """
156
+
157
+ # NOTE: `model_type` is kept as `qwen3_5_moe_text` because transformers hardcodes weight-renaming logic keyed
158
+ # on model_type (e.g. `model_dtype`); reusing the parent's value ensures correct weight loading via
159
+ # `AutoModelForCausalLM.from_pretrained`.
160
+ model_type = "qwen3_5_moe_text"
161
+ keys_to_ignore_at_inference = ["past_key_values"]
162
+
163
+ base_model_tp_plan = {
164
+ "layers.*.self_attn.q_proj": "colwise",
165
+ "layers.*.self_attn.k_proj": "colwise",
166
+ "layers.*.self_attn.v_proj": "colwise",
167
+ "layers.*.self_attn.o_proj": "rowwise",
168
+ "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
169
+ "layers.*.mlp.experts.down_proj": "rowwise",
170
+ "layers.*.mlp.shared_expert.gate_proj": "colwise",
171
+ "layers.*.mlp.shared_expert.up_proj": "colwise",
172
+ "layers.*.mlp.shared_expert.down_proj": "rowwise",
173
+ }
174
+ base_model_pp_plan = {
175
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
176
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
177
+ "norm": (["hidden_states"], ["hidden_states"]),
178
+ }
179
+ base_config_key = "text_config"
180
+
181
+ def __init__(
182
+ self,
183
+ vocab_size=248320,
184
+ hidden_size=2048,
185
+ num_hidden_layers=40,
186
+ num_attention_heads=16,
187
+ num_key_value_heads=2,
188
+ hidden_act="silu",
189
+ max_position_embeddings=32768,
190
+ initializer_range=0.02,
191
+ rms_norm_eps=1e-6,
192
+ use_cache=True,
193
+ tie_word_embeddings=False,
194
+ rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
195
+ attention_bias=False,
196
+ attention_dropout=0.0,
197
+ head_dim=256,
198
+ linear_conv_kernel_dim=4,
199
+ linear_key_head_dim=128,
200
+ linear_value_head_dim=128,
201
+ linear_num_key_heads=16,
202
+ linear_num_value_heads=32,
203
+ moe_intermediate_size=512,
204
+ shared_expert_intermediate_size=512,
205
+ num_experts_per_tok=8,
206
+ num_experts=256,
207
+ output_router_logits=False,
208
+ router_aux_loss_coef=0.001,
209
+ layer_types=None,
210
+ pad_token_id: int | None = None,
211
+ bos_token_id: int | None = None,
212
+ eos_token_id: int | None = None,
213
+ **kwargs,
214
+ ):
215
+ kwargs["ignore_keys_at_rope_validation"] = {"mrope_section", "mrope_interleaved"}
216
+ self.pad_token_id = pad_token_id
217
+ self.bos_token_id = bos_token_id
218
+ self.eos_token_id = eos_token_id
219
+ self.tie_word_embeddings = tie_word_embeddings
220
+ self.vocab_size = vocab_size
221
+ self.max_position_embeddings = max_position_embeddings
222
+ self.hidden_size = hidden_size
223
+ self.num_hidden_layers = num_hidden_layers
224
+ self.num_attention_heads = num_attention_heads
225
+ self.num_key_value_heads = num_key_value_heads
226
+ self.hidden_act = hidden_act
227
+ self.initializer_range = initializer_range
228
+ self.rms_norm_eps = rms_norm_eps
229
+ self.use_cache = use_cache
230
+ self.attention_bias = attention_bias
231
+ self.attention_dropout = attention_dropout
232
+ self.head_dim = head_dim
233
+ self.rope_parameters = rope_parameters
234
+ kwargs.setdefault("partial_rotary_factor", 0.25) # assign default for BC
235
+
236
+ self.layer_types = layer_types
237
+ if self.layer_types is None:
238
+ interval_pattern = kwargs.get("full_attention_interval", 4)
239
+ self.layer_types = [
240
+ "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention"
241
+ for i in range(self.num_hidden_layers)
242
+ ]
243
+ layer_type_validation(self.layer_types, self.num_hidden_layers)
244
+
245
+ # linear attention part
246
+ self.linear_conv_kernel_dim = linear_conv_kernel_dim
247
+ self.linear_key_head_dim = linear_key_head_dim
248
+ self.linear_value_head_dim = linear_value_head_dim
249
+ self.linear_num_key_heads = linear_num_key_heads
250
+ self.linear_num_value_heads = linear_num_value_heads
251
+ self.moe_intermediate_size = moe_intermediate_size
252
+ self.shared_expert_intermediate_size = shared_expert_intermediate_size
253
+ self.num_experts_per_tok = num_experts_per_tok
254
+ self.num_experts = num_experts
255
+ self.output_router_logits = output_router_logits
256
+ self.router_aux_loss_coef = router_aux_loss_coef
257
+ super().__init__(**kwargs)
258
+
259
+
260
+ class InternS2PreviewTimeSeriesConfig(PreTrainedConfig):
261
+ r"""
262
+ This is the configuration class to store the configuration of a [`InternS2PreviewTimeSeriesModel`]. It is used to instantiate a
263
+ InternS2PreviewTimeSeries model according to the specified arguments, defining the model architecture.
264
+
265
+ Args:
266
+ ts_adapt_in_dim (`int`, *optional*, defaults to 256):
267
+ The input dimension of the time series adapter.
268
+ ts_adapt_out_dim (`int`, *optional*, defaults to 1024):
269
+ The output dimension of the time series adapter.
270
+ ts_hidden_dim (`int`, *optional*, defaults to 1024):
271
+ The hidden dimension of the time series model.
272
+ ts_cnn_channels (`list[int]`, *optional*, defaults to [1, 32, 64, 128, 128]):
273
+ The channels of the time series CNN.
274
+ ts_cnn_kernel_sizes (`list[int]`, *optional*, defaults to [3, 5, 5, 5]):
275
+ The kernel sizes of the time series CNN.
276
+ ts_cnn_strides (`list[int]`, *optional*, defaults to [2, 4, 4, 5]):
277
+ The strides of the time series CNN.
278
+ ts_cnn_paddings (`list[int]`, *optional*, defaults to [1, 2, 2, 2]):
279
+ The paddings of the time series CNN.
280
+ ts_concat_subsampling_in_channels (`int`, *optional*, defaults to 128):
281
+ The input channels of the time series concat subsampling.
282
+ ts_concat_subsampling_concat_size (`int`, *optional*, defaults to 2):
283
+ The concat size of the time series concat subsampling.
284
+ **super_kwargs:
285
+ Additional keyword arguments passed along to the base class `WhisperConfig`.
286
+ """
287
+
288
+ model_type = "interns2_preview_time_series"
289
+ base_config_key = "ts_config"
290
+
291
+ def __init__(
292
+ self,
293
+ activation_dropout: float = 0.0,
294
+ activation_function: str = "gelu",
295
+ attention_dropout: float = 0.0,
296
+ d_model: int = 768,
297
+ dropout: float = 0.0,
298
+ encoder_attention_heads: int = 8,
299
+ encoder_ffn_dim: int = 3072,
300
+ encoder_layerdrop: float = 0.0,
301
+ encoder_layers: int = 17,
302
+ max_source_positions: int = 1500,
303
+ num_mel_bins: int = 80,
304
+ out_hidden_size: int = 2048,
305
+ scale_embedding: bool = False,
306
+ ts_adapt_in_dim: int = 256,
307
+ ts_adapt_out_dim: int = 1024,
308
+ ts_hidden_dim: int = 1024,
309
+ **super_kwargs,
310
+ ):
311
+ super().__init__(**super_kwargs)
312
+
313
+ self.auto_map = {
314
+ "AutoConfig": "configuration_interns2_preview.InternS2PreviewTimeSeriesConfig",
315
+ "AutoModel": "modeling_interns2_preview.InternS2PreviewTimeSeriesModel",
316
+ }
317
+ self.activation_dropout = activation_dropout
318
+ self.activation_function = activation_function
319
+ self.attention_dropout = attention_dropout
320
+ self.d_model = d_model
321
+ self.dropout = dropout
322
+ self.encoder_attention_heads = encoder_attention_heads
323
+ self.encoder_ffn_dim = encoder_ffn_dim
324
+ self.encoder_layerdrop = encoder_layerdrop
325
+ self.encoder_layers = encoder_layers
326
+ self.max_source_positions = max_source_positions
327
+ self.num_mel_bins = num_mel_bins
328
+ self.out_hidden_size = out_hidden_size
329
+ self.scale_embedding = scale_embedding
330
+ self.ts_adapt_in_dim = ts_adapt_in_dim
331
+ self.ts_adapt_out_dim = ts_adapt_out_dim
332
+ self.ts_hidden_dim = ts_hidden_dim
333
+
334
+ assert self.ts_adapt_out_dim == self.ts_hidden_dim, "ts_adapt_out_dim should be equal to ts_hidden_dim"
335
+
336
+
337
+ class InternS2PreviewConfig(PreTrainedConfig):
338
+ r"""
339
+ This is the configuration class to store the configuration of a [`InternS2PreviewModel`]. It is used to instantiate a
340
+ Qwen3.5-MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
341
+ with the defaults will yield a similar configuration to that of
342
+ Qwen3.5-35B-A3B-Instruct [Qwen/Qwen3.5-35B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-Instruct).
343
+
344
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
345
+ documentation from [`PreTrainedConfig`] for more information.
346
+
347
+
348
+ Args:
349
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3_5TextConfig`):
350
+ The config object or dictionary of the text backbone.
351
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3_5VisionConfig`):
352
+ The config object or dictionary of the vision backbone.
353
+ image_token_id (`int`, *optional*, defaults to 248056):
354
+ The image token index to encode the image prompt.
355
+ video_token_id (`int`, *optional*, defaults to 248057):
356
+ The video token index to encode the image prompt.
357
+ vision_start_token_id (`int`, *optional*, defaults to 248053):
358
+ The start token index to encode the image prompt.
359
+ vision_end_token_id (`int`, *optional*, defaults to 248054):
360
+ The end token index to encode the image prompt.
361
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
362
+ Whether to tie the word embeddings.
363
+
364
+ ```python
365
+ >>> from transformers import InternS2PreviewForConditionalGeneration, InternS2PreviewConfig
366
+
367
+ >>> # Initializing a Qwen3.5-MoE style configuration
368
+ >>> configuration = InternS2PreviewConfig()
369
+
370
+ >>> # Initializing a model from the Qwen3.5-35B-A3B style configuration
371
+ >>> model = InternS2PreviewForConditionalGeneration(configuration)
372
+
373
+ >>> # Accessing the model configuration
374
+ >>> configuration = model.config
375
+ ```"""
376
+
377
+ model_type = "intern_s2_preview"
378
+ sub_configs = {
379
+ "vision_config": InternS2PreviewVisionConfig,
380
+ "text_config": InternS2PreviewTextConfig,
381
+ "ts_config": InternS2PreviewTimeSeriesConfig,
382
+ }
383
+ keys_to_ignore_at_inference = ["past_key_values"]
384
+
385
+ def __init__(
386
+ self,
387
+ text_config=None,
388
+ vision_config=None,
389
+ image_token_id=248056,
390
+ video_token_id=248057,
391
+ vision_start_token_id=248053,
392
+ vision_end_token_id=248054,
393
+ tie_word_embeddings=False,
394
+ ts_config=None,
395
+ ts_token_id=248093,
396
+ ts_start_id=248091,
397
+ ts_end_id=248092,
398
+ **kwargs,
399
+ ):
400
+ if isinstance(ts_config, dict):
401
+ self.ts_config = self.sub_configs["ts_config"](**ts_config)
402
+ elif ts_config is None:
403
+ self.ts_config = self.sub_configs["ts_config"]()
404
+
405
+ self.ts_token_id = ts_token_id
406
+ self.ts_start_id = ts_start_id
407
+ self.ts_end_id = ts_end_id
408
+ if isinstance(vision_config, dict):
409
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
410
+ elif vision_config is None:
411
+ self.vision_config = self.sub_configs["vision_config"]()
412
+
413
+ if isinstance(text_config, dict):
414
+ self.text_config = self.sub_configs["text_config"](**text_config)
415
+ elif text_config is None:
416
+ self.text_config = self.sub_configs["text_config"]()
417
+
418
+ self.image_token_id = image_token_id
419
+ self.video_token_id = video_token_id
420
+ self.vision_start_token_id = vision_start_token_id
421
+ self.vision_end_token_id = vision_end_token_id
422
+ self.tie_word_embeddings = tie_word_embeddings
423
+ super().__init__(**kwargs)
424
+ self.auto_map = {
425
+ "AutoConfig": "configuration_interns2_preview.InternS2PreviewConfig",
426
+ "AutoModelForCausalLM": "modeling_interns2_preview.InternS2PreviewForCausalLM",
427
+ "AutoModel": "modeling_interns2_preview.InternS2PreviewModel",
428
+ "AutoModelForImageTextToText": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration",
429
+ "AutoModelForMultimodalLM": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration",
430
+ }
431
+ self.architectures = ["InternS2PreviewForConditionalGeneration"]
432
+
433
+
434
+ __all__ = ["InternS2PreviewConfig", "InternS2PreviewTextConfig"]
figs/efficiency.jpg ADDED

Git LFS Details

  • SHA256: 2d7b1336523b6fe067a513fab92964c30c7a28a682a0debed4402041092bd8de
  • Pointer size: 131 Bytes
  • Size of remote file: 182 kB
figs/title.png ADDED

Git LFS Details

  • SHA256: 1e0080637b1009715c78ad8fb9b00f2355282b79e9e332100b8943f1a17eb33c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.3 MB
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 248044,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 248046,
6
+ 248044
7
+ ],
8
+ "pad_token_id": 248044,
9
+ "temperature": 1.0,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.57.0.dev0"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:995a45fa3692bd35364745a7b96acdaa631a6c3ffb8a4102edfcbf39229ac5af
3
+ size 3920272400
model-00002-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c629deac2d23507b1b324096a42f61129f815f2e52621f82e5365fe585580fa
3
+ size 3357898400
model-00003-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:063764d67002b7e6bb21c0937dbe4ed90a0355cab570c878a33411ded935bea6
3
+ size 3370808760
model-00004-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50dd6754c2c07aa914817ead2ec53699d8f8b9f1405665d814566281c1951043
3
+ size 3357898400
model-00005-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7bcd4ca7a6eb1c349170c00858a52e526268b0b305590124e20543750ba1669
3
+ size 3370808760
model-00006-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b0c9c45273dac6f35d61ddf45f4b27a5e81203adda07d677f53680f46c89c3
3
+ size 3357898360
model-00007-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aa57c51a4742dc646e060a9047b8daf12911f69dd4cc7ff98dfb1d32912a24f
3
+ size 3370808792
model-00008-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37c11f3eb529963a1a9c7511a4aeddb45d6182363a42f9dd6b144e0c3f9114b6
3
+ size 3357898432
model-00009-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78728fb1a8ca22a82f6f293cd3bda71e712cd17dd5b55ad8746d7e317aacafa4
3
+ size 3370808792
model-00010-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:455404e35f8e0c488eb08196fe27568a13f9f4b36268039d8f37572149a500ef
3
+ size 3357898432
model-00011-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef470a402662cf45b36d8b12e7dcea6192ff9de7612bd60829410e40e9a4084e
3
+ size 3370808792
model-00012-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9490b4f09f2420deeb254209fa68cdc9339860560d26037ef3b56bc1733b39eb
3
+ size 3357898432
model-00013-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb516812b38f42c775f7b6681496129d49f88f3d6874d7ff101580fe0073ab2
3
+ size 3370808792
model-00014-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:779b5ee835558166adb60e1e187bf44c1a862b693f94a4a4fec2d59a463be0d9
3
+ size 3357898432
model-00015-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18575d53b0c3708ffbbb14a2e9008a1fa0ec426887ab547ba765e3325436dbdd
3
+ size 3370808792
model-00016-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e24ce8453b89b8fb6a390358080250d6a6dd60b3190984d7ac9e920711ae33c
3
+ size 3357898432
model-00017-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8020324419f3ad5f03812594d6ee152c5a869c71077230649321aaf5a76bd9b
3
+ size 3370808792
model-00018-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f3f9808fd16522e225e30bf1654c366de9827dd9261cbf5bf2bdf6a5b944232
3
+ size 3357898432
model-00019-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64ca6cea07284e2d43ec4fb32e78941b9e505178d3e870891449bf0b73e2f281
3
+ size 3370808792
model-00020-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd30d6f3db1c81b3ce74e994aa0604781218d8f307203130d768d967428f5b5b
3
+ size 3357898432
model-00021-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b725bd593e946e8242f9e8585f4f7b53c4f6ba1795244f45b9a0ca458714b232
3
+ size 4288914488
model-00022-of-00022.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b609eeea1d0fcbdd1a5fd9d2979f5c4b331caa4fef49432c096795db90ae1ddd
3
+ size 833287792
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_interns2_preview.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {
3
+ "longest_edge": 16777216,
4
+ "shortest_edge": 65536
5
+ },
6
+ "patch_size": 16,
7
+ "temporal_patch_size": 2,
8
+ "merge_size": 2,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "Qwen3VLProcessor",
20
+ "image_processor_type": "Qwen2VLImageProcessorFast",
21
+ "auto_map": {
22
+ "AutoProcessor": "processing_interns2_preview.InternS2PreviewProcessor"
23
+ }
24
+ }
processing_interns2_preview.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/interns2_preview/modular_interns2_preview.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_interns2_preview.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ import importlib
21
+ import os
22
+
23
+ import numpy as np
24
+
25
+ from transformers.feature_extraction_utils import BatchFeature
26
+ from transformers.image_utils import ImageInput
27
+ from transformers.processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
28
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
29
+ from transformers.utils import auto_docstring, logging
30
+ from transformers.video_utils import VideoInput
31
+
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+
36
+ class InternS2PreviewProcessorKwargs(ProcessingKwargs, total=False):
37
+ _defaults = {
38
+ "text_kwargs": {
39
+ "padding": False,
40
+ "return_token_type_ids": False,
41
+ "return_mm_token_type_ids": False,
42
+ },
43
+ "videos_kwargs": {"return_metadata": True},
44
+ "time_series_kwargs": {},
45
+ }
46
+
47
+
48
+ @auto_docstring
49
+ class InternS2PreviewProcessor(ProcessorMixin):
50
+ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
51
+ self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
52
+ self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
53
+ self.image_token_id = (
54
+ tokenizer.image_token_id
55
+ if getattr(tokenizer, "image_token_id", None)
56
+ else tokenizer.convert_tokens_to_ids(self.image_token)
57
+ )
58
+ self.video_token_id = (
59
+ tokenizer.video_token_id
60
+ if getattr(tokenizer, "video_token_id", None)
61
+ else tokenizer.convert_tokens_to_ids(self.video_token)
62
+ )
63
+ super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
64
+ self.vision_start_token = (
65
+ "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
66
+ )
67
+ self.vision_end_token = (
68
+ "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token
69
+ )
70
+ self.vision_start_token_id = (
71
+ tokenizer.vision_start_token_id
72
+ if getattr(tokenizer, "vision_start_token_id", None)
73
+ else tokenizer.convert_tokens_to_ids(self.vision_start_token)
74
+ )
75
+ self.vision_end_token_id = (
76
+ tokenizer.vision_end_token_id
77
+ if getattr(tokenizer, "vision_end_token_id", None)
78
+ else tokenizer.convert_tokens_to_ids(self.vision_end_token)
79
+ )
80
+ self.ts_token = "<TS_CONTEXT>" if not hasattr(tokenizer, "ts_token") else tokenizer.ts_token
81
+ self.ts_start_token = "<|ts|>" if not hasattr(tokenizer, "ts_start_token") else tokenizer.ts_start_token
82
+ self.ts_end_token = "<|/ts|>" if not hasattr(tokenizer, "ts_end_token") else tokenizer.ts_end_token
83
+ self.ts_start_token_id = (
84
+ tokenizer.ts_start_token_id
85
+ if getattr(tokenizer, "ts_start_token_id", None)
86
+ else tokenizer.convert_tokens_to_ids(self.ts_start_token)
87
+ )
88
+ self.ts_end_token_id = (
89
+ tokenizer.ts_end_token_id
90
+ if getattr(tokenizer, "ts_end_token_id", None)
91
+ else tokenizer.convert_tokens_to_ids(self.ts_end_token)
92
+ )
93
+ self.ts_token_id = (
94
+ tokenizer.ts_token_id
95
+ if getattr(tokenizer, "ts_token_id", None)
96
+ else tokenizer.convert_tokens_to_ids(self.ts_token)
97
+ )
98
+
99
+ @auto_docstring
100
+ def __call__(
101
+ self,
102
+ images: ImageInput = None,
103
+ text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
104
+ videos: VideoInput = None,
105
+ time_series_paths: list[str] = None,
106
+ time_series_sampling_rates: list[int] = None,
107
+ **kwargs: Unpack[InternS2PreviewProcessorKwargs],
108
+ ) -> BatchFeature:
109
+ r"""
110
+ Returns:
111
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
112
+
113
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
114
+ - **ts_values** -- List of time series values to be fed to a model. Returned when `time_series_paths` is not `None`.
115
+ - **ts_sr** -- List of time series sampling rates to be fed to a model. Returned when `time_series_sampling_rates` is not `None`.
116
+ - **ts_lens** -- List of time series lengths to be fed to a model. Returned when `time_series_paths` is not `None`.
117
+ - **num_ts_tokens** -- List of number of time series tokens to be fed to a model. Returned when `time_series_paths` is not `None`.
118
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
119
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
120
+ `None`).
121
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
122
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
123
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
124
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
125
+ """
126
+ output_kwargs = self._merge_kwargs(
127
+ InternS2PreviewProcessorKwargs,
128
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
129
+ **kwargs,
130
+ )
131
+ if images is not None:
132
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
133
+ image_grid_thw = image_inputs["image_grid_thw"]
134
+ else:
135
+ image_inputs = {}
136
+ image_grid_thw = None
137
+
138
+ if videos is not None:
139
+ videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
140
+ video_grid_thw = videos_inputs["video_grid_thw"]
141
+ # If user has not requested video metadata, pop it
142
+ if not kwargs.get("return_metadata"):
143
+ video_metadata = videos_inputs.pop("video_metadata")
144
+ else:
145
+ video_metadata = videos_inputs["video_metadata"]
146
+ else:
147
+ videos_inputs = {}
148
+ video_grid_thw = None
149
+
150
+ if not isinstance(text, list):
151
+ text = [text]
152
+
153
+ text = text.copy() # below lines change text in-place
154
+
155
+ if time_series_paths is not None:
156
+ assert time_series_sampling_rates is not None, (
157
+ "If time_series_signals is provided, time_series_sampling_rates must also be provided."
158
+ )
159
+ assert len(time_series_paths) == len(time_series_sampling_rates), (
160
+ "The number of time series signals must match the number of sampling rates."
161
+ )
162
+ time_series_inputs = self.time_series_processor(
163
+ ts_paths=time_series_paths, sampling_rates=time_series_sampling_rates
164
+ )
165
+ num_ts_tokens = time_series_inputs.pop("num_ts_tokens")
166
+ assert len(num_ts_tokens) == len(text), (
167
+ "The number of time series signals must match the number of text prompts."
168
+ )
169
+ for i in range(len(text)):
170
+ if f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}" in text[i]:
171
+ ts_placeholder = self.ts_start_token + self.ts_token * num_ts_tokens[i] + self.ts_end_token
172
+ text[i] = text[i].replace(
173
+ f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}", ts_placeholder, 1
174
+ )
175
+ elif self.ts_token in text[i]:
176
+ text[i] = text[i].replace(self.ts_token, self.ts_token * num_ts_tokens[i])
177
+ else:
178
+ time_series_inputs = {}
179
+
180
+ if image_grid_thw is not None:
181
+ merge_length = self.image_processor.merge_size**2
182
+ index = 0
183
+ for i in range(len(text)):
184
+ while self.image_token in text[i]:
185
+ num_image_tokens = image_grid_thw[index].prod() // merge_length
186
+ text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
187
+ index += 1
188
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
189
+
190
+ if video_grid_thw is not None:
191
+ merge_length = self.video_processor.merge_size**2
192
+ index = 0
193
+ for i in range(len(text)):
194
+ while self.video_token in text[i]:
195
+ metadata = video_metadata[index]
196
+ if metadata.fps is None:
197
+ logger.warning_once(
198
+ "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
199
+ "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
200
+ "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
201
+ )
202
+ metadata.fps = 24 if metadata.fps is None else metadata.fps
203
+
204
+ # if timestamps are not provided, calculate them
205
+ curr_timestamp = self._calculate_timestamps(
206
+ metadata.frames_indices,
207
+ metadata.fps,
208
+ self.video_processor.temporal_patch_size,
209
+ )
210
+
211
+ video_placeholder = ""
212
+ frame_seqlen = video_grid_thw[index][1:].prod() // merge_length
213
+ for frame_idx in range(video_grid_thw[index][0]):
214
+ curr_time = curr_timestamp[frame_idx]
215
+ video_placeholder += f"<{curr_time:.1f} seconds>"
216
+ video_placeholder += (
217
+ self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token
218
+ )
219
+ if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in text[i]:
220
+ text[i] = text[i].replace(
221
+ f"{self.vision_start_token}{self.video_token}{self.vision_end_token}", video_placeholder, 1
222
+ )
223
+ else:
224
+ # vllm may input video token directly
225
+ text[i] = text[i].replace(self.video_token, video_placeholder, 1)
226
+ index += 1
227
+
228
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
229
+
230
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
231
+ return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
232
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
233
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video", "ts"])
234
+
235
+ if return_mm_token_type_ids:
236
+ array_ids = np.array(text_inputs["input_ids"])
237
+ mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
238
+ mm_token_type_ids[array_ids == self.image_token_id] = 1
239
+ text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
240
+
241
+ return BatchFeature(
242
+ data={**text_inputs, **image_inputs, **videos_inputs, **time_series_inputs}, tensor_type=return_tensors
243
+ )
244
+
245
+ def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
246
+ """
247
+ Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
248
+ Args:
249
+ image_sizes (`list[list[int]]`, *optional*):
250
+ The input sizes formatted as (height, width) per each image.
251
+ video_sizes (`list[list[int]]`, *optional*):
252
+ The input sizes formatted as (num_frames, height, width) per each video.
253
+ Returns:
254
+ `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
255
+ input modalities, along with other useful data.
256
+ """
257
+
258
+ vision_data = {}
259
+ if image_sizes is not None:
260
+ images_kwargs = InternS2PreviewProcessorKwargs._defaults.get("images_kwargs", {})
261
+ images_kwargs.update(kwargs)
262
+ merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
263
+
264
+ num_image_patches = [
265
+ self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
266
+ for image_size in image_sizes
267
+ ]
268
+ num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
269
+ vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
270
+
271
+ if video_sizes is not None:
272
+ videos_kwargs = InternS2PreviewProcessorKwargs._defaults.get("videos_kwargs", {})
273
+ videos_kwargs.update(kwargs)
274
+ num_video_patches = [
275
+ self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
276
+ for video_size in video_sizes
277
+ ]
278
+ num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
279
+ vision_data["num_video_tokens"] = num_video_tokens
280
+
281
+ return MultiModalData(**vision_data)
282
+
283
+ def post_process_image_text_to_text(
284
+ self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
285
+ ):
286
+ """
287
+ Post-process the output of the model to decode the text.
288
+
289
+ Args:
290
+ generated_outputs (`torch.Tensor` or `np.ndarray`):
291
+ The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
292
+ or `(sequence_length,)`.
293
+ skip_special_tokens (`bool`, *optional*, defaults to `True`):
294
+ Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
295
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
296
+ Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
297
+ **kwargs:
298
+ Additional arguments to be passed to the tokenizer's `batch_decode method`.
299
+
300
+ Returns:
301
+ `list[str]`: The decoded text.
302
+ """
303
+ return self.tokenizer.batch_decode(
304
+ generated_outputs,
305
+ skip_special_tokens=skip_special_tokens,
306
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
307
+ **kwargs,
308
+ )
309
+
310
+ def _calculate_timestamps(self, indices: list[int] | np.ndarray, video_fps: float, merge_size: int = 2):
311
+ if not isinstance(indices, list):
312
+ indices = indices.tolist()
313
+ if len(indices) % merge_size != 0:
314
+ indices.extend(indices[-1] for _ in range(merge_size - len(indices) % merge_size))
315
+ timestamps = [idx / video_fps for idx in indices]
316
+ # @JJJYmmm frames are merged by self.merge_size, \
317
+ # so we need to average the timestamps between the first/last frame within the temporal patch
318
+ timestamps = [
319
+ (timestamps[i] + timestamps[i + merge_size - 1]) / 2 for i in range(0, len(timestamps), merge_size)
320
+ ]
321
+ return timestamps
322
+
323
+ def time_series_preprocessor(self, conversation):
324
+ if isinstance(conversation, (list, tuple)) and (
325
+ isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
326
+ ):
327
+ conversations = conversation
328
+ else:
329
+ conversations = [conversation]
330
+
331
+ batch_time_series = []
332
+ batch_time_series_metadata = []
333
+ for conversation in conversations:
334
+ for message in conversation:
335
+ if message["role"] != "user":
336
+ continue
337
+ time_series_fnames = [
338
+ content["data"]
339
+ for content in message["content"]
340
+ if content.get("type") == "time_series" and "data" in content
341
+ ]
342
+ time_series_rates = [
343
+ content.get("sampling_rate", None)
344
+ for content in message["content"]
345
+ if content.get("type") == "time_series"
346
+ ]
347
+ for path, rate in zip(time_series_fnames, time_series_rates):
348
+ batch_time_series.append(path)
349
+ batch_time_series_metadata.append(rate)
350
+
351
+ return {
352
+ "time_series_paths": batch_time_series or None,
353
+ "time_series_sampling_rates": batch_time_series_metadata or None,
354
+ }
355
+
356
+ def time_series_processor(
357
+ self,
358
+ ts_paths: list[str],
359
+ sampling_rates: list[float],
360
+ do_normalize=True,
361
+ do_truncate=True,
362
+ ) -> BatchFeature:
363
+ pd = importlib.import_module("pandas")
364
+ sf = importlib.import_module("soundfile")
365
+
366
+ assert len(ts_paths) == len(sampling_rates), "ts_paths and sampling_rates must have the same length"
367
+
368
+ ts_values = []
369
+ ts_sr = []
370
+ ts_lens = []
371
+
372
+ for idx, ts_path in enumerate(ts_paths):
373
+ sr = sampling_rates[idx]
374
+ ext = os.path.splitext(ts_path)[-1].lower()
375
+ if ext in [".wav", ".mp3", ".flac"]:
376
+ ts_input, sr = sf.read(ts_path) # ts_input: np.ndarray, shape [T] or [T, C]
377
+ elif ext == ".csv":
378
+ df = pd.read_csv(ts_path, header=None)
379
+ ts_input = df.values # [T, C]
380
+ elif ext == ".npy":
381
+ ts_input = np.load(ts_path) # [T, C]
382
+ else:
383
+ raise ValueError(f"Unsupported file format: {ext}")
384
+
385
+ if not isinstance(ts_input, np.ndarray):
386
+ ts_input = np.array(ts_input, dtype=np.float32)
387
+
388
+ if do_normalize:
389
+ mean = ts_input.mean(axis=0, keepdims=True)
390
+ std = ts_input.std(axis=0, keepdims=True)
391
+ ts_input = (ts_input - mean) / (std + 1e-8)
392
+
393
+ if do_truncate and len(ts_input) > 240000:
394
+ ts_input = ts_input[:240000] # truncate to 240k to avoid oom
395
+
396
+ if ts_input.ndim == 1:
397
+ ts_input = ts_input[:, None] # [T,C]
398
+
399
+ ts_len = ts_input.shape[0]
400
+
401
+ if sr is None or sr == 0: # if no sr provided
402
+ sr = ts_len / 4
403
+
404
+ ts_values.append(ts_input)
405
+ ts_sr.append(sr)
406
+ ts_lens.append(ts_len)
407
+
408
+ ts_lens = np.array(ts_lens)
409
+ ts_sr = np.array(ts_sr)
410
+ num_ts_tokens = self._get_num_ts_tokens(sampling_rates=ts_sr, ts_lens=ts_lens)
411
+ return BatchFeature(
412
+ data={"ts_values": ts_values, "ts_sr": ts_sr, "ts_lens": ts_lens, "num_ts_tokens": num_ts_tokens}
413
+ )
414
+
415
+ def _get_num_ts_tokens(self, sampling_rates, ts_lens):
416
+ strides = np.floor(160 / ((1 + np.exp(-sampling_rates / 100)) ** 6))
417
+ patch_sizes = strides * 2
418
+ embed_lengths = (np.ceil((ts_lens - patch_sizes) / strides) + 1).astype(np.int64)
419
+ num_ts_tokens = [(embed_length // 2 + 1) // 2 for embed_length in embed_lengths]
420
+ return num_ts_tokens
421
+
422
+
423
+ __all__ = ["InternS2PreviewProcessor"]
special_tokens_map.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "audio_bos_token": "<|audio_start|>",
18
+ "audio_eos_token": "<|audio_end|>",
19
+ "audio_token": "<|audio_pad|>",
20
+ "bos_token": {
21
+ "content": "<|im_start|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "eos_token": {
28
+ "content": "<|im_end|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "image_token": "<|image_pad|>",
35
+ "pad_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ },
42
+ "video_token": "<|video_pad|>",
43
+ "vision_bos_token": "<|vision_start|>",
44
+ "vision_eos_token": "<|vision_end|>"
45
+ }
tokenization_interns1.py ADDED
@@ -0,0 +1,1009 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The Intern team and Shanghai AI Lab team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for InternS1."""
16
+
17
+ import json
18
+ import os
19
+ import unicodedata
20
+ from abc import ABC, abstractmethod
21
+ from typing import Optional, Union
22
+ from functools import lru_cache
23
+
24
+ import regex as re
25
+ import sentencepiece as spm
26
+
27
+ from transformers.tokenization_utils_base import AddedToken, TextInput
28
+ from transformers.utils import logging
29
+ from packaging import version
30
+ import transformers
31
+ if version.parse(transformers.__version__) >= version.parse("5.0.0"):
32
+ from transformers.tokenization_python import PreTrainedTokenizer
33
+ else:
34
+ from transformers.tokenization_utils import PreTrainedTokenizer
35
+
36
+ logger = logging.get_logger(__name__)
37
+
38
+ try:
39
+ from rdkit import Chem, RDLogger
40
+
41
+ RDLogger.DisableLog("rdApp.error")
42
+ RDLogger.DisableLog("rdApp.*")
43
+ RDKIT_AVAILABLE = True
44
+ except ImportError:
45
+ logger.warning_once(
46
+ "If tokenization with SMILES formula is of necessity, please 'pip install RDKit' for better tokenization quality."
47
+ )
48
+ RDKIT_AVAILABLE = False
49
+
50
+ VOCAB_FILES_NAMES = {
51
+ "vocab_file": "vocab.json",
52
+ "merges_file": "merges.txt",
53
+ "sp_model_SMILES": "tokenizer_SMILES.model",
54
+ "sp_model_PROT": "tokenizer_PROT.model",
55
+ "sp_model_XNA": "tokenizer_XNA.model",
56
+ }
57
+
58
+ PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
59
+
60
+
61
+ class InternS1CheckModuleMixin(ABC):
62
+ """
63
+ Basic auto-detection module.
64
+
65
+ Note that short strings are ignored by this module.
66
+ """
67
+
68
+ def __init__(self, *, min_length: int):
69
+ self.min_length = min_length
70
+ self.REGEX = self._build_regex()
71
+ self.all_auto_detect_token_start = ["<SMILES_AUTO_DETECT>", "<PROT_AUTO_DETECT>", "<XNA_AUTO_DETECT>"]
72
+ self.all_auto_detect_token_end = ["</SMILES_AUTO_DETECT>", "</PROT_AUTO_DETECT>", "</XNA_AUTO_DETECT>"]
73
+ self.auto_detect_token = []
74
+ self.truncation = False
75
+
76
+ @abstractmethod
77
+ def _build_regex(self):
78
+ pass
79
+
80
+ @abstractmethod
81
+ def check_legitimacy(self, candidate: str) -> bool:
82
+ pass
83
+
84
+ def re_split(self, texts: Union[str, list[str]]) -> list[str]:
85
+ if isinstance(texts, str):
86
+ texts = [texts]
87
+
88
+ total_results = []
89
+
90
+ no_split_flag = 0
91
+
92
+ for text in texts:
93
+ if text in self.all_auto_detect_token_start:
94
+ total_results.append(text)
95
+ no_split_flag += 1
96
+ continue
97
+ elif text in self.all_auto_detect_token_end:
98
+ total_results.append(text)
99
+ no_split_flag = max(0, no_split_flag - 1)
100
+ continue
101
+
102
+ if no_split_flag > 0:
103
+ total_results.append(text)
104
+ continue
105
+
106
+ results = []
107
+ current_pos = 0
108
+ for match in self.REGEX.finditer(text):
109
+ candidate = match.group(1)
110
+
111
+ if len(candidate) >= self.min_length:
112
+ match_start, match_end = match.span(1)
113
+
114
+ if not self.check_legitimacy(candidate):
115
+ continue
116
+
117
+ if not self.truncation:
118
+ if match_start > 0 and text[match_start - 1].encode("UTF-8").isalpha():
119
+ continue
120
+ if match_end < len(text) and text[match_end].encode("UTF-8").isalpha():
121
+ continue
122
+
123
+ if match_start > current_pos:
124
+ non_candidate_part = text[current_pos:match_start]
125
+ results.append(non_candidate_part)
126
+ else:
127
+ continue
128
+
129
+ results.extend([self.auto_detect_token[0], candidate, self.auto_detect_token[1]])
130
+ current_pos = match_end
131
+
132
+ if current_pos < len(text):
133
+ remaining_part = text[current_pos:]
134
+ results.append(remaining_part)
135
+
136
+ total_results.extend(results)
137
+
138
+ return total_results
139
+
140
+
141
+ class XnaCheckModule(InternS1CheckModuleMixin):
142
+ """
143
+ XNA sequence auto-detection module.
144
+
145
+ Automatically detects XNA sequence using regex patterns.
146
+ """
147
+ def __init__(self, *, min_length: int = 27):
148
+ super().__init__(min_length=min_length)
149
+ self.auto_detect_token = ["<XNA_AUTO_DETECT>", "</XNA_AUTO_DETECT>"]
150
+ self.truncation = True
151
+
152
+ def _build_regex(self):
153
+ return re.compile(r"([ATCGU]{" + str(self.min_length) + r",})")
154
+
155
+ def check_legitimacy(self, candidate: str):
156
+ return True
157
+
158
+
159
+ class ProtCheckModule(InternS1CheckModuleMixin):
160
+ """
161
+ Protein sequence auto-detection module.
162
+
163
+ Automatically detects protein sequence using regex patterns.
164
+ """
165
+ def __init__(self, *, min_length: int = 27):
166
+ super().__init__(min_length=min_length)
167
+ self.auto_detect_token = ["<PROT_AUTO_DETECT>", "</PROT_AUTO_DETECT>"]
168
+ self.truncation = True
169
+ self._xna_pattern = re.compile(r"^[ATCGU]+$")
170
+
171
+ def _build_regex(self):
172
+ return re.compile(r"([A-Z]{" + str(self.min_length) + r",})")
173
+
174
+ def check_legitimacy(self, candidate: str):
175
+ if self._xna_pattern.match(candidate):
176
+ return False
177
+ return True
178
+
179
+
180
+ # fmt: off
181
+ bonds = ["-", "=", "#", ":", "/", "\\", ".", "$"]
182
+ organic_symbols = ["B", "C", "N", "O", "P", "S", "F", "Cl", "Br", "I"]
183
+ other_allows = bonds + ["[", "]", "(", ")", ";"]
184
+ aromatic_symbols = ["b", "c", "n", "o", "s", "p"]
185
+ elements = [
186
+ "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
187
+ "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca",
188
+ "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
189
+ "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr",
190
+ "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn",
191
+ "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd",
192
+ "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
193
+ "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
194
+ "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th",
195
+ "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm",
196
+ "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
197
+ "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
198
+ ]
199
+ # fmt: on
200
+
201
+
202
+ class SmilesCheckModule(InternS1CheckModuleMixin):
203
+ """
204
+ SMILES molecular sequence auto-detection module.
205
+
206
+ Automatically detects and validates SMILES strings in text using regex patterns
207
+ or chemical syntax rules. Uses RDKit for precise validation when available,
208
+ otherwise falls back to rule-based validation.
209
+ """
210
+
211
+ def __init__(self, *, min_length: int = 10):
212
+ super().__init__(min_length=min_length)
213
+ self.auto_detect_token = ["<SMILES_AUTO_DETECT>", "</SMILES_AUTO_DETECT>"]
214
+ self._SQ_BRACKET_BAN_1 = re.compile(r"(?:[A-GI-Z]|[a-z]){3,}")
215
+ self._SQ_BRACKET_BAN_2 = re.compile(r"\d{4,}")
216
+
217
+ def _build_regex(self):
218
+ # fmt: off
219
+ _two_letter_elements = [
220
+ 'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'Ba', 'Be', 'Bh', 'Bi', 'Bk', 'Br', 'Ca', 'Cd',
221
+ 'Ce', 'Cf', 'Cl', 'Cm', 'Cn', 'Co', 'Cr', 'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'Fe',
222
+ 'Fl', 'Fm', 'Fr', 'Ga', 'Gd', 'Ge', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'In', 'Ir', 'Kr', 'La', 'Li',
223
+ 'Lr', 'Lu', 'Lv', 'Mc', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'Na', 'Nb', 'Nd', 'Ne', 'Nh', 'Ni', 'No',
224
+ 'Np', 'Og', 'Os', 'Pa', 'Pb', 'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg',
225
+ 'Rh', 'Rn', 'Ru', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Tc', 'Te', 'Th',
226
+ 'Ti', 'Tl', 'Tm', 'Ts', 'Xe', 'Yb', 'Zn', 'Zr'
227
+ ]
228
+ _single_letter_elements = [
229
+ "B", "C", "F", "H", "I", "K", "N", "O", "P", "S", "U", "V", "W", "Y", 'b', 'c', 'n', 'o', 'p', 's'
230
+ ]
231
+ # fmt: on
232
+ all_elements_sorted = sorted(_two_letter_elements + _single_letter_elements, key=lambda x: (-len(x), x))
233
+ elements_pattern_str = "|".join(all_elements_sorted)
234
+
235
+ bracket_atom_pattern_str = r"\[[^\]]+\]"
236
+ other_single_chars_pattern_str = r"[\(\)\.=\-#@\d\$\%\*:\+\-\/\\]"
237
+ smiles_unit_pattern = (
238
+ r"(?:"
239
+ + bracket_atom_pattern_str
240
+ + r"|"
241
+ + elements_pattern_str
242
+ + r"|"
243
+ + other_single_chars_pattern_str
244
+ + r")"
245
+ )
246
+ core_sequence_pattern = rf"(?>{smiles_unit_pattern}){{10,}}"
247
+ constrained_core_sequence_pattern = rf"(?![:.=]){core_sequence_pattern}(?<![:.=])"
248
+
249
+ final_regex_str = rf"({constrained_core_sequence_pattern})"
250
+
251
+ COMPILED_REGEX = re.compile(final_regex_str)
252
+ return COMPILED_REGEX
253
+
254
+ def check_legitimacy_slow(self, candidate: str) -> bool:
255
+ """Check legitimacy with RDKit"""
256
+ if sum(1 for char in candidate if char.encode("UTF-8").isalpha()) < 5:
257
+ return False
258
+
259
+ mol = Chem.MolFromSmiles(candidate)
260
+ if mol is None:
261
+ return False
262
+ else:
263
+ return True
264
+
265
+ def check_legitimacy_fast(self, candidate: str) -> bool:
266
+ """Check legitimacy with hard rules"""
267
+ if sum(1 for char in candidate if char.encode("UTF-8").isalpha()) < 5:
268
+ return False
269
+
270
+ if not self.check_rings_and_brackets(candidate):
271
+ return False
272
+ else:
273
+ return True
274
+
275
+ def check_legitimacy(self, candidate: str) -> bool:
276
+ if RDKIT_AVAILABLE:
277
+ return self.check_legitimacy_slow(candidate)
278
+ else:
279
+ return self.check_legitimacy_fast(candidate)
280
+
281
+ def check_brackets(self, text):
282
+ matches = re.findall(r"\[([^\[\]]*)\]", text)
283
+ for part in matches:
284
+ if "(" in part or ")" in part:
285
+ return False
286
+ if len(part) == 0:
287
+ return False
288
+ if part[0] in elements or part[0] in aromatic_symbols or part[:2] in elements:
289
+ return True
290
+ return True
291
+
292
+ def check_rings_and_brackets(self, text):
293
+ rings = {}
294
+ left_sq_bracket, right_sq_bracket = 0, 0
295
+ left_pt_bracket, right_pt_bracket = 0, 0
296
+ all_lower = True
297
+ digits_cnt = 0
298
+ pos = 0
299
+ while pos < len(text):
300
+ step = 0
301
+ c = text[pos]
302
+ if ord(c) >= 65 and ord(c) <= 90:
303
+ all_lower = False
304
+ if (pos == len(text) - 1 or pos == 0) and c in bonds:
305
+ return False
306
+ if pos > 0 and text[pos - 1] in bonds and text[pos] in bonds:
307
+ return False
308
+ if c == "[":
309
+ step = 1
310
+ left_sq_bracket += 1
311
+ if left_sq_bracket > right_sq_bracket + 1:
312
+ return False
313
+ if pos == len(text) - 1:
314
+ return False
315
+ if "]" not in text[pos + 1 :]:
316
+ return False
317
+ bracket_span = text[pos + 1 : text.find("]")]
318
+
319
+ if self._SQ_BRACKET_BAN_1.search(bracket_span) or self._SQ_BRACKET_BAN_2.search(bracket_span):
320
+ return False
321
+
322
+ matches = re.findall(r"\d+", bracket_span)
323
+ if len(matches) > 2:
324
+ return False
325
+ if c == "]":
326
+ step = 1
327
+ right_sq_bracket += 1
328
+ if right_sq_bracket > left_sq_bracket:
329
+ return False
330
+
331
+ if c == "(":
332
+ step = 1
333
+ left_pt_bracket += 1
334
+ if c == ")":
335
+ step = 1
336
+ right_pt_bracket += 1
337
+ if right_pt_bracket > left_pt_bracket:
338
+ return False
339
+
340
+ if left_sq_bracket == right_sq_bracket:
341
+ if c.isdigit():
342
+ digits_cnt += 1
343
+ step = 1
344
+ if (
345
+ pos == 0
346
+ or (pos == 1 and text[pos - 1] != "%")
347
+ or (pos > 1 and text[pos - 1] != "%" and text[pos - 2] != "%")
348
+ ):
349
+ if c in rings:
350
+ if rings[c] == "unclosed":
351
+ rings[c] = "closed"
352
+ else:
353
+ rings[c] = "unclosed"
354
+ else:
355
+ rings[c] = "unclosed"
356
+ if c == "%":
357
+ if pos >= len(text) - 2 or not text[pos + 1].isdigit() or not text[pos + 2].isdigit():
358
+ return False
359
+ step = 3
360
+ digits_cnt += 1
361
+ num = text[pos + 1 : pos + 3]
362
+ if num in rings:
363
+ if rings[num] == "unclosed":
364
+ rings[num] = "closed"
365
+ else:
366
+ rings[num] = "unclosed"
367
+ else:
368
+ rings[num] = "unclosed"
369
+ if step == 0:
370
+ if (
371
+ pos < len(text) - 1
372
+ and text[pos : pos + 2] in organic_symbols + aromatic_symbols + other_allows
373
+ ):
374
+ step = 2
375
+ elif c in organic_symbols + aromatic_symbols + other_allows:
376
+ step = 1
377
+ else:
378
+ return False
379
+
380
+ if step == 0:
381
+ step = 1
382
+ pos += step
383
+
384
+ if left_sq_bracket != right_sq_bracket or any(v == "unclosed" for v in rings.values()):
385
+ return False
386
+ if all_lower and digits_cnt < 2:
387
+ return False
388
+ return self.check_brackets(text)
389
+
390
+
391
+ @lru_cache
392
+ # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
393
+ def bytes_to_unicode():
394
+ """
395
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
396
+ characters the bpe code barfs on.
397
+
398
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
399
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
400
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
401
+ tables between utf-8 bytes and unicode strings.
402
+ """
403
+ bs = (
404
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
405
+ )
406
+ cs = bs[:]
407
+ n = 0
408
+ for b in range(2**8):
409
+ if b not in bs:
410
+ bs.append(b)
411
+ cs.append(2**8 + n)
412
+ n += 1
413
+ cs = [chr(n) for n in cs]
414
+ return dict(zip(bs, cs))
415
+
416
+
417
+ # Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
418
+ def get_pairs(word):
419
+ """
420
+ Return set of symbol pairs in a word.
421
+
422
+ Word is represented as tuple of symbols (symbols being variable-length strings).
423
+ """
424
+ pairs = set()
425
+ prev_char = word[0]
426
+ for char in word[1:]:
427
+ pairs.add((prev_char, char))
428
+ prev_char = char
429
+ return pairs
430
+
431
+
432
+ # @requires(backends=("sentencepiece",))
433
+ class InternS1Tokenizer(PreTrainedTokenizer):
434
+ """
435
+ Construct an InternS1 tokenizer. Based on byte-level Byte-Pair-Encoding.
436
+
437
+ Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
438
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
439
+
440
+ ```python
441
+ >>> from transformers import AutoTokenizer
442
+
443
+ >>> tokenizer = AutoTokenizer.from_pretrained("InternS1Tokenizer", trust_remote_code=True)
444
+ >>> tokenizer("Hello world")["input_ids"]
445
+ [9707, 1879]
446
+
447
+ >>> tokenizer(" Hello world")["input_ids"]
448
+ [21927, 1879]
449
+ ```
450
+ This is expected.
451
+
452
+ Include custom extension to support better domain-specific text tokenization, leveraging a separately trained tokenizer model.
453
+
454
+ ```python
455
+ >>> from transformers import AutoTokenizer
456
+
457
+ >>> tokenizer = AutoTokenizer.from_pretrained("InternS1Tokenizer", trust_remote_code=True)
458
+ >>> tokenizer.tokenize("Describe <SMILES>C1=CC=C(C=C1)C=O</SMILES> and CC1=CC=CC=C1C=O")
459
+ ["Describe ", "<SMILES>", "C1=CC=C(C=C1)C=O", "</SMILES>", " and ", "<SMILES_AUTO_DETECT>",
460
+ "CC1=CC=CC=C1C=O", "</SMILES_AUTO_DETECT>"]
461
+ >>> token_ids = tokenizer("Describe <SMILES>C1=CC=C(C=C1)C=O</SMILES> and CC1=CC=CC=C1C=O")["input_ids"]
462
+ >>> token_ids
463
+ [74785, 220, 151925, 151854, 151860, 151698, 151707, 151860, 151690, 151726, 151926, 323, 220, 151672, 151860, 151701, 151860, 151854, 151726]
464
+
465
+ >>> tokenizer.convert_ids_to_tokens(token_ids)
466
+ ['Describe', 'Ġ', '<SMILES>', 'C', '1', '=CC=C(', 'C=C', '1', ')C', '=O', '</SMILES>', 'Ġand', 'Ġ', 'CC', '1', '=CC=CC=C', '1', 'C', '=O']
467
+ ```
468
+
469
+ Users should refer to this superclass [`PreTrainedTokenizer`] for more information regarding those overloaded methods
470
+
471
+ Args:
472
+ vocab_file (`str`):
473
+ Path to the vocabulary file.
474
+ merges_file (`str`):
475
+ Path to the merges file.
476
+ errors (`str`, *optional*, defaults to `"replace"`):
477
+ Paradigm to follow when decoding bytes to UTF-8. See
478
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
479
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
480
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
481
+ token instead.
482
+ bos_token (`str`, *optional*):
483
+ The beginning of sequence token. Not applicable for this tokenizer.
484
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
485
+ The end of sequence token.
486
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
487
+ The token used for padding, for example when batching sequences of different lengths.
488
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
489
+ Whether or not the model should cleanup the spaces that were added when splitting the input text during the
490
+ tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
491
+ split_special_tokens (`bool`, *optional*, defaults to `False`):
492
+ Whether or not the special tokens should be split during the tokenization process. The default behavior is
493
+ to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
494
+ ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
495
+ '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
496
+ """
497
+
498
+ vocab_files_names = VOCAB_FILES_NAMES
499
+ model_input_names = ["input_ids", "attention_mask"]
500
+
501
+ def __init__(
502
+ self,
503
+ vocab_file,
504
+ merges_file,
505
+ errors="replace",
506
+ unk_token="<|endoftext|>",
507
+ bos_token=None,
508
+ eos_token="<|endoftext|>",
509
+ pad_token="<|endoftext|>",
510
+ clean_up_tokenization_spaces=False,
511
+ split_special_tokens=False,
512
+ special_tokens_pattern="none",
513
+ **kwargs,
514
+ ):
515
+ bos_token = (
516
+ AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
517
+ if isinstance(bos_token, str)
518
+ else bos_token
519
+ )
520
+ eos_token = (
521
+ AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
522
+ if isinstance(eos_token, str)
523
+ else eos_token
524
+ )
525
+ unk_token = (
526
+ AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
527
+ if isinstance(unk_token, str)
528
+ else unk_token
529
+ )
530
+ pad_token = (
531
+ AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
532
+ if isinstance(pad_token, str)
533
+ else pad_token
534
+ )
535
+
536
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
537
+ self.encoder = json.load(vocab_handle)
538
+ self.decoder = {v: k for k, v in self.encoder.items()}
539
+ self.errors = errors # how to handle errors in decoding
540
+ self.byte_encoder = bytes_to_unicode()
541
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
542
+ bpe_merges = []
543
+ with open(merges_file, encoding="utf-8") as merges_handle:
544
+ for i, line in enumerate(merges_handle):
545
+ line = line.strip()
546
+ if (i == 0 and line.startswith("#version:")) or not line:
547
+ continue
548
+ bpe_merges.append(tuple(line.split()))
549
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
550
+ # NOTE: the cache can grow without bound and will get really large for long running processes
551
+ # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
552
+ # not a memory leak but appears as one.
553
+ # GPT2Tokenizer has the same problem, so let's be consistent.
554
+ self.cache = {}
555
+
556
+ self.pat = re.compile(PRETOKENIZE_REGEX)
557
+
558
+ if kwargs.get("add_prefix_space", False):
559
+ logger.warning_once(
560
+ f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
561
+ )
562
+
563
+ super().__init__(
564
+ vocab_file=vocab_file,
565
+ merges_file=merges_file,
566
+ errors=errors,
567
+ unk_token=unk_token,
568
+ bos_token=bos_token,
569
+ eos_token=eos_token,
570
+ pad_token=pad_token,
571
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
572
+ split_special_tokens=split_special_tokens,
573
+ special_tokens_pattern=special_tokens_pattern,
574
+ **kwargs,
575
+ )
576
+
577
+ self.prepare_extra_tokenizers(vocab_file)
578
+
579
+ @property
580
+ def vocab_size(self) -> int:
581
+ return len(self.encoder)
582
+
583
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
584
+ def get_vocab(self):
585
+ return dict(self.encoder, **self.added_tokens_encoder)
586
+
587
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
588
+ def bpe(self, token):
589
+ if token in self.cache:
590
+ return self.cache[token]
591
+ word = tuple(token)
592
+ pairs = get_pairs(word)
593
+
594
+ if not pairs:
595
+ return token
596
+
597
+ while True:
598
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
599
+ if bigram not in self.bpe_ranks:
600
+ break
601
+ first, second = bigram
602
+ new_word = []
603
+ i = 0
604
+ while i < len(word):
605
+ try:
606
+ j = word.index(first, i)
607
+ except ValueError:
608
+ new_word.extend(word[i:])
609
+ break
610
+ else:
611
+ new_word.extend(word[i:j])
612
+ i = j
613
+
614
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
615
+ new_word.append(first + second)
616
+ i += 2
617
+ else:
618
+ new_word.append(word[i])
619
+ i += 1
620
+ new_word = tuple(new_word)
621
+ word = new_word
622
+ if len(word) == 1:
623
+ break
624
+ else:
625
+ pairs = get_pairs(word)
626
+ word = " ".join(word)
627
+ self.cache[token] = word
628
+ return word
629
+
630
+ def prepare_extra_tokenizers(self, vocab_file: str) -> None:
631
+ """
632
+ Prepare domain-specific tokenizers.
633
+
634
+ Define variables/maps here which guide domain-specific tokenization later.
635
+ """
636
+ # Load extra tokenizers with SentencePiece model
637
+ dir_name = os.path.dirname(vocab_file)
638
+
639
+ self.sp_model_SMILES = spm.SentencePieceProcessor()
640
+ self.sp_model_SMILES.Load(os.path.join(dir_name, "tokenizer_SMILES.model"))
641
+ self.sp_model_SMILES.offset = self.init_kwargs["offset_SMILES"]
642
+
643
+ self.sp_model_PROT = spm.SentencePieceProcessor()
644
+ self.sp_model_PROT.Load(os.path.join(dir_name, "tokenizer_PROT.model"))
645
+ self.sp_model_PROT.offset = self.init_kwargs["offset_PROT"]
646
+
647
+ self.sp_model_XNA = spm.SentencePieceProcessor()
648
+ self.sp_model_XNA.Load(os.path.join(dir_name, "tokenizer_XNA.model"))
649
+ self.sp_model_XNA.offset = self.init_kwargs["offset_XNA"]
650
+
651
+ base_mapping = {
652
+ "SMILES": self.sp_model_SMILES,
653
+ "protein": self.sp_model_PROT,
654
+ "dna": self.sp_model_XNA,
655
+ "rna": self.sp_model_XNA,
656
+ }
657
+ auto_detect_mapping = {
658
+ "SMILES": self.sp_model_SMILES,
659
+ "PROT": self.sp_model_PROT,
660
+ "XNA": self.sp_model_XNA,
661
+ }
662
+ # Guiding tokens of domain-specific tokenization
663
+ self.ex_begin_mapping = {f"<{key}>": value for key, value in base_mapping.items()}
664
+ self.ex_end_mapping = {f"</{key}>": value for key, value in base_mapping.items()}
665
+ # Transient markers for auto-detection, these tokens will not be assigned token ids
666
+ self.ex_auto_begin_mapping = {f"<{key}_AUTO_DETECT>": value for key, value in auto_detect_mapping.items()}
667
+ self.ex_auto_end_mapping = {f"</{key}_AUTO_DETECT>": value for key, value in auto_detect_mapping.items()}
668
+ # Token markers to prevent unwanted auto-detection
669
+ self.ex_protect_begin_tokens = ["<MOLFORMULA>"]
670
+ self.ex_protect_end_tokens = ["</MOLFORMULA>"]
671
+ # For simplicity
672
+ self.ex_protect_tokens = self.ex_protect_begin_tokens + self.ex_protect_end_tokens
673
+ self.ex_all_begin_mapping = self.ex_begin_mapping | self.ex_auto_begin_mapping
674
+ self.ex_all_end_mapping = self.ex_end_mapping | self.ex_auto_end_mapping
675
+
676
+ # Update encoder & decoder with extra tokenizers
677
+ for tokenizer_name, sp_model in [
678
+ ("SMILES", self.sp_model_SMILES),
679
+ ("PROT", self.sp_model_PROT),
680
+ ("XNA", self.sp_model_XNA),
681
+ ]:
682
+ self.decoder.update(
683
+ {i + sp_model.offset: sp_model.id_to_piece(i) for i in range(sp_model.get_piece_size())}
684
+ )
685
+ # Not really used, only to fill holes in encoder, to keep methods like `add_tokens` working
686
+ self.encoder.update(
687
+ {
688
+ f"<|{tokenizer_name}_{sp_model.id_to_piece(i)}|>": i + sp_model.offset
689
+ for i in range(sp_model.get_piece_size())
690
+ }
691
+ )
692
+
693
+ # protect-tokens should keep complete temporarily to guide later tokenization
694
+ # it will be segmented later
695
+ for token in self.ex_protect_tokens:
696
+ self.tokens_trie.add(token)
697
+
698
+ self._unk_token = "<unk>" # Fall-back
699
+ self.check_module_list = [SmilesCheckModule(), ProtCheckModule(), XnaCheckModule()]
700
+
701
+ def _pop_logical_sp_token(self, extra_tokenizer_stack: list, mapping_name: str) -> None:
702
+ """Switch tokenizer when it comes to an end sp token"""
703
+ extra_tokenizer = extra_tokenizer_stack.pop()
704
+ if extra_tokenizer != self.ex_all_end_mapping[mapping_name]:
705
+ logger.warning_once(
706
+ f"Encounter incorrect nesting of extra tokenizer: {self.ex_all_end_mapping[mapping_name]} and {extra_tokenizer}"
707
+ )
708
+ logger.warning_once("This may lead to unexpected behaviour of the tokenizer, please check your input.")
709
+
710
+ def tokenize(self, text: TextInput, **kwargs) -> list[str]:
711
+ """
712
+ Converts a string into a sequence of tokens, using the tokenizer.
713
+
714
+ It will switch to domain-specific tokenizer once encountering extra/logical sp tokens.
715
+
716
+ Args:
717
+ text: TextInput
718
+ """
719
+ split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
720
+
721
+ text, kwargs = self.prepare_for_tokenization(text, **kwargs)
722
+
723
+ if hasattr(self, "do_lower_case") and self.do_lower_case:
724
+ # convert non-special tokens to lowercase. Might be super slow as well?
725
+ escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
726
+ escaped_special_toks += [
727
+ re.escape(s_tok.content)
728
+ for s_tok in (self._added_tokens_decoder.values())
729
+ if not s_tok.special and s_tok.normalized
730
+ ]
731
+ pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
732
+ text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
733
+
734
+ if split_special_tokens:
735
+ no_split_token = []
736
+ tokens = [text]
737
+ else:
738
+ no_split_token = self._added_tokens_encoder.keys() # don't split on any of the added tokens
739
+ # "This is something<special_token_1> else"
740
+ tokens = self.tokens_trie.split(text)
741
+
742
+ # ["This is something", "<special_token_1>", " else"]
743
+ for i, token in enumerate(tokens):
744
+ if token in no_split_token:
745
+ tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None)
746
+ left = tokens[i - 1] if i > 0 else None
747
+ right = tokens[i + 1] if i < len(tokens) - 1 else None
748
+ if isinstance(tok_extended, AddedToken):
749
+ if tok_extended.rstrip and right:
750
+ # A bit counter-intuitive but we strip the left of the string
751
+ # since tok_extended.rstrip means the special token is eating all white spaces on its right
752
+ tokens[i + 1] = right.lstrip()
753
+ # Strip white spaces on the left
754
+ if tok_extended.lstrip and left:
755
+ tokens[i - 1] = left.rstrip() # Opposite here
756
+ if tok_extended.single_word and left and left[-1] != " ":
757
+ tokens[i - 1] += token
758
+ tokens[i] = ""
759
+ elif tok_extended.single_word and right and right[0] != " ":
760
+ tokens[i + 1] = token + tokens[i + 1]
761
+ tokens[i] = ""
762
+ else:
763
+ raise ValueError(
764
+ f"{tok_extended} cannot be tokenized because it was not properly added"
765
+ f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
766
+ )
767
+
768
+ # ["This is something", "<special_token_1>", "else"]
769
+ tokenized_text = []
770
+
771
+ # Codes for automatically detecting domain-specific content
772
+ # All parts that have been marked by domain-specific or protection tokens will not be subject to auto detection
773
+ # See transformers/tests/models/intern_s1/test_tokenization_intern_s1.py::test_auto_detection() for more details
774
+ new_tokens = []
775
+ not_split_flag = 0
776
+ for token in tokens:
777
+ if not token:
778
+ continue
779
+ if token in no_split_token or token in self.ex_protect_tokens:
780
+ new_tokens.append(token)
781
+ if token in self.ex_begin_mapping or token in self.ex_protect_begin_tokens:
782
+ not_split_flag += 1 # In case nested sp tokens
783
+ elif token in self.ex_end_mapping or token in self.ex_protect_end_tokens:
784
+ not_split_flag = max(0, not_split_flag - 1)
785
+ else:
786
+ if not_split_flag:
787
+ new_tokens.append(token)
788
+ else:
789
+ for check_module in self.check_module_list:
790
+ token = check_module.re_split(token)
791
+
792
+ new_tokens.extend(token)
793
+ tokens = new_tokens
794
+
795
+ # Use stack to maintain which tokenizer should be used, considering the possibility of nested extra tokenizer
796
+ extra_tokenizer_stack = []
797
+ for token in tokens:
798
+ # Need to skip eventual empty (fully stripped) tokens
799
+ if not token:
800
+ continue
801
+ # protect-tokens are not assigned token ids, should be segmented here
802
+ if token in self.ex_protect_tokens:
803
+ tokenized_text.extend(self._tokenize(token))
804
+ # push tokenizer to stack when encountering begin token
805
+ elif token in self.ex_all_begin_mapping:
806
+ tokenized_text.append(token)
807
+ extra_tokenizer_stack.append(self.ex_all_begin_mapping[token])
808
+ # pop tokenizer from stack when encountering end token
809
+ elif token in self.ex_all_end_mapping:
810
+ tokenized_text.append(token)
811
+ if extra_tokenizer_stack:
812
+ self._pop_logical_sp_token(extra_tokenizer_stack, token)
813
+ # other special tokens
814
+ elif token in no_split_token:
815
+ tokenized_text.append(token)
816
+ else:
817
+ tokenized_text.extend(self._tokenize(token, extra_tokenizer_stack=extra_tokenizer_stack))
818
+
819
+ # ["This", " is", " something", "<special_token_1>", "else"]
820
+ return tokenized_text
821
+
822
+ def _tokenize(self, text, **kwargs):
823
+ """
824
+ Modified from `transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize`.
825
+
826
+ This adaptation supports domain-specific tokenizers.
827
+ """
828
+ extra_tokenizer_stack = kwargs.pop("extra_tokenizer_stack", False)
829
+ if extra_tokenizer_stack:
830
+ tokenized_text = extra_tokenizer_stack[-1].encode(text, out_type=str)
831
+ tokenized_id = extra_tokenizer_stack[-1].encode(text, out_type=int)
832
+ final_tokenized_text = []
833
+ for text_piece, id_piece in zip(tokenized_text, tokenized_id):
834
+ if id_piece == 0:
835
+ final_tokenized_text.extend(self._bpe_tokenize(text_piece))
836
+ else:
837
+ final_tokenized_text.append(text_piece)
838
+ return final_tokenized_text
839
+ else:
840
+ return self._bpe_tokenize(text)
841
+
842
+ def _bpe_tokenize(self, text, **kwargs):
843
+ text = text.replace(
844
+ "▁", " "
845
+ ) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
846
+ bpe_tokens = []
847
+ for token in re.findall(self.pat, text):
848
+ token = "".join(
849
+ self.byte_encoder[b] for b in token.encode("utf-8")
850
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
851
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
852
+ return bpe_tokens
853
+
854
+ def convert_tokens_to_ids(self, tokens: Union[str, list[str]]) -> Union[int, list[int]]:
855
+ """
856
+ Modified from `transformers.tokenization_utils.PreTrainedTokenzier.convert_tokens_to_ids`.
857
+
858
+ Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
859
+ vocabulary.
860
+
861
+ This adaptation supports domain-specific tokenizers.
862
+
863
+ Args:
864
+ tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
865
+
866
+ Returns:
867
+ `int` or `List[int]`: The token id or list of token ids.
868
+ """
869
+ if tokens is None:
870
+ return None
871
+
872
+ if isinstance(tokens, str):
873
+ return self._convert_token_to_id_with_added_voc(tokens)
874
+
875
+ ids = []
876
+ extra_tokenizer_stack = []
877
+
878
+ for token in tokens:
879
+ if token not in self.ex_auto_begin_mapping and token not in self.ex_auto_end_mapping:
880
+ ids.append(
881
+ self._convert_token_to_id_with_added_voc(token, extra_tokenizer_stack=extra_tokenizer_stack)
882
+ )
883
+ if token in self.ex_all_begin_mapping:
884
+ extra_tokenizer_stack.append(self.ex_all_begin_mapping[token])
885
+ elif token in self.ex_all_end_mapping:
886
+ if extra_tokenizer_stack:
887
+ self._pop_logical_sp_token(extra_tokenizer_stack, token)
888
+ return ids
889
+
890
+ def _convert_token_to_id_with_added_voc(self, token, **kwargs):
891
+ """
892
+ Modified from `transformers.tokenization_utils.PreTrainedTokenzier._convert_token_to_id_with_added_voc`.
893
+
894
+ This adaptation supports domain-specific tokenizers.
895
+ """
896
+ if token is None:
897
+ return None
898
+
899
+ if token in self._added_tokens_encoder:
900
+ return self._added_tokens_encoder[token]
901
+ return self._convert_token_to_id(token, **kwargs)
902
+
903
+ def _convert_token_to_id(self, token, **kwargs):
904
+ """
905
+ Modified from `transformers.tokenization_utils.PreTrainedTokenzier._convert_token_to_id`.
906
+
907
+ Converts a token (str) in an id using the vocab.
908
+
909
+ Fall back to original tokenizer once OOV.
910
+ """
911
+ extra_tokenizer_stack = kwargs.pop("extra_tokenizer_stack", False)
912
+ if extra_tokenizer_stack:
913
+ token_id = extra_tokenizer_stack[-1].piece_to_id(token)
914
+ if token_id == extra_tokenizer_stack[-1].unk_id():
915
+ return self.encoder.get(token, self.encoder.get(self._unk_token))
916
+ else:
917
+ return token_id + extra_tokenizer_stack[-1].offset
918
+ else:
919
+ return self.encoder.get(token, self.encoder.get(self._unk_token))
920
+
921
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
922
+ def _convert_id_to_token(self, index):
923
+ """Converts an index (integer) in a token (str) using the vocab."""
924
+ return self.decoder.get(index)
925
+
926
+ def convert_tokens_to_string(self, tokens):
927
+ """Converts a sequence of tokens (string) in a single string."""
928
+ text = "".join(tokens)
929
+ text = text.replace(
930
+ "▁", "Ġ"
931
+ ) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
932
+ text = text.replace("\n", "Ċ")
933
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
934
+ return text
935
+
936
+ def decode(
937
+ self,
938
+ token_ids,
939
+ skip_special_tokens: bool = False,
940
+ clean_up_tokenization_spaces: Optional[bool] = False,
941
+ spaces_between_special_tokens: bool = False,
942
+ **kwargs,
943
+ ) -> str:
944
+ # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
945
+ # and cannot be configured elsewhere, but it should default to False for InternS1Tokenizer
946
+ return super().decode(
947
+ token_ids,
948
+ skip_special_tokens=skip_special_tokens,
949
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
950
+ spaces_between_special_tokens=spaces_between_special_tokens,
951
+ **kwargs,
952
+ )
953
+
954
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
955
+ """
956
+ Modified from `transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary` to support saving custom extension.
957
+ """
958
+ if not os.path.isdir(save_directory):
959
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
960
+ return
961
+ vocab_file = os.path.join(
962
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
963
+ )
964
+ merge_file = os.path.join(
965
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
966
+ )
967
+ sp_model_smiles = os.path.join(
968
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_SMILES"]
969
+ )
970
+ sp_model_prot = os.path.join(
971
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_PROT"]
972
+ )
973
+ sp_model_xna = os.path.join(
974
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_XNA"]
975
+ )
976
+
977
+ with open(vocab_file, "w", encoding="utf-8") as f:
978
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
979
+
980
+ index = 0
981
+ with open(merge_file, "w", encoding="utf-8") as writer:
982
+ writer.write("#version: 0.2\n")
983
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
984
+ if index != token_index:
985
+ logger.warning(
986
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
987
+ " Please check that the tokenizer is not corrupted!"
988
+ )
989
+ index = token_index
990
+ writer.write(" ".join(bpe_tokens) + "\n")
991
+ index += 1
992
+
993
+ with open(sp_model_smiles, "wb") as f:
994
+ f.write(self.sp_model_SMILES.serialized_model_proto())
995
+
996
+ with open(sp_model_prot, "wb") as f:
997
+ f.write(self.sp_model_PROT.serialized_model_proto())
998
+
999
+ with open(sp_model_xna, "wb") as f:
1000
+ f.write(self.sp_model_XNA.serialized_model_proto())
1001
+
1002
+ return vocab_file, merge_file
1003
+
1004
+ def prepare_for_tokenization(self, text, **kwargs):
1005
+ text = unicodedata.normalize("NFC", text)
1006
+ return (text, kwargs)
1007
+
1008
+
1009
+ __all__ = ["InternS1Tokenizer"]
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f9e4d4901a92b997e463c1f46055088b6cca5ca61a6522d1b9f64c4bb81cb42
3
+ size 12807982
tokenizer_PROT.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1144f52f86f3ca5a29940d69b037e508c05a89e6eedbe42bea641e226b20dbe0
3
+ size 12118
tokenizer_SMILES.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fba1c97da0353ccbffd368ae78e311ccbc762aa5ba74f9aff8bf2ab363c4d37d
3
+ size 14775
tokenizer_XNA.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58fc8bfb2af3dfe936a13dad8a9cb28dab7850b70b358db19605d867c133fb35
3
+ size 15451
tokenizer_config.json ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "248044": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "248045": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "248046": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "248047": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "248048": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "248049": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "248050": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "248051": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "248052": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "248053": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "248054": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "248055": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "248056": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "248057": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "248058": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "248059": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "248060": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "248061": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "248062": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "248063": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "248064": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "248065": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "248066": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "248067": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "248068": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "248069": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "248070": {
214
+ "content": "<|audio_start|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "248071": {
222
+ "content": "<|audio_end|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "248072": {
230
+ "content": "<tts_pad>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "248073": {
238
+ "content": "<tts_text_bos>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "248074": {
246
+ "content": "<tts_text_eod>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "248075": {
254
+ "content": "<tts_text_bos_single>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "248076": {
262
+ "content": "<|audio_pad|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "248077": {
270
+ "content": "<IMG_CONTEXT>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "248078": {
278
+ "content": "<img>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "248079": {
286
+ "content": "</img>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "248080": {
294
+ "content": "<quad>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "248081": {
302
+ "content": "</quad>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "248082": {
310
+ "content": "<ref>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "248083": {
318
+ "content": "</ref>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "248084": {
326
+ "content": "<box>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "248085": {
334
+ "content": "</box>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "248086": {
342
+ "content": "<|action_start|>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "248087": {
350
+ "content": "<|action_end|>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "248088": {
358
+ "content": "<|interpreter|>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "248089": {
366
+ "content": "<|plugin|>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "248090": {
374
+ "content": "<video>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "248091": {
382
+ "content": "<|ts|>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "248092": {
390
+ "content": "<|/ts|>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "248093": {
398
+ "content": "<TS_CONTEXT>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "248094": {
406
+ "content": "<SMILES>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": false
412
+ },
413
+ "248095": {
414
+ "content": "</SMILES>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": false
420
+ },
421
+ "248096": {
422
+ "content": "<protein>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": false
428
+ },
429
+ "248097": {
430
+ "content": "</protein>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": false
436
+ },
437
+ "248098": {
438
+ "content": "<dna>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": false
444
+ },
445
+ "248099": {
446
+ "content": "</dna>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": false
452
+ },
453
+ "248100": {
454
+ "content": "<rna>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": false
460
+ },
461
+ "248101": {
462
+ "content": "</rna>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": false
468
+ }
469
+ },
470
+ "additional_special_tokens": [
471
+ "<|im_start|>",
472
+ "<|im_end|>",
473
+ "<|object_ref_start|>",
474
+ "<|object_ref_end|>",
475
+ "<|box_start|>",
476
+ "<|box_end|>",
477
+ "<|quad_start|>",
478
+ "<|quad_end|>",
479
+ "<|vision_start|>",
480
+ "<|vision_end|>",
481
+ "<|vision_pad|>",
482
+ "<|image_pad|>",
483
+ "<|video_pad|>"
484
+ ],
485
+ "audio_bos_token": "<|audio_start|>",
486
+ "audio_eos_token": "<|audio_end|>",
487
+ "audio_token": "<|audio_pad|>",
488
+ "auto_map": {
489
+ "AutoTokenizer": [
490
+ "tokenization_interns1.InternS1Tokenizer",
491
+ null
492
+ ]
493
+ },
494
+ "bos_token": "<|im_start|>",
495
+ "clean_up_tokenization_spaces": false,
496
+ "eos_token": "<|im_end|>",
497
+ "errors": "replace",
498
+ "extra_special_tokens": {
499
+ "audio_bos_token": "<|audio_start|>",
500
+ "audio_eos_token": "<|audio_end|>",
501
+ "audio_token": "<|audio_pad|>",
502
+ "image_token": "<|image_pad|>",
503
+ "video_token": "<|video_pad|>",
504
+ "vision_bos_token": "<|vision_start|>",
505
+ "vision_eos_token": "<|vision_end|>"
506
+ },
507
+ "image_token": "<|image_pad|>",
508
+ "model_max_length": 262144,
509
+ "offset_PROT": 249126,
510
+ "offset_SMILES": 248102,
511
+ "offset_XNA": 250150,
512
+ "pad_token": "<|endoftext|>",
513
+ "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
514
+ "special_tokens_pattern": "none",
515
+ "split_special_tokens": false,
516
+ "tokenizer_class": "InternS1Tokenizer",
517
+ "unk_token": null,
518
+ "video_token": "<|video_pad|>",
519
+ "vision_bos_token": "<|vision_start|>",
520
+ "vision_eos_token": "<|vision_end|>"
521
+ }
video_preprocessor_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {
3
+ "longest_edge": 25165824,
4
+ "shortest_edge": 4096
5
+ },
6
+ "patch_size": 16,
7
+ "temporal_patch_size": 2,
8
+ "merge_size": 2,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "Qwen3VLProcessor",
20
+ "video_processor_type": "Qwen3VLVideoProcessor"
21
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff