haijunlv commited on
Commit
ff2baaa
·
verified ·
1 Parent(s): bab9cd5
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2025-2026 Shanghai AI Laboratory
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
202
+
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ pipeline_tag: image-text-to-text
5
+ ---
6
+
7
+ # InternS2Preview
8
+
9
+ ![20260408-154223.jpg](https://picui.ogmua.cn/s1/2026/04/08/69d60695b0db7.webp)
chat_template.jinja ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set image_count = namespace(value=0) %}
2
+ {%- set video_count = namespace(value=0) %}
3
+ {%- macro render_content(content, do_vision_count, is_system_content=false) %}
4
+ {%- if content is string %}
5
+ {{- content }}
6
+ {%- elif content is iterable and content is not mapping %}
7
+ {%- for item in content %}
8
+ {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
9
+ {%- if is_system_content %}
10
+ {{- raise_exception('System message cannot contain images.') }}
11
+ {%- endif %}
12
+ {%- if do_vision_count %}
13
+ {%- set image_count.value = image_count.value + 1 %}
14
+ {%- endif %}
15
+ {%- if add_vision_id %}
16
+ {{- 'Picture ' ~ image_count.value ~ ': ' }}
17
+ {%- endif %}
18
+ {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
19
+ {%- elif 'video' in item or item.type == 'video' %}
20
+ {%- if is_system_content %}
21
+ {{- raise_exception('System message cannot contain videos.') }}
22
+ {%- endif %}
23
+ {%- if do_vision_count %}
24
+ {%- set video_count.value = video_count.value + 1 %}
25
+ {%- endif %}
26
+ {%- if add_vision_id %}
27
+ {{- 'Video ' ~ video_count.value ~ ': ' }}
28
+ {%- endif %}
29
+ {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
30
+ {%- elif 'time_series' in item or item.type == 'time_series' %}
31
+ {%- if is_system_content %}
32
+ {{- raise_exception('System message cannot contain time series.') }}
33
+ {%- endif %}
34
+ {{- '<|ts|><TS_CONTEXT><|/ts|>' }}
35
+ {%- elif 'text' in item %}
36
+ {{- item.text }}
37
+ {%- else %}
38
+ {{- raise_exception('Unexpected item type in content.') }}
39
+ {%- endif %}
40
+ {%- endfor %}
41
+ {%- elif content is none or content is undefined %}
42
+ {{- '' }}
43
+ {%- else %}
44
+ {{- raise_exception('Unexpected content type.') }}
45
+ {%- endif %}
46
+ {%- endmacro %}
47
+ {%- if not messages %}
48
+ {{- raise_exception('No messages provided.') }}
49
+ {%- endif %}
50
+ {%- if tools and tools is iterable and tools is not mapping %}
51
+ {{- '<|im_start|>system\n' }}
52
+ {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
53
+ {%- for tool in tools %}
54
+ {{- "\n" }}
55
+ {{- tool | tojson }}
56
+ {%- endfor %}
57
+ {{- "\n</tools>" }}
58
+ {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
59
+ {%- if messages[0].role == 'system' %}
60
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
61
+ {%- if content %}
62
+ {{- '\n\n' + content }}
63
+ {%- endif %}
64
+ {%- endif %}
65
+ {{- '<|im_end|>\n' }}
66
+ {%- else %}
67
+ {%- if messages[0].role == 'system' %}
68
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
69
+ {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
70
+ {%- endif %}
71
+ {%- endif %}
72
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
73
+ {%- for message in messages[::-1] %}
74
+ {%- set index = (messages|length - 1) - loop.index0 %}
75
+ {%- if ns.multi_step_tool and message.role == "user" %}
76
+ {%- set content = render_content(message.content, false)|trim %}
77
+ {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
78
+ {%- set ns.multi_step_tool = false %}
79
+ {%- set ns.last_query_index = index %}
80
+ {%- endif %}
81
+ {%- endif %}
82
+ {%- endfor %}
83
+ {%- if ns.multi_step_tool %}
84
+ {{- raise_exception('No user query found in messages.') }}
85
+ {%- endif %}
86
+ {%- for message in messages %}
87
+ {%- set content = render_content(message.content, true)|trim %}
88
+ {%- if message.role == "system" %}
89
+ {%- if not loop.first %}
90
+ {{- raise_exception('System message must be at the beginning.') }}
91
+ {%- endif %}
92
+ {%- elif message.role == "user" %}
93
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
94
+ {%- elif message.role == "assistant" %}
95
+ {%- set reasoning_content = '' %}
96
+ {%- if message.reasoning_content is string %}
97
+ {%- set reasoning_content = message.reasoning_content %}
98
+ {%- else %}
99
+ {%- if '</think>' in content %}
100
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
101
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
102
+ {%- endif %}
103
+ {%- endif %}
104
+ {%- set reasoning_content = reasoning_content|trim %}
105
+ {%- if loop.index0 > ns.last_query_index %}
106
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
107
+ {%- else %}
108
+ {{- '<|im_start|>' + message.role + '\n' + content }}
109
+ {%- endif %}
110
+ {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
111
+ {%- for tool_call in message.tool_calls %}
112
+ {%- if tool_call.function is defined %}
113
+ {%- set tool_call = tool_call.function %}
114
+ {%- endif %}
115
+ {%- if loop.first %}
116
+ {%- if content|trim %}
117
+ {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
118
+ {%- else %}
119
+ {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
120
+ {%- endif %}
121
+ {%- else %}
122
+ {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
123
+ {%- endif %}
124
+ {%- if tool_call.arguments is defined %}
125
+ {%- for args_name, args_value in tool_call.arguments|items %}
126
+ {{- '<parameter=' + args_name + '>\n' }}
127
+ {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
128
+ {{- args_value }}
129
+ {{- '\n</parameter>\n' }}
130
+ {%- endfor %}
131
+ {%- endif %}
132
+ {{- '</function>\n</tool_call>' }}
133
+ {%- endfor %}
134
+ {%- endif %}
135
+ {{- '<|im_end|>\n' }}
136
+ {%- elif message.role == "tool" %}
137
+ {%- if loop.previtem and loop.previtem.role != "tool" %}
138
+ {{- '<|im_start|>user' }}
139
+ {%- endif %}
140
+ {{- '\n<tool_response>\n' }}
141
+ {{- content }}
142
+ {{- '\n</tool_response>' }}
143
+ {%- if not loop.last and loop.nextitem.role != "tool" %}
144
+ {{- '<|im_end|>\n' }}
145
+ {%- elif loop.last %}
146
+ {{- '<|im_end|>\n' }}
147
+ {%- endif %}
148
+ {%- else %}
149
+ {{- raise_exception('Unexpected message role.') }}
150
+ {%- endif %}
151
+ {%- endfor %}
152
+ {%- if add_generation_prompt %}
153
+ {{- '<|im_start|>assistant\n' }}
154
+ {%- if enable_thinking is defined and enable_thinking is false %}
155
+ {{- '<think>\n\n</think>\n\n' }}
156
+ {%- else %}
157
+ {{- '<think>\n' }}
158
+ {%- endif %}
159
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,675 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "intern_s2_preview",
3
+ "architectures": [
4
+ "InternS2PreviewForConditionalGeneration"
5
+ ],
6
+ "transformers_version": "5.2.0",
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_interns2_preview.InternS2PreviewConfig",
9
+ "AutoModelForCausalLM": "modeling_interns2_preview.InternS2PreviewForCausalLM",
10
+ "AutoModel": "modeling_interns2_preview.InternS2PreviewModel",
11
+ "AutoModelForImageTextToText": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration",
12
+ "AutoModelForMultimodalLM": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration"
13
+ },
14
+ "image_token_id": 248056,
15
+ "text_config": {
16
+ "model_type": "qwen3_5_moe_text",
17
+ "attention_bias": false,
18
+ "attention_dropout": 0.0,
19
+ "attn_output_gate": true,
20
+ "dtype": "bfloat16",
21
+ "eos_token_id": 248044,
22
+ "full_attention_interval": 4,
23
+ "head_dim": 256,
24
+ "hidden_act": "silu",
25
+ "hidden_size": 2048,
26
+ "initializer_range": 0.02,
27
+ "layer_types": [
28
+ "linear_attention",
29
+ "linear_attention",
30
+ "linear_attention",
31
+ "full_attention",
32
+ "linear_attention",
33
+ "linear_attention",
34
+ "linear_attention",
35
+ "full_attention",
36
+ "linear_attention",
37
+ "linear_attention",
38
+ "linear_attention",
39
+ "full_attention",
40
+ "linear_attention",
41
+ "linear_attention",
42
+ "linear_attention",
43
+ "full_attention",
44
+ "linear_attention",
45
+ "linear_attention",
46
+ "linear_attention",
47
+ "full_attention",
48
+ "linear_attention",
49
+ "linear_attention",
50
+ "linear_attention",
51
+ "full_attention",
52
+ "linear_attention",
53
+ "linear_attention",
54
+ "linear_attention",
55
+ "full_attention",
56
+ "linear_attention",
57
+ "linear_attention",
58
+ "linear_attention",
59
+ "full_attention",
60
+ "linear_attention",
61
+ "linear_attention",
62
+ "linear_attention",
63
+ "full_attention",
64
+ "linear_attention",
65
+ "linear_attention",
66
+ "linear_attention",
67
+ "full_attention"
68
+ ],
69
+ "linear_conv_kernel_dim": 4,
70
+ "linear_key_head_dim": 128,
71
+ "linear_num_key_heads": 16,
72
+ "linear_num_value_heads": 32,
73
+ "linear_value_head_dim": 128,
74
+ "max_position_embeddings": 262144,
75
+ "mlp_only_layers": [],
76
+ "moe_intermediate_size": 512,
77
+ "mtp_num_hidden_layers": 1,
78
+ "mtp_use_dedicated_embeddings": false,
79
+ "num_attention_heads": 16,
80
+ "num_experts": 256,
81
+ "num_experts_per_tok": 8,
82
+ "num_hidden_layers": 40,
83
+ "num_key_value_heads": 2,
84
+ "rms_norm_eps": 1e-06,
85
+ "router_aux_loss_coef": 0.001,
86
+ "shared_expert_intermediate_size": 512,
87
+ "use_cache": true,
88
+ "vocab_size": 251392,
89
+ "mamba_ssm_dtype": "float32",
90
+ "rope_parameters": {
91
+ "mrope_interleaved": true,
92
+ "mrope_section": [
93
+ 11,
94
+ 11,
95
+ 10
96
+ ],
97
+ "rope_type": "default",
98
+ "rope_theta": 10000000,
99
+ "partial_rotary_factor": 0.25
100
+ },
101
+ "pad_token_id": null,
102
+ "bos_token_id": null,
103
+ "tie_word_embeddings": false,
104
+ "output_router_logits": false,
105
+ "partial_rotary_factor": 0.25
106
+ },
107
+ "tie_word_embeddings": false,
108
+ "video_token_id": 248057,
109
+ "vision_config": {
110
+ "model_type": "intern_s2_preview",
111
+ "deepstack_visual_indexes": [],
112
+ "depth": 27,
113
+ "hidden_act": "gelu_pytorch_tanh",
114
+ "hidden_size": 1152,
115
+ "in_channels": 3,
116
+ "initializer_range": 0.02,
117
+ "intermediate_size": 4304,
118
+ "num_heads": 16,
119
+ "num_position_embeddings": 2304,
120
+ "out_hidden_size": 2048,
121
+ "patch_size": 16,
122
+ "spatial_merge_size": 2,
123
+ "temporal_patch_size": 2
124
+ },
125
+ "vision_end_token_id": 248054,
126
+ "vision_start_token_id": 248053,
127
+ "ts_config": {
128
+ "model_type": "interns2_preview_time_series",
129
+ "auto_map": {
130
+ "AutoConfig": "configuration_interns2_preview.InternS2PreviewTimeSeriesConfig",
131
+ "AutoModel": "modeling_interns2_preview.InternS2PreviewTimeSeriesModel"
132
+ },
133
+ "activation_dropout": 0.0,
134
+ "activation_function": "gelu",
135
+ "attention_dropout": 0.0,
136
+ "d_model": 768,
137
+ "dropout": 0.0,
138
+ "encoder_attention_heads": 8,
139
+ "encoder_ffn_dim": 3072,
140
+ "encoder_layerdrop": 0.0,
141
+ "encoder_layers": 17,
142
+ "max_source_positions": 1500,
143
+ "num_mel_bins": 80,
144
+ "out_hidden_size": 2048,
145
+ "scale_embedding": false,
146
+ "ts_adapt_in_dim": 256,
147
+ "ts_adapt_out_dim": 1024,
148
+ "ts_hidden_dim": 1024
149
+ },
150
+ "ts_token_id": 248093,
151
+ "ts_start_id": 248091,
152
+ "ts_end_id": 248092,
153
+ "quantization_config": {
154
+ "activation_scheme": "dynamic",
155
+ "fmt": "e4m3",
156
+ "quant_method": "fp8",
157
+ "scale_fmt": "ue8m0",
158
+ "weight_block_size": [
159
+ 128,
160
+ 128
161
+ ],
162
+ "modules_to_not_convert": [
163
+ "lm_head",
164
+ "model.language_model.embed_tokens",
165
+ "model.language_model.layers.0.input_layernorm",
166
+ "model.language_model.layers.0.linear_attn",
167
+ "model.language_model.layers.0.linear_attn.conv1d",
168
+ "model.language_model.layers.0.linear_attn.in_proj_a",
169
+ "model.language_model.layers.0.linear_attn.in_proj_b",
170
+ "model.language_model.layers.0.linear_attn.norm",
171
+ "model.language_model.layers.0.mlp.gate",
172
+ "model.language_model.layers.0.mlp.shared_expert_gate",
173
+ "model.language_model.layers.0.post_attention_layernorm",
174
+ "model.language_model.layers.1.input_layernorm",
175
+ "model.language_model.layers.1.linear_attn",
176
+ "model.language_model.layers.1.linear_attn.conv1d",
177
+ "model.language_model.layers.1.linear_attn.in_proj_a",
178
+ "model.language_model.layers.1.linear_attn.in_proj_b",
179
+ "model.language_model.layers.1.linear_attn.norm",
180
+ "model.language_model.layers.1.mlp.gate",
181
+ "model.language_model.layers.1.mlp.shared_expert_gate",
182
+ "model.language_model.layers.1.post_attention_layernorm",
183
+ "model.language_model.layers.10.input_layernorm",
184
+ "model.language_model.layers.10.linear_attn",
185
+ "model.language_model.layers.10.linear_attn.conv1d",
186
+ "model.language_model.layers.10.linear_attn.in_proj_a",
187
+ "model.language_model.layers.10.linear_attn.in_proj_b",
188
+ "model.language_model.layers.10.linear_attn.norm",
189
+ "model.language_model.layers.10.mlp.gate",
190
+ "model.language_model.layers.10.mlp.shared_expert_gate",
191
+ "model.language_model.layers.10.post_attention_layernorm",
192
+ "model.language_model.layers.11.input_layernorm",
193
+ "model.language_model.layers.11.mlp.gate",
194
+ "model.language_model.layers.11.mlp.shared_expert_gate",
195
+ "model.language_model.layers.11.post_attention_layernorm",
196
+ "model.language_model.layers.11.self_attn.k_norm",
197
+ "model.language_model.layers.11.self_attn.q_norm",
198
+ "model.language_model.layers.12.input_layernorm",
199
+ "model.language_model.layers.12.linear_attn",
200
+ "model.language_model.layers.12.linear_attn.conv1d",
201
+ "model.language_model.layers.12.linear_attn.in_proj_a",
202
+ "model.language_model.layers.12.linear_attn.in_proj_b",
203
+ "model.language_model.layers.12.linear_attn.norm",
204
+ "model.language_model.layers.12.mlp.gate",
205
+ "model.language_model.layers.12.mlp.shared_expert_gate",
206
+ "model.language_model.layers.12.post_attention_layernorm",
207
+ "model.language_model.layers.13.input_layernorm",
208
+ "model.language_model.layers.13.linear_attn",
209
+ "model.language_model.layers.13.linear_attn.conv1d",
210
+ "model.language_model.layers.13.linear_attn.in_proj_a",
211
+ "model.language_model.layers.13.linear_attn.in_proj_b",
212
+ "model.language_model.layers.13.linear_attn.norm",
213
+ "model.language_model.layers.13.mlp.gate",
214
+ "model.language_model.layers.13.mlp.shared_expert_gate",
215
+ "model.language_model.layers.13.post_attention_layernorm",
216
+ "model.language_model.layers.14.input_layernorm",
217
+ "model.language_model.layers.14.linear_attn",
218
+ "model.language_model.layers.14.linear_attn.conv1d",
219
+ "model.language_model.layers.14.linear_attn.in_proj_a",
220
+ "model.language_model.layers.14.linear_attn.in_proj_b",
221
+ "model.language_model.layers.14.linear_attn.norm",
222
+ "model.language_model.layers.14.mlp.gate",
223
+ "model.language_model.layers.14.mlp.shared_expert_gate",
224
+ "model.language_model.layers.14.post_attention_layernorm",
225
+ "model.language_model.layers.15.input_layernorm",
226
+ "model.language_model.layers.15.mlp.gate",
227
+ "model.language_model.layers.15.mlp.shared_expert_gate",
228
+ "model.language_model.layers.15.post_attention_layernorm",
229
+ "model.language_model.layers.15.self_attn.k_norm",
230
+ "model.language_model.layers.15.self_attn.q_norm",
231
+ "model.language_model.layers.16.input_layernorm",
232
+ "model.language_model.layers.16.linear_attn",
233
+ "model.language_model.layers.16.linear_attn.conv1d",
234
+ "model.language_model.layers.16.linear_attn.in_proj_a",
235
+ "model.language_model.layers.16.linear_attn.in_proj_b",
236
+ "model.language_model.layers.16.linear_attn.norm",
237
+ "model.language_model.layers.16.mlp.gate",
238
+ "model.language_model.layers.16.mlp.shared_expert_gate",
239
+ "model.language_model.layers.16.post_attention_layernorm",
240
+ "model.language_model.layers.17.input_layernorm",
241
+ "model.language_model.layers.17.linear_attn",
242
+ "model.language_model.layers.17.linear_attn.conv1d",
243
+ "model.language_model.layers.17.linear_attn.in_proj_a",
244
+ "model.language_model.layers.17.linear_attn.in_proj_b",
245
+ "model.language_model.layers.17.linear_attn.norm",
246
+ "model.language_model.layers.17.mlp.gate",
247
+ "model.language_model.layers.17.mlp.shared_expert_gate",
248
+ "model.language_model.layers.17.post_attention_layernorm",
249
+ "model.language_model.layers.18.input_layernorm",
250
+ "model.language_model.layers.18.linear_attn",
251
+ "model.language_model.layers.18.linear_attn.conv1d",
252
+ "model.language_model.layers.18.linear_attn.in_proj_a",
253
+ "model.language_model.layers.18.linear_attn.in_proj_b",
254
+ "model.language_model.layers.18.linear_attn.norm",
255
+ "model.language_model.layers.18.mlp.gate",
256
+ "model.language_model.layers.18.mlp.shared_expert_gate",
257
+ "model.language_model.layers.18.post_attention_layernorm",
258
+ "model.language_model.layers.19.input_layernorm",
259
+ "model.language_model.layers.19.mlp.gate",
260
+ "model.language_model.layers.19.mlp.shared_expert_gate",
261
+ "model.language_model.layers.19.post_attention_layernorm",
262
+ "model.language_model.layers.19.self_attn.k_norm",
263
+ "model.language_model.layers.19.self_attn.q_norm",
264
+ "model.language_model.layers.2.input_layernorm",
265
+ "model.language_model.layers.2.linear_attn",
266
+ "model.language_model.layers.2.linear_attn.conv1d",
267
+ "model.language_model.layers.2.linear_attn.in_proj_a",
268
+ "model.language_model.layers.2.linear_attn.in_proj_b",
269
+ "model.language_model.layers.2.linear_attn.norm",
270
+ "model.language_model.layers.2.mlp.gate",
271
+ "model.language_model.layers.2.mlp.shared_expert_gate",
272
+ "model.language_model.layers.2.post_attention_layernorm",
273
+ "model.language_model.layers.20.input_layernorm",
274
+ "model.language_model.layers.20.linear_attn",
275
+ "model.language_model.layers.20.linear_attn.conv1d",
276
+ "model.language_model.layers.20.linear_attn.in_proj_a",
277
+ "model.language_model.layers.20.linear_attn.in_proj_b",
278
+ "model.language_model.layers.20.linear_attn.norm",
279
+ "model.language_model.layers.20.mlp.gate",
280
+ "model.language_model.layers.20.mlp.shared_expert_gate",
281
+ "model.language_model.layers.20.post_attention_layernorm",
282
+ "model.language_model.layers.21.input_layernorm",
283
+ "model.language_model.layers.21.linear_attn",
284
+ "model.language_model.layers.21.linear_attn.conv1d",
285
+ "model.language_model.layers.21.linear_attn.in_proj_a",
286
+ "model.language_model.layers.21.linear_attn.in_proj_b",
287
+ "model.language_model.layers.21.linear_attn.norm",
288
+ "model.language_model.layers.21.mlp.gate",
289
+ "model.language_model.layers.21.mlp.shared_expert_gate",
290
+ "model.language_model.layers.21.post_attention_layernorm",
291
+ "model.language_model.layers.22.input_layernorm",
292
+ "model.language_model.layers.22.linear_attn",
293
+ "model.language_model.layers.22.linear_attn.conv1d",
294
+ "model.language_model.layers.22.linear_attn.in_proj_a",
295
+ "model.language_model.layers.22.linear_attn.in_proj_b",
296
+ "model.language_model.layers.22.linear_attn.norm",
297
+ "model.language_model.layers.22.mlp.gate",
298
+ "model.language_model.layers.22.mlp.shared_expert_gate",
299
+ "model.language_model.layers.22.post_attention_layernorm",
300
+ "model.language_model.layers.23.input_layernorm",
301
+ "model.language_model.layers.23.mlp.gate",
302
+ "model.language_model.layers.23.mlp.shared_expert_gate",
303
+ "model.language_model.layers.23.post_attention_layernorm",
304
+ "model.language_model.layers.23.self_attn.k_norm",
305
+ "model.language_model.layers.23.self_attn.q_norm",
306
+ "model.language_model.layers.24.input_layernorm",
307
+ "model.language_model.layers.24.linear_attn",
308
+ "model.language_model.layers.24.linear_attn.conv1d",
309
+ "model.language_model.layers.24.linear_attn.in_proj_a",
310
+ "model.language_model.layers.24.linear_attn.in_proj_b",
311
+ "model.language_model.layers.24.linear_attn.norm",
312
+ "model.language_model.layers.24.mlp.gate",
313
+ "model.language_model.layers.24.mlp.shared_expert_gate",
314
+ "model.language_model.layers.24.post_attention_layernorm",
315
+ "model.language_model.layers.25.input_layernorm",
316
+ "model.language_model.layers.25.linear_attn",
317
+ "model.language_model.layers.25.linear_attn.conv1d",
318
+ "model.language_model.layers.25.linear_attn.in_proj_a",
319
+ "model.language_model.layers.25.linear_attn.in_proj_b",
320
+ "model.language_model.layers.25.linear_attn.norm",
321
+ "model.language_model.layers.25.mlp.gate",
322
+ "model.language_model.layers.25.mlp.shared_expert_gate",
323
+ "model.language_model.layers.25.post_attention_layernorm",
324
+ "model.language_model.layers.26.input_layernorm",
325
+ "model.language_model.layers.26.linear_attn",
326
+ "model.language_model.layers.26.linear_attn.conv1d",
327
+ "model.language_model.layers.26.linear_attn.in_proj_a",
328
+ "model.language_model.layers.26.linear_attn.in_proj_b",
329
+ "model.language_model.layers.26.linear_attn.norm",
330
+ "model.language_model.layers.26.mlp.gate",
331
+ "model.language_model.layers.26.mlp.shared_expert_gate",
332
+ "model.language_model.layers.26.post_attention_layernorm",
333
+ "model.language_model.layers.27.input_layernorm",
334
+ "model.language_model.layers.27.mlp.gate",
335
+ "model.language_model.layers.27.mlp.shared_expert_gate",
336
+ "model.language_model.layers.27.post_attention_layernorm",
337
+ "model.language_model.layers.27.self_attn.k_norm",
338
+ "model.language_model.layers.27.self_attn.q_norm",
339
+ "model.language_model.layers.28.input_layernorm",
340
+ "model.language_model.layers.28.linear_attn",
341
+ "model.language_model.layers.28.linear_attn.conv1d",
342
+ "model.language_model.layers.28.linear_attn.in_proj_a",
343
+ "model.language_model.layers.28.linear_attn.in_proj_b",
344
+ "model.language_model.layers.28.linear_attn.norm",
345
+ "model.language_model.layers.28.mlp.gate",
346
+ "model.language_model.layers.28.mlp.shared_expert_gate",
347
+ "model.language_model.layers.28.post_attention_layernorm",
348
+ "model.language_model.layers.29.input_layernorm",
349
+ "model.language_model.layers.29.linear_attn",
350
+ "model.language_model.layers.29.linear_attn.conv1d",
351
+ "model.language_model.layers.29.linear_attn.in_proj_a",
352
+ "model.language_model.layers.29.linear_attn.in_proj_b",
353
+ "model.language_model.layers.29.linear_attn.norm",
354
+ "model.language_model.layers.29.mlp.gate",
355
+ "model.language_model.layers.29.mlp.shared_expert_gate",
356
+ "model.language_model.layers.29.post_attention_layernorm",
357
+ "model.language_model.layers.3.input_layernorm",
358
+ "model.language_model.layers.3.mlp.gate",
359
+ "model.language_model.layers.3.mlp.shared_expert_gate",
360
+ "model.language_model.layers.3.post_attention_layernorm",
361
+ "model.language_model.layers.3.self_attn.k_norm",
362
+ "model.language_model.layers.3.self_attn.q_norm",
363
+ "model.language_model.layers.30.input_layernorm",
364
+ "model.language_model.layers.30.linear_attn",
365
+ "model.language_model.layers.30.linear_attn.conv1d",
366
+ "model.language_model.layers.30.linear_attn.in_proj_a",
367
+ "model.language_model.layers.30.linear_attn.in_proj_b",
368
+ "model.language_model.layers.30.linear_attn.norm",
369
+ "model.language_model.layers.30.mlp.gate",
370
+ "model.language_model.layers.30.mlp.shared_expert_gate",
371
+ "model.language_model.layers.30.post_attention_layernorm",
372
+ "model.language_model.layers.31.input_layernorm",
373
+ "model.language_model.layers.31.mlp.gate",
374
+ "model.language_model.layers.31.mlp.shared_expert_gate",
375
+ "model.language_model.layers.31.post_attention_layernorm",
376
+ "model.language_model.layers.31.self_attn.k_norm",
377
+ "model.language_model.layers.31.self_attn.q_norm",
378
+ "model.language_model.layers.32.input_layernorm",
379
+ "model.language_model.layers.32.linear_attn",
380
+ "model.language_model.layers.32.linear_attn.conv1d",
381
+ "model.language_model.layers.32.linear_attn.in_proj_a",
382
+ "model.language_model.layers.32.linear_attn.in_proj_b",
383
+ "model.language_model.layers.32.linear_attn.norm",
384
+ "model.language_model.layers.32.mlp.gate",
385
+ "model.language_model.layers.32.mlp.shared_expert_gate",
386
+ "model.language_model.layers.32.post_attention_layernorm",
387
+ "model.language_model.layers.33.input_layernorm",
388
+ "model.language_model.layers.33.linear_attn",
389
+ "model.language_model.layers.33.linear_attn.conv1d",
390
+ "model.language_model.layers.33.linear_attn.in_proj_a",
391
+ "model.language_model.layers.33.linear_attn.in_proj_b",
392
+ "model.language_model.layers.33.linear_attn.norm",
393
+ "model.language_model.layers.33.mlp.gate",
394
+ "model.language_model.layers.33.mlp.shared_expert_gate",
395
+ "model.language_model.layers.33.post_attention_layernorm",
396
+ "model.language_model.layers.34.input_layernorm",
397
+ "model.language_model.layers.34.linear_attn",
398
+ "model.language_model.layers.34.linear_attn.conv1d",
399
+ "model.language_model.layers.34.linear_attn.in_proj_a",
400
+ "model.language_model.layers.34.linear_attn.in_proj_b",
401
+ "model.language_model.layers.34.linear_attn.norm",
402
+ "model.language_model.layers.34.mlp.gate",
403
+ "model.language_model.layers.34.mlp.shared_expert_gate",
404
+ "model.language_model.layers.34.post_attention_layernorm",
405
+ "model.language_model.layers.35.input_layernorm",
406
+ "model.language_model.layers.35.mlp.gate",
407
+ "model.language_model.layers.35.mlp.shared_expert_gate",
408
+ "model.language_model.layers.35.post_attention_layernorm",
409
+ "model.language_model.layers.35.self_attn.k_norm",
410
+ "model.language_model.layers.35.self_attn.q_norm",
411
+ "model.language_model.layers.36.input_layernorm",
412
+ "model.language_model.layers.36.linear_attn",
413
+ "model.language_model.layers.36.linear_attn.conv1d",
414
+ "model.language_model.layers.36.linear_attn.in_proj_a",
415
+ "model.language_model.layers.36.linear_attn.in_proj_b",
416
+ "model.language_model.layers.36.linear_attn.norm",
417
+ "model.language_model.layers.36.mlp.gate",
418
+ "model.language_model.layers.36.mlp.shared_expert_gate",
419
+ "model.language_model.layers.36.post_attention_layernorm",
420
+ "model.language_model.layers.37.input_layernorm",
421
+ "model.language_model.layers.37.linear_attn",
422
+ "model.language_model.layers.37.linear_attn.conv1d",
423
+ "model.language_model.layers.37.linear_attn.in_proj_a",
424
+ "model.language_model.layers.37.linear_attn.in_proj_b",
425
+ "model.language_model.layers.37.linear_attn.norm",
426
+ "model.language_model.layers.37.mlp.gate",
427
+ "model.language_model.layers.37.mlp.shared_expert_gate",
428
+ "model.language_model.layers.37.post_attention_layernorm",
429
+ "model.language_model.layers.38.input_layernorm",
430
+ "model.language_model.layers.38.linear_attn",
431
+ "model.language_model.layers.38.linear_attn.conv1d",
432
+ "model.language_model.layers.38.linear_attn.in_proj_a",
433
+ "model.language_model.layers.38.linear_attn.in_proj_b",
434
+ "model.language_model.layers.38.linear_attn.norm",
435
+ "model.language_model.layers.38.mlp.gate",
436
+ "model.language_model.layers.38.mlp.shared_expert_gate",
437
+ "model.language_model.layers.38.post_attention_layernorm",
438
+ "model.language_model.layers.39.input_layernorm",
439
+ "model.language_model.layers.39.mlp.gate",
440
+ "model.language_model.layers.39.mlp.shared_expert_gate",
441
+ "model.language_model.layers.39.post_attention_layernorm",
442
+ "model.language_model.layers.39.self_attn.k_norm",
443
+ "model.language_model.layers.39.self_attn.q_norm",
444
+ "model.language_model.layers.4.input_layernorm",
445
+ "model.language_model.layers.4.linear_attn",
446
+ "model.language_model.layers.4.linear_attn.conv1d",
447
+ "model.language_model.layers.4.linear_attn.in_proj_a",
448
+ "model.language_model.layers.4.linear_attn.in_proj_b",
449
+ "model.language_model.layers.4.linear_attn.norm",
450
+ "model.language_model.layers.4.mlp.gate",
451
+ "model.language_model.layers.4.mlp.shared_expert_gate",
452
+ "model.language_model.layers.4.post_attention_layernorm",
453
+ "model.language_model.layers.5.input_layernorm",
454
+ "model.language_model.layers.5.linear_attn",
455
+ "model.language_model.layers.5.linear_attn.conv1d",
456
+ "model.language_model.layers.5.linear_attn.in_proj_a",
457
+ "model.language_model.layers.5.linear_attn.in_proj_b",
458
+ "model.language_model.layers.5.linear_attn.norm",
459
+ "model.language_model.layers.5.mlp.gate",
460
+ "model.language_model.layers.5.mlp.shared_expert_gate",
461
+ "model.language_model.layers.5.post_attention_layernorm",
462
+ "model.language_model.layers.6.input_layernorm",
463
+ "model.language_model.layers.6.linear_attn",
464
+ "model.language_model.layers.6.linear_attn.conv1d",
465
+ "model.language_model.layers.6.linear_attn.in_proj_a",
466
+ "model.language_model.layers.6.linear_attn.in_proj_b",
467
+ "model.language_model.layers.6.linear_attn.norm",
468
+ "model.language_model.layers.6.mlp.gate",
469
+ "model.language_model.layers.6.mlp.shared_expert_gate",
470
+ "model.language_model.layers.6.post_attention_layernorm",
471
+ "model.language_model.layers.7.input_layernorm",
472
+ "model.language_model.layers.7.mlp.gate",
473
+ "model.language_model.layers.7.mlp.shared_expert_gate",
474
+ "model.language_model.layers.7.post_attention_layernorm",
475
+ "model.language_model.layers.7.self_attn.k_norm",
476
+ "model.language_model.layers.7.self_attn.q_norm",
477
+ "model.language_model.layers.8.input_layernorm",
478
+ "model.language_model.layers.8.linear_attn",
479
+ "model.language_model.layers.8.linear_attn.conv1d",
480
+ "model.language_model.layers.8.linear_attn.in_proj_a",
481
+ "model.language_model.layers.8.linear_attn.in_proj_b",
482
+ "model.language_model.layers.8.linear_attn.norm",
483
+ "model.language_model.layers.8.mlp.gate",
484
+ "model.language_model.layers.8.mlp.shared_expert_gate",
485
+ "model.language_model.layers.8.post_attention_layernorm",
486
+ "model.language_model.layers.9.input_layernorm",
487
+ "model.language_model.layers.9.linear_attn",
488
+ "model.language_model.layers.9.linear_attn.conv1d",
489
+ "model.language_model.layers.9.linear_attn.in_proj_a",
490
+ "model.language_model.layers.9.linear_attn.in_proj_b",
491
+ "model.language_model.layers.9.linear_attn.norm",
492
+ "model.language_model.layers.9.mlp.gate",
493
+ "model.language_model.layers.9.mlp.shared_expert_gate",
494
+ "model.language_model.layers.9.post_attention_layernorm",
495
+ "model.language_model.norm",
496
+ "model.visual.blocks.0.attn.proj",
497
+ "model.visual.blocks.0.attn.qkv",
498
+ "model.visual.blocks.0.mlp.linear_fc1",
499
+ "model.visual.blocks.0.mlp.linear_fc2",
500
+ "model.visual.blocks.0.norm1",
501
+ "model.visual.blocks.0.norm2",
502
+ "model.visual.blocks.1.attn.proj",
503
+ "model.visual.blocks.1.attn.qkv",
504
+ "model.visual.blocks.1.mlp.linear_fc1",
505
+ "model.visual.blocks.1.mlp.linear_fc2",
506
+ "model.visual.blocks.1.norm1",
507
+ "model.visual.blocks.1.norm2",
508
+ "model.visual.blocks.10.attn.proj",
509
+ "model.visual.blocks.10.attn.qkv",
510
+ "model.visual.blocks.10.mlp.linear_fc1",
511
+ "model.visual.blocks.10.mlp.linear_fc2",
512
+ "model.visual.blocks.10.norm1",
513
+ "model.visual.blocks.10.norm2",
514
+ "model.visual.blocks.11.attn.proj",
515
+ "model.visual.blocks.11.attn.qkv",
516
+ "model.visual.blocks.11.mlp.linear_fc1",
517
+ "model.visual.blocks.11.mlp.linear_fc2",
518
+ "model.visual.blocks.11.norm1",
519
+ "model.visual.blocks.11.norm2",
520
+ "model.visual.blocks.12.attn.proj",
521
+ "model.visual.blocks.12.attn.qkv",
522
+ "model.visual.blocks.12.mlp.linear_fc1",
523
+ "model.visual.blocks.12.mlp.linear_fc2",
524
+ "model.visual.blocks.12.norm1",
525
+ "model.visual.blocks.12.norm2",
526
+ "model.visual.blocks.13.attn.proj",
527
+ "model.visual.blocks.13.attn.qkv",
528
+ "model.visual.blocks.13.mlp.linear_fc1",
529
+ "model.visual.blocks.13.mlp.linear_fc2",
530
+ "model.visual.blocks.13.norm1",
531
+ "model.visual.blocks.13.norm2",
532
+ "model.visual.blocks.14.attn.proj",
533
+ "model.visual.blocks.14.attn.qkv",
534
+ "model.visual.blocks.14.mlp.linear_fc1",
535
+ "model.visual.blocks.14.mlp.linear_fc2",
536
+ "model.visual.blocks.14.norm1",
537
+ "model.visual.blocks.14.norm2",
538
+ "model.visual.blocks.15.attn.proj",
539
+ "model.visual.blocks.15.attn.qkv",
540
+ "model.visual.blocks.15.mlp.linear_fc1",
541
+ "model.visual.blocks.15.mlp.linear_fc2",
542
+ "model.visual.blocks.15.norm1",
543
+ "model.visual.blocks.15.norm2",
544
+ "model.visual.blocks.16.attn.proj",
545
+ "model.visual.blocks.16.attn.qkv",
546
+ "model.visual.blocks.16.mlp.linear_fc1",
547
+ "model.visual.blocks.16.mlp.linear_fc2",
548
+ "model.visual.blocks.16.norm1",
549
+ "model.visual.blocks.16.norm2",
550
+ "model.visual.blocks.17.attn.proj",
551
+ "model.visual.blocks.17.attn.qkv",
552
+ "model.visual.blocks.17.mlp.linear_fc1",
553
+ "model.visual.blocks.17.mlp.linear_fc2",
554
+ "model.visual.blocks.17.norm1",
555
+ "model.visual.blocks.17.norm2",
556
+ "model.visual.blocks.18.attn.proj",
557
+ "model.visual.blocks.18.attn.qkv",
558
+ "model.visual.blocks.18.mlp.linear_fc1",
559
+ "model.visual.blocks.18.mlp.linear_fc2",
560
+ "model.visual.blocks.18.norm1",
561
+ "model.visual.blocks.18.norm2",
562
+ "model.visual.blocks.19.attn.proj",
563
+ "model.visual.blocks.19.attn.qkv",
564
+ "model.visual.blocks.19.mlp.linear_fc1",
565
+ "model.visual.blocks.19.mlp.linear_fc2",
566
+ "model.visual.blocks.19.norm1",
567
+ "model.visual.blocks.19.norm2",
568
+ "model.visual.blocks.2.attn.proj",
569
+ "model.visual.blocks.2.attn.qkv",
570
+ "model.visual.blocks.2.mlp.linear_fc1",
571
+ "model.visual.blocks.2.mlp.linear_fc2",
572
+ "model.visual.blocks.2.norm1",
573
+ "model.visual.blocks.2.norm2",
574
+ "model.visual.blocks.20.attn.proj",
575
+ "model.visual.blocks.20.attn.qkv",
576
+ "model.visual.blocks.20.mlp.linear_fc1",
577
+ "model.visual.blocks.20.mlp.linear_fc2",
578
+ "model.visual.blocks.20.norm1",
579
+ "model.visual.blocks.20.norm2",
580
+ "model.visual.blocks.21.attn.proj",
581
+ "model.visual.blocks.21.attn.qkv",
582
+ "model.visual.blocks.21.mlp.linear_fc1",
583
+ "model.visual.blocks.21.mlp.linear_fc2",
584
+ "model.visual.blocks.21.norm1",
585
+ "model.visual.blocks.21.norm2",
586
+ "model.visual.blocks.22.attn.proj",
587
+ "model.visual.blocks.22.attn.qkv",
588
+ "model.visual.blocks.22.mlp.linear_fc1",
589
+ "model.visual.blocks.22.mlp.linear_fc2",
590
+ "model.visual.blocks.22.norm1",
591
+ "model.visual.blocks.22.norm2",
592
+ "model.visual.blocks.23.attn.proj",
593
+ "model.visual.blocks.23.attn.qkv",
594
+ "model.visual.blocks.23.mlp.linear_fc1",
595
+ "model.visual.blocks.23.mlp.linear_fc2",
596
+ "model.visual.blocks.23.norm1",
597
+ "model.visual.blocks.23.norm2",
598
+ "model.visual.blocks.24.attn.proj",
599
+ "model.visual.blocks.24.attn.qkv",
600
+ "model.visual.blocks.24.mlp.linear_fc1",
601
+ "model.visual.blocks.24.mlp.linear_fc2",
602
+ "model.visual.blocks.24.norm1",
603
+ "model.visual.blocks.24.norm2",
604
+ "model.visual.blocks.25.attn.proj",
605
+ "model.visual.blocks.25.attn.qkv",
606
+ "model.visual.blocks.25.mlp.linear_fc1",
607
+ "model.visual.blocks.25.mlp.linear_fc2",
608
+ "model.visual.blocks.25.norm1",
609
+ "model.visual.blocks.25.norm2",
610
+ "model.visual.blocks.26.attn.proj",
611
+ "model.visual.blocks.26.attn.qkv",
612
+ "model.visual.blocks.26.mlp.linear_fc1",
613
+ "model.visual.blocks.26.mlp.linear_fc2",
614
+ "model.visual.blocks.26.norm1",
615
+ "model.visual.blocks.26.norm2",
616
+ "model.visual.blocks.3.attn.proj",
617
+ "model.visual.blocks.3.attn.qkv",
618
+ "model.visual.blocks.3.mlp.linear_fc1",
619
+ "model.visual.blocks.3.mlp.linear_fc2",
620
+ "model.visual.blocks.3.norm1",
621
+ "model.visual.blocks.3.norm2",
622
+ "model.visual.blocks.4.attn.proj",
623
+ "model.visual.blocks.4.attn.qkv",
624
+ "model.visual.blocks.4.mlp.linear_fc1",
625
+ "model.visual.blocks.4.mlp.linear_fc2",
626
+ "model.visual.blocks.4.norm1",
627
+ "model.visual.blocks.4.norm2",
628
+ "model.visual.blocks.5.attn.proj",
629
+ "model.visual.blocks.5.attn.qkv",
630
+ "model.visual.blocks.5.mlp.linear_fc1",
631
+ "model.visual.blocks.5.mlp.linear_fc2",
632
+ "model.visual.blocks.5.norm1",
633
+ "model.visual.blocks.5.norm2",
634
+ "model.visual.blocks.6.attn.proj",
635
+ "model.visual.blocks.6.attn.qkv",
636
+ "model.visual.blocks.6.mlp.linear_fc1",
637
+ "model.visual.blocks.6.mlp.linear_fc2",
638
+ "model.visual.blocks.6.norm1",
639
+ "model.visual.blocks.6.norm2",
640
+ "model.visual.blocks.7.attn.proj",
641
+ "model.visual.blocks.7.attn.qkv",
642
+ "model.visual.blocks.7.mlp.linear_fc1",
643
+ "model.visual.blocks.7.mlp.linear_fc2",
644
+ "model.visual.blocks.7.norm1",
645
+ "model.visual.blocks.7.norm2",
646
+ "model.visual.blocks.8.attn.proj",
647
+ "model.visual.blocks.8.attn.qkv",
648
+ "model.visual.blocks.8.mlp.linear_fc1",
649
+ "model.visual.blocks.8.mlp.linear_fc2",
650
+ "model.visual.blocks.8.norm1",
651
+ "model.visual.blocks.8.norm2",
652
+ "model.visual.blocks.9.attn.proj",
653
+ "model.visual.blocks.9.attn.qkv",
654
+ "model.visual.blocks.9.mlp.linear_fc1",
655
+ "model.visual.blocks.9.mlp.linear_fc2",
656
+ "model.visual.blocks.9.norm1",
657
+ "model.visual.blocks.9.norm2",
658
+ "model.visual.merger.linear_fc1",
659
+ "model.visual.merger.linear_fc2",
660
+ "model.visual.merger.norm",
661
+ "model.visual.patch_embed.proj",
662
+ "model.visual.pos_embed",
663
+ "mtp.fc",
664
+ "mtp.layers.0.input_layernorm",
665
+ "mtp.layers.0.mlp.gate",
666
+ "mtp.layers.0.mlp.shared_expert_gate",
667
+ "mtp.layers.0.post_attention_layernorm",
668
+ "mtp.layers.0.self_attn.k_norm",
669
+ "mtp.layers.0.self_attn.q_norm",
670
+ "mtp.norm",
671
+ "mtp.pre_fc_norm_embedding",
672
+ "mtp.pre_fc_norm_hidden"
673
+ ]
674
+ }
675
+ }
configuration_interns2_preview.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/interns2_preview/modular_interns2_preview.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_interns2_preview.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # Copyright 2026 HuggingFace Inc. team. All rights reserved.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
21
+ from transformers.modeling_rope_utils import RopeParameters
22
+
23
+
24
+ class InternS2PreviewVisionConfig(PreTrainedConfig):
25
+ model_type = "intern_s2_preview"
26
+ base_config_key = "vision_config"
27
+
28
+ def __init__(
29
+ self,
30
+ depth=27,
31
+ hidden_size=1152,
32
+ hidden_act="gelu_pytorch_tanh",
33
+ intermediate_size=4304,
34
+ num_heads=16,
35
+ in_channels=3,
36
+ patch_size=16,
37
+ spatial_merge_size=2,
38
+ temporal_patch_size=2,
39
+ out_hidden_size=3584,
40
+ num_position_embeddings=2304,
41
+ initializer_range=0.02,
42
+ **kwargs,
43
+ ):
44
+ super().__init__(**kwargs)
45
+
46
+ self.depth = depth
47
+ self.hidden_size = hidden_size
48
+ self.hidden_act = hidden_act
49
+ self.intermediate_size = intermediate_size
50
+ self.num_heads = num_heads
51
+ self.in_channels = in_channels
52
+ self.patch_size = patch_size
53
+ self.spatial_merge_size = spatial_merge_size
54
+ self.temporal_patch_size = temporal_patch_size
55
+ self.out_hidden_size = out_hidden_size
56
+ self.num_position_embeddings = num_position_embeddings
57
+ self.initializer_range = initializer_range
58
+
59
+
60
+ class InternS2PreviewTextConfig(PreTrainedConfig):
61
+ r"""
62
+ This is the configuration class to store the configuration of a [`InternS2PreviewTextModel`]. It is used to instantiate a
63
+ Qwen3.5-MoE model according to the specified arguments, defining the model architecture.
64
+ Instantiating a configuration with the defaults will yield a similar configuration to that of
65
+ Qwen3.5-35B-A3B-Instruct [Qwen/Qwen3.5-35B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-Instruct).
66
+
67
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
68
+ documentation from [`PreTrainedConfig`] for more information.
69
+
70
+
71
+ Args:
72
+ vocab_size (`int`, *optional*, defaults to 248320):
73
+ Vocabulary size of the model. Defines the number of different tokens that can be represented by the
74
+ `inputs_ids`.
75
+ hidden_size (`int`, *optional*, defaults to 2048):
76
+ Dimension of the hidden representations.
77
+ num_hidden_layers (`int`, *optional*, defaults to 40):
78
+ Number of hidden layers in the Transformer encoder.
79
+ num_attention_heads (`int`, *optional*, defaults to 16):
80
+ Number of attention heads for each attention layer in the Transformer encoder.
81
+ num_key_value_heads (`int`, *optional*, defaults to 2):
82
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
83
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
84
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
85
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
86
+ by meanpooling all the original heads within that group. For more details checkout [this
87
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
88
+ hidden_act (`str`, *optional*, defaults to `"silu"`):
89
+ The non-linear activation function in the decoder.
90
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
91
+ The maximum sequence length that this model might ever be used with.
92
+ initializer_range (`float`, *optional*, defaults to 0.02):
93
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
94
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
95
+ The epsilon used by the rms normalization layers.
96
+ use_cache (`bool`, *optional*, defaults to `True`):
97
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
98
+ relevant if `config.is_decoder=True`.
99
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
100
+ Whether the model's input and output word embeddings should be tied.
101
+ rope_parameters (`RopeParameters`, *optional*):
102
+ Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
103
+ a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
104
+ with longer `max_position_embeddings`.
105
+ attention_bias (`bool`, *optional*, defaults to `False`):
106
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
107
+ attention_dropout (`float`, *optional*, defaults to 0.0):
108
+ The dropout ratio for the attention probabilities.
109
+ head_dim (`int`, *optional*, defaults to 256):
110
+ Projection weights dimension in multi-head attention.
111
+ linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
112
+ Kernel size of the convolution used in linear attention layers.
113
+ linear_key_head_dim (`int`, *optional*, defaults to 128):
114
+ Dimension of each key head in linear attention.
115
+ linear_value_head_dim (`int`, *optional*, defaults to 128):
116
+ Dimension of each value head in linear attention.
117
+ linear_num_key_heads (`int`, *optional*, defaults to 16):
118
+ Number of key heads used in linear attention layers.
119
+ linear_num_value_heads (`int`, *optional*, defaults to 32):
120
+ Number of value heads used in linear attention layers.
121
+ moe_intermediate_size (`int`, *optional*, defaults to 512):
122
+ Intermediate size of the routed expert.
123
+ shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
124
+ Intermediate size of the shared expert.
125
+ num_experts_per_tok (`int`, *optional*, defaults to 8):
126
+ Number of selected experts.
127
+ num_experts (`int`, *optional*, defaults to 256):
128
+ Number of routed experts.
129
+ output_router_logits (`bool`, *optional*, defaults to `False`):
130
+ Whether or not the router logits should be returned by the model. Enabling this will also
131
+ allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
132
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
133
+ The aux loss factor for the total loss.
134
+ layer_types (`list[str]`, *optional*):
135
+ Types of each layer (attention or linear).
136
+ pad_token_id (`int`, *optional*):
137
+ Padding token id.
138
+ bos_token_id (`int`, *optional*):
139
+ Beginning of stream token id.
140
+ eos_token_id (`int`, *optional*):
141
+ End of stream token id.
142
+
143
+ ```python
144
+ >>> from transformers import InternS2PreviewTextModel, InternS2PreviewTextConfig
145
+
146
+ >>> # Initializing a Qwen3.5-MoE style configuration
147
+ >>> configuration = InternS2PreviewTextConfig()
148
+
149
+ >>> # Initializing a model from the Qwen3.5-35B-A3B style configuration
150
+ >>> model = InternS2PreviewTextModel(configuration)
151
+
152
+ >>> # Accessing the model configuration
153
+ >>> configuration = model.config
154
+ ```
155
+ """
156
+
157
+ # NOTE: `model_type` is kept as `qwen3_5_moe_text` because transformers hardcodes weight-renaming logic keyed
158
+ # on model_type (e.g. `model_dtype`); reusing the parent's value ensures correct weight loading via
159
+ # `AutoModelForCausalLM.from_pretrained`.
160
+ model_type = "qwen3_5_moe_text"
161
+ keys_to_ignore_at_inference = ["past_key_values"]
162
+
163
+ base_model_tp_plan = {
164
+ "layers.*.self_attn.q_proj": "colwise",
165
+ "layers.*.self_attn.k_proj": "colwise",
166
+ "layers.*.self_attn.v_proj": "colwise",
167
+ "layers.*.self_attn.o_proj": "rowwise",
168
+ "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
169
+ "layers.*.mlp.experts.down_proj": "rowwise",
170
+ "layers.*.mlp.shared_expert.gate_proj": "colwise",
171
+ "layers.*.mlp.shared_expert.up_proj": "colwise",
172
+ "layers.*.mlp.shared_expert.down_proj": "rowwise",
173
+ }
174
+ base_model_pp_plan = {
175
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
176
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
177
+ "norm": (["hidden_states"], ["hidden_states"]),
178
+ }
179
+ base_config_key = "text_config"
180
+
181
+ def __init__(
182
+ self,
183
+ vocab_size=248320,
184
+ hidden_size=2048,
185
+ num_hidden_layers=40,
186
+ num_attention_heads=16,
187
+ num_key_value_heads=2,
188
+ hidden_act="silu",
189
+ max_position_embeddings=32768,
190
+ initializer_range=0.02,
191
+ rms_norm_eps=1e-6,
192
+ use_cache=True,
193
+ tie_word_embeddings=False,
194
+ rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
195
+ attention_bias=False,
196
+ attention_dropout=0.0,
197
+ head_dim=256,
198
+ linear_conv_kernel_dim=4,
199
+ linear_key_head_dim=128,
200
+ linear_value_head_dim=128,
201
+ linear_num_key_heads=16,
202
+ linear_num_value_heads=32,
203
+ moe_intermediate_size=512,
204
+ shared_expert_intermediate_size=512,
205
+ num_experts_per_tok=8,
206
+ num_experts=256,
207
+ output_router_logits=False,
208
+ router_aux_loss_coef=0.001,
209
+ layer_types=None,
210
+ pad_token_id: int | None = None,
211
+ bos_token_id: int | None = None,
212
+ eos_token_id: int | None = None,
213
+ **kwargs,
214
+ ):
215
+ kwargs["ignore_keys_at_rope_validation"] = {"mrope_section", "mrope_interleaved"}
216
+ self.pad_token_id = pad_token_id
217
+ self.bos_token_id = bos_token_id
218
+ self.eos_token_id = eos_token_id
219
+ self.tie_word_embeddings = tie_word_embeddings
220
+ self.vocab_size = vocab_size
221
+ self.max_position_embeddings = max_position_embeddings
222
+ self.hidden_size = hidden_size
223
+ self.num_hidden_layers = num_hidden_layers
224
+ self.num_attention_heads = num_attention_heads
225
+ self.num_key_value_heads = num_key_value_heads
226
+ self.hidden_act = hidden_act
227
+ self.initializer_range = initializer_range
228
+ self.rms_norm_eps = rms_norm_eps
229
+ self.use_cache = use_cache
230
+ self.attention_bias = attention_bias
231
+ self.attention_dropout = attention_dropout
232
+ self.head_dim = head_dim
233
+ self.rope_parameters = rope_parameters
234
+ kwargs.setdefault("partial_rotary_factor", 0.25) # assign default for BC
235
+
236
+ self.layer_types = layer_types
237
+ if self.layer_types is None:
238
+ interval_pattern = kwargs.get("full_attention_interval", 4)
239
+ self.layer_types = [
240
+ "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention"
241
+ for i in range(self.num_hidden_layers)
242
+ ]
243
+ layer_type_validation(self.layer_types, self.num_hidden_layers)
244
+
245
+ # linear attention part
246
+ self.linear_conv_kernel_dim = linear_conv_kernel_dim
247
+ self.linear_key_head_dim = linear_key_head_dim
248
+ self.linear_value_head_dim = linear_value_head_dim
249
+ self.linear_num_key_heads = linear_num_key_heads
250
+ self.linear_num_value_heads = linear_num_value_heads
251
+ self.moe_intermediate_size = moe_intermediate_size
252
+ self.shared_expert_intermediate_size = shared_expert_intermediate_size
253
+ self.num_experts_per_tok = num_experts_per_tok
254
+ self.num_experts = num_experts
255
+ self.output_router_logits = output_router_logits
256
+ self.router_aux_loss_coef = router_aux_loss_coef
257
+ super().__init__(**kwargs)
258
+
259
+
260
+ class InternS2PreviewTimeSeriesConfig(PreTrainedConfig):
261
+ r"""
262
+ This is the configuration class to store the configuration of a [`InternS2PreviewTimeSeriesModel`]. It is used to instantiate a
263
+ InternS2PreviewTimeSeries model according to the specified arguments, defining the model architecture.
264
+
265
+ Args:
266
+ ts_adapt_in_dim (`int`, *optional*, defaults to 256):
267
+ The input dimension of the time series adapter.
268
+ ts_adapt_out_dim (`int`, *optional*, defaults to 1024):
269
+ The output dimension of the time series adapter.
270
+ ts_hidden_dim (`int`, *optional*, defaults to 1024):
271
+ The hidden dimension of the time series model.
272
+ ts_cnn_channels (`list[int]`, *optional*, defaults to [1, 32, 64, 128, 128]):
273
+ The channels of the time series CNN.
274
+ ts_cnn_kernel_sizes (`list[int]`, *optional*, defaults to [3, 5, 5, 5]):
275
+ The kernel sizes of the time series CNN.
276
+ ts_cnn_strides (`list[int]`, *optional*, defaults to [2, 4, 4, 5]):
277
+ The strides of the time series CNN.
278
+ ts_cnn_paddings (`list[int]`, *optional*, defaults to [1, 2, 2, 2]):
279
+ The paddings of the time series CNN.
280
+ ts_concat_subsampling_in_channels (`int`, *optional*, defaults to 128):
281
+ The input channels of the time series concat subsampling.
282
+ ts_concat_subsampling_concat_size (`int`, *optional*, defaults to 2):
283
+ The concat size of the time series concat subsampling.
284
+ **super_kwargs:
285
+ Additional keyword arguments passed along to the base class `WhisperConfig`.
286
+ """
287
+
288
+ model_type = "interns2_preview_time_series"
289
+ base_config_key = "ts_config"
290
+
291
+ def __init__(
292
+ self,
293
+ activation_dropout: float = 0.0,
294
+ activation_function: str = "gelu",
295
+ attention_dropout: float = 0.0,
296
+ d_model: int = 768,
297
+ dropout: float = 0.0,
298
+ encoder_attention_heads: int = 8,
299
+ encoder_ffn_dim: int = 3072,
300
+ encoder_layerdrop: float = 0.0,
301
+ encoder_layers: int = 17,
302
+ max_source_positions: int = 1500,
303
+ num_mel_bins: int = 80,
304
+ out_hidden_size: int = 2048,
305
+ scale_embedding: bool = False,
306
+ ts_adapt_in_dim: int = 256,
307
+ ts_adapt_out_dim: int = 1024,
308
+ ts_hidden_dim: int = 1024,
309
+ **super_kwargs,
310
+ ):
311
+ super().__init__(**super_kwargs)
312
+
313
+ self.auto_map = {
314
+ "AutoConfig": "configuration_interns2_preview.InternS2PreviewTimeSeriesConfig",
315
+ "AutoModel": "modeling_interns2_preview.InternS2PreviewTimeSeriesModel",
316
+ }
317
+ self.activation_dropout = activation_dropout
318
+ self.activation_function = activation_function
319
+ self.attention_dropout = attention_dropout
320
+ self.d_model = d_model
321
+ self.dropout = dropout
322
+ self.encoder_attention_heads = encoder_attention_heads
323
+ self.encoder_ffn_dim = encoder_ffn_dim
324
+ self.encoder_layerdrop = encoder_layerdrop
325
+ self.encoder_layers = encoder_layers
326
+ self.max_source_positions = max_source_positions
327
+ self.num_mel_bins = num_mel_bins
328
+ self.out_hidden_size = out_hidden_size
329
+ self.scale_embedding = scale_embedding
330
+ self.ts_adapt_in_dim = ts_adapt_in_dim
331
+ self.ts_adapt_out_dim = ts_adapt_out_dim
332
+ self.ts_hidden_dim = ts_hidden_dim
333
+
334
+ assert self.ts_adapt_out_dim == self.ts_hidden_dim, "ts_adapt_out_dim should be equal to ts_hidden_dim"
335
+
336
+
337
+ class InternS2PreviewConfig(PreTrainedConfig):
338
+ r"""
339
+ This is the configuration class to store the configuration of a [`InternS2PreviewModel`]. It is used to instantiate a
340
+ Qwen3.5-MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
341
+ with the defaults will yield a similar configuration to that of
342
+ Qwen3.5-35B-A3B-Instruct [Qwen/Qwen3.5-35B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-Instruct).
343
+
344
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
345
+ documentation from [`PreTrainedConfig`] for more information.
346
+
347
+
348
+ Args:
349
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3_5TextConfig`):
350
+ The config object or dictionary of the text backbone.
351
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3_5VisionConfig`):
352
+ The config object or dictionary of the vision backbone.
353
+ image_token_id (`int`, *optional*, defaults to 248056):
354
+ The image token index to encode the image prompt.
355
+ video_token_id (`int`, *optional*, defaults to 248057):
356
+ The video token index to encode the image prompt.
357
+ vision_start_token_id (`int`, *optional*, defaults to 248053):
358
+ The start token index to encode the image prompt.
359
+ vision_end_token_id (`int`, *optional*, defaults to 248054):
360
+ The end token index to encode the image prompt.
361
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
362
+ Whether to tie the word embeddings.
363
+
364
+ ```python
365
+ >>> from transformers import InternS2PreviewForConditionalGeneration, InternS2PreviewConfig
366
+
367
+ >>> # Initializing a Qwen3.5-MoE style configuration
368
+ >>> configuration = InternS2PreviewConfig()
369
+
370
+ >>> # Initializing a model from the Qwen3.5-35B-A3B style configuration
371
+ >>> model = InternS2PreviewForConditionalGeneration(configuration)
372
+
373
+ >>> # Accessing the model configuration
374
+ >>> configuration = model.config
375
+ ```"""
376
+
377
+ model_type = "intern_s2_preview"
378
+ sub_configs = {
379
+ "vision_config": InternS2PreviewVisionConfig,
380
+ "text_config": InternS2PreviewTextConfig,
381
+ "ts_config": InternS2PreviewTimeSeriesConfig,
382
+ }
383
+ keys_to_ignore_at_inference = ["past_key_values"]
384
+
385
+ def __init__(
386
+ self,
387
+ text_config=None,
388
+ vision_config=None,
389
+ image_token_id=248056,
390
+ video_token_id=248057,
391
+ vision_start_token_id=248053,
392
+ vision_end_token_id=248054,
393
+ tie_word_embeddings=False,
394
+ ts_config=None,
395
+ ts_token_id=248093,
396
+ ts_start_id=248091,
397
+ ts_end_id=248092,
398
+ **kwargs,
399
+ ):
400
+ if isinstance(ts_config, dict):
401
+ self.ts_config = self.sub_configs["ts_config"](**ts_config)
402
+ elif ts_config is None:
403
+ self.ts_config = self.sub_configs["ts_config"]()
404
+
405
+ self.ts_token_id = ts_token_id
406
+ self.ts_start_id = ts_start_id
407
+ self.ts_end_id = ts_end_id
408
+ if isinstance(vision_config, dict):
409
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
410
+ elif vision_config is None:
411
+ self.vision_config = self.sub_configs["vision_config"]()
412
+
413
+ if isinstance(text_config, dict):
414
+ self.text_config = self.sub_configs["text_config"](**text_config)
415
+ elif text_config is None:
416
+ self.text_config = self.sub_configs["text_config"]()
417
+
418
+ self.image_token_id = image_token_id
419
+ self.video_token_id = video_token_id
420
+ self.vision_start_token_id = vision_start_token_id
421
+ self.vision_end_token_id = vision_end_token_id
422
+ self.tie_word_embeddings = tie_word_embeddings
423
+ super().__init__(**kwargs)
424
+ self.auto_map = {
425
+ "AutoConfig": "configuration_interns2_preview.InternS2PreviewConfig",
426
+ "AutoModelForCausalLM": "modeling_interns2_preview.InternS2PreviewForCausalLM",
427
+ "AutoModel": "modeling_interns2_preview.InternS2PreviewModel",
428
+ "AutoModelForImageTextToText": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration",
429
+ "AutoModelForMultimodalLM": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration",
430
+ }
431
+ self.architectures = ["InternS2PreviewForConditionalGeneration"]
432
+
433
+
434
+ __all__ = ["InternS2PreviewConfig", "InternS2PreviewTextConfig"]
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 248044,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 248046,
6
+ 248044
7
+ ],
8
+ "pad_token_id": 248044,
9
+ "temperature": 1.0,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.57.0.dev0"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:411ce6803b1825ad1851e7f458e19fd3fbfd44da79c2bf55d87063768f6c80fb
3
+ size 4011102104
model-00002-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f1a57e47ea36d5926acf1e121d7455d22244e3ee013487d9df38f6f56d8957f
3
+ size 4215610696
model-00003-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a528b83780fddd55d8510cbbf3585ae703bba1aa6216e32c941449f9c8c3f168
3
+ size 4173305288
model-00004-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:529e38f8c54102fc65c7e1049ee546127ef816de38b95c352f48aa340746f1b4
3
+ size 4242878584
model-00005-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6a23fa2b0a4e9bc713c93be269667b962422583aa0b6cfd05109abf79318179
3
+ size 4207197648
model-00006-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b724b333f9d922f7bff275772ead32345b28caf5219a3bbfeb228a54e5516e47
3
+ size 4215606640
model-00007-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4cff7c2282adda3319cf16e2ca3028987a64964efa076a3005b7966f846096c
3
+ size 4173305392
model-00008-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e31ee1978470119600616ef6c76134f899eaf73b40bbda0584d9f41646b2264a
3
+ size 4242878584
model-00009-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a217307099d5cd6dcc98c4c28e5133b8082c4f816a33bf9bea2f18fed94dec5
3
+ size 4294632712
model-00010-of-00010.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4aa79671edee0347642ce6bffb97e1e4b64b86cbd448ecb5d99076c918ea4136
3
+ size 737714464
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_interns2_preview.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {
3
+ "longest_edge": 16777216,
4
+ "shortest_edge": 65536
5
+ },
6
+ "patch_size": 16,
7
+ "temporal_patch_size": 2,
8
+ "merge_size": 2,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "Qwen3VLProcessor",
20
+ "image_processor_type": "Qwen2VLImageProcessorFast",
21
+ "auto_map": {
22
+ "AutoProcessor": "processing_interns2_preview.InternS2PreviewProcessor"
23
+ }
24
+ }
processing_interns2_preview.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/interns2_preview/modular_interns2_preview.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_interns2_preview.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ import importlib
21
+ import os
22
+
23
+ import numpy as np
24
+
25
+ from transformers.feature_extraction_utils import BatchFeature
26
+ from transformers.image_utils import ImageInput
27
+ from transformers.processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
28
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
29
+ from transformers.utils import auto_docstring, logging
30
+ from transformers.video_utils import VideoInput
31
+
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+
36
+ class InternS2PreviewProcessorKwargs(ProcessingKwargs, total=False):
37
+ _defaults = {
38
+ "text_kwargs": {
39
+ "padding": False,
40
+ "return_token_type_ids": False,
41
+ "return_mm_token_type_ids": False,
42
+ },
43
+ "videos_kwargs": {"return_metadata": True},
44
+ "time_series_kwargs": {},
45
+ }
46
+
47
+
48
+ @auto_docstring
49
+ class InternS2PreviewProcessor(ProcessorMixin):
50
+ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
51
+ self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
52
+ self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
53
+ self.image_token_id = (
54
+ tokenizer.image_token_id
55
+ if getattr(tokenizer, "image_token_id", None)
56
+ else tokenizer.convert_tokens_to_ids(self.image_token)
57
+ )
58
+ self.video_token_id = (
59
+ tokenizer.video_token_id
60
+ if getattr(tokenizer, "video_token_id", None)
61
+ else tokenizer.convert_tokens_to_ids(self.video_token)
62
+ )
63
+ super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
64
+ self.vision_start_token = (
65
+ "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
66
+ )
67
+ self.vision_end_token = (
68
+ "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token
69
+ )
70
+ self.vision_start_token_id = (
71
+ tokenizer.vision_start_token_id
72
+ if getattr(tokenizer, "vision_start_token_id", None)
73
+ else tokenizer.convert_tokens_to_ids(self.vision_start_token)
74
+ )
75
+ self.vision_end_token_id = (
76
+ tokenizer.vision_end_token_id
77
+ if getattr(tokenizer, "vision_end_token_id", None)
78
+ else tokenizer.convert_tokens_to_ids(self.vision_end_token)
79
+ )
80
+ self.ts_token = "<TS_CONTEXT>" if not hasattr(tokenizer, "ts_token") else tokenizer.ts_token
81
+ self.ts_start_token = "<|ts|>" if not hasattr(tokenizer, "ts_start_token") else tokenizer.ts_start_token
82
+ self.ts_end_token = "<|/ts|>" if not hasattr(tokenizer, "ts_end_token") else tokenizer.ts_end_token
83
+ self.ts_start_token_id = (
84
+ tokenizer.ts_start_token_id
85
+ if getattr(tokenizer, "ts_start_token_id", None)
86
+ else tokenizer.convert_tokens_to_ids(self.ts_start_token)
87
+ )
88
+ self.ts_end_token_id = (
89
+ tokenizer.ts_end_token_id
90
+ if getattr(tokenizer, "ts_end_token_id", None)
91
+ else tokenizer.convert_tokens_to_ids(self.ts_end_token)
92
+ )
93
+ self.ts_token_id = (
94
+ tokenizer.ts_token_id
95
+ if getattr(tokenizer, "ts_token_id", None)
96
+ else tokenizer.convert_tokens_to_ids(self.ts_token)
97
+ )
98
+
99
+ @auto_docstring
100
+ def __call__(
101
+ self,
102
+ images: ImageInput = None,
103
+ text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
104
+ videos: VideoInput = None,
105
+ time_series_paths: list[str] = None,
106
+ time_series_sampling_rates: list[int] = None,
107
+ **kwargs: Unpack[InternS2PreviewProcessorKwargs],
108
+ ) -> BatchFeature:
109
+ r"""
110
+ Returns:
111
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
112
+
113
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
114
+ - **ts_values** -- List of time series values to be fed to a model. Returned when `time_series_paths` is not `None`.
115
+ - **ts_sr** -- List of time series sampling rates to be fed to a model. Returned when `time_series_sampling_rates` is not `None`.
116
+ - **ts_lens** -- List of time series lengths to be fed to a model. Returned when `time_series_paths` is not `None`.
117
+ - **num_ts_tokens** -- List of number of time series tokens to be fed to a model. Returned when `time_series_paths` is not `None`.
118
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
119
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
120
+ `None`).
121
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
122
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
123
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
124
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
125
+ """
126
+ output_kwargs = self._merge_kwargs(
127
+ InternS2PreviewProcessorKwargs,
128
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
129
+ **kwargs,
130
+ )
131
+ if images is not None:
132
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
133
+ image_grid_thw = image_inputs["image_grid_thw"]
134
+ else:
135
+ image_inputs = {}
136
+ image_grid_thw = None
137
+
138
+ if videos is not None:
139
+ videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
140
+ video_grid_thw = videos_inputs["video_grid_thw"]
141
+ # If user has not requested video metadata, pop it
142
+ if not kwargs.get("return_metadata"):
143
+ video_metadata = videos_inputs.pop("video_metadata")
144
+ else:
145
+ video_metadata = videos_inputs["video_metadata"]
146
+ else:
147
+ videos_inputs = {}
148
+ video_grid_thw = None
149
+
150
+ if not isinstance(text, list):
151
+ text = [text]
152
+
153
+ text = text.copy() # below lines change text in-place
154
+
155
+ if time_series_paths is not None:
156
+ assert time_series_sampling_rates is not None, (
157
+ "If time_series_signals is provided, time_series_sampling_rates must also be provided."
158
+ )
159
+ assert len(time_series_paths) == len(time_series_sampling_rates), (
160
+ "The number of time series signals must match the number of sampling rates."
161
+ )
162
+ time_series_inputs = self.time_series_processor(
163
+ ts_paths=time_series_paths, sampling_rates=time_series_sampling_rates
164
+ )
165
+ num_ts_tokens = time_series_inputs.pop("num_ts_tokens")
166
+ assert len(num_ts_tokens) == len(text), (
167
+ "The number of time series signals must match the number of text prompts."
168
+ )
169
+ for i in range(len(text)):
170
+ if f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}" in text[i]:
171
+ ts_placeholder = self.ts_start_token + self.ts_token * num_ts_tokens[i] + self.ts_end_token
172
+ text[i] = text[i].replace(
173
+ f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}", ts_placeholder, 1
174
+ )
175
+ elif self.ts_token in text[i]:
176
+ text[i] = text[i].replace(self.ts_token, self.ts_token * num_ts_tokens[i])
177
+ else:
178
+ time_series_inputs = {}
179
+
180
+ if image_grid_thw is not None:
181
+ merge_length = self.image_processor.merge_size**2
182
+ index = 0
183
+ for i in range(len(text)):
184
+ while self.image_token in text[i]:
185
+ num_image_tokens = image_grid_thw[index].prod() // merge_length
186
+ text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
187
+ index += 1
188
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
189
+
190
+ if video_grid_thw is not None:
191
+ merge_length = self.video_processor.merge_size**2
192
+ index = 0
193
+ for i in range(len(text)):
194
+ while self.video_token in text[i]:
195
+ metadata = video_metadata[index]
196
+ if metadata.fps is None:
197
+ logger.warning_once(
198
+ "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
199
+ "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
200
+ "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
201
+ )
202
+ metadata.fps = 24 if metadata.fps is None else metadata.fps
203
+
204
+ # if timestamps are not provided, calculate them
205
+ curr_timestamp = self._calculate_timestamps(
206
+ metadata.frames_indices,
207
+ metadata.fps,
208
+ self.video_processor.temporal_patch_size,
209
+ )
210
+
211
+ video_placeholder = ""
212
+ frame_seqlen = video_grid_thw[index][1:].prod() // merge_length
213
+ for frame_idx in range(video_grid_thw[index][0]):
214
+ curr_time = curr_timestamp[frame_idx]
215
+ video_placeholder += f"<{curr_time:.1f} seconds>"
216
+ video_placeholder += (
217
+ self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token
218
+ )
219
+ if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in text[i]:
220
+ text[i] = text[i].replace(
221
+ f"{self.vision_start_token}{self.video_token}{self.vision_end_token}", video_placeholder, 1
222
+ )
223
+ else:
224
+ # vllm may input video token directly
225
+ text[i] = text[i].replace(self.video_token, video_placeholder, 1)
226
+ index += 1
227
+
228
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
229
+
230
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
231
+ return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
232
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
233
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video", "ts"])
234
+
235
+ if return_mm_token_type_ids:
236
+ array_ids = np.array(text_inputs["input_ids"])
237
+ mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
238
+ mm_token_type_ids[array_ids == self.image_token_id] = 1
239
+ text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
240
+
241
+ return BatchFeature(
242
+ data={**text_inputs, **image_inputs, **videos_inputs, **time_series_inputs}, tensor_type=return_tensors
243
+ )
244
+
245
+ def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
246
+ """
247
+ Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
248
+ Args:
249
+ image_sizes (`list[list[int]]`, *optional*):
250
+ The input sizes formatted as (height, width) per each image.
251
+ video_sizes (`list[list[int]]`, *optional*):
252
+ The input sizes formatted as (num_frames, height, width) per each video.
253
+ Returns:
254
+ `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
255
+ input modalities, along with other useful data.
256
+ """
257
+
258
+ vision_data = {}
259
+ if image_sizes is not None:
260
+ images_kwargs = InternS2PreviewProcessorKwargs._defaults.get("images_kwargs", {})
261
+ images_kwargs.update(kwargs)
262
+ merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
263
+
264
+ num_image_patches = [
265
+ self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
266
+ for image_size in image_sizes
267
+ ]
268
+ num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
269
+ vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
270
+
271
+ if video_sizes is not None:
272
+ videos_kwargs = InternS2PreviewProcessorKwargs._defaults.get("videos_kwargs", {})
273
+ videos_kwargs.update(kwargs)
274
+ num_video_patches = [
275
+ self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
276
+ for video_size in video_sizes
277
+ ]
278
+ num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
279
+ vision_data["num_video_tokens"] = num_video_tokens
280
+
281
+ return MultiModalData(**vision_data)
282
+
283
+ def post_process_image_text_to_text(
284
+ self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
285
+ ):
286
+ """
287
+ Post-process the output of the model to decode the text.
288
+
289
+ Args:
290
+ generated_outputs (`torch.Tensor` or `np.ndarray`):
291
+ The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
292
+ or `(sequence_length,)`.
293
+ skip_special_tokens (`bool`, *optional*, defaults to `True`):
294
+ Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
295
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
296
+ Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
297
+ **kwargs:
298
+ Additional arguments to be passed to the tokenizer's `batch_decode method`.
299
+
300
+ Returns:
301
+ `list[str]`: The decoded text.
302
+ """
303
+ return self.tokenizer.batch_decode(
304
+ generated_outputs,
305
+ skip_special_tokens=skip_special_tokens,
306
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
307
+ **kwargs,
308
+ )
309
+
310
+ def _calculate_timestamps(self, indices: list[int] | np.ndarray, video_fps: float, merge_size: int = 2):
311
+ if not isinstance(indices, list):
312
+ indices = indices.tolist()
313
+ if len(indices) % merge_size != 0:
314
+ indices.extend(indices[-1] for _ in range(merge_size - len(indices) % merge_size))
315
+ timestamps = [idx / video_fps for idx in indices]
316
+ # @JJJYmmm frames are merged by self.merge_size, \
317
+ # so we need to average the timestamps between the first/last frame within the temporal patch
318
+ timestamps = [
319
+ (timestamps[i] + timestamps[i + merge_size - 1]) / 2 for i in range(0, len(timestamps), merge_size)
320
+ ]
321
+ return timestamps
322
+
323
+ def time_series_preprocessor(self, conversation):
324
+ if isinstance(conversation, (list, tuple)) and (
325
+ isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
326
+ ):
327
+ conversations = conversation
328
+ else:
329
+ conversations = [conversation]
330
+
331
+ batch_time_series = []
332
+ batch_time_series_metadata = []
333
+ for conversation in conversations:
334
+ for message in conversation:
335
+ if message["role"] != "user":
336
+ continue
337
+ time_series_fnames = [
338
+ content["data"]
339
+ for content in message["content"]
340
+ if content.get("type") == "time_series" and "data" in content
341
+ ]
342
+ time_series_rates = [
343
+ content.get("sampling_rate", None)
344
+ for content in message["content"]
345
+ if content.get("type") == "time_series"
346
+ ]
347
+ for path, rate in zip(time_series_fnames, time_series_rates):
348
+ batch_time_series.append(path)
349
+ batch_time_series_metadata.append(rate)
350
+
351
+ return {
352
+ "time_series_paths": batch_time_series or None,
353
+ "time_series_sampling_rates": batch_time_series_metadata or None,
354
+ }
355
+
356
+ def time_series_processor(
357
+ self,
358
+ ts_paths: list[str],
359
+ sampling_rates: list[float],
360
+ do_normalize=True,
361
+ do_truncate=True,
362
+ ) -> BatchFeature:
363
+ pd = importlib.import_module("pandas")
364
+ sf = importlib.import_module("soundfile")
365
+
366
+ assert len(ts_paths) == len(sampling_rates), "ts_paths and sampling_rates must have the same length"
367
+
368
+ ts_values = []
369
+ ts_sr = []
370
+ ts_lens = []
371
+
372
+ for idx, ts_path in enumerate(ts_paths):
373
+ sr = sampling_rates[idx]
374
+ ext = os.path.splitext(ts_path)[-1].lower()
375
+ if ext in [".wav", ".mp3", ".flac"]:
376
+ ts_input, sr = sf.read(ts_path) # ts_input: np.ndarray, shape [T] or [T, C]
377
+ elif ext == ".csv":
378
+ df = pd.read_csv(ts_path, header=None)
379
+ ts_input = df.values # [T, C]
380
+ elif ext == ".npy":
381
+ ts_input = np.load(ts_path) # [T, C]
382
+ else:
383
+ raise ValueError(f"Unsupported file format: {ext}")
384
+
385
+ if not isinstance(ts_input, np.ndarray):
386
+ ts_input = np.array(ts_input, dtype=np.float32)
387
+
388
+ if do_normalize:
389
+ mean = ts_input.mean(axis=0, keepdims=True)
390
+ std = ts_input.std(axis=0, keepdims=True)
391
+ ts_input = (ts_input - mean) / (std + 1e-8)
392
+
393
+ if do_truncate and len(ts_input) > 240000:
394
+ ts_input = ts_input[:240000] # truncate to 240k to avoid oom
395
+
396
+ if ts_input.ndim == 1:
397
+ ts_input = ts_input[:, None] # [T,C]
398
+
399
+ ts_len = ts_input.shape[0]
400
+
401
+ if sr is None or sr == 0: # if no sr provided
402
+ sr = ts_len / 4
403
+
404
+ ts_values.append(ts_input)
405
+ ts_sr.append(sr)
406
+ ts_lens.append(ts_len)
407
+
408
+ ts_lens = np.array(ts_lens)
409
+ ts_sr = np.array(ts_sr)
410
+ num_ts_tokens = self._get_num_ts_tokens(sampling_rates=ts_sr, ts_lens=ts_lens)
411
+ return BatchFeature(
412
+ data={"ts_values": ts_values, "ts_sr": ts_sr, "ts_lens": ts_lens, "num_ts_tokens": num_ts_tokens}
413
+ )
414
+
415
+ def _get_num_ts_tokens(self, sampling_rates, ts_lens):
416
+ strides = np.floor(160 / ((1 + np.exp(-sampling_rates / 100)) ** 6))
417
+ patch_sizes = strides * 2
418
+ embed_lengths = (np.ceil((ts_lens - patch_sizes) / strides) + 1).astype(np.int64)
419
+ num_ts_tokens = [(embed_length // 2 + 1) // 2 for embed_length in embed_lengths]
420
+ return num_ts_tokens
421
+
422
+
423
+ __all__ = ["InternS2PreviewProcessor"]
special_tokens_map.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "audio_bos_token": "<|audio_start|>",
18
+ "audio_eos_token": "<|audio_end|>",
19
+ "audio_token": "<|audio_pad|>",
20
+ "bos_token": {
21
+ "content": "<|im_start|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "eos_token": {
28
+ "content": "<|im_end|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "image_token": "<|image_pad|>",
35
+ "pad_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ },
42
+ "video_token": "<|video_pad|>",
43
+ "vision_bos_token": "<|vision_start|>",
44
+ "vision_eos_token": "<|vision_end|>"
45
+ }
tokenization_interns1.py ADDED
@@ -0,0 +1,1009 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The Intern team and Shanghai AI Lab team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for InternS1."""
16
+
17
+ import json
18
+ import os
19
+ import unicodedata
20
+ from abc import ABC, abstractmethod
21
+ from typing import Optional, Union
22
+ from functools import lru_cache
23
+
24
+ import regex as re
25
+ import sentencepiece as spm
26
+
27
+ from transformers.tokenization_utils_base import AddedToken, TextInput
28
+ from transformers.utils import logging
29
+ from packaging import version
30
+ import transformers
31
+ if version.parse(transformers.__version__) >= version.parse("5.0.0"):
32
+ from transformers.tokenization_python import PreTrainedTokenizer
33
+ else:
34
+ from transformers.tokenization_utils import PreTrainedTokenizer
35
+
36
+ logger = logging.get_logger(__name__)
37
+
38
+ try:
39
+ from rdkit import Chem, RDLogger
40
+
41
+ RDLogger.DisableLog("rdApp.error")
42
+ RDLogger.DisableLog("rdApp.*")
43
+ RDKIT_AVAILABLE = True
44
+ except ImportError:
45
+ logger.warning_once(
46
+ "If tokenization with SMILES formula is of necessity, please 'pip install RDKit' for better tokenization quality."
47
+ )
48
+ RDKIT_AVAILABLE = False
49
+
50
+ VOCAB_FILES_NAMES = {
51
+ "vocab_file": "vocab.json",
52
+ "merges_file": "merges.txt",
53
+ "sp_model_SMILES": "tokenizer_SMILES.model",
54
+ "sp_model_PROT": "tokenizer_PROT.model",
55
+ "sp_model_XNA": "tokenizer_XNA.model",
56
+ }
57
+
58
+ PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
59
+
60
+
61
+ class InternS1CheckModuleMixin(ABC):
62
+ """
63
+ Basic auto-detection module.
64
+
65
+ Note that short strings are ignored by this module.
66
+ """
67
+
68
+ def __init__(self, *, min_length: int):
69
+ self.min_length = min_length
70
+ self.REGEX = self._build_regex()
71
+ self.all_auto_detect_token_start = ["<SMILES_AUTO_DETECT>", "<PROT_AUTO_DETECT>", "<XNA_AUTO_DETECT>"]
72
+ self.all_auto_detect_token_end = ["</SMILES_AUTO_DETECT>", "</PROT_AUTO_DETECT>", "</XNA_AUTO_DETECT>"]
73
+ self.auto_detect_token = []
74
+ self.truncation = False
75
+
76
+ @abstractmethod
77
+ def _build_regex(self):
78
+ pass
79
+
80
+ @abstractmethod
81
+ def check_legitimacy(self, candidate: str) -> bool:
82
+ pass
83
+
84
+ def re_split(self, texts: Union[str, list[str]]) -> list[str]:
85
+ if isinstance(texts, str):
86
+ texts = [texts]
87
+
88
+ total_results = []
89
+
90
+ no_split_flag = 0
91
+
92
+ for text in texts:
93
+ if text in self.all_auto_detect_token_start:
94
+ total_results.append(text)
95
+ no_split_flag += 1
96
+ continue
97
+ elif text in self.all_auto_detect_token_end:
98
+ total_results.append(text)
99
+ no_split_flag = max(0, no_split_flag - 1)
100
+ continue
101
+
102
+ if no_split_flag > 0:
103
+ total_results.append(text)
104
+ continue
105
+
106
+ results = []
107
+ current_pos = 0
108
+ for match in self.REGEX.finditer(text):
109
+ candidate = match.group(1)
110
+
111
+ if len(candidate) >= self.min_length:
112
+ match_start, match_end = match.span(1)
113
+
114
+ if not self.check_legitimacy(candidate):
115
+ continue
116
+
117
+ if not self.truncation:
118
+ if match_start > 0 and text[match_start - 1].encode("UTF-8").isalpha():
119
+ continue
120
+ if match_end < len(text) and text[match_end].encode("UTF-8").isalpha():
121
+ continue
122
+
123
+ if match_start > current_pos:
124
+ non_candidate_part = text[current_pos:match_start]
125
+ results.append(non_candidate_part)
126
+ else:
127
+ continue
128
+
129
+ results.extend([self.auto_detect_token[0], candidate, self.auto_detect_token[1]])
130
+ current_pos = match_end
131
+
132
+ if current_pos < len(text):
133
+ remaining_part = text[current_pos:]
134
+ results.append(remaining_part)
135
+
136
+ total_results.extend(results)
137
+
138
+ return total_results
139
+
140
+
141
+ class XnaCheckModule(InternS1CheckModuleMixin):
142
+ """
143
+ XNA sequence auto-detection module.
144
+
145
+ Automatically detects XNA sequence using regex patterns.
146
+ """
147
+ def __init__(self, *, min_length: int = 27):
148
+ super().__init__(min_length=min_length)
149
+ self.auto_detect_token = ["<XNA_AUTO_DETECT>", "</XNA_AUTO_DETECT>"]
150
+ self.truncation = True
151
+
152
+ def _build_regex(self):
153
+ return re.compile(r"([ATCGU]{" + str(self.min_length) + r",})")
154
+
155
+ def check_legitimacy(self, candidate: str):
156
+ return True
157
+
158
+
159
+ class ProtCheckModule(InternS1CheckModuleMixin):
160
+ """
161
+ Protein sequence auto-detection module.
162
+
163
+ Automatically detects protein sequence using regex patterns.
164
+ """
165
+ def __init__(self, *, min_length: int = 27):
166
+ super().__init__(min_length=min_length)
167
+ self.auto_detect_token = ["<PROT_AUTO_DETECT>", "</PROT_AUTO_DETECT>"]
168
+ self.truncation = True
169
+ self._xna_pattern = re.compile(r"^[ATCGU]+$")
170
+
171
+ def _build_regex(self):
172
+ return re.compile(r"([A-Z]{" + str(self.min_length) + r",})")
173
+
174
+ def check_legitimacy(self, candidate: str):
175
+ if self._xna_pattern.match(candidate):
176
+ return False
177
+ return True
178
+
179
+
180
+ # fmt: off
181
+ bonds = ["-", "=", "#", ":", "/", "\\", ".", "$"]
182
+ organic_symbols = ["B", "C", "N", "O", "P", "S", "F", "Cl", "Br", "I"]
183
+ other_allows = bonds + ["[", "]", "(", ")", ";"]
184
+ aromatic_symbols = ["b", "c", "n", "o", "s", "p"]
185
+ elements = [
186
+ "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
187
+ "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca",
188
+ "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
189
+ "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr",
190
+ "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn",
191
+ "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd",
192
+ "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
193
+ "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
194
+ "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th",
195
+ "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm",
196
+ "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
197
+ "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
198
+ ]
199
+ # fmt: on
200
+
201
+
202
+ class SmilesCheckModule(InternS1CheckModuleMixin):
203
+ """
204
+ SMILES molecular sequence auto-detection module.
205
+
206
+ Automatically detects and validates SMILES strings in text using regex patterns
207
+ or chemical syntax rules. Uses RDKit for precise validation when available,
208
+ otherwise falls back to rule-based validation.
209
+ """
210
+
211
+ def __init__(self, *, min_length: int = 10):
212
+ super().__init__(min_length=min_length)
213
+ self.auto_detect_token = ["<SMILES_AUTO_DETECT>", "</SMILES_AUTO_DETECT>"]
214
+ self._SQ_BRACKET_BAN_1 = re.compile(r"(?:[A-GI-Z]|[a-z]){3,}")
215
+ self._SQ_BRACKET_BAN_2 = re.compile(r"\d{4,}")
216
+
217
+ def _build_regex(self):
218
+ # fmt: off
219
+ _two_letter_elements = [
220
+ 'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'Ba', 'Be', 'Bh', 'Bi', 'Bk', 'Br', 'Ca', 'Cd',
221
+ 'Ce', 'Cf', 'Cl', 'Cm', 'Cn', 'Co', 'Cr', 'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'Fe',
222
+ 'Fl', 'Fm', 'Fr', 'Ga', 'Gd', 'Ge', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'In', 'Ir', 'Kr', 'La', 'Li',
223
+ 'Lr', 'Lu', 'Lv', 'Mc', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'Na', 'Nb', 'Nd', 'Ne', 'Nh', 'Ni', 'No',
224
+ 'Np', 'Og', 'Os', 'Pa', 'Pb', 'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg',
225
+ 'Rh', 'Rn', 'Ru', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Tc', 'Te', 'Th',
226
+ 'Ti', 'Tl', 'Tm', 'Ts', 'Xe', 'Yb', 'Zn', 'Zr'
227
+ ]
228
+ _single_letter_elements = [
229
+ "B", "C", "F", "H", "I", "K", "N", "O", "P", "S", "U", "V", "W", "Y", 'b', 'c', 'n', 'o', 'p', 's'
230
+ ]
231
+ # fmt: on
232
+ all_elements_sorted = sorted(_two_letter_elements + _single_letter_elements, key=lambda x: (-len(x), x))
233
+ elements_pattern_str = "|".join(all_elements_sorted)
234
+
235
+ bracket_atom_pattern_str = r"\[[^\]]+\]"
236
+ other_single_chars_pattern_str = r"[\(\)\.=\-#@\d\$\%\*:\+\-\/\\]"
237
+ smiles_unit_pattern = (
238
+ r"(?:"
239
+ + bracket_atom_pattern_str
240
+ + r"|"
241
+ + elements_pattern_str
242
+ + r"|"
243
+ + other_single_chars_pattern_str
244
+ + r")"
245
+ )
246
+ core_sequence_pattern = rf"(?>{smiles_unit_pattern}){{10,}}"
247
+ constrained_core_sequence_pattern = rf"(?![:.=]){core_sequence_pattern}(?<![:.=])"
248
+
249
+ final_regex_str = rf"({constrained_core_sequence_pattern})"
250
+
251
+ COMPILED_REGEX = re.compile(final_regex_str)
252
+ return COMPILED_REGEX
253
+
254
+ def check_legitimacy_slow(self, candidate: str) -> bool:
255
+ """Check legitimacy with RDKit"""
256
+ if sum(1 for char in candidate if char.encode("UTF-8").isalpha()) < 5:
257
+ return False
258
+
259
+ mol = Chem.MolFromSmiles(candidate)
260
+ if mol is None:
261
+ return False
262
+ else:
263
+ return True
264
+
265
+ def check_legitimacy_fast(self, candidate: str) -> bool:
266
+ """Check legitimacy with hard rules"""
267
+ if sum(1 for char in candidate if char.encode("UTF-8").isalpha()) < 5:
268
+ return False
269
+
270
+ if not self.check_rings_and_brackets(candidate):
271
+ return False
272
+ else:
273
+ return True
274
+
275
+ def check_legitimacy(self, candidate: str) -> bool:
276
+ if RDKIT_AVAILABLE:
277
+ return self.check_legitimacy_slow(candidate)
278
+ else:
279
+ return self.check_legitimacy_fast(candidate)
280
+
281
+ def check_brackets(self, text):
282
+ matches = re.findall(r"\[([^\[\]]*)\]", text)
283
+ for part in matches:
284
+ if "(" in part or ")" in part:
285
+ return False
286
+ if len(part) == 0:
287
+ return False
288
+ if part[0] in elements or part[0] in aromatic_symbols or part[:2] in elements:
289
+ return True
290
+ return True
291
+
292
+ def check_rings_and_brackets(self, text):
293
+ rings = {}
294
+ left_sq_bracket, right_sq_bracket = 0, 0
295
+ left_pt_bracket, right_pt_bracket = 0, 0
296
+ all_lower = True
297
+ digits_cnt = 0
298
+ pos = 0
299
+ while pos < len(text):
300
+ step = 0
301
+ c = text[pos]
302
+ if ord(c) >= 65 and ord(c) <= 90:
303
+ all_lower = False
304
+ if (pos == len(text) - 1 or pos == 0) and c in bonds:
305
+ return False
306
+ if pos > 0 and text[pos - 1] in bonds and text[pos] in bonds:
307
+ return False
308
+ if c == "[":
309
+ step = 1
310
+ left_sq_bracket += 1
311
+ if left_sq_bracket > right_sq_bracket + 1:
312
+ return False
313
+ if pos == len(text) - 1:
314
+ return False
315
+ if "]" not in text[pos + 1 :]:
316
+ return False
317
+ bracket_span = text[pos + 1 : text.find("]")]
318
+
319
+ if self._SQ_BRACKET_BAN_1.search(bracket_span) or self._SQ_BRACKET_BAN_2.search(bracket_span):
320
+ return False
321
+
322
+ matches = re.findall(r"\d+", bracket_span)
323
+ if len(matches) > 2:
324
+ return False
325
+ if c == "]":
326
+ step = 1
327
+ right_sq_bracket += 1
328
+ if right_sq_bracket > left_sq_bracket:
329
+ return False
330
+
331
+ if c == "(":
332
+ step = 1
333
+ left_pt_bracket += 1
334
+ if c == ")":
335
+ step = 1
336
+ right_pt_bracket += 1
337
+ if right_pt_bracket > left_pt_bracket:
338
+ return False
339
+
340
+ if left_sq_bracket == right_sq_bracket:
341
+ if c.isdigit():
342
+ digits_cnt += 1
343
+ step = 1
344
+ if (
345
+ pos == 0
346
+ or (pos == 1 and text[pos - 1] != "%")
347
+ or (pos > 1 and text[pos - 1] != "%" and text[pos - 2] != "%")
348
+ ):
349
+ if c in rings:
350
+ if rings[c] == "unclosed":
351
+ rings[c] = "closed"
352
+ else:
353
+ rings[c] = "unclosed"
354
+ else:
355
+ rings[c] = "unclosed"
356
+ if c == "%":
357
+ if pos >= len(text) - 2 or not text[pos + 1].isdigit() or not text[pos + 2].isdigit():
358
+ return False
359
+ step = 3
360
+ digits_cnt += 1
361
+ num = text[pos + 1 : pos + 3]
362
+ if num in rings:
363
+ if rings[num] == "unclosed":
364
+ rings[num] = "closed"
365
+ else:
366
+ rings[num] = "unclosed"
367
+ else:
368
+ rings[num] = "unclosed"
369
+ if step == 0:
370
+ if (
371
+ pos < len(text) - 1
372
+ and text[pos : pos + 2] in organic_symbols + aromatic_symbols + other_allows
373
+ ):
374
+ step = 2
375
+ elif c in organic_symbols + aromatic_symbols + other_allows:
376
+ step = 1
377
+ else:
378
+ return False
379
+
380
+ if step == 0:
381
+ step = 1
382
+ pos += step
383
+
384
+ if left_sq_bracket != right_sq_bracket or any(v == "unclosed" for v in rings.values()):
385
+ return False
386
+ if all_lower and digits_cnt < 2:
387
+ return False
388
+ return self.check_brackets(text)
389
+
390
+
391
+ @lru_cache
392
+ # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
393
+ def bytes_to_unicode():
394
+ """
395
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
396
+ characters the bpe code barfs on.
397
+
398
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
399
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
400
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
401
+ tables between utf-8 bytes and unicode strings.
402
+ """
403
+ bs = (
404
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
405
+ )
406
+ cs = bs[:]
407
+ n = 0
408
+ for b in range(2**8):
409
+ if b not in bs:
410
+ bs.append(b)
411
+ cs.append(2**8 + n)
412
+ n += 1
413
+ cs = [chr(n) for n in cs]
414
+ return dict(zip(bs, cs))
415
+
416
+
417
+ # Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
418
+ def get_pairs(word):
419
+ """
420
+ Return set of symbol pairs in a word.
421
+
422
+ Word is represented as tuple of symbols (symbols being variable-length strings).
423
+ """
424
+ pairs = set()
425
+ prev_char = word[0]
426
+ for char in word[1:]:
427
+ pairs.add((prev_char, char))
428
+ prev_char = char
429
+ return pairs
430
+
431
+
432
+ # @requires(backends=("sentencepiece",))
433
+ class InternS1Tokenizer(PreTrainedTokenizer):
434
+ """
435
+ Construct an InternS1 tokenizer. Based on byte-level Byte-Pair-Encoding.
436
+
437
+ Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
438
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
439
+
440
+ ```python
441
+ >>> from transformers import AutoTokenizer
442
+
443
+ >>> tokenizer = AutoTokenizer.from_pretrained("InternS1Tokenizer", trust_remote_code=True)
444
+ >>> tokenizer("Hello world")["input_ids"]
445
+ [9707, 1879]
446
+
447
+ >>> tokenizer(" Hello world")["input_ids"]
448
+ [21927, 1879]
449
+ ```
450
+ This is expected.
451
+
452
+ Include custom extension to support better domain-specific text tokenization, leveraging a separately trained tokenizer model.
453
+
454
+ ```python
455
+ >>> from transformers import AutoTokenizer
456
+
457
+ >>> tokenizer = AutoTokenizer.from_pretrained("InternS1Tokenizer", trust_remote_code=True)
458
+ >>> tokenizer.tokenize("Describe <SMILES>C1=CC=C(C=C1)C=O</SMILES> and CC1=CC=CC=C1C=O")
459
+ ["Describe ", "<SMILES>", "C1=CC=C(C=C1)C=O", "</SMILES>", " and ", "<SMILES_AUTO_DETECT>",
460
+ "CC1=CC=CC=C1C=O", "</SMILES_AUTO_DETECT>"]
461
+ >>> token_ids = tokenizer("Describe <SMILES>C1=CC=C(C=C1)C=O</SMILES> and CC1=CC=CC=C1C=O")["input_ids"]
462
+ >>> token_ids
463
+ [74785, 220, 151925, 151854, 151860, 151698, 151707, 151860, 151690, 151726, 151926, 323, 220, 151672, 151860, 151701, 151860, 151854, 151726]
464
+
465
+ >>> tokenizer.convert_ids_to_tokens(token_ids)
466
+ ['Describe', 'Ġ', '<SMILES>', 'C', '1', '=CC=C(', 'C=C', '1', ')C', '=O', '</SMILES>', 'Ġand', 'Ġ', 'CC', '1', '=CC=CC=C', '1', 'C', '=O']
467
+ ```
468
+
469
+ Users should refer to this superclass [`PreTrainedTokenizer`] for more information regarding those overloaded methods
470
+
471
+ Args:
472
+ vocab_file (`str`):
473
+ Path to the vocabulary file.
474
+ merges_file (`str`):
475
+ Path to the merges file.
476
+ errors (`str`, *optional*, defaults to `"replace"`):
477
+ Paradigm to follow when decoding bytes to UTF-8. See
478
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
479
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
480
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
481
+ token instead.
482
+ bos_token (`str`, *optional*):
483
+ The beginning of sequence token. Not applicable for this tokenizer.
484
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
485
+ The end of sequence token.
486
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
487
+ The token used for padding, for example when batching sequences of different lengths.
488
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
489
+ Whether or not the model should cleanup the spaces that were added when splitting the input text during the
490
+ tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
491
+ split_special_tokens (`bool`, *optional*, defaults to `False`):
492
+ Whether or not the special tokens should be split during the tokenization process. The default behavior is
493
+ to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
494
+ ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
495
+ '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
496
+ """
497
+
498
+ vocab_files_names = VOCAB_FILES_NAMES
499
+ model_input_names = ["input_ids", "attention_mask"]
500
+
501
+ def __init__(
502
+ self,
503
+ vocab_file,
504
+ merges_file,
505
+ errors="replace",
506
+ unk_token="<|endoftext|>",
507
+ bos_token=None,
508
+ eos_token="<|endoftext|>",
509
+ pad_token="<|endoftext|>",
510
+ clean_up_tokenization_spaces=False,
511
+ split_special_tokens=False,
512
+ special_tokens_pattern="none",
513
+ **kwargs,
514
+ ):
515
+ bos_token = (
516
+ AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
517
+ if isinstance(bos_token, str)
518
+ else bos_token
519
+ )
520
+ eos_token = (
521
+ AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
522
+ if isinstance(eos_token, str)
523
+ else eos_token
524
+ )
525
+ unk_token = (
526
+ AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
527
+ if isinstance(unk_token, str)
528
+ else unk_token
529
+ )
530
+ pad_token = (
531
+ AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
532
+ if isinstance(pad_token, str)
533
+ else pad_token
534
+ )
535
+
536
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
537
+ self.encoder = json.load(vocab_handle)
538
+ self.decoder = {v: k for k, v in self.encoder.items()}
539
+ self.errors = errors # how to handle errors in decoding
540
+ self.byte_encoder = bytes_to_unicode()
541
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
542
+ bpe_merges = []
543
+ with open(merges_file, encoding="utf-8") as merges_handle:
544
+ for i, line in enumerate(merges_handle):
545
+ line = line.strip()
546
+ if (i == 0 and line.startswith("#version:")) or not line:
547
+ continue
548
+ bpe_merges.append(tuple(line.split()))
549
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
550
+ # NOTE: the cache can grow without bound and will get really large for long running processes
551
+ # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
552
+ # not a memory leak but appears as one.
553
+ # GPT2Tokenizer has the same problem, so let's be consistent.
554
+ self.cache = {}
555
+
556
+ self.pat = re.compile(PRETOKENIZE_REGEX)
557
+
558
+ if kwargs.get("add_prefix_space", False):
559
+ logger.warning_once(
560
+ f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
561
+ )
562
+
563
+ super().__init__(
564
+ vocab_file=vocab_file,
565
+ merges_file=merges_file,
566
+ errors=errors,
567
+ unk_token=unk_token,
568
+ bos_token=bos_token,
569
+ eos_token=eos_token,
570
+ pad_token=pad_token,
571
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
572
+ split_special_tokens=split_special_tokens,
573
+ special_tokens_pattern=special_tokens_pattern,
574
+ **kwargs,
575
+ )
576
+
577
+ self.prepare_extra_tokenizers(vocab_file)
578
+
579
+ @property
580
+ def vocab_size(self) -> int:
581
+ return len(self.encoder)
582
+
583
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
584
+ def get_vocab(self):
585
+ return dict(self.encoder, **self.added_tokens_encoder)
586
+
587
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
588
+ def bpe(self, token):
589
+ if token in self.cache:
590
+ return self.cache[token]
591
+ word = tuple(token)
592
+ pairs = get_pairs(word)
593
+
594
+ if not pairs:
595
+ return token
596
+
597
+ while True:
598
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
599
+ if bigram not in self.bpe_ranks:
600
+ break
601
+ first, second = bigram
602
+ new_word = []
603
+ i = 0
604
+ while i < len(word):
605
+ try:
606
+ j = word.index(first, i)
607
+ except ValueError:
608
+ new_word.extend(word[i:])
609
+ break
610
+ else:
611
+ new_word.extend(word[i:j])
612
+ i = j
613
+
614
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
615
+ new_word.append(first + second)
616
+ i += 2
617
+ else:
618
+ new_word.append(word[i])
619
+ i += 1
620
+ new_word = tuple(new_word)
621
+ word = new_word
622
+ if len(word) == 1:
623
+ break
624
+ else:
625
+ pairs = get_pairs(word)
626
+ word = " ".join(word)
627
+ self.cache[token] = word
628
+ return word
629
+
630
+ def prepare_extra_tokenizers(self, vocab_file: str) -> None:
631
+ """
632
+ Prepare domain-specific tokenizers.
633
+
634
+ Define variables/maps here which guide domain-specific tokenization later.
635
+ """
636
+ # Load extra tokenizers with SentencePiece model
637
+ dir_name = os.path.dirname(vocab_file)
638
+
639
+ self.sp_model_SMILES = spm.SentencePieceProcessor()
640
+ self.sp_model_SMILES.Load(os.path.join(dir_name, "tokenizer_SMILES.model"))
641
+ self.sp_model_SMILES.offset = self.init_kwargs["offset_SMILES"]
642
+
643
+ self.sp_model_PROT = spm.SentencePieceProcessor()
644
+ self.sp_model_PROT.Load(os.path.join(dir_name, "tokenizer_PROT.model"))
645
+ self.sp_model_PROT.offset = self.init_kwargs["offset_PROT"]
646
+
647
+ self.sp_model_XNA = spm.SentencePieceProcessor()
648
+ self.sp_model_XNA.Load(os.path.join(dir_name, "tokenizer_XNA.model"))
649
+ self.sp_model_XNA.offset = self.init_kwargs["offset_XNA"]
650
+
651
+ base_mapping = {
652
+ "SMILES": self.sp_model_SMILES,
653
+ "protein": self.sp_model_PROT,
654
+ "dna": self.sp_model_XNA,
655
+ "rna": self.sp_model_XNA,
656
+ }
657
+ auto_detect_mapping = {
658
+ "SMILES": self.sp_model_SMILES,
659
+ "PROT": self.sp_model_PROT,
660
+ "XNA": self.sp_model_XNA,
661
+ }
662
+ # Guiding tokens of domain-specific tokenization
663
+ self.ex_begin_mapping = {f"<{key}>": value for key, value in base_mapping.items()}
664
+ self.ex_end_mapping = {f"</{key}>": value for key, value in base_mapping.items()}
665
+ # Transient markers for auto-detection, these tokens will not be assigned token ids
666
+ self.ex_auto_begin_mapping = {f"<{key}_AUTO_DETECT>": value for key, value in auto_detect_mapping.items()}
667
+ self.ex_auto_end_mapping = {f"</{key}_AUTO_DETECT>": value for key, value in auto_detect_mapping.items()}
668
+ # Token markers to prevent unwanted auto-detection
669
+ self.ex_protect_begin_tokens = ["<MOLFORMULA>"]
670
+ self.ex_protect_end_tokens = ["</MOLFORMULA>"]
671
+ # For simplicity
672
+ self.ex_protect_tokens = self.ex_protect_begin_tokens + self.ex_protect_end_tokens
673
+ self.ex_all_begin_mapping = self.ex_begin_mapping | self.ex_auto_begin_mapping
674
+ self.ex_all_end_mapping = self.ex_end_mapping | self.ex_auto_end_mapping
675
+
676
+ # Update encoder & decoder with extra tokenizers
677
+ for tokenizer_name, sp_model in [
678
+ ("SMILES", self.sp_model_SMILES),
679
+ ("PROT", self.sp_model_PROT),
680
+ ("XNA", self.sp_model_XNA),
681
+ ]:
682
+ self.decoder.update(
683
+ {i + sp_model.offset: sp_model.id_to_piece(i) for i in range(sp_model.get_piece_size())}
684
+ )
685
+ # Not really used, only to fill holes in encoder, to keep methods like `add_tokens` working
686
+ self.encoder.update(
687
+ {
688
+ f"<|{tokenizer_name}_{sp_model.id_to_piece(i)}|>": i + sp_model.offset
689
+ for i in range(sp_model.get_piece_size())
690
+ }
691
+ )
692
+
693
+ # protect-tokens should keep complete temporarily to guide later tokenization
694
+ # it will be segmented later
695
+ for token in self.ex_protect_tokens:
696
+ self.tokens_trie.add(token)
697
+
698
+ self._unk_token = "<unk>" # Fall-back
699
+ self.check_module_list = [SmilesCheckModule(), ProtCheckModule(), XnaCheckModule()]
700
+
701
+ def _pop_logical_sp_token(self, extra_tokenizer_stack: list, mapping_name: str) -> None:
702
+ """Switch tokenizer when it comes to an end sp token"""
703
+ extra_tokenizer = extra_tokenizer_stack.pop()
704
+ if extra_tokenizer != self.ex_all_end_mapping[mapping_name]:
705
+ logger.warning_once(
706
+ f"Encounter incorrect nesting of extra tokenizer: {self.ex_all_end_mapping[mapping_name]} and {extra_tokenizer}"
707
+ )
708
+ logger.warning_once("This may lead to unexpected behaviour of the tokenizer, please check your input.")
709
+
710
+ def tokenize(self, text: TextInput, **kwargs) -> list[str]:
711
+ """
712
+ Converts a string into a sequence of tokens, using the tokenizer.
713
+
714
+ It will switch to domain-specific tokenizer once encountering extra/logical sp tokens.
715
+
716
+ Args:
717
+ text: TextInput
718
+ """
719
+ split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
720
+
721
+ text, kwargs = self.prepare_for_tokenization(text, **kwargs)
722
+
723
+ if hasattr(self, "do_lower_case") and self.do_lower_case:
724
+ # convert non-special tokens to lowercase. Might be super slow as well?
725
+ escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
726
+ escaped_special_toks += [
727
+ re.escape(s_tok.content)
728
+ for s_tok in (self._added_tokens_decoder.values())
729
+ if not s_tok.special and s_tok.normalized
730
+ ]
731
+ pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
732
+ text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
733
+
734
+ if split_special_tokens:
735
+ no_split_token = []
736
+ tokens = [text]
737
+ else:
738
+ no_split_token = self._added_tokens_encoder.keys() # don't split on any of the added tokens
739
+ # "This is something<special_token_1> else"
740
+ tokens = self.tokens_trie.split(text)
741
+
742
+ # ["This is something", "<special_token_1>", " else"]
743
+ for i, token in enumerate(tokens):
744
+ if token in no_split_token:
745
+ tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None)
746
+ left = tokens[i - 1] if i > 0 else None
747
+ right = tokens[i + 1] if i < len(tokens) - 1 else None
748
+ if isinstance(tok_extended, AddedToken):
749
+ if tok_extended.rstrip and right:
750
+ # A bit counter-intuitive but we strip the left of the string
751
+ # since tok_extended.rstrip means the special token is eating all white spaces on its right
752
+ tokens[i + 1] = right.lstrip()
753
+ # Strip white spaces on the left
754
+ if tok_extended.lstrip and left:
755
+ tokens[i - 1] = left.rstrip() # Opposite here
756
+ if tok_extended.single_word and left and left[-1] != " ":
757
+ tokens[i - 1] += token
758
+ tokens[i] = ""
759
+ elif tok_extended.single_word and right and right[0] != " ":
760
+ tokens[i + 1] = token + tokens[i + 1]
761
+ tokens[i] = ""
762
+ else:
763
+ raise ValueError(
764
+ f"{tok_extended} cannot be tokenized because it was not properly added"
765
+ f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
766
+ )
767
+
768
+ # ["This is something", "<special_token_1>", "else"]
769
+ tokenized_text = []
770
+
771
+ # Codes for automatically detecting domain-specific content
772
+ # All parts that have been marked by domain-specific or protection tokens will not be subject to auto detection
773
+ # See transformers/tests/models/intern_s1/test_tokenization_intern_s1.py::test_auto_detection() for more details
774
+ new_tokens = []
775
+ not_split_flag = 0
776
+ for token in tokens:
777
+ if not token:
778
+ continue
779
+ if token in no_split_token or token in self.ex_protect_tokens:
780
+ new_tokens.append(token)
781
+ if token in self.ex_begin_mapping or token in self.ex_protect_begin_tokens:
782
+ not_split_flag += 1 # In case nested sp tokens
783
+ elif token in self.ex_end_mapping or token in self.ex_protect_end_tokens:
784
+ not_split_flag = max(0, not_split_flag - 1)
785
+ else:
786
+ if not_split_flag:
787
+ new_tokens.append(token)
788
+ else:
789
+ for check_module in self.check_module_list:
790
+ token = check_module.re_split(token)
791
+
792
+ new_tokens.extend(token)
793
+ tokens = new_tokens
794
+
795
+ # Use stack to maintain which tokenizer should be used, considering the possibility of nested extra tokenizer
796
+ extra_tokenizer_stack = []
797
+ for token in tokens:
798
+ # Need to skip eventual empty (fully stripped) tokens
799
+ if not token:
800
+ continue
801
+ # protect-tokens are not assigned token ids, should be segmented here
802
+ if token in self.ex_protect_tokens:
803
+ tokenized_text.extend(self._tokenize(token))
804
+ # push tokenizer to stack when encountering begin token
805
+ elif token in self.ex_all_begin_mapping:
806
+ tokenized_text.append(token)
807
+ extra_tokenizer_stack.append(self.ex_all_begin_mapping[token])
808
+ # pop tokenizer from stack when encountering end token
809
+ elif token in self.ex_all_end_mapping:
810
+ tokenized_text.append(token)
811
+ if extra_tokenizer_stack:
812
+ self._pop_logical_sp_token(extra_tokenizer_stack, token)
813
+ # other special tokens
814
+ elif token in no_split_token:
815
+ tokenized_text.append(token)
816
+ else:
817
+ tokenized_text.extend(self._tokenize(token, extra_tokenizer_stack=extra_tokenizer_stack))
818
+
819
+ # ["This", " is", " something", "<special_token_1>", "else"]
820
+ return tokenized_text
821
+
822
+ def _tokenize(self, text, **kwargs):
823
+ """
824
+ Modified from `transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize`.
825
+
826
+ This adaptation supports domain-specific tokenizers.
827
+ """
828
+ extra_tokenizer_stack = kwargs.pop("extra_tokenizer_stack", False)
829
+ if extra_tokenizer_stack:
830
+ tokenized_text = extra_tokenizer_stack[-1].encode(text, out_type=str)
831
+ tokenized_id = extra_tokenizer_stack[-1].encode(text, out_type=int)
832
+ final_tokenized_text = []
833
+ for text_piece, id_piece in zip(tokenized_text, tokenized_id):
834
+ if id_piece == 0:
835
+ final_tokenized_text.extend(self._bpe_tokenize(text_piece))
836
+ else:
837
+ final_tokenized_text.append(text_piece)
838
+ return final_tokenized_text
839
+ else:
840
+ return self._bpe_tokenize(text)
841
+
842
+ def _bpe_tokenize(self, text, **kwargs):
843
+ text = text.replace(
844
+ "▁", " "
845
+ ) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
846
+ bpe_tokens = []
847
+ for token in re.findall(self.pat, text):
848
+ token = "".join(
849
+ self.byte_encoder[b] for b in token.encode("utf-8")
850
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
851
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
852
+ return bpe_tokens
853
+
854
+ def convert_tokens_to_ids(self, tokens: Union[str, list[str]]) -> Union[int, list[int]]:
855
+ """
856
+ Modified from `transformers.tokenization_utils.PreTrainedTokenzier.convert_tokens_to_ids`.
857
+
858
+ Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
859
+ vocabulary.
860
+
861
+ This adaptation supports domain-specific tokenizers.
862
+
863
+ Args:
864
+ tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
865
+
866
+ Returns:
867
+ `int` or `List[int]`: The token id or list of token ids.
868
+ """
869
+ if tokens is None:
870
+ return None
871
+
872
+ if isinstance(tokens, str):
873
+ return self._convert_token_to_id_with_added_voc(tokens)
874
+
875
+ ids = []
876
+ extra_tokenizer_stack = []
877
+
878
+ for token in tokens:
879
+ if token not in self.ex_auto_begin_mapping and token not in self.ex_auto_end_mapping:
880
+ ids.append(
881
+ self._convert_token_to_id_with_added_voc(token, extra_tokenizer_stack=extra_tokenizer_stack)
882
+ )
883
+ if token in self.ex_all_begin_mapping:
884
+ extra_tokenizer_stack.append(self.ex_all_begin_mapping[token])
885
+ elif token in self.ex_all_end_mapping:
886
+ if extra_tokenizer_stack:
887
+ self._pop_logical_sp_token(extra_tokenizer_stack, token)
888
+ return ids
889
+
890
+ def _convert_token_to_id_with_added_voc(self, token, **kwargs):
891
+ """
892
+ Modified from `transformers.tokenization_utils.PreTrainedTokenzier._convert_token_to_id_with_added_voc`.
893
+
894
+ This adaptation supports domain-specific tokenizers.
895
+ """
896
+ if token is None:
897
+ return None
898
+
899
+ if token in self._added_tokens_encoder:
900
+ return self._added_tokens_encoder[token]
901
+ return self._convert_token_to_id(token, **kwargs)
902
+
903
+ def _convert_token_to_id(self, token, **kwargs):
904
+ """
905
+ Modified from `transformers.tokenization_utils.PreTrainedTokenzier._convert_token_to_id`.
906
+
907
+ Converts a token (str) in an id using the vocab.
908
+
909
+ Fall back to original tokenizer once OOV.
910
+ """
911
+ extra_tokenizer_stack = kwargs.pop("extra_tokenizer_stack", False)
912
+ if extra_tokenizer_stack:
913
+ token_id = extra_tokenizer_stack[-1].piece_to_id(token)
914
+ if token_id == extra_tokenizer_stack[-1].unk_id():
915
+ return self.encoder.get(token, self.encoder.get(self._unk_token))
916
+ else:
917
+ return token_id + extra_tokenizer_stack[-1].offset
918
+ else:
919
+ return self.encoder.get(token, self.encoder.get(self._unk_token))
920
+
921
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
922
+ def _convert_id_to_token(self, index):
923
+ """Converts an index (integer) in a token (str) using the vocab."""
924
+ return self.decoder.get(index)
925
+
926
+ def convert_tokens_to_string(self, tokens):
927
+ """Converts a sequence of tokens (string) in a single string."""
928
+ text = "".join(tokens)
929
+ text = text.replace(
930
+ "▁", "Ġ"
931
+ ) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
932
+ text = text.replace("\n", "Ċ")
933
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
934
+ return text
935
+
936
+ def decode(
937
+ self,
938
+ token_ids,
939
+ skip_special_tokens: bool = False,
940
+ clean_up_tokenization_spaces: Optional[bool] = False,
941
+ spaces_between_special_tokens: bool = False,
942
+ **kwargs,
943
+ ) -> str:
944
+ # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
945
+ # and cannot be configured elsewhere, but it should default to False for InternS1Tokenizer
946
+ return super().decode(
947
+ token_ids,
948
+ skip_special_tokens=skip_special_tokens,
949
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
950
+ spaces_between_special_tokens=spaces_between_special_tokens,
951
+ **kwargs,
952
+ )
953
+
954
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
955
+ """
956
+ Modified from `transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary` to support saving custom extension.
957
+ """
958
+ if not os.path.isdir(save_directory):
959
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
960
+ return
961
+ vocab_file = os.path.join(
962
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
963
+ )
964
+ merge_file = os.path.join(
965
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
966
+ )
967
+ sp_model_smiles = os.path.join(
968
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_SMILES"]
969
+ )
970
+ sp_model_prot = os.path.join(
971
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_PROT"]
972
+ )
973
+ sp_model_xna = os.path.join(
974
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_XNA"]
975
+ )
976
+
977
+ with open(vocab_file, "w", encoding="utf-8") as f:
978
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
979
+
980
+ index = 0
981
+ with open(merge_file, "w", encoding="utf-8") as writer:
982
+ writer.write("#version: 0.2\n")
983
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
984
+ if index != token_index:
985
+ logger.warning(
986
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
987
+ " Please check that the tokenizer is not corrupted!"
988
+ )
989
+ index = token_index
990
+ writer.write(" ".join(bpe_tokens) + "\n")
991
+ index += 1
992
+
993
+ with open(sp_model_smiles, "wb") as f:
994
+ f.write(self.sp_model_SMILES.serialized_model_proto())
995
+
996
+ with open(sp_model_prot, "wb") as f:
997
+ f.write(self.sp_model_PROT.serialized_model_proto())
998
+
999
+ with open(sp_model_xna, "wb") as f:
1000
+ f.write(self.sp_model_XNA.serialized_model_proto())
1001
+
1002
+ return vocab_file, merge_file
1003
+
1004
+ def prepare_for_tokenization(self, text, **kwargs):
1005
+ text = unicodedata.normalize("NFC", text)
1006
+ return (text, kwargs)
1007
+
1008
+
1009
+ __all__ = ["InternS1Tokenizer"]
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f9e4d4901a92b997e463c1f46055088b6cca5ca61a6522d1b9f64c4bb81cb42
3
+ size 12807982
tokenizer_PROT.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1144f52f86f3ca5a29940d69b037e508c05a89e6eedbe42bea641e226b20dbe0
3
+ size 12118
tokenizer_SMILES.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fba1c97da0353ccbffd368ae78e311ccbc762aa5ba74f9aff8bf2ab363c4d37d
3
+ size 14775
tokenizer_XNA.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58fc8bfb2af3dfe936a13dad8a9cb28dab7850b70b358db19605d867c133fb35
3
+ size 15451
tokenizer_config.json ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "248044": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "248045": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "248046": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "248047": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "248048": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "248049": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "248050": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "248051": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "248052": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "248053": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "248054": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "248055": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "248056": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "248057": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "248058": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "248059": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "248060": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "248061": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "248062": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "248063": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "248064": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "248065": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "248066": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "248067": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "248068": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "248069": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "248070": {
214
+ "content": "<|audio_start|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "248071": {
222
+ "content": "<|audio_end|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "248072": {
230
+ "content": "<tts_pad>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "248073": {
238
+ "content": "<tts_text_bos>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "248074": {
246
+ "content": "<tts_text_eod>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "248075": {
254
+ "content": "<tts_text_bos_single>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "248076": {
262
+ "content": "<|audio_pad|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "248077": {
270
+ "content": "<IMG_CONTEXT>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "248078": {
278
+ "content": "<img>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "248079": {
286
+ "content": "</img>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "248080": {
294
+ "content": "<quad>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "248081": {
302
+ "content": "</quad>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "248082": {
310
+ "content": "<ref>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "248083": {
318
+ "content": "</ref>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "248084": {
326
+ "content": "<box>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "248085": {
334
+ "content": "</box>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "248086": {
342
+ "content": "<|action_start|>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "248087": {
350
+ "content": "<|action_end|>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "248088": {
358
+ "content": "<|interpreter|>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "248089": {
366
+ "content": "<|plugin|>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "248090": {
374
+ "content": "<video>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "248091": {
382
+ "content": "<|ts|>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "248092": {
390
+ "content": "<|/ts|>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "248093": {
398
+ "content": "<TS_CONTEXT>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "248094": {
406
+ "content": "<SMILES>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": false
412
+ },
413
+ "248095": {
414
+ "content": "</SMILES>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": false
420
+ },
421
+ "248096": {
422
+ "content": "<protein>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": false
428
+ },
429
+ "248097": {
430
+ "content": "</protein>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": false
436
+ },
437
+ "248098": {
438
+ "content": "<dna>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": false
444
+ },
445
+ "248099": {
446
+ "content": "</dna>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": false
452
+ },
453
+ "248100": {
454
+ "content": "<rna>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": false
460
+ },
461
+ "248101": {
462
+ "content": "</rna>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": false
468
+ }
469
+ },
470
+ "additional_special_tokens": [
471
+ "<|im_start|>",
472
+ "<|im_end|>",
473
+ "<|object_ref_start|>",
474
+ "<|object_ref_end|>",
475
+ "<|box_start|>",
476
+ "<|box_end|>",
477
+ "<|quad_start|>",
478
+ "<|quad_end|>",
479
+ "<|vision_start|>",
480
+ "<|vision_end|>",
481
+ "<|vision_pad|>",
482
+ "<|image_pad|>",
483
+ "<|video_pad|>"
484
+ ],
485
+ "audio_bos_token": "<|audio_start|>",
486
+ "audio_eos_token": "<|audio_end|>",
487
+ "audio_token": "<|audio_pad|>",
488
+ "auto_map": {
489
+ "AutoTokenizer": [
490
+ "tokenization_interns1.InternS1Tokenizer",
491
+ null
492
+ ]
493
+ },
494
+ "bos_token": "<|im_start|>",
495
+ "clean_up_tokenization_spaces": false,
496
+ "eos_token": "<|im_end|>",
497
+ "errors": "replace",
498
+ "extra_special_tokens": {
499
+ "audio_bos_token": "<|audio_start|>",
500
+ "audio_eos_token": "<|audio_end|>",
501
+ "audio_token": "<|audio_pad|>",
502
+ "image_token": "<|image_pad|>",
503
+ "video_token": "<|video_pad|>",
504
+ "vision_bos_token": "<|vision_start|>",
505
+ "vision_eos_token": "<|vision_end|>"
506
+ },
507
+ "image_token": "<|image_pad|>",
508
+ "model_max_length": 262144,
509
+ "offset_PROT": 249126,
510
+ "offset_SMILES": 248102,
511
+ "offset_XNA": 250150,
512
+ "pad_token": "<|endoftext|>",
513
+ "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
514
+ "special_tokens_pattern": "none",
515
+ "split_special_tokens": false,
516
+ "tokenizer_class": "InternS1Tokenizer",
517
+ "unk_token": null,
518
+ "video_token": "<|video_pad|>",
519
+ "vision_bos_token": "<|vision_start|>",
520
+ "vision_eos_token": "<|vision_end|>"
521
+ }
video_preprocessor_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {
3
+ "longest_edge": 25165824,
4
+ "shortest_edge": 4096
5
+ },
6
+ "patch_size": 16,
7
+ "temporal_patch_size": 2,
8
+ "merge_size": 2,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "Qwen3VLProcessor",
20
+ "video_processor_type": "Qwen3VLVideoProcessor"
21
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff