0092638_seism.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2b94653c6964b630038897a27cb6d276ff866d9ecd1f6419358b9407f0df62e
3
- size 72128
 
 
 
 
README.md CHANGED
@@ -12,7 +12,7 @@ library_name: transformers
12
 
13
  <div>&nbsp;</div>
14
 
15
- [💻Github Repo](https://github.com/InternLM/Intern-S1) • [🤗Model Collections](https://huggingface.co/collections/internlm/intern-s1-6882e325e8ac1c58ba108aa5) • [📜Technical Report](https://huggingface.co/papers/2603.25040) • [💬Online Chat](https://chat.intern-ai.org.cn/)
16
 
17
  </div>
18
 
@@ -60,9 +60,6 @@ temperature = 0.8
60
 
61
  ### Serving
62
 
63
- > [!IMPORTANT]
64
- > Running a trillion-parameter model using the native Hugging Face forward method is challenging. We strongly recommend using an LLM inference engine (such as LMDeploy, vLLM, or SGLang) to host Intern-S1-Pro and accessing the model via API.
65
-
66
  Intern-S1-Pro can be deployed using any of the following LLM inference frameworks:
67
 
68
  - LMDeploy
@@ -71,6 +68,8 @@ Intern-S1-Pro can be deployed using any of the following LLM inference framework
71
 
72
  Detailed deployment examples for these frameworks are available in the [Model Deployment Guide](./deployment_guide.md).
73
 
 
 
74
 
75
  ## Advanced Usage
76
 
@@ -247,7 +246,7 @@ text = tokenizer.apply_chat_template(
247
  )
248
  ```
249
 
250
- When serving Intern-S1-Pro models, you can dynamically control the thinking mode by adjusting the `enable_thinking` parameter in your requests.
251
 
252
  ```python
253
  from openai import OpenAI
@@ -286,122 +285,6 @@ response = client.chat.completions.create(
286
  print(json.dumps(response.model_dump(), indent=2, ensure_ascii=False))
287
  ```
288
 
289
- ### Time Series Demo
290
-
291
- Time series inference is currently only supported in LMDeploy. To get started, download and deploy Intern-S1-Pro with LMDeploy (>=v0.12.1) by following the [Model Deployment Guide](./deployment_guide.md).
292
- Below is an example of detecting earthquake events from a time series signal file. Additional data types and functionalities are also supported.
293
-
294
- ```
295
- from openai import OpenAI
296
- from lmdeploy.vl.time_series_utils import encode_time_series_base64
297
-
298
- openai_api_key = "EMPTY"
299
- openai_api_base = "http://0.0.0.0:8000/v1"
300
- client = OpenAI(
301
- api_key=openai_api_key,
302
- base_url=openai_api_base,
303
- )
304
- model_name = client.models.list().data[0].id
305
-
306
-
307
- def send_base64(file_path: str, sampling_rate: int = 100):
308
- """base64-encoded time-series data."""
309
-
310
- # encode_time_series_base64 accepts local file paths and http urls,
311
- # encoding time-series data (.npy, .csv, .wav, .mp3, .flac, etc.) into base64 strings.
312
- base64_ts = encode_time_series_base64(file_path)
313
-
314
- messages = [
315
- {
316
- "role": "user",
317
- "content": [
318
- {
319
- "type": "text",
320
- "text": "Please determine whether an Earthquake event has occurred in the provided time-series data. If so, please specify the starting time point indices of the P-wave and S-wave in the event."
321
- },
322
- {
323
- "type": "time_series_url",
324
- "time_series_url": {
325
- "url": f"data:time_series/npy;base64,{base64_ts}",
326
- "sampling_rate": sampling_rate
327
- },
328
- },
329
- ],
330
- }
331
- ]
332
-
333
- return client.chat.completions.create(
334
- model=model_name,
335
- messages=messages,
336
- temperature=0,
337
- max_tokens=200,
338
- )
339
-
340
-
341
- def send_http_url(url: str, sampling_rate: int = 100):
342
- """http(s) url pointing to the time-series data."""
343
- messages = [
344
- {
345
- "role": "user",
346
- "content": [
347
- {
348
- "type": "text",
349
- "text": "Please determine whether an Earthquake event has occurred in the provided time-series data. If so, please specify the starting time point indices of the P-wave and S-wave in the event."
350
- },
351
- {
352
- "type": "time_series_url",
353
- "time_series_url": {
354
- "url": url,
355
- "sampling_rate": sampling_rate
356
- },
357
- },
358
- ],
359
- }
360
- ]
361
-
362
- return client.chat.completions.create(
363
- model=model_name,
364
- messages=messages,
365
- temperature=0,
366
- max_tokens=200,
367
- )
368
-
369
-
370
- def send_file_url(file_path: str, sampling_rate: int = 100):
371
- """file url pointing to the time-series data."""
372
- messages = [
373
- {
374
- "role": "user",
375
- "content": [
376
- {
377
- "type": "text",
378
- "text": "Please determine whether an Earthquake event has occurred in the provided time-series data. If so, please specify the starting time point indices of the P-wave and S-wave in the event."
379
- },
380
- {
381
- "type": "time_series_url",
382
- "time_series_url": {
383
- "url": f"file://{file_path}",
384
- "sampling_rate": sampling_rate
385
- },
386
- },
387
- ],
388
- }
389
- ]
390
-
391
- return client.chat.completions.create(
392
- model=model_name,
393
- messages=messages,
394
- temperature=0,
395
- max_tokens=200,
396
- )
397
-
398
- response = send_base64("./0092638_seism.npy")
399
- # response = send_http_url("https://huggingface.co/internlm/Intern-S1-Pro/raw/main/0092638_seism.npy")
400
- # response = send_file_url("./0092638_seism.npy")
401
-
402
- print(response.choices[0].message)
403
- ```
404
-
405
  ## Citation
406
 
407
  If you find this work useful, feel free to give us a cite.
@@ -417,15 +300,3 @@ If you find this work useful, feel free to give us a cite.
417
  url={https://arxiv.org/abs/2508.15763},
418
  }
419
  ```
420
-
421
- ```
422
- @misc{zou2026interns1proscientificmultimodalfoundation,
423
- title={Intern-S1-Pro: Scientific Multimodal Foundation Model at Trillion Scale},
424
- author={Yicheng Zou and Dongsheng Zhu and Lin Zhu and Tong Zhu and Yunhua Zhou and Peiheng Zhou and Xinyu Zhou and Dongzhan Zhou and Zhiwang Zhou and Yuhao Zhou and Bowen Zhou and Zhanping Zhong and Zhijie Zhong and Haiteng Zhao and Penghao Zhao and Xiaomeng Zhao and Zhiyuan Zhao and Yechen Zhang and Jin Zhang and Wenwei Zhang and Hongjie Zhang and Zhuo Zhang and Wenlong Zhang and Bo Zhang and Chao Zhang and Chen Zhang and Yuhang Zang and Fei Yuan and Jiakang Yuan and Jiashuo Yu and Jinhui Yin and Haochen Ye and Qian Yao and Bowen Yang and Danni Yang and Kaichen Yang and Ziang Yan and Jun Xu and Yicheng Xu and Wanghan Xu and Xuenan Xu and Chao Xu and Ruiliang Xu and Shuhao Xing and Long Xing and Xinchen Xie and Ling-I Wu and Zijian Wu and Zhenyu Wu and Lijun Wu and Yue Wu and Jianyu Wu and Wen Wu and Fan Wu and Xilin Wei and Qi Wei and Bingli Wang and Rui Wang and Ziyi Wang and Zun Wang and Yi Wang and Haomin Wang and Yizhou Wang and Lintao Wang and Yiheng Wang and Longjiang Wang and Bin Wang and Jian Tong and Zhongbo Tian and Huanze Tang and Chen Tang and Shixiang Tang and Yu Sun and Qiushi Sun and Xuerui Su and Qisheng Su and Chenlin Su and Demin Song and Jin Shi and Fukai Shang and Yuchen Ren and Pengli Ren and Xiaoye Qu and Yuan Qu and Jiantao Qiu and Yu Qiao and Runyu Peng and Tianshuo Peng and Jiahui Peng and Qizhi Pei and Zhuoshi Pan and Linke Ouyang and Wenchang Ning and Yichuan Ma and Zerun Ma and Ningsheng Ma and Runyuan Ma and Chengqi Lyu and Haijun Lv and Han Lv and Lindong Lu and Kuikun Liu and Jiangning Liu and Yuhong Liu and Kai Liu and Hongwei Liu and Zhoumianze Liu and Mengjie Liu and Ziyu Liu and Wenran Liu and Yang Liu and Liwei Liu and Kaiwen Liu and Junyao Lin and Junming Lin and Tianyang Lin and Dahua Lin and Jianze Liang and Linyang Li and Peiji Li and Zonglin Li and Zehao Li and Pengze Li and Guoyan Li and Lingkai Kong and Linglin Jing and Zhenjiang Jin and Feifei Jiang and Qian Jiang and Junhao Huang and Zixian Huang and Haian Huang and Zhouqi Hua and Han Hu and Linfeng Hou and Yinan He and Conghui He and Tianyao He and Xu Guo and Qipeng Guo and Aijia Guo and Yuzhe Gu and Lixin Gu and Jingyang Gong and Qiming Ge and Jiaye Ge and Songyang Gao and Jianfei Gao and Xinyu Fang and Caihua fan and Yue Fan and Yanhui Duan and Zichen Ding and Shengyuan Ding and Xuanlang Dai and Erfei Cui and Ganqu Cui and Pei Chu and Tao Chu and Guangran Cheng and Yu Cheng and Kai Chen and Yongkang Chen and Chiyu Chen and Guanzhou Chen and Qiaosheng Chen and Sitao Chen and Xin Chen and Haojiong Chen and Yicheng Chen and Weihan Cao and Yuhang Cao and Qinglong Cao and Lei Bai},
425
- year={2026},
426
- eprint={2603.25040},
427
- archivePrefix={arXiv},
428
- primaryClass={cs.LG},
429
- url={https://arxiv.org/abs/2603.25040},
430
- }
431
- ```
 
12
 
13
  <div>&nbsp;</div>
14
 
15
+ [💻Github Repo](https://github.com/InternLM/Intern-S1) • [🤗Model Collections](https://huggingface.co/collections/internlm/intern-s1-6882e325e8ac1c58ba108aa5) • [📜Technical Report](https://arxiv.org/abs/2508.15763) • [💬Online Chat](https://chat.intern-ai.org.cn/)
16
 
17
  </div>
18
 
 
60
 
61
  ### Serving
62
 
 
 
 
63
  Intern-S1-Pro can be deployed using any of the following LLM inference frameworks:
64
 
65
  - LMDeploy
 
68
 
69
  Detailed deployment examples for these frameworks are available in the [Model Deployment Guide](./deployment_guide.md).
70
 
71
+ > Deployment support for the time-series module is under optimization and will be released soon.
72
+
73
 
74
  ## Advanced Usage
75
 
 
246
  )
247
  ```
248
 
249
+ With serving Intern-S1-Pro models, you can dynamically control the thinking mode by adjusting the `enable_thinking` parameter in your requests.
250
 
251
  ```python
252
  from openai import OpenAI
 
285
  print(json.dumps(response.model_dump(), indent=2, ensure_ascii=False))
286
  ```
287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  ## Citation
289
 
290
  If you find this work useful, feel free to give us a cite.
 
300
  url={https://arxiv.org/abs/2508.15763},
301
  }
302
  ```
 
 
 
 
 
 
 
 
 
 
 
 
chat_template.jinja CHANGED
@@ -17,8 +17,6 @@
17
  {{- 'Video ' + video_count.value|string + ': <|vision_start|><|video_pad|><|vision_end|>'-}}
18
  {%- elif 'text' in item %}
19
  {{- item.text }}
20
- {%- elif 'time_series' in item or item.type == 'time_series' %}
21
- {{- '<|ts|><TS_CONTEXT><|/ts|>'-}}
22
  {%- endif %}
23
  {%- endfor %}
24
  {%- endif %}
 
17
  {{- 'Video ' + video_count.value|string + ': <|vision_start|><|video_pad|><|vision_end|>'-}}
18
  {%- elif 'text' in item %}
19
  {{- item.text }}
 
 
20
  {%- endif %}
21
  {%- endfor %}
22
  {%- endif %}
config.json CHANGED
@@ -58,37 +58,6 @@
58
  },
59
  "vision_end_token_id": 151653,
60
  "vision_start_token_id": 151652,
61
- "ts_config": {
62
- "auto_map": {
63
- "AutoConfig": "configuration_interns1_pro.InternS1ProTimeSeriesConfig",
64
- "AutoModel": "modeling_interns1_pro.InternS1ProTimeSeriesModel"
65
- },
66
- "activation_dropout": 0.0,
67
- "activation_function": "gelu",
68
- "architectures": [
69
- "InternS1TimeSeriesModel"
70
- ],
71
- "attention_dropout": 0.0,
72
- "d_model": 768,
73
- "dropout": 0.0,
74
- "dtype": "bfloat16",
75
- "encoder_attention_heads": 8,
76
- "encoder_ffn_dim": 3072,
77
- "encoder_layerdrop": 0.0,
78
- "encoder_layers": 17,
79
- "model_type": "interns1_pro_time_series",
80
- "max_source_positions": 1500,
81
- "num_mel_bins": 80,
82
- "out_hidden_size": 4096,
83
- "scale_embedding": false,
84
- "ts_adapt_in_dim": 256,
85
- "ts_adapt_out_dim": 1024,
86
- "use_cache": true,
87
- "attn_implementation": "eager"
88
- },
89
- "ts_end_id": 151684,
90
- "ts_start_id": 151683,
91
- "ts_token_id": 151685,
92
  "auto_map": {
93
  "AutoConfig": "configuration_interns1_pro.InternS1ProConfig",
94
  "AutoModel": "modeling_interns1_pro.InternS1ProModel",
@@ -172,7 +141,6 @@
172
  "model.visual.blocks.17.mlp.linear_fc1",
173
  "model.visual.blocks.4.norm2",
174
  "model.visual.blocks.17.attn.qkv",
175
- "model.time_series",
176
  "model.language_model.layers.83.self_attn.k_norm",
177
  "model.language_model.layers.47.post_attention_layernorm",
178
  "model.language_model.layers.59.input_layernorm",
 
58
  },
59
  "vision_end_token_id": 151653,
60
  "vision_start_token_id": 151652,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  "auto_map": {
62
  "AutoConfig": "configuration_interns1_pro.InternS1ProConfig",
63
  "AutoModel": "modeling_interns1_pro.InternS1ProModel",
 
141
  "model.visual.blocks.17.mlp.linear_fc1",
142
  "model.visual.blocks.4.norm2",
143
  "model.visual.blocks.17.attn.qkv",
 
144
  "model.language_model.layers.83.self_attn.k_norm",
145
  "model.language_model.layers.47.post_attention_layernorm",
146
  "model.language_model.layers.59.input_layernorm",
configuration_interns1_pro.py CHANGED
@@ -15,7 +15,6 @@
15
 
16
  from transformers.configuration_utils import PretrainedConfig
17
  from transformers.modeling_rope_utils import rope_config_validation
18
- from transformers import WhisperConfig
19
 
20
 
21
  class InternS1ProTextConfig(PretrainedConfig):
@@ -139,61 +138,20 @@ class InternS1ProVisionConfig(PretrainedConfig):
139
  self.num_position_embeddings = num_position_embeddings
140
  self.initializer_range = initializer_range
141
 
142
- class InternS1ProTimeSeriesConfig(WhisperConfig):
143
-
144
- model_type = "interns1_pro_time_series"
145
- base_config_key = "ts_config"
146
-
147
- def __init__(
148
- self,
149
- ts_adapt_in_dim: int=256,
150
- ts_adapt_out_dim: int=1024,
151
- ts_hidden_dim: int=1024,
152
- ts_cnn_channels: list[int]=[1, 32, 64, 128, 128],
153
- ts_cnn_kernel_sizes: list[int]=[3, 5, 5, 5],
154
- ts_cnn_strides: list[int]=[2, 4, 4, 5],
155
- ts_cnn_paddings: list[int]=[1, 2, 2, 2],
156
- ts_concat_subsampling_in_channels: int=128,
157
- ts_concat_subsampling_concat_size: int=2,
158
- use_flash_attn: bool=False,
159
- **kwargs
160
- ):
161
- super().__init__(**kwargs)
162
-
163
- self.ts_cnn_channels = ts_cnn_channels
164
- self.ts_cnn_kernel_sizes = ts_cnn_kernel_sizes
165
- self.ts_cnn_strides = ts_cnn_strides
166
- self.ts_cnn_paddings = ts_cnn_paddings
167
- self.ts_concat_subsampling_in_channels = ts_concat_subsampling_in_channels
168
- self.ts_concat_subsampling_concat_size = ts_concat_subsampling_concat_size
169
-
170
- self.ts_adapt_in_dim = ts_adapt_in_dim
171
- self.ts_adapt_out_dim = ts_adapt_out_dim
172
-
173
- self.ts_hidden_dim = ts_hidden_dim
174
- self.use_flash_attn = use_flash_attn
175
-
176
- assert self.ts_adapt_out_dim == self.ts_hidden_dim, "ts_adapt_out_dim should be equal to ts_hidden_dim"
177
- assert self.ts_concat_subsampling_in_channels == self.ts_cnn_channels[-1], "ts_concat_subsampling_in_channels should be equal to the out_channel of the last cnn layer"
178
-
179
 
180
  class InternS1ProConfig(PretrainedConfig):
181
  model_type = "interns1_pro"
182
- sub_configs = {"vision_config": InternS1ProVisionConfig, "text_config": InternS1ProTextConfig, 'ts_config':InternS1ProTimeSeriesConfig}
183
  keys_to_ignore_at_inference = ["past_key_values"]
184
 
185
  def __init__(
186
  self,
187
  text_config=None,
188
  vision_config=None,
189
- ts_config=None,
190
  image_token_id=151655,
191
  video_token_id=151656,
192
  vision_start_token_id=151652,
193
  vision_end_token_id=151653,
194
- ts_token_id=151685,
195
- ts_start_id=151683,
196
- ts_end_id=151684,
197
  tie_word_embeddings=False,
198
  **kwargs,
199
  ):
@@ -207,19 +165,11 @@ class InternS1ProConfig(PretrainedConfig):
207
  elif text_config is None:
208
  self.text_config = self.sub_configs["text_config"]()
209
 
210
- if isinstance(ts_config, dict):
211
- self.ts_config = self.sub_configs["ts_config"](**ts_config)
212
- elif ts_config is None:
213
- self.ts_config = self.sub_configs["ts_config"]()
214
-
215
  self.image_token_id = image_token_id
216
  self.video_token_id = video_token_id
217
  self.vision_start_token_id = vision_start_token_id
218
  self.vision_end_token_id = vision_end_token_id
219
- self.ts_token_id = ts_token_id
220
- self.ts_start_id = ts_start_id
221
- self.ts_end_id = ts_end_id
222
  super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
223
 
224
 
225
- __all__ = ["InternS1ProConfig", "InternS1ProTextConfig", "InternS1ProVisionConfig", "InternS1ProTimeSeriesConfig"]
 
15
 
16
  from transformers.configuration_utils import PretrainedConfig
17
  from transformers.modeling_rope_utils import rope_config_validation
 
18
 
19
 
20
  class InternS1ProTextConfig(PretrainedConfig):
 
138
  self.num_position_embeddings = num_position_embeddings
139
  self.initializer_range = initializer_range
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  class InternS1ProConfig(PretrainedConfig):
143
  model_type = "interns1_pro"
144
+ sub_configs = {"vision_config": InternS1ProVisionConfig, "text_config": InternS1ProTextConfig}
145
  keys_to_ignore_at_inference = ["past_key_values"]
146
 
147
  def __init__(
148
  self,
149
  text_config=None,
150
  vision_config=None,
 
151
  image_token_id=151655,
152
  video_token_id=151656,
153
  vision_start_token_id=151652,
154
  vision_end_token_id=151653,
 
 
 
155
  tie_word_embeddings=False,
156
  **kwargs,
157
  ):
 
165
  elif text_config is None:
166
  self.text_config = self.sub_configs["text_config"]()
167
 
 
 
 
 
 
168
  self.image_token_id = image_token_id
169
  self.video_token_id = video_token_id
170
  self.vision_start_token_id = vision_start_token_id
171
  self.vision_end_token_id = vision_end_token_id
 
 
 
172
  super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
173
 
174
 
175
+ __all__ = ["InternS1ProConfig", "InternS1ProTextConfig", "InternS1ProVisionConfig"]
deployment_guide.md CHANGED
@@ -1,6 +1,6 @@
1
  # Intern-S1-Pro Deployment Guide
2
 
3
- The Intern-S1-Pro release is a 1T parameter model stored in FP8 format. Deployment requires at least **two 8-GPU H200** nodes, with either of the following configurations:
4
 
5
  - Tensor Parallelism (TP)
6
  - Data Parallelism (DP) + Expert Parallelism (EP)
@@ -9,7 +9,7 @@ The Intern-S1-Pro release is a 1T parameter model stored in FP8 format. Deployme
9
 
10
  ## LMDeploy
11
 
12
- Required version `lmdeploy>=0.12.1`
13
 
14
  - Tensor Parallelism
15
 
@@ -59,7 +59,25 @@ lmdeploy serve api_server \
59
 
60
  ## vLLM
61
 
62
- You can use the vLLM nightly-built docker image `vllm/vllm-openai:nightly` to deploy. Refer to [using-docker](https://docs.vllm.ai/en/latest/deployment/docker/?h=docker) for more.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  ```bash
65
  # node 0
@@ -108,8 +126,6 @@ vllm serve internlm/Intern-S1-Pro \
108
 
109
  ## SGLang
110
 
111
- You can use the docker image `lmsysorg/sglang:dev` to deploy. Refer to [using-docker](https://docs.sglang.io/get_started/install.html#method-3-using-docker) for more.
112
-
113
  - Tensor Parallelism + Expert Parallelism
114
 
115
  ```bash
 
1
  # Intern-S1-Pro Deployment Guide
2
 
3
+ The Intern-S1-Pro release is a 1T parameter model stored in FP8 format. Deployment requires at least two 8-GPU H200 nodes, with either of the following configurations:
4
 
5
  - Tensor Parallelism (TP)
6
  - Data Parallelism (DP) + Expert Parallelism (EP)
 
9
 
10
  ## LMDeploy
11
 
12
+ Required version `lmdeploy>=0.12.0`
13
 
14
  - Tensor Parallelism
15
 
 
59
 
60
  ## vLLM
61
 
62
+ - Tensor Parallelism + Expert Parallelism
63
+
64
+ ```bash
65
+ # start ray on node 0 and node 1
66
+
67
+ # node 0
68
+ export VLLM_ENGINE_READY_TIMEOUT_S=10000
69
+ vllm serve internlm/Intern-S1-Pro \
70
+ --tensor-parallel-size 16 \
71
+ --enable-expert-parallel \
72
+ --distributed-executor-backend ray \
73
+ --max-model-len 65536 \
74
+ --trust-remote-code \
75
+ --reasoning-parser deepseek_r1 \
76
+ --enable-auto-tool-choice \
77
+ --tool-call-parser hermes
78
+ ```
79
+
80
+ - Data Parallelism + Expert Parallelism
81
 
82
  ```bash
83
  # node 0
 
126
 
127
  ## SGLang
128
 
 
 
129
  - Tensor Parallelism + Expert Parallelism
130
 
131
  ```bash
model-time_series-00001-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fab87c45c01a8695f97b5801bee2771ac6e874561ac773983397d958f1e7a00
3
- size 291982664
 
 
 
 
model-time_series-00002-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4150fadfb90bd9561c422b37ecc83fd5a30966f1e555bc9305b9fd5d2c914b0d
3
- size 10240128
 
 
 
 
model.safetensors.index.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6aa1acb6e462542ccb55d50c9ba2097df081b6fd69b8ac5aaed1f0b30b14678e
3
- size 32236540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7de640c8e6f374c36de64b925b2c107896731ef642283e490e69125ec5c4eac1
3
+ size 32204741
modeling_interns1_pro.py CHANGED
@@ -34,10 +34,8 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
34
  from transformers.processing_utils import Unpack
35
  from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
36
  from transformers.utils.generic import OutputRecorder, check_model_inputs
37
- from .configuration_interns1_pro import InternS1ProConfig, InternS1ProTextConfig, InternS1ProVisionConfig, InternS1ProTimeSeriesConfig
38
- from transformers.models.whisper.modeling_whisper import WhisperEncoderLayer
39
- from transformers import WhisperPreTrainedModel
40
- import math
41
 
42
  @use_kernel_forward_from_hub("RMSNorm")
43
  class Qwen3VLMoeTextRMSNorm(nn.Module):
@@ -441,7 +439,7 @@ class InternS1ProPreTrainedModel(PreTrainedModel):
441
  config: InternS1ProConfig
442
  base_model_prefix = "model"
443
  supports_gradient_checkpointing = True
444
- _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock",'WhisperEncoderLayer']
445
  _skip_keys_device_placement = ["past_key_values"]
446
  _supports_flash_attn = True
447
  _supports_sdpa = True
@@ -988,7 +986,7 @@ class InternS1ProTextModel(InternS1ProPreTrainedModel):
988
  # Initialize weights and apply final processing
989
  self.post_init()
990
 
991
- @check_model_inputs
992
  @auto_docstring
993
  def forward(
994
  self,
@@ -1059,442 +1057,6 @@ class InternS1ProTextModel(InternS1ProPreTrainedModel):
1059
  )
1060
 
1061
 
1062
- class InternS1ProTimeSeriesEncoder(WhisperPreTrainedModel):
1063
- def __init__(self, config: InternS1ProTimeSeriesConfig):
1064
- super().__init__(config)
1065
- self.config = config
1066
- self.dropout = config.dropout
1067
- self.layerdrop = config.encoder_layerdrop
1068
-
1069
- self.embed_dim = config.d_model
1070
- self.num_mel_bins = config.num_mel_bins
1071
- self.padding_idx = config.pad_token_id
1072
- self.max_source_positions = config.max_source_positions
1073
- self.embed_scale = math.sqrt(self.embed_dim) if config.scale_embedding else 1.0
1074
-
1075
- self.conv1 = nn.Conv1d(self.num_mel_bins, self.embed_dim, kernel_size=3, padding=1)
1076
- self.conv2 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=3, stride=2, padding=1)
1077
- self.embed_positions = nn.Embedding(self.max_source_positions, self.embed_dim)
1078
-
1079
- self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
1080
- self.layer_norm = nn.LayerNorm(config.d_model)
1081
-
1082
- self.gradient_checkpointing = False
1083
- self.post_init()
1084
-
1085
- self.mask_type = None
1086
- self.chunk_length = None
1087
-
1088
- self.adapt_in = nn.Linear(config.ts_adapt_in_dim, 80)
1089
- self.adapt_out = nn.Linear(self.embed_dim, config.ts_adapt_out_dim)
1090
-
1091
- def _freeze_parameters(self):
1092
- for param in self.parameters():
1093
- param.requires_grad = False
1094
- self._requires_grad = False
1095
-
1096
- def get_input_embeddings(self) -> nn.Module:
1097
- return self.conv1
1098
-
1099
- def set_input_embeddings(self, value: nn.Module):
1100
- self.conv1 = value
1101
-
1102
- def define_masktype(self, masktype, chunk_length=None):
1103
- self.mask_type = masktype
1104
- self.chunk_length = chunk_length
1105
-
1106
- def _make_causal_mask(self,
1107
- input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
1108
- ):
1109
- """
1110
- Make causal mask used for bi-directional self-attention.
1111
- """
1112
- bsz, tgt_len = input_ids_shape
1113
- mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
1114
- mask_cond = torch.arange(mask.size(-1), device=device)
1115
- mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
1116
- mask = mask.to(dtype)
1117
-
1118
- if past_key_values_length > 0:
1119
- mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
1120
- return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
1121
-
1122
- # Copied from transformers.models.bart.modeling_bart._expand_mask
1123
- def _expand_mask(self, mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
1124
- """
1125
- Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
1126
- """
1127
- # print(mask.size())
1128
- bsz, src_len = mask.size()
1129
- tgt_len = tgt_len if tgt_len is not None else src_len
1130
-
1131
- expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
1132
-
1133
- inverted_mask = 1.0 - expanded_mask
1134
-
1135
- return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
1136
-
1137
-
1138
- def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
1139
- # create causal mask
1140
- # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1141
- combined_attention_mask = None
1142
-
1143
- if input_shape[-1] > 1:
1144
- combined_attention_mask = self._make_causal_mask(
1145
- input_shape,
1146
- inputs_embeds.dtype,
1147
- device=inputs_embeds.device,
1148
- past_key_values_length=past_key_values_length,
1149
- )
1150
-
1151
- if attention_mask is not None:
1152
- # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
1153
- expanded_attn_mask = self._expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
1154
- combined_attention_mask = (
1155
- expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
1156
- )
1157
- return combined_attention_mask
1158
-
1159
- def prepare_chunk_attention_mask(self, attention_mask, input_shape, inputs_embeds):
1160
-
1161
- block_size = round(self.chunk_length / 4 * 2)
1162
- matrix_size = input_shape[1]
1163
-
1164
- matrix = torch.ones(matrix_size, matrix_size)
1165
-
1166
- num_full_blocks = round(matrix_size // block_size)
1167
- remainder = matrix_size % block_size
1168
- for i in range(num_full_blocks):
1169
- row_start = i * block_size
1170
- col_start = i * block_size
1171
- matrix[row_start:row_start + block_size, col_start:col_start + block_size] = torch.zeros(block_size, block_size)
1172
-
1173
- if remainder > 0:
1174
- last_row_start = num_full_blocks * block_size
1175
- last_col_start = num_full_blocks * block_size
1176
- matrix[last_row_start:last_row_start + remainder, last_col_start:last_col_start + remainder] = torch.zeros(remainder, remainder)
1177
-
1178
- matrix = matrix * -65504
1179
- matrix = matrix.unsqueeze(0).unsqueeze(0).repeat(input_shape[0], 1, 1, 1)
1180
- attention_mask = matrix.to(inputs_embeds.device)
1181
- return attention_mask
1182
-
1183
- def forward(
1184
- self,
1185
- input_features,
1186
- attention_mask=None,
1187
- head_mask=None,
1188
- output_attentions=None,
1189
- output_hidden_states=None,
1190
- return_dict=None,
1191
- ):
1192
- # (N, T, C) -> (T, N, C) -> (N, C, T)
1193
- input_features = input_features.permute(1, 0, 2)
1194
- input_features = self.adapt_in(input_features)
1195
- input_features = input_features.permute(1, 2, 0)
1196
-
1197
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1198
- output_hidden_states = (
1199
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1200
- )
1201
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1202
-
1203
- # (N, C, T) -> (N, C, T//2)
1204
- inputs_embeds = nn.functional.gelu(self.conv1(input_features))
1205
- inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
1206
-
1207
- # (N, C, T) -> (N, T, C)
1208
- inputs_embeds = inputs_embeds.permute(0, 2, 1) # torch.Size([1, 100, 768])
1209
- embed_pos = self.embed_positions.weight # torch.Size([1500, 768])
1210
-
1211
- if inputs_embeds.shape[1] > embed_pos.shape[0]:
1212
- target_len = inputs_embeds.shape[1]
1213
- padding = [0, 0, 0, target_len-embed_pos.shape[0]]
1214
-
1215
- embed_pos = nn.functional.pad(embed_pos, pad=padding, mode='constant', value=0)
1216
- hidden_states = inputs_embeds[:, :embed_pos.shape[0], :] + embed_pos
1217
- else:
1218
- hidden_states = inputs_embeds + embed_pos[:inputs_embeds.shape[1], :]
1219
- hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
1220
-
1221
- encoder_states = () if output_hidden_states else None
1222
- all_attentions = () if output_attentions else None
1223
-
1224
- input_shape = inputs_embeds.size()[:-1]
1225
- past_key_values_length = 0
1226
- attention_mask = None
1227
- if self.mask_type == 'chunk':
1228
- attention_mask = self.prepare_chunk_attention_mask(attention_mask, input_shape, inputs_embeds)
1229
- else:
1230
- attention_mask = self._prepare_decoder_attention_mask(
1231
- attention_mask, input_shape, inputs_embeds, past_key_values_length
1232
- )
1233
-
1234
- if head_mask is not None:
1235
- assert head_mask.size()[0] == (
1236
- len(self.layers)
1237
- ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
1238
-
1239
- for idx, encoder_layer in enumerate(self.layers):
1240
- if output_hidden_states:
1241
- encoder_states = encoder_states + (self.layer_norm(hidden_states),)
1242
- # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
1243
- to_drop = False
1244
- if self.training:
1245
- dropout_probability = torch.rand([])
1246
- if dropout_probability < self.layerdrop: # skip the layer
1247
- to_drop = True
1248
-
1249
- if to_drop:
1250
- layer_outputs = (None, None)
1251
- else:
1252
- if self.gradient_checkpointing and self.training:
1253
-
1254
- def create_custom_forward(module):
1255
- def custom_forward(*inputs):
1256
- return module(*inputs, output_attentions)
1257
-
1258
- return custom_forward
1259
-
1260
- layer_outputs = torch.utils.checkpoint.checkpoint(
1261
- create_custom_forward(encoder_layer),
1262
- hidden_states,
1263
- attention_mask,
1264
- (head_mask[idx] if head_mask is not None else None),
1265
- )
1266
- else:
1267
- layer_outputs = encoder_layer(
1268
- hidden_states,
1269
- attention_mask,
1270
- layer_head_mask=(head_mask[idx] if head_mask is not None else None),
1271
- output_attentions=output_attentions,
1272
- )
1273
-
1274
- hidden_states = layer_outputs[0]
1275
-
1276
- if output_attentions:
1277
- all_attentions = all_attentions + (layer_outputs[1],)
1278
-
1279
- # (N, T, C) -> (T, N, C)
1280
- hidden_states = hidden_states.permute(1, 0, 2)
1281
- hidden_states = self.layer_norm(hidden_states)
1282
- hidden_states = self.adapt_out(hidden_states)
1283
-
1284
- # (T, N, C) -> (N, T, C)
1285
- hidden_states = hidden_states.permute(1, 0, 2)
1286
- if output_hidden_states:
1287
- encoder_states = encoder_states + (hidden_states,)
1288
-
1289
- if not return_dict:
1290
- return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
1291
- return ModelOutput(
1292
- last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
1293
- )
1294
-
1295
-
1296
- class InternS1ProTimeSeriesConcatSubsampling(nn.Module):
1297
- def __init__(self, in_channels: int, concat_size: int):
1298
- super().__init__()
1299
- self.in_channels = in_channels
1300
- self.out_channels = in_channels * concat_size
1301
-
1302
- def forward(self, ts_signals: torch.Tensor, ts_lens: torch.Tensor):
1303
- if ts_signals.shape[1] % 2 != 0:
1304
- ts_signals = ts_signals[:, :-1, :]
1305
- even_frames = ts_signals[:, ::2, :]
1306
- odd_frames = ts_signals[:, 1::2, :]
1307
- ts_signals = torch.cat((even_frames, odd_frames), dim=2)
1308
- ts_lens = ts_lens // 2
1309
- return ts_signals, ts_lens
1310
-
1311
-
1312
- class InternS1ProTimeSeriesFixPositionalEncoding(nn.Module):
1313
- def __init__(self, d_model, max_len=20000):
1314
- super().__init__()
1315
- pe = torch.zeros(max_len, d_model,dtype=torch.float)
1316
- position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
1317
- div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model))
1318
- pe[:, 0::2] = torch.sin(position * div_term)
1319
- pe[:, 1::2] = torch.cos(position * div_term)
1320
- pe = pe.unsqueeze(0).transpose(0, 1).to(torch.float32) # (max_len, 1, d_model)
1321
- self.register_buffer('pe', pe, persistent=True)
1322
-
1323
- def forward(self, x):
1324
- # x: (seq_len, batch_size, d_model)
1325
- x = x + self.pe[:x.size(0), :]
1326
- return x.clone()
1327
-
1328
-
1329
- class InternS1ProTimeSeriesMultiChannelAdaptiveSubsampling(nn.Module):
1330
- def __init__(self, hidden_dim=128, nhead=8,num_encoder_layers = 1):
1331
- super().__init__()
1332
- self.conv = nn.Conv1d(in_channels=1, out_channels=hidden_dim, kernel_size=5, stride=1, padding=2)
1333
- encoder_layers = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead)
1334
- self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
1335
- self.pos_encoder = InternS1ProTimeSeriesFixPositionalEncoding(d_model=hidden_dim)
1336
- self.subsampling = InternS1ProTimeSeriesConcatSubsampling(128, 2)
1337
-
1338
-
1339
- def forward(self, inputs, input_lens, sr):
1340
- features, feature_lens = self.forward_patch(inputs, input_lens, sr)
1341
- outputs = features
1342
- output_lens = feature_lens
1343
- return outputs, output_lens
1344
-
1345
- def forward_patch(self, inputs, input_lens, sr):
1346
- sr = sr.float()
1347
- strides = torch.floor(160/((1+torch.exp(-sr/100))**6))
1348
- patch_sizes = strides * 2
1349
- patched_outputs = []
1350
- output_lens = []
1351
-
1352
- for i in range(len(inputs)):
1353
- seq = inputs[i] # [seq_len, num_channel]
1354
- ps = patch_sizes[i].item()
1355
- st = strides[i].item()
1356
- le = input_lens[i]
1357
-
1358
- output_len = torch.ceil((le - ps) / st) + 1
1359
- pad_len = ((output_len - 1) * st + ps - le).long().item()
1360
- if seq.ndim == 1:
1361
- seq = seq.unsqueeze(-1)
1362
- seq = nn.functional.pad(seq, (0, 0, 0, pad_len), "constant", 0)
1363
- assert output_len > 0,(seq.shape, ps,st,le,output_len)
1364
- output_lens.append(output_len)
1365
- indices = (torch.arange(0, output_len * st, st).unsqueeze(1) + torch.arange(ps)).long()
1366
- patched = seq[indices]
1367
-
1368
- output = self.forward_encoder(patched) #[num_patch, D]
1369
- patched_outputs.append(output)
1370
-
1371
- outputs = nn.utils.rnn.pad_sequence(patched_outputs, batch_first=True)
1372
- output_lens = torch.tensor(output_lens).squeeze().to(outputs.device).long()
1373
- if output_lens.ndim == 0:
1374
- output_lens = output_lens.unsqueeze(0)
1375
-
1376
- outputs, output_lens = self.subsampling(outputs.clone(), output_lens.clone())
1377
- return outputs, output_lens
1378
-
1379
- def forward_encoder(self, x):
1380
- num_patch, patch_len, C = x.shape
1381
- # conv1
1382
- x = x.reshape(num_patch*C, 1, patch_len) # 每个 channel 当作独立样本送入 conv1
1383
- x = nn.functional.relu((self.conv(x))) # [B*C, D1, L]
1384
- x = x.permute(2,0,1) # [L, B*C, D1]
1385
-
1386
- x = self.pos_encoder(x) # [L, B*C, D1]
1387
- x = self.transformer_encoder(x.to(torch.bfloat16))
1388
- x = x.mean(0)
1389
-
1390
- x = x.reshape(num_patch,C,-1)
1391
-
1392
- return x.mean(1)
1393
-
1394
- class InternS1ProTimeSeriesProjector(nn.Module):
1395
- def __init__(self, config: InternS1ProTimeSeriesConfig):
1396
- super().__init__()
1397
- self.layer_norm = nn.LayerNorm(config.ts_hidden_dim)
1398
- self.linear_1 = nn.Linear(config.ts_hidden_dim, config.out_hidden_size)
1399
- self.act = ACT2FN[config.activation_function]
1400
- self.linear_2 = nn.Linear(config.out_hidden_size, config.out_hidden_size)
1401
-
1402
- def forward(self, ts_features):
1403
- hidden_states = self.layer_norm(ts_features)
1404
- hidden_states = self.linear_1(hidden_states)
1405
- hidden_states = self.act(hidden_states)
1406
- hidden_states = self.linear_2(hidden_states)
1407
- return hidden_states
1408
-
1409
- class InternS1ProTimeSeriesModel(InternS1ProPreTrainedModel):
1410
- main_input_name = 'time_series_signals'
1411
- _supports_flash_attn_2 = False
1412
- config_class = InternS1ProTimeSeriesConfig
1413
- _no_split_modules = ['WhisperEncoderLayer']
1414
-
1415
- def __init__(self, config: InternS1ProTimeSeriesConfig):
1416
- super().__init__(config)
1417
- self.config = config
1418
- self.encoder_embed = InternS1ProTimeSeriesMultiChannelAdaptiveSubsampling()
1419
- self.encoder = InternS1ProTimeSeriesEncoder(config)
1420
- self.projector = InternS1ProTimeSeriesProjector(config)
1421
-
1422
- def get_input_embeddings(self):
1423
- return self.encoder_embed
1424
-
1425
- def make_pad_mask(self, lengths: torch.Tensor) -> torch.Tensor:
1426
- """
1427
- Args:
1428
- lengths:
1429
- A 1-D tensor containing sentence lengths.
1430
- max_len:
1431
- The length of masks.
1432
- Returns:
1433
- Return a 2-D bool tensor, where masked positions
1434
- are filled with `True` and non-masked positions are
1435
- filled with `False`.
1436
-
1437
- >>> lengths = torch.tensor([1, 3, 2, 5])
1438
- >>> make_pad_mask(lengths)
1439
- tensor([[False, True, True, True, True],
1440
- [False, False, False, True, True],
1441
- [False, False, True, True, True],
1442
- [False, False, False, False, False]])
1443
- """
1444
- assert lengths.ndim == 1, lengths.ndim
1445
- max_len = lengths.max()
1446
- n = lengths.size(0)
1447
- seq_range = torch.arange(0, max_len, device=lengths.device)
1448
- expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
1449
- return expaned_lengths >= lengths.unsqueeze(-1)
1450
-
1451
- def forward(
1452
- self,
1453
- time_series_signals: Optional[torch.FloatTensor] = None,
1454
- ts_lens: Optional[torch.Tensor] = None,
1455
- sr: Optional[torch.Tensor] = None,
1456
- output_hidden_states: Optional[bool] = None,
1457
- return_dict: Optional[bool] = None,
1458
- time_series_embeds: Optional[torch.FloatTensor] = None,
1459
- ):
1460
-
1461
- output_hidden_states = (
1462
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1463
- )
1464
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1465
-
1466
- if time_series_signals is None and time_series_embeds is None:
1467
- raise ValueError('You have to specify time_series_signals or time_series_embeds')
1468
-
1469
- if time_series_embeds is not None and len(time_series_embeds.shape) == 3 and time_series_embeds.shape[-1] == self.config.ts_adapt_in_dim:
1470
- time_series_embeds = time_series_embeds
1471
- else:
1472
- if (isinstance(time_series_signals,list) and len(time_series_signals[0].shape) == 2) \
1473
- or (isinstance(time_series_signals, torch.Tensor) and len(time_series_signals.shape) == 3):
1474
- time_series_embeds, ts_lens = self.encoder_embed(time_series_signals, ts_lens, sr)
1475
- else:
1476
- raise ValueError(f'wrong time_series_signals size: {time_series_signals[0].shape}')
1477
-
1478
- # [B, 64000, 1] -> [B, 200, 256] -> [B, 100, 1024]
1479
- encoder_outputs = self.encoder(
1480
- input_features=time_series_embeds,
1481
- output_hidden_states=output_hidden_states,
1482
- return_dict=return_dict,
1483
- )
1484
-
1485
- # ts_lens after encoder
1486
- ts_lens = (ts_lens+1) // 2
1487
- assert torch.all(ts_lens > 0), f"The length of time_series_embeds is so small. ts_lens: {ts_lens}"
1488
-
1489
- src_key_padding_mask = self.make_pad_mask(ts_lens)
1490
- last_hidden_state = encoder_outputs.last_hidden_state
1491
-
1492
- ts_pad_mask = src_key_padding_mask
1493
- ts_embeds = self.projector(last_hidden_state)
1494
-
1495
- return ts_embeds,ts_pad_mask
1496
-
1497
-
1498
  @dataclass
1499
  @auto_docstring(
1500
  custom_intro="""
@@ -1556,13 +1118,12 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
1556
  # Reference: fix gemma3 grad acc #37208
1557
  accepts_loss_kwargs = False
1558
  config: InternS1ProConfig
1559
- _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock",'WhisperEncoderLayer']
1560
 
1561
  def __init__(self, config):
1562
  super().__init__(config)
1563
  self.visual = InternS1ProVisionModel._from_config(config.vision_config)
1564
  self.language_model = InternS1ProTextModel._from_config(config.text_config)
1565
- self.time_series = InternS1ProTimeSeriesModel._from_config(config.ts_config)
1566
 
1567
  # Initialize weights and apply final processing
1568
  self.post_init()
@@ -1609,15 +1170,6 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
1609
  split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
1610
  image_embeds = torch.split(image_embeds, split_sizes)
1611
  return image_embeds
1612
-
1613
- def get_ts_feature(self, ts_values, ts_lens, sr):
1614
- ts_embeds, ts_pad_mask = self.time_series(
1615
- time_series_signals=ts_values,
1616
- ts_lens=ts_lens,
1617
- sr=sr,
1618
- output_hidden_states=False,
1619
- return_dict=True)
1620
- return ts_embeds, ts_pad_mask
1621
 
1622
  def get_placeholder_mask(
1623
  self,
@@ -1660,7 +1212,7 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
1660
  return special_image_mask, special_video_mask
1661
 
1662
  @auto_docstring
1663
- @check_model_inputs
1664
  def forward(
1665
  self,
1666
  input_ids: torch.LongTensor = None,
@@ -1673,9 +1225,6 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
1673
  image_grid_thw: Optional[torch.LongTensor] = None,
1674
  video_grid_thw: Optional[torch.LongTensor] = None,
1675
  cache_position: Optional[torch.LongTensor] = None,
1676
- ts_values: Union[torch.FloatTensor, list[torch.FloatTensor]] = None,
1677
- ts_lens: Union[torch.Tensor, list[torch.Tensor]] = None,
1678
- ts_sr: Union[torch.FloatTensor, list[torch.FloatTensor]] = None,
1679
  **kwargs: Unpack[TransformersKwargs],
1680
  ) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]:
1681
  r"""
@@ -1683,12 +1232,6 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
1683
  The temporal, height and width of feature shape of each image in LLM.
1684
  video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1685
  The temporal, height and width of feature shape of each video in LLM.
1686
- ts_values (`torch.FloatTensor` of shape `(batch_size, seq_len, num_channels)`, *optional*):
1687
- The tensors corresponding to the input time series signals.
1688
- ts_lens (`torch.Tensor` of shape `(batch_size,)`, *optional*):
1689
- The valid lengths of each time series signal in the batch.
1690
- ts_sr (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
1691
- The sampling rates of each time series signal in the batch.
1692
  """
1693
  if (input_ids is None) ^ (inputs_embeds is not None):
1694
  raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -1715,27 +1258,6 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
1715
  )
1716
  inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
1717
 
1718
- if pixel_values is None and pixel_values_videos is None and ts_values is not None:
1719
- ts_features, ts_pad_mask = self.get_ts_feature(ts_values, ts_lens, ts_sr) # [B, T, C], [B, T]
1720
- ts_features = ts_features[~ts_pad_mask].to(inputs_embeds.device, inputs_embeds.dtype) # [num_valid_ts_tokens, C]
1721
- B, N, C = inputs_embeds.shape
1722
- input_ids = input_ids.reshape(B * N)
1723
- inputs_embeds = inputs_embeds.reshape(B * N, C)
1724
- # replace ts_token in inputs_embeds and attention_mask
1725
- ts_placeholder = (input_ids == self.config.ts_token_id)
1726
- n_ts_placeholders = ts_placeholder.sum().item()
1727
- n_ts_tokens = ts_features.size(0)
1728
- assert n_ts_placeholders == n_ts_tokens, f"[ERROR]: Mismatch: <TS_CONTEXT> tokens={n_ts_placeholders}, ts_embeds_valid={n_ts_tokens}"
1729
-
1730
- try:
1731
- inputs_embeds[ts_placeholder] = inputs_embeds[ts_placeholder] * 0.0 + ts_features
1732
- except Exception as e:
1733
- print(f'warning: {e}, inputs_embeds[selected].shape={inputs_embeds[ts_placeholder].shape}, ts_embeds_valid.shape={ts_features.shape}')
1734
- inputs_embeds[ts_placeholder] = inputs_embeds[ts_placeholder] * 0.0 + n_ts_tokens[:n_ts_placeholders]
1735
-
1736
- inputs_embeds = inputs_embeds.reshape(B, N, C)
1737
- # input_ids = input_ids.reshape(B, N)
1738
-
1739
  if position_ids is None:
1740
  batch_size, seq_length = inputs_embeds.shape[:2]
1741
  if cache_position is not None:
@@ -1874,8 +1396,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1874
  def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
1875
  return self.model.get_image_features(pixel_values, image_grid_thw)
1876
 
1877
- def get_ts_feature(self, ts_values, ts_lens, sr):
1878
- return self.model.get_ts_feature(ts_values, ts_lens, sr)
1879
  # Make modules available through conditional class for BC
1880
  @property
1881
  def language_model(self):
@@ -1884,11 +1404,8 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1884
  @property
1885
  def visual(self):
1886
  return self.model.visual
1887
-
1888
- def time_series(self):
1889
- return self.model.time_series
1890
 
1891
- @check_model_inputs
1892
  def forward(
1893
  self,
1894
  input_ids: torch.LongTensor = None,
@@ -1901,9 +1418,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1901
  pixel_values_videos: Optional[torch.FloatTensor] = None,
1902
  image_grid_thw: Optional[torch.LongTensor] = None,
1903
  video_grid_thw: Optional[torch.LongTensor] = None,
1904
- ts_values: Optional[Union[torch.FloatTensor, list[torch.FloatTensor]]] = None,
1905
- ts_lens: Optional[Union[torch.Tensor, list[torch.Tensor]]] = None,
1906
- ts_sr: Optional[Union[torch.FloatTensor, list[torch.FloatTensor]]] = None,
1907
  cache_position: Optional[torch.LongTensor] = None,
1908
  logits_to_keep: Union[int, torch.Tensor] = 0,
1909
  **kwargs: Unpack[TransformersKwargs],
@@ -1970,9 +1484,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
1970
  past_key_values=past_key_values,
1971
  inputs_embeds=inputs_embeds,
1972
  cache_position=cache_position,
1973
- ts_values=ts_values,
1974
- ts_lens=ts_lens,
1975
- ts_sr=ts_sr,
1976
  **kwargs,
1977
  )
1978
 
@@ -2019,9 +1530,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
2019
  pixel_values_videos=None,
2020
  image_grid_thw=None,
2021
  video_grid_thw=None,
2022
- ts_values=None,
2023
- ts_lens=None,
2024
- ts_sr=None,
2025
  **kwargs,
2026
  ):
2027
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -2038,9 +1546,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
2038
  image_grid_thw=image_grid_thw,
2039
  video_grid_thw=video_grid_thw,
2040
  use_cache=use_cache,
2041
- ts_values=ts_values,
2042
- ts_lens=ts_lens,
2043
- ts_sr=ts_sr,
2044
  **kwargs,
2045
  )
2046
 
@@ -2049,9 +1554,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
2049
  if cache_position[0] != 0:
2050
  model_inputs["pixel_values"] = None
2051
  model_inputs["pixel_values_videos"] = None
2052
- model_inputs["ts_values"] = None
2053
- model_inputs["ts_lens"] = None
2054
- model_inputs["ts_sr"] = None
2055
 
2056
  return model_inputs
2057
 
@@ -2195,7 +1697,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
2195
 
2196
  __all__ = [
2197
  "InternS1ProVisionModel",
2198
- "InternS1ProTimeSeriesModel",
2199
  "InternS1ProForConditionalGeneration",
2200
  "InternS1ProModel",
2201
  "InternS1ProPreTrainedModel",
 
34
  from transformers.processing_utils import Unpack
35
  from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
36
  from transformers.utils.generic import OutputRecorder, check_model_inputs
37
+ from .configuration_interns1_pro import InternS1ProConfig, InternS1ProTextConfig, InternS1ProVisionConfig
38
+
 
 
39
 
40
  @use_kernel_forward_from_hub("RMSNorm")
41
  class Qwen3VLMoeTextRMSNorm(nn.Module):
 
439
  config: InternS1ProConfig
440
  base_model_prefix = "model"
441
  supports_gradient_checkpointing = True
442
+ _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
443
  _skip_keys_device_placement = ["past_key_values"]
444
  _supports_flash_attn = True
445
  _supports_sdpa = True
 
986
  # Initialize weights and apply final processing
987
  self.post_init()
988
 
989
+ @check_model_inputs()
990
  @auto_docstring
991
  def forward(
992
  self,
 
1057
  )
1058
 
1059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1060
  @dataclass
1061
  @auto_docstring(
1062
  custom_intro="""
 
1118
  # Reference: fix gemma3 grad acc #37208
1119
  accepts_loss_kwargs = False
1120
  config: InternS1ProConfig
1121
+ _no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
1122
 
1123
  def __init__(self, config):
1124
  super().__init__(config)
1125
  self.visual = InternS1ProVisionModel._from_config(config.vision_config)
1126
  self.language_model = InternS1ProTextModel._from_config(config.text_config)
 
1127
 
1128
  # Initialize weights and apply final processing
1129
  self.post_init()
 
1170
  split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
1171
  image_embeds = torch.split(image_embeds, split_sizes)
1172
  return image_embeds
 
 
 
 
 
 
 
 
 
1173
 
1174
  def get_placeholder_mask(
1175
  self,
 
1212
  return special_image_mask, special_video_mask
1213
 
1214
  @auto_docstring
1215
+ @check_model_inputs()
1216
  def forward(
1217
  self,
1218
  input_ids: torch.LongTensor = None,
 
1225
  image_grid_thw: Optional[torch.LongTensor] = None,
1226
  video_grid_thw: Optional[torch.LongTensor] = None,
1227
  cache_position: Optional[torch.LongTensor] = None,
 
 
 
1228
  **kwargs: Unpack[TransformersKwargs],
1229
  ) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]:
1230
  r"""
 
1232
  The temporal, height and width of feature shape of each image in LLM.
1233
  video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
1234
  The temporal, height and width of feature shape of each video in LLM.
 
 
 
 
 
 
1235
  """
1236
  if (input_ids is None) ^ (inputs_embeds is not None):
1237
  raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
1258
  )
1259
  inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
1260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1261
  if position_ids is None:
1262
  batch_size, seq_length = inputs_embeds.shape[:2]
1263
  if cache_position is not None:
 
1396
  def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
1397
  return self.model.get_image_features(pixel_values, image_grid_thw)
1398
 
 
 
1399
  # Make modules available through conditional class for BC
1400
  @property
1401
  def language_model(self):
 
1404
  @property
1405
  def visual(self):
1406
  return self.model.visual
 
 
 
1407
 
1408
+ @check_model_inputs()
1409
  def forward(
1410
  self,
1411
  input_ids: torch.LongTensor = None,
 
1418
  pixel_values_videos: Optional[torch.FloatTensor] = None,
1419
  image_grid_thw: Optional[torch.LongTensor] = None,
1420
  video_grid_thw: Optional[torch.LongTensor] = None,
 
 
 
1421
  cache_position: Optional[torch.LongTensor] = None,
1422
  logits_to_keep: Union[int, torch.Tensor] = 0,
1423
  **kwargs: Unpack[TransformersKwargs],
 
1484
  past_key_values=past_key_values,
1485
  inputs_embeds=inputs_embeds,
1486
  cache_position=cache_position,
 
 
 
1487
  **kwargs,
1488
  )
1489
 
 
1530
  pixel_values_videos=None,
1531
  image_grid_thw=None,
1532
  video_grid_thw=None,
 
 
 
1533
  **kwargs,
1534
  ):
1535
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
1546
  image_grid_thw=image_grid_thw,
1547
  video_grid_thw=video_grid_thw,
1548
  use_cache=use_cache,
 
 
 
1549
  **kwargs,
1550
  )
1551
 
 
1554
  if cache_position[0] != 0:
1555
  model_inputs["pixel_values"] = None
1556
  model_inputs["pixel_values_videos"] = None
 
 
 
1557
 
1558
  return model_inputs
1559
 
 
1697
 
1698
  __all__ = [
1699
  "InternS1ProVisionModel",
 
1700
  "InternS1ProForConditionalGeneration",
1701
  "InternS1ProModel",
1702
  "InternS1ProPreTrainedModel",
processing_interns1_pro.py CHANGED
@@ -18,7 +18,7 @@
18
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
  # See the License for the specific language governing permissions and
20
  # limitations under the License.
21
- from typing import Union,Optional
22
 
23
  import numpy as np
24
 
@@ -28,7 +28,6 @@ from transformers.processing_utils import MultiModalData, ProcessingKwargs, Proc
28
  from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
29
  from transformers.utils import logging
30
  from transformers.video_utils import VideoInput
31
- import os
32
 
33
 
34
  logger = logging.get_logger(__name__)
@@ -42,7 +41,6 @@ class InternS1ProProcessorKwargs(ProcessingKwargs, total=False):
42
  "return_mm_token_type_ids": False,
43
  },
44
  "videos_kwargs": {"return_metadata": True},
45
- "time_series_kwargs": {},
46
  }
47
 
48
 
@@ -70,7 +68,6 @@ class InternS1ProProcessor(ProcessorMixin):
70
  def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
71
  self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
72
  self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
73
- self.ts_token = "<TS_CONTEXT>" if not hasattr(tokenizer, "ts_token") else tokenizer.ts_token
74
  self.image_token_id = (
75
  tokenizer.image_token_id
76
  if getattr(tokenizer, "image_token_id", None)
@@ -81,11 +78,6 @@ class InternS1ProProcessor(ProcessorMixin):
81
  if getattr(tokenizer, "video_token_id", None)
82
  else tokenizer.convert_tokens_to_ids(self.video_token)
83
  )
84
- self.ts_token_id = (
85
- tokenizer.ts_token_id
86
- if getattr(tokenizer, "ts_token_id", None)
87
- else tokenizer.convert_tokens_to_ids(self.ts_token)
88
- )
89
  super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
90
  self.vision_start_token = (
91
  "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
@@ -103,132 +95,12 @@ class InternS1ProProcessor(ProcessorMixin):
103
  if getattr(tokenizer, "vision_end_token_id", None)
104
  else tokenizer.convert_tokens_to_ids(self.vision_end_token)
105
  )
106
- self.ts_start_token = (
107
- "<|ts|>" if not hasattr(tokenizer, "ts_start_token") else tokenizer.ts_start_token
108
- )
109
- self.ts_end_token = (
110
- "<|/ts|>" if not hasattr(tokenizer, "ts_end_token") else tokenizer.ts_end_token
111
- )
112
- self.ts_start_token_id = (
113
- tokenizer.ts_start_token_id
114
- if getattr(tokenizer, "ts_start_token_id", None)
115
- else tokenizer.convert_tokens_to_ids(self.ts_start_token)
116
- )
117
- self.ts_end_token_id = (
118
- tokenizer.ts_end_token_id
119
- if getattr(tokenizer, "ts_end_token_id", None)
120
- else tokenizer.convert_tokens_to_ids(self.ts_end_token)
121
- )
122
-
123
- def time_series_preprocessor(self,conversation):
124
- if isinstance(conversation, (list, tuple)) and (
125
- isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
126
- ):
127
- conversations = conversation
128
- else:
129
- conversations = [conversation]
130
-
131
- batch_time_series = []
132
- batch_time_series_metadata = []
133
- for conversation in conversations:
134
- for message in conversation:
135
- if message['role'] != "user": continue
136
- time_series_fnames = [
137
- content["data"]
138
- for content in message["content"]
139
- if content.get("type") == "time_series" and "data" in content
140
- ]
141
- time_series_rates = [
142
- content.get("sampling_rate", None)
143
- for content in message["content"]
144
- if content.get("type") == "time_series"
145
- ]
146
- for path, rate in zip(time_series_fnames, time_series_rates):
147
- batch_time_series.append(path)
148
- batch_time_series_metadata.append(rate)
149
-
150
- return {"time_series_paths": batch_time_series if batch_time_series else None,
151
- "time_series_sampling_rates": batch_time_series_metadata if batch_time_series_metadata else None}
152
-
153
- def time_series_processor(self,
154
- ts_paths: list[str],
155
- sampling_rates: list[float],
156
- do_normalize=True,
157
- do_truncate=True,
158
-
159
- )-> BatchFeature:
160
- assert len(ts_paths)==len(sampling_rates), "ts_paths and sampling_rates must have the same length"
161
-
162
- ts_values=[]
163
- ts_sr=[]
164
- ts_lens=[]
165
-
166
- for idx,ts_path in enumerate(ts_paths):
167
- sr=sampling_rates[idx]
168
- ext = os.path.splitext(ts_path)[-1].lower()
169
- if ext in [".wav",'.mp3','.flac']:
170
- try:
171
- import soundfile as sf
172
- except ImportError:
173
- raise ImportError("Please install soundfile to process audio files.")
174
- ts_input, sr = sf.read(ts_path) # ts_input: np.ndarray, shape [T] or [T, C]
175
- elif ext == ".csv":
176
- pd = __import__("pandas")
177
- df = pd.read_csv(ts_path, header=None)
178
- ts_input = df.values # [T, C]
179
- elif ext == ".npy":
180
- ts_input = np.load(ts_path) # [T, C]
181
- else:
182
- raise ValueError(f"Unsupported file format: {ext}")
183
-
184
- # ts_tensor = torch.from_numpy(ts_input).float()
185
- if not isinstance(ts_input, np.ndarray):
186
- ts_input = np.array(ts_input, dtype=np.float32)
187
-
188
- if do_normalize:
189
- mean = ts_input.mean(axis=0, keepdims=True)
190
- std = ts_input.std(axis=0, keepdims=True)
191
- ts_input = (ts_input - mean) / (std + 1e-8)
192
-
193
- if do_truncate and len(ts_input)>240000:
194
- ts_input=ts_input[:240000] # truncate to 240k to avoid oom
195
-
196
- if ts_input.ndim==1:
197
- ts_input=ts_input[:, None] #[T,C]
198
-
199
- ts_len=ts_input.shape[0]
200
-
201
- if sr is None or sr == 0: # if no sr provided
202
- sr = ts_len/4
203
-
204
- ts_values.append(ts_input)
205
- ts_sr.append(sr)
206
- ts_lens.append(ts_len)
207
-
208
- ts_lens = np.array(ts_lens)
209
- ts_sr = np.array(ts_sr)
210
- num_ts_tokens = self._get_num_ts_tokens(sampling_rates=ts_sr,
211
- ts_lens=ts_lens)
212
- return BatchFeature(data={"ts_values": ts_values,
213
- "ts_sr":ts_sr,
214
- "ts_lens":ts_lens,
215
- "num_ts_tokens":num_ts_tokens}
216
- )
217
-
218
- def _get_num_ts_tokens(self,sampling_rates,ts_lens):
219
- strides = np.floor(160/((1+np.exp(-sampling_rates/100))**6))
220
- patch_sizes = strides * 2
221
- embed_lengths = (np.ceil((ts_lens - patch_sizes) / strides) + 1).astype(np.int64)
222
- num_ts_tokens=[(embed_length // 2 + 1) // 2 for embed_length in embed_lengths]
223
- return num_ts_tokens
224
 
225
  def __call__(
226
  self,
227
  images: ImageInput = None,
228
  text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
229
  videos: VideoInput = None,
230
- time_series_paths: Optional[list[str]]=None,
231
- time_series_sampling_rates: Optional[list[float]]=None,
232
  **kwargs: Unpack[InternS1ProProcessorKwargs],
233
  ) -> BatchFeature:
234
  """
@@ -248,7 +120,6 @@ class InternS1ProProcessor(ProcessorMixin):
248
  videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
249
  The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
250
  tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
251
- time_series_signals (`list[np.ndarray]`, `list[torch.Tensor]`):
252
  return_tensors (`str` or [`~utils.TensorType`], *optional*):
253
  If set, will return tensors of a particular framework. Acceptable values are:
254
  - `'pt'`: Return PyTorch `torch.Tensor` objects.
@@ -345,22 +216,6 @@ class InternS1ProProcessor(ProcessorMixin):
345
 
346
  text[i] = text[i].replace("<|placeholder|>", self.video_token)
347
 
348
- time_series_inputs = {}
349
- if images is None and videos is None and time_series_paths is not None:
350
- assert time_series_sampling_rates is not None, "If time_series_signals is provided, time_series_sampling_rates must also be provided."
351
- assert len(time_series_paths) == len(time_series_sampling_rates), "The number of time series signals must match the number of sampling rates."
352
- time_series_inputs = self.time_series_processor(ts_paths=time_series_paths, sampling_rates=time_series_sampling_rates)
353
- num_ts_tokens = time_series_inputs.pop("num_ts_tokens")
354
- assert len(num_ts_tokens) == len(text), "The number of time series signals must match the number of text prompts."
355
- for i in range(len(text)):
356
- if f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}" in text[i]:
357
- ts_placeholder = self.ts_start_token + self.ts_token * num_ts_tokens[i] + self.ts_end_token
358
- text[i] = text[i].replace(
359
- f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}", ts_placeholder, 1
360
- )
361
- elif self.ts_token in text[i]:
362
- text[i] = text[i].replace(self.ts_token, self.ts_token * num_ts_tokens[i])
363
-
364
  return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
365
  return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
366
  text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
@@ -372,7 +227,7 @@ class InternS1ProProcessor(ProcessorMixin):
372
  mm_token_type_ids[array_ids == self.image_token_id] = 1
373
  text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
374
 
375
- return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs,**time_series_inputs}, tensor_type=return_tensors)
376
 
377
  def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
378
  """
 
18
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
  # See the License for the specific language governing permissions and
20
  # limitations under the License.
21
+ from typing import Union
22
 
23
  import numpy as np
24
 
 
28
  from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
29
  from transformers.utils import logging
30
  from transformers.video_utils import VideoInput
 
31
 
32
 
33
  logger = logging.get_logger(__name__)
 
41
  "return_mm_token_type_ids": False,
42
  },
43
  "videos_kwargs": {"return_metadata": True},
 
44
  }
45
 
46
 
 
68
  def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
69
  self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
70
  self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
 
71
  self.image_token_id = (
72
  tokenizer.image_token_id
73
  if getattr(tokenizer, "image_token_id", None)
 
78
  if getattr(tokenizer, "video_token_id", None)
79
  else tokenizer.convert_tokens_to_ids(self.video_token)
80
  )
 
 
 
 
 
81
  super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
82
  self.vision_start_token = (
83
  "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
 
95
  if getattr(tokenizer, "vision_end_token_id", None)
96
  else tokenizer.convert_tokens_to_ids(self.vision_end_token)
97
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def __call__(
100
  self,
101
  images: ImageInput = None,
102
  text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
103
  videos: VideoInput = None,
 
 
104
  **kwargs: Unpack[InternS1ProProcessorKwargs],
105
  ) -> BatchFeature:
106
  """
 
120
  videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
121
  The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
122
  tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
 
123
  return_tensors (`str` or [`~utils.TensorType`], *optional*):
124
  If set, will return tensors of a particular framework. Acceptable values are:
125
  - `'pt'`: Return PyTorch `torch.Tensor` objects.
 
216
 
217
  text[i] = text[i].replace("<|placeholder|>", self.video_token)
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
220
  return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
221
  text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
227
  mm_token_type_ids[array_ids == self.image_token_id] = 1
228
  text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
229
 
230
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
231
 
232
  def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
233
  """
test_inference_ts.py DELETED
@@ -1,78 +0,0 @@
1
- from pathlib import Path
2
- import torch
3
- from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
4
-
5
-
6
- model_path = Path(__file__).parent.resolve()
7
- print(f"Loading model from: {model_path}")
8
-
9
- # 加载模型配置
10
- config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
11
- print(f"Model config: {config.model_type}")
12
- print(f"Architecture: {config.architectures}")
13
-
14
- # 加载处理器(tokenizer + image processor + ts processor)
15
- print("\nLoading processor...")
16
- processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
17
-
18
- # 加载模型(使用 bfloat16 精度和自动设备映射)
19
- print("\nLoading model...")
20
- model = AutoModelForCausalLM.from_pretrained(
21
- model_path,
22
- dtype=torch.bfloat16,
23
- device_map="auto",
24
- # attn_implementation="flash_attention_2", #时序暂不支持flash_attn,load加这行会报错
25
- trust_remote_code=True
26
- )
27
-
28
- print(f"✓ Model loaded successfully!")
29
- print(f"Model type: {type(model).__name__}")
30
- print(f"Model device: {model.device}")
31
-
32
- # ============================================================================
33
- # 测试 3: 时序对话
34
- # ============================================================================
35
- print("\n" + "=" * 80)
36
- print("测试 3: 时序对话")
37
- print("=" * 80)
38
-
39
- messages = [
40
- {
41
- "role": "user",
42
- "content": [
43
- {"type": "time_series", "data": "./0092638_seism.npy", "sampling_rate": 100},
44
- {"type": "text", "text": "Please determine whether an Earthquake event has occurred in the provided time-series data. If so, please specify the starting time point indices of the P-wave and S-wave in the event."},
45
- ],
46
- }
47
- ]
48
-
49
- time_series_inputs = processor.time_series_preprocessor(messages)
50
- multimodal_inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", enable_thinking=False, **time_series_inputs).to(model.device, dtype=torch.bfloat16)
51
-
52
- print("\n生成时序回复...")
53
- with torch.inference_mode():
54
- multimodal_generated_ids = model.generate(
55
- **multimodal_inputs,
56
- max_new_tokens=200,
57
- do_sample=False,
58
- temperature=1.0,
59
- )
60
-
61
- # 提取生成的 token(去除输入部分)
62
- multimodal_generated_ids_trimmed = [
63
- out_ids[len(in_ids):] for in_ids, out_ids in zip(multimodal_inputs.input_ids, multimodal_generated_ids)
64
- ]
65
-
66
- # 解码为文本
67
- multimodal_output = processor.batch_decode(
68
- multimodal_generated_ids_trimmed,
69
- skip_special_tokens=True,
70
- clean_up_tokenization_spaces=False
71
- )
72
-
73
- print("\n" + "-" * 80)
74
- print("时序输出:")
75
- print("-" * 80)
76
- print(multimodal_output[0])
77
- print("-" * 80)
78
- print("\n✅ 时序功能测试完成!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenization_interns1.py CHANGED
@@ -24,14 +24,11 @@ from functools import lru_cache
24
  import regex as re
25
  import sentencepiece as spm
26
 
27
- import transformers
28
  from transformers.tokenization_utils_base import AddedToken, TextInput
 
29
  from transformers.utils import logging
30
- from packaging import version
31
- if version.parse(transformers.__version__) >= version.parse("5.0.0"):
32
- from transformers.tokenization_python import PreTrainedTokenizer
33
- else:
34
- from transformers.tokenization_utils import PreTrainedTokenizer
35
 
36
  logger = logging.get_logger(__name__)
37
 
@@ -509,7 +506,6 @@ class InternS1Tokenizer(PreTrainedTokenizer):
509
  pad_token="<|endoftext|>",
510
  clean_up_tokenization_spaces=False,
511
  split_special_tokens=False,
512
- special_tokens_pattern="none",
513
  **kwargs,
514
  ):
515
  bos_token = (
@@ -570,7 +566,6 @@ class InternS1Tokenizer(PreTrainedTokenizer):
570
  pad_token=pad_token,
571
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
572
  split_special_tokens=split_special_tokens,
573
- special_tokens_pattern="none",
574
  **kwargs,
575
  )
576
 
@@ -720,6 +715,9 @@ class InternS1Tokenizer(PreTrainedTokenizer):
720
 
721
  text, kwargs = self.prepare_for_tokenization(text, **kwargs)
722
 
 
 
 
723
  if hasattr(self, "do_lower_case") and self.do_lower_case:
724
  # convert non-special tokens to lowercase. Might be super slow as well?
725
  escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
 
24
  import regex as re
25
  import sentencepiece as spm
26
 
 
27
  from transformers.tokenization_utils_base import AddedToken, TextInput
28
+ from transformers.tokenization_utils import PreTrainedTokenizer
29
  from transformers.utils import logging
30
+ # from transformers.utils.import_utils import requires
31
+
 
 
 
32
 
33
  logger = logging.get_logger(__name__)
34
 
 
506
  pad_token="<|endoftext|>",
507
  clean_up_tokenization_spaces=False,
508
  split_special_tokens=False,
 
509
  **kwargs,
510
  ):
511
  bos_token = (
 
566
  pad_token=pad_token,
567
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
568
  split_special_tokens=split_special_tokens,
 
569
  **kwargs,
570
  )
571
 
 
715
 
716
  text, kwargs = self.prepare_for_tokenization(text, **kwargs)
717
 
718
+ if kwargs:
719
+ logger.warning(f"Keyword arguments {kwargs} not recognized.")
720
+
721
  if hasattr(self, "do_lower_case") and self.do_lower_case:
722
  # convert non-special tokens to lowercase. Might be super slow as well?
723
  escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]