fix check_model_inputs
#5
by huang11 - opened
- 0092638_seism.npy +0 -3
- README.md +4 -133
- chat_template.jinja +0 -2
- config.json +0 -32
- configuration_interns1_pro.py +2 -52
- deployment_guide.md +21 -5
- model-time_series-00001-of-00002.safetensors +0 -3
- model-time_series-00002-of-00002.safetensors +0 -3
- model.safetensors.index.json +2 -2
- modeling_interns1_pro.py +7 -506
- processing_interns1_pro.py +2 -147
- test_inference_ts.py +0 -78
- tokenization_interns1.py +6 -8
0092638_seism.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c2b94653c6964b630038897a27cb6d276ff866d9ecd1f6419358b9407f0df62e
|
| 3 |
-
size 72128
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -12,7 +12,7 @@ library_name: transformers
|
|
| 12 |
|
| 13 |
<div> </div>
|
| 14 |
|
| 15 |
-
[💻Github Repo](https://github.com/InternLM/Intern-S1) • [🤗Model Collections](https://huggingface.co/collections/internlm/intern-s1-6882e325e8ac1c58ba108aa5) • [📜Technical Report](https://
|
| 16 |
|
| 17 |
</div>
|
| 18 |
|
|
@@ -60,9 +60,6 @@ temperature = 0.8
|
|
| 60 |
|
| 61 |
### Serving
|
| 62 |
|
| 63 |
-
> [!IMPORTANT]
|
| 64 |
-
> Running a trillion-parameter model using the native Hugging Face forward method is challenging. We strongly recommend using an LLM inference engine (such as LMDeploy, vLLM, or SGLang) to host Intern-S1-Pro and accessing the model via API.
|
| 65 |
-
|
| 66 |
Intern-S1-Pro can be deployed using any of the following LLM inference frameworks:
|
| 67 |
|
| 68 |
- LMDeploy
|
|
@@ -71,6 +68,8 @@ Intern-S1-Pro can be deployed using any of the following LLM inference framework
|
|
| 71 |
|
| 72 |
Detailed deployment examples for these frameworks are available in the [Model Deployment Guide](./deployment_guide.md).
|
| 73 |
|
|
|
|
|
|
|
| 74 |
|
| 75 |
## Advanced Usage
|
| 76 |
|
|
@@ -247,7 +246,7 @@ text = tokenizer.apply_chat_template(
|
|
| 247 |
)
|
| 248 |
```
|
| 249 |
|
| 250 |
-
|
| 251 |
|
| 252 |
```python
|
| 253 |
from openai import OpenAI
|
|
@@ -286,122 +285,6 @@ response = client.chat.completions.create(
|
|
| 286 |
print(json.dumps(response.model_dump(), indent=2, ensure_ascii=False))
|
| 287 |
```
|
| 288 |
|
| 289 |
-
### Time Series Demo
|
| 290 |
-
|
| 291 |
-
Time series inference is currently only supported in LMDeploy. To get started, download and deploy Intern-S1-Pro with LMDeploy (>=v0.12.1) by following the [Model Deployment Guide](./deployment_guide.md).
|
| 292 |
-
Below is an example of detecting earthquake events from a time series signal file. Additional data types and functionalities are also supported.
|
| 293 |
-
|
| 294 |
-
```
|
| 295 |
-
from openai import OpenAI
|
| 296 |
-
from lmdeploy.vl.time_series_utils import encode_time_series_base64
|
| 297 |
-
|
| 298 |
-
openai_api_key = "EMPTY"
|
| 299 |
-
openai_api_base = "http://0.0.0.0:8000/v1"
|
| 300 |
-
client = OpenAI(
|
| 301 |
-
api_key=openai_api_key,
|
| 302 |
-
base_url=openai_api_base,
|
| 303 |
-
)
|
| 304 |
-
model_name = client.models.list().data[0].id
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
def send_base64(file_path: str, sampling_rate: int = 100):
|
| 308 |
-
"""base64-encoded time-series data."""
|
| 309 |
-
|
| 310 |
-
# encode_time_series_base64 accepts local file paths and http urls,
|
| 311 |
-
# encoding time-series data (.npy, .csv, .wav, .mp3, .flac, etc.) into base64 strings.
|
| 312 |
-
base64_ts = encode_time_series_base64(file_path)
|
| 313 |
-
|
| 314 |
-
messages = [
|
| 315 |
-
{
|
| 316 |
-
"role": "user",
|
| 317 |
-
"content": [
|
| 318 |
-
{
|
| 319 |
-
"type": "text",
|
| 320 |
-
"text": "Please determine whether an Earthquake event has occurred in the provided time-series data. If so, please specify the starting time point indices of the P-wave and S-wave in the event."
|
| 321 |
-
},
|
| 322 |
-
{
|
| 323 |
-
"type": "time_series_url",
|
| 324 |
-
"time_series_url": {
|
| 325 |
-
"url": f"data:time_series/npy;base64,{base64_ts}",
|
| 326 |
-
"sampling_rate": sampling_rate
|
| 327 |
-
},
|
| 328 |
-
},
|
| 329 |
-
],
|
| 330 |
-
}
|
| 331 |
-
]
|
| 332 |
-
|
| 333 |
-
return client.chat.completions.create(
|
| 334 |
-
model=model_name,
|
| 335 |
-
messages=messages,
|
| 336 |
-
temperature=0,
|
| 337 |
-
max_tokens=200,
|
| 338 |
-
)
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
def send_http_url(url: str, sampling_rate: int = 100):
|
| 342 |
-
"""http(s) url pointing to the time-series data."""
|
| 343 |
-
messages = [
|
| 344 |
-
{
|
| 345 |
-
"role": "user",
|
| 346 |
-
"content": [
|
| 347 |
-
{
|
| 348 |
-
"type": "text",
|
| 349 |
-
"text": "Please determine whether an Earthquake event has occurred in the provided time-series data. If so, please specify the starting time point indices of the P-wave and S-wave in the event."
|
| 350 |
-
},
|
| 351 |
-
{
|
| 352 |
-
"type": "time_series_url",
|
| 353 |
-
"time_series_url": {
|
| 354 |
-
"url": url,
|
| 355 |
-
"sampling_rate": sampling_rate
|
| 356 |
-
},
|
| 357 |
-
},
|
| 358 |
-
],
|
| 359 |
-
}
|
| 360 |
-
]
|
| 361 |
-
|
| 362 |
-
return client.chat.completions.create(
|
| 363 |
-
model=model_name,
|
| 364 |
-
messages=messages,
|
| 365 |
-
temperature=0,
|
| 366 |
-
max_tokens=200,
|
| 367 |
-
)
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
def send_file_url(file_path: str, sampling_rate: int = 100):
|
| 371 |
-
"""file url pointing to the time-series data."""
|
| 372 |
-
messages = [
|
| 373 |
-
{
|
| 374 |
-
"role": "user",
|
| 375 |
-
"content": [
|
| 376 |
-
{
|
| 377 |
-
"type": "text",
|
| 378 |
-
"text": "Please determine whether an Earthquake event has occurred in the provided time-series data. If so, please specify the starting time point indices of the P-wave and S-wave in the event."
|
| 379 |
-
},
|
| 380 |
-
{
|
| 381 |
-
"type": "time_series_url",
|
| 382 |
-
"time_series_url": {
|
| 383 |
-
"url": f"file://{file_path}",
|
| 384 |
-
"sampling_rate": sampling_rate
|
| 385 |
-
},
|
| 386 |
-
},
|
| 387 |
-
],
|
| 388 |
-
}
|
| 389 |
-
]
|
| 390 |
-
|
| 391 |
-
return client.chat.completions.create(
|
| 392 |
-
model=model_name,
|
| 393 |
-
messages=messages,
|
| 394 |
-
temperature=0,
|
| 395 |
-
max_tokens=200,
|
| 396 |
-
)
|
| 397 |
-
|
| 398 |
-
response = send_base64("./0092638_seism.npy")
|
| 399 |
-
# response = send_http_url("https://huggingface.co/internlm/Intern-S1-Pro/raw/main/0092638_seism.npy")
|
| 400 |
-
# response = send_file_url("./0092638_seism.npy")
|
| 401 |
-
|
| 402 |
-
print(response.choices[0].message)
|
| 403 |
-
```
|
| 404 |
-
|
| 405 |
## Citation
|
| 406 |
|
| 407 |
If you find this work useful, feel free to give us a cite.
|
|
@@ -417,15 +300,3 @@ If you find this work useful, feel free to give us a cite.
|
|
| 417 |
url={https://arxiv.org/abs/2508.15763},
|
| 418 |
}
|
| 419 |
```
|
| 420 |
-
|
| 421 |
-
```
|
| 422 |
-
@misc{zou2026interns1proscientificmultimodalfoundation,
|
| 423 |
-
title={Intern-S1-Pro: Scientific Multimodal Foundation Model at Trillion Scale},
|
| 424 |
-
author={Yicheng Zou and Dongsheng Zhu and Lin Zhu and Tong Zhu and Yunhua Zhou and Peiheng Zhou and Xinyu Zhou and Dongzhan Zhou and Zhiwang Zhou and Yuhao Zhou and Bowen Zhou and Zhanping Zhong and Zhijie Zhong and Haiteng Zhao and Penghao Zhao and Xiaomeng Zhao and Zhiyuan Zhao and Yechen Zhang and Jin Zhang and Wenwei Zhang and Hongjie Zhang and Zhuo Zhang and Wenlong Zhang and Bo Zhang and Chao Zhang and Chen Zhang and Yuhang Zang and Fei Yuan and Jiakang Yuan and Jiashuo Yu and Jinhui Yin and Haochen Ye and Qian Yao and Bowen Yang and Danni Yang and Kaichen Yang and Ziang Yan and Jun Xu and Yicheng Xu and Wanghan Xu and Xuenan Xu and Chao Xu and Ruiliang Xu and Shuhao Xing and Long Xing and Xinchen Xie and Ling-I Wu and Zijian Wu and Zhenyu Wu and Lijun Wu and Yue Wu and Jianyu Wu and Wen Wu and Fan Wu and Xilin Wei and Qi Wei and Bingli Wang and Rui Wang and Ziyi Wang and Zun Wang and Yi Wang and Haomin Wang and Yizhou Wang and Lintao Wang and Yiheng Wang and Longjiang Wang and Bin Wang and Jian Tong and Zhongbo Tian and Huanze Tang and Chen Tang and Shixiang Tang and Yu Sun and Qiushi Sun and Xuerui Su and Qisheng Su and Chenlin Su and Demin Song and Jin Shi and Fukai Shang and Yuchen Ren and Pengli Ren and Xiaoye Qu and Yuan Qu and Jiantao Qiu and Yu Qiao and Runyu Peng and Tianshuo Peng and Jiahui Peng and Qizhi Pei and Zhuoshi Pan and Linke Ouyang and Wenchang Ning and Yichuan Ma and Zerun Ma and Ningsheng Ma and Runyuan Ma and Chengqi Lyu and Haijun Lv and Han Lv and Lindong Lu and Kuikun Liu and Jiangning Liu and Yuhong Liu and Kai Liu and Hongwei Liu and Zhoumianze Liu and Mengjie Liu and Ziyu Liu and Wenran Liu and Yang Liu and Liwei Liu and Kaiwen Liu and Junyao Lin and Junming Lin and Tianyang Lin and Dahua Lin and Jianze Liang and Linyang Li and Peiji Li and Zonglin Li and Zehao Li and Pengze Li and Guoyan Li and Lingkai Kong and Linglin Jing and Zhenjiang Jin and Feifei Jiang and Qian Jiang and Junhao Huang and Zixian Huang and Haian Huang and Zhouqi Hua and Han Hu and Linfeng Hou and Yinan He and Conghui He and Tianyao He and Xu Guo and Qipeng Guo and Aijia Guo and Yuzhe Gu and Lixin Gu and Jingyang Gong and Qiming Ge and Jiaye Ge and Songyang Gao and Jianfei Gao and Xinyu Fang and Caihua fan and Yue Fan and Yanhui Duan and Zichen Ding and Shengyuan Ding and Xuanlang Dai and Erfei Cui and Ganqu Cui and Pei Chu and Tao Chu and Guangran Cheng and Yu Cheng and Kai Chen and Yongkang Chen and Chiyu Chen and Guanzhou Chen and Qiaosheng Chen and Sitao Chen and Xin Chen and Haojiong Chen and Yicheng Chen and Weihan Cao and Yuhang Cao and Qinglong Cao and Lei Bai},
|
| 425 |
-
year={2026},
|
| 426 |
-
eprint={2603.25040},
|
| 427 |
-
archivePrefix={arXiv},
|
| 428 |
-
primaryClass={cs.LG},
|
| 429 |
-
url={https://arxiv.org/abs/2603.25040},
|
| 430 |
-
}
|
| 431 |
-
```
|
|
|
|
| 12 |
|
| 13 |
<div> </div>
|
| 14 |
|
| 15 |
+
[💻Github Repo](https://github.com/InternLM/Intern-S1) • [🤗Model Collections](https://huggingface.co/collections/internlm/intern-s1-6882e325e8ac1c58ba108aa5) • [📜Technical Report](https://arxiv.org/abs/2508.15763) • [💬Online Chat](https://chat.intern-ai.org.cn/)
|
| 16 |
|
| 17 |
</div>
|
| 18 |
|
|
|
|
| 60 |
|
| 61 |
### Serving
|
| 62 |
|
|
|
|
|
|
|
|
|
|
| 63 |
Intern-S1-Pro can be deployed using any of the following LLM inference frameworks:
|
| 64 |
|
| 65 |
- LMDeploy
|
|
|
|
| 68 |
|
| 69 |
Detailed deployment examples for these frameworks are available in the [Model Deployment Guide](./deployment_guide.md).
|
| 70 |
|
| 71 |
+
> Deployment support for the time-series module is under optimization and will be released soon.
|
| 72 |
+
|
| 73 |
|
| 74 |
## Advanced Usage
|
| 75 |
|
|
|
|
| 246 |
)
|
| 247 |
```
|
| 248 |
|
| 249 |
+
With serving Intern-S1-Pro models, you can dynamically control the thinking mode by adjusting the `enable_thinking` parameter in your requests.
|
| 250 |
|
| 251 |
```python
|
| 252 |
from openai import OpenAI
|
|
|
|
| 285 |
print(json.dumps(response.model_dump(), indent=2, ensure_ascii=False))
|
| 286 |
```
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
## Citation
|
| 289 |
|
| 290 |
If you find this work useful, feel free to give us a cite.
|
|
|
|
| 300 |
url={https://arxiv.org/abs/2508.15763},
|
| 301 |
}
|
| 302 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chat_template.jinja
CHANGED
|
@@ -17,8 +17,6 @@
|
|
| 17 |
{{- 'Video ' + video_count.value|string + ': <|vision_start|><|video_pad|><|vision_end|>'-}}
|
| 18 |
{%- elif 'text' in item %}
|
| 19 |
{{- item.text }}
|
| 20 |
-
{%- elif 'time_series' in item or item.type == 'time_series' %}
|
| 21 |
-
{{- '<|ts|><TS_CONTEXT><|/ts|>'-}}
|
| 22 |
{%- endif %}
|
| 23 |
{%- endfor %}
|
| 24 |
{%- endif %}
|
|
|
|
| 17 |
{{- 'Video ' + video_count.value|string + ': <|vision_start|><|video_pad|><|vision_end|>'-}}
|
| 18 |
{%- elif 'text' in item %}
|
| 19 |
{{- item.text }}
|
|
|
|
|
|
|
| 20 |
{%- endif %}
|
| 21 |
{%- endfor %}
|
| 22 |
{%- endif %}
|
config.json
CHANGED
|
@@ -58,37 +58,6 @@
|
|
| 58 |
},
|
| 59 |
"vision_end_token_id": 151653,
|
| 60 |
"vision_start_token_id": 151652,
|
| 61 |
-
"ts_config": {
|
| 62 |
-
"auto_map": {
|
| 63 |
-
"AutoConfig": "configuration_interns1_pro.InternS1ProTimeSeriesConfig",
|
| 64 |
-
"AutoModel": "modeling_interns1_pro.InternS1ProTimeSeriesModel"
|
| 65 |
-
},
|
| 66 |
-
"activation_dropout": 0.0,
|
| 67 |
-
"activation_function": "gelu",
|
| 68 |
-
"architectures": [
|
| 69 |
-
"InternS1TimeSeriesModel"
|
| 70 |
-
],
|
| 71 |
-
"attention_dropout": 0.0,
|
| 72 |
-
"d_model": 768,
|
| 73 |
-
"dropout": 0.0,
|
| 74 |
-
"dtype": "bfloat16",
|
| 75 |
-
"encoder_attention_heads": 8,
|
| 76 |
-
"encoder_ffn_dim": 3072,
|
| 77 |
-
"encoder_layerdrop": 0.0,
|
| 78 |
-
"encoder_layers": 17,
|
| 79 |
-
"model_type": "interns1_pro_time_series",
|
| 80 |
-
"max_source_positions": 1500,
|
| 81 |
-
"num_mel_bins": 80,
|
| 82 |
-
"out_hidden_size": 4096,
|
| 83 |
-
"scale_embedding": false,
|
| 84 |
-
"ts_adapt_in_dim": 256,
|
| 85 |
-
"ts_adapt_out_dim": 1024,
|
| 86 |
-
"use_cache": true,
|
| 87 |
-
"attn_implementation": "eager"
|
| 88 |
-
},
|
| 89 |
-
"ts_end_id": 151684,
|
| 90 |
-
"ts_start_id": 151683,
|
| 91 |
-
"ts_token_id": 151685,
|
| 92 |
"auto_map": {
|
| 93 |
"AutoConfig": "configuration_interns1_pro.InternS1ProConfig",
|
| 94 |
"AutoModel": "modeling_interns1_pro.InternS1ProModel",
|
|
@@ -172,7 +141,6 @@
|
|
| 172 |
"model.visual.blocks.17.mlp.linear_fc1",
|
| 173 |
"model.visual.blocks.4.norm2",
|
| 174 |
"model.visual.blocks.17.attn.qkv",
|
| 175 |
-
"model.time_series",
|
| 176 |
"model.language_model.layers.83.self_attn.k_norm",
|
| 177 |
"model.language_model.layers.47.post_attention_layernorm",
|
| 178 |
"model.language_model.layers.59.input_layernorm",
|
|
|
|
| 58 |
},
|
| 59 |
"vision_end_token_id": 151653,
|
| 60 |
"vision_start_token_id": 151652,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
"auto_map": {
|
| 62 |
"AutoConfig": "configuration_interns1_pro.InternS1ProConfig",
|
| 63 |
"AutoModel": "modeling_interns1_pro.InternS1ProModel",
|
|
|
|
| 141 |
"model.visual.blocks.17.mlp.linear_fc1",
|
| 142 |
"model.visual.blocks.4.norm2",
|
| 143 |
"model.visual.blocks.17.attn.qkv",
|
|
|
|
| 144 |
"model.language_model.layers.83.self_attn.k_norm",
|
| 145 |
"model.language_model.layers.47.post_attention_layernorm",
|
| 146 |
"model.language_model.layers.59.input_layernorm",
|
configuration_interns1_pro.py
CHANGED
|
@@ -15,7 +15,6 @@
|
|
| 15 |
|
| 16 |
from transformers.configuration_utils import PretrainedConfig
|
| 17 |
from transformers.modeling_rope_utils import rope_config_validation
|
| 18 |
-
from transformers import WhisperConfig
|
| 19 |
|
| 20 |
|
| 21 |
class InternS1ProTextConfig(PretrainedConfig):
|
|
@@ -139,61 +138,20 @@ class InternS1ProVisionConfig(PretrainedConfig):
|
|
| 139 |
self.num_position_embeddings = num_position_embeddings
|
| 140 |
self.initializer_range = initializer_range
|
| 141 |
|
| 142 |
-
class InternS1ProTimeSeriesConfig(WhisperConfig):
|
| 143 |
-
|
| 144 |
-
model_type = "interns1_pro_time_series"
|
| 145 |
-
base_config_key = "ts_config"
|
| 146 |
-
|
| 147 |
-
def __init__(
|
| 148 |
-
self,
|
| 149 |
-
ts_adapt_in_dim: int=256,
|
| 150 |
-
ts_adapt_out_dim: int=1024,
|
| 151 |
-
ts_hidden_dim: int=1024,
|
| 152 |
-
ts_cnn_channels: list[int]=[1, 32, 64, 128, 128],
|
| 153 |
-
ts_cnn_kernel_sizes: list[int]=[3, 5, 5, 5],
|
| 154 |
-
ts_cnn_strides: list[int]=[2, 4, 4, 5],
|
| 155 |
-
ts_cnn_paddings: list[int]=[1, 2, 2, 2],
|
| 156 |
-
ts_concat_subsampling_in_channels: int=128,
|
| 157 |
-
ts_concat_subsampling_concat_size: int=2,
|
| 158 |
-
use_flash_attn: bool=False,
|
| 159 |
-
**kwargs
|
| 160 |
-
):
|
| 161 |
-
super().__init__(**kwargs)
|
| 162 |
-
|
| 163 |
-
self.ts_cnn_channels = ts_cnn_channels
|
| 164 |
-
self.ts_cnn_kernel_sizes = ts_cnn_kernel_sizes
|
| 165 |
-
self.ts_cnn_strides = ts_cnn_strides
|
| 166 |
-
self.ts_cnn_paddings = ts_cnn_paddings
|
| 167 |
-
self.ts_concat_subsampling_in_channels = ts_concat_subsampling_in_channels
|
| 168 |
-
self.ts_concat_subsampling_concat_size = ts_concat_subsampling_concat_size
|
| 169 |
-
|
| 170 |
-
self.ts_adapt_in_dim = ts_adapt_in_dim
|
| 171 |
-
self.ts_adapt_out_dim = ts_adapt_out_dim
|
| 172 |
-
|
| 173 |
-
self.ts_hidden_dim = ts_hidden_dim
|
| 174 |
-
self.use_flash_attn = use_flash_attn
|
| 175 |
-
|
| 176 |
-
assert self.ts_adapt_out_dim == self.ts_hidden_dim, "ts_adapt_out_dim should be equal to ts_hidden_dim"
|
| 177 |
-
assert self.ts_concat_subsampling_in_channels == self.ts_cnn_channels[-1], "ts_concat_subsampling_in_channels should be equal to the out_channel of the last cnn layer"
|
| 178 |
-
|
| 179 |
|
| 180 |
class InternS1ProConfig(PretrainedConfig):
|
| 181 |
model_type = "interns1_pro"
|
| 182 |
-
sub_configs = {"vision_config": InternS1ProVisionConfig, "text_config": InternS1ProTextConfig
|
| 183 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 184 |
|
| 185 |
def __init__(
|
| 186 |
self,
|
| 187 |
text_config=None,
|
| 188 |
vision_config=None,
|
| 189 |
-
ts_config=None,
|
| 190 |
image_token_id=151655,
|
| 191 |
video_token_id=151656,
|
| 192 |
vision_start_token_id=151652,
|
| 193 |
vision_end_token_id=151653,
|
| 194 |
-
ts_token_id=151685,
|
| 195 |
-
ts_start_id=151683,
|
| 196 |
-
ts_end_id=151684,
|
| 197 |
tie_word_embeddings=False,
|
| 198 |
**kwargs,
|
| 199 |
):
|
|
@@ -207,19 +165,11 @@ class InternS1ProConfig(PretrainedConfig):
|
|
| 207 |
elif text_config is None:
|
| 208 |
self.text_config = self.sub_configs["text_config"]()
|
| 209 |
|
| 210 |
-
if isinstance(ts_config, dict):
|
| 211 |
-
self.ts_config = self.sub_configs["ts_config"](**ts_config)
|
| 212 |
-
elif ts_config is None:
|
| 213 |
-
self.ts_config = self.sub_configs["ts_config"]()
|
| 214 |
-
|
| 215 |
self.image_token_id = image_token_id
|
| 216 |
self.video_token_id = video_token_id
|
| 217 |
self.vision_start_token_id = vision_start_token_id
|
| 218 |
self.vision_end_token_id = vision_end_token_id
|
| 219 |
-
self.ts_token_id = ts_token_id
|
| 220 |
-
self.ts_start_id = ts_start_id
|
| 221 |
-
self.ts_end_id = ts_end_id
|
| 222 |
super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
|
| 223 |
|
| 224 |
|
| 225 |
-
__all__ = ["InternS1ProConfig", "InternS1ProTextConfig", "InternS1ProVisionConfig"
|
|
|
|
| 15 |
|
| 16 |
from transformers.configuration_utils import PretrainedConfig
|
| 17 |
from transformers.modeling_rope_utils import rope_config_validation
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
class InternS1ProTextConfig(PretrainedConfig):
|
|
|
|
| 138 |
self.num_position_embeddings = num_position_embeddings
|
| 139 |
self.initializer_range = initializer_range
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
class InternS1ProConfig(PretrainedConfig):
|
| 143 |
model_type = "interns1_pro"
|
| 144 |
+
sub_configs = {"vision_config": InternS1ProVisionConfig, "text_config": InternS1ProTextConfig}
|
| 145 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 146 |
|
| 147 |
def __init__(
|
| 148 |
self,
|
| 149 |
text_config=None,
|
| 150 |
vision_config=None,
|
|
|
|
| 151 |
image_token_id=151655,
|
| 152 |
video_token_id=151656,
|
| 153 |
vision_start_token_id=151652,
|
| 154 |
vision_end_token_id=151653,
|
|
|
|
|
|
|
|
|
|
| 155 |
tie_word_embeddings=False,
|
| 156 |
**kwargs,
|
| 157 |
):
|
|
|
|
| 165 |
elif text_config is None:
|
| 166 |
self.text_config = self.sub_configs["text_config"]()
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
self.image_token_id = image_token_id
|
| 169 |
self.video_token_id = video_token_id
|
| 170 |
self.vision_start_token_id = vision_start_token_id
|
| 171 |
self.vision_end_token_id = vision_end_token_id
|
|
|
|
|
|
|
|
|
|
| 172 |
super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
|
| 173 |
|
| 174 |
|
| 175 |
+
__all__ = ["InternS1ProConfig", "InternS1ProTextConfig", "InternS1ProVisionConfig"]
|
deployment_guide.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# Intern-S1-Pro Deployment Guide
|
| 2 |
|
| 3 |
-
The Intern-S1-Pro release is a 1T parameter model stored in FP8 format. Deployment requires at least
|
| 4 |
|
| 5 |
- Tensor Parallelism (TP)
|
| 6 |
- Data Parallelism (DP) + Expert Parallelism (EP)
|
|
@@ -9,7 +9,7 @@ The Intern-S1-Pro release is a 1T parameter model stored in FP8 format. Deployme
|
|
| 9 |
|
| 10 |
## LMDeploy
|
| 11 |
|
| 12 |
-
Required version `lmdeploy>=0.12.
|
| 13 |
|
| 14 |
- Tensor Parallelism
|
| 15 |
|
|
@@ -59,7 +59,25 @@ lmdeploy serve api_server \
|
|
| 59 |
|
| 60 |
## vLLM
|
| 61 |
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
```bash
|
| 65 |
# node 0
|
|
@@ -108,8 +126,6 @@ vllm serve internlm/Intern-S1-Pro \
|
|
| 108 |
|
| 109 |
## SGLang
|
| 110 |
|
| 111 |
-
You can use the docker image `lmsysorg/sglang:dev` to deploy. Refer to [using-docker](https://docs.sglang.io/get_started/install.html#method-3-using-docker) for more.
|
| 112 |
-
|
| 113 |
- Tensor Parallelism + Expert Parallelism
|
| 114 |
|
| 115 |
```bash
|
|
|
|
| 1 |
# Intern-S1-Pro Deployment Guide
|
| 2 |
|
| 3 |
+
The Intern-S1-Pro release is a 1T parameter model stored in FP8 format. Deployment requires at least two 8-GPU H200 nodes, with either of the following configurations:
|
| 4 |
|
| 5 |
- Tensor Parallelism (TP)
|
| 6 |
- Data Parallelism (DP) + Expert Parallelism (EP)
|
|
|
|
| 9 |
|
| 10 |
## LMDeploy
|
| 11 |
|
| 12 |
+
Required version `lmdeploy>=0.12.0`
|
| 13 |
|
| 14 |
- Tensor Parallelism
|
| 15 |
|
|
|
|
| 59 |
|
| 60 |
## vLLM
|
| 61 |
|
| 62 |
+
- Tensor Parallelism + Expert Parallelism
|
| 63 |
+
|
| 64 |
+
```bash
|
| 65 |
+
# start ray on node 0 and node 1
|
| 66 |
+
|
| 67 |
+
# node 0
|
| 68 |
+
export VLLM_ENGINE_READY_TIMEOUT_S=10000
|
| 69 |
+
vllm serve internlm/Intern-S1-Pro \
|
| 70 |
+
--tensor-parallel-size 16 \
|
| 71 |
+
--enable-expert-parallel \
|
| 72 |
+
--distributed-executor-backend ray \
|
| 73 |
+
--max-model-len 65536 \
|
| 74 |
+
--trust-remote-code \
|
| 75 |
+
--reasoning-parser deepseek_r1 \
|
| 76 |
+
--enable-auto-tool-choice \
|
| 77 |
+
--tool-call-parser hermes
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
- Data Parallelism + Expert Parallelism
|
| 81 |
|
| 82 |
```bash
|
| 83 |
# node 0
|
|
|
|
| 126 |
|
| 127 |
## SGLang
|
| 128 |
|
|
|
|
|
|
|
| 129 |
- Tensor Parallelism + Expert Parallelism
|
| 130 |
|
| 131 |
```bash
|
model-time_series-00001-of-00002.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6fab87c45c01a8695f97b5801bee2771ac6e874561ac773983397d958f1e7a00
|
| 3 |
-
size 291982664
|
|
|
|
|
|
|
|
|
|
|
|
model-time_series-00002-of-00002.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:4150fadfb90bd9561c422b37ecc83fd5a30966f1e555bc9305b9fd5d2c914b0d
|
| 3 |
-
size 10240128
|
|
|
|
|
|
|
|
|
|
|
|
model.safetensors.index.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7de640c8e6f374c36de64b925b2c107896731ef642283e490e69125ec5c4eac1
|
| 3 |
+
size 32204741
|
modeling_interns1_pro.py
CHANGED
|
@@ -34,10 +34,8 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
| 34 |
from transformers.processing_utils import Unpack
|
| 35 |
from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
|
| 36 |
from transformers.utils.generic import OutputRecorder, check_model_inputs
|
| 37 |
-
from .configuration_interns1_pro import InternS1ProConfig, InternS1ProTextConfig, InternS1ProVisionConfig
|
| 38 |
-
|
| 39 |
-
from transformers import WhisperPreTrainedModel
|
| 40 |
-
import math
|
| 41 |
|
| 42 |
@use_kernel_forward_from_hub("RMSNorm")
|
| 43 |
class Qwen3VLMoeTextRMSNorm(nn.Module):
|
|
@@ -441,7 +439,7 @@ class InternS1ProPreTrainedModel(PreTrainedModel):
|
|
| 441 |
config: InternS1ProConfig
|
| 442 |
base_model_prefix = "model"
|
| 443 |
supports_gradient_checkpointing = True
|
| 444 |
-
_no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"
|
| 445 |
_skip_keys_device_placement = ["past_key_values"]
|
| 446 |
_supports_flash_attn = True
|
| 447 |
_supports_sdpa = True
|
|
@@ -988,7 +986,7 @@ class InternS1ProTextModel(InternS1ProPreTrainedModel):
|
|
| 988 |
# Initialize weights and apply final processing
|
| 989 |
self.post_init()
|
| 990 |
|
| 991 |
-
@check_model_inputs
|
| 992 |
@auto_docstring
|
| 993 |
def forward(
|
| 994 |
self,
|
|
@@ -1059,442 +1057,6 @@ class InternS1ProTextModel(InternS1ProPreTrainedModel):
|
|
| 1059 |
)
|
| 1060 |
|
| 1061 |
|
| 1062 |
-
class InternS1ProTimeSeriesEncoder(WhisperPreTrainedModel):
|
| 1063 |
-
def __init__(self, config: InternS1ProTimeSeriesConfig):
|
| 1064 |
-
super().__init__(config)
|
| 1065 |
-
self.config = config
|
| 1066 |
-
self.dropout = config.dropout
|
| 1067 |
-
self.layerdrop = config.encoder_layerdrop
|
| 1068 |
-
|
| 1069 |
-
self.embed_dim = config.d_model
|
| 1070 |
-
self.num_mel_bins = config.num_mel_bins
|
| 1071 |
-
self.padding_idx = config.pad_token_id
|
| 1072 |
-
self.max_source_positions = config.max_source_positions
|
| 1073 |
-
self.embed_scale = math.sqrt(self.embed_dim) if config.scale_embedding else 1.0
|
| 1074 |
-
|
| 1075 |
-
self.conv1 = nn.Conv1d(self.num_mel_bins, self.embed_dim, kernel_size=3, padding=1)
|
| 1076 |
-
self.conv2 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=3, stride=2, padding=1)
|
| 1077 |
-
self.embed_positions = nn.Embedding(self.max_source_positions, self.embed_dim)
|
| 1078 |
-
|
| 1079 |
-
self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
|
| 1080 |
-
self.layer_norm = nn.LayerNorm(config.d_model)
|
| 1081 |
-
|
| 1082 |
-
self.gradient_checkpointing = False
|
| 1083 |
-
self.post_init()
|
| 1084 |
-
|
| 1085 |
-
self.mask_type = None
|
| 1086 |
-
self.chunk_length = None
|
| 1087 |
-
|
| 1088 |
-
self.adapt_in = nn.Linear(config.ts_adapt_in_dim, 80)
|
| 1089 |
-
self.adapt_out = nn.Linear(self.embed_dim, config.ts_adapt_out_dim)
|
| 1090 |
-
|
| 1091 |
-
def _freeze_parameters(self):
|
| 1092 |
-
for param in self.parameters():
|
| 1093 |
-
param.requires_grad = False
|
| 1094 |
-
self._requires_grad = False
|
| 1095 |
-
|
| 1096 |
-
def get_input_embeddings(self) -> nn.Module:
|
| 1097 |
-
return self.conv1
|
| 1098 |
-
|
| 1099 |
-
def set_input_embeddings(self, value: nn.Module):
|
| 1100 |
-
self.conv1 = value
|
| 1101 |
-
|
| 1102 |
-
def define_masktype(self, masktype, chunk_length=None):
|
| 1103 |
-
self.mask_type = masktype
|
| 1104 |
-
self.chunk_length = chunk_length
|
| 1105 |
-
|
| 1106 |
-
def _make_causal_mask(self,
|
| 1107 |
-
input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
|
| 1108 |
-
):
|
| 1109 |
-
"""
|
| 1110 |
-
Make causal mask used for bi-directional self-attention.
|
| 1111 |
-
"""
|
| 1112 |
-
bsz, tgt_len = input_ids_shape
|
| 1113 |
-
mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
|
| 1114 |
-
mask_cond = torch.arange(mask.size(-1), device=device)
|
| 1115 |
-
mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
|
| 1116 |
-
mask = mask.to(dtype)
|
| 1117 |
-
|
| 1118 |
-
if past_key_values_length > 0:
|
| 1119 |
-
mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
|
| 1120 |
-
return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
|
| 1121 |
-
|
| 1122 |
-
# Copied from transformers.models.bart.modeling_bart._expand_mask
|
| 1123 |
-
def _expand_mask(self, mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
|
| 1124 |
-
"""
|
| 1125 |
-
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
|
| 1126 |
-
"""
|
| 1127 |
-
# print(mask.size())
|
| 1128 |
-
bsz, src_len = mask.size()
|
| 1129 |
-
tgt_len = tgt_len if tgt_len is not None else src_len
|
| 1130 |
-
|
| 1131 |
-
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
|
| 1132 |
-
|
| 1133 |
-
inverted_mask = 1.0 - expanded_mask
|
| 1134 |
-
|
| 1135 |
-
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
| 1136 |
-
|
| 1137 |
-
|
| 1138 |
-
def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
|
| 1139 |
-
# create causal mask
|
| 1140 |
-
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
| 1141 |
-
combined_attention_mask = None
|
| 1142 |
-
|
| 1143 |
-
if input_shape[-1] > 1:
|
| 1144 |
-
combined_attention_mask = self._make_causal_mask(
|
| 1145 |
-
input_shape,
|
| 1146 |
-
inputs_embeds.dtype,
|
| 1147 |
-
device=inputs_embeds.device,
|
| 1148 |
-
past_key_values_length=past_key_values_length,
|
| 1149 |
-
)
|
| 1150 |
-
|
| 1151 |
-
if attention_mask is not None:
|
| 1152 |
-
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
| 1153 |
-
expanded_attn_mask = self._expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
|
| 1154 |
-
combined_attention_mask = (
|
| 1155 |
-
expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
|
| 1156 |
-
)
|
| 1157 |
-
return combined_attention_mask
|
| 1158 |
-
|
| 1159 |
-
def prepare_chunk_attention_mask(self, attention_mask, input_shape, inputs_embeds):
|
| 1160 |
-
|
| 1161 |
-
block_size = round(self.chunk_length / 4 * 2)
|
| 1162 |
-
matrix_size = input_shape[1]
|
| 1163 |
-
|
| 1164 |
-
matrix = torch.ones(matrix_size, matrix_size)
|
| 1165 |
-
|
| 1166 |
-
num_full_blocks = round(matrix_size // block_size)
|
| 1167 |
-
remainder = matrix_size % block_size
|
| 1168 |
-
for i in range(num_full_blocks):
|
| 1169 |
-
row_start = i * block_size
|
| 1170 |
-
col_start = i * block_size
|
| 1171 |
-
matrix[row_start:row_start + block_size, col_start:col_start + block_size] = torch.zeros(block_size, block_size)
|
| 1172 |
-
|
| 1173 |
-
if remainder > 0:
|
| 1174 |
-
last_row_start = num_full_blocks * block_size
|
| 1175 |
-
last_col_start = num_full_blocks * block_size
|
| 1176 |
-
matrix[last_row_start:last_row_start + remainder, last_col_start:last_col_start + remainder] = torch.zeros(remainder, remainder)
|
| 1177 |
-
|
| 1178 |
-
matrix = matrix * -65504
|
| 1179 |
-
matrix = matrix.unsqueeze(0).unsqueeze(0).repeat(input_shape[0], 1, 1, 1)
|
| 1180 |
-
attention_mask = matrix.to(inputs_embeds.device)
|
| 1181 |
-
return attention_mask
|
| 1182 |
-
|
| 1183 |
-
def forward(
|
| 1184 |
-
self,
|
| 1185 |
-
input_features,
|
| 1186 |
-
attention_mask=None,
|
| 1187 |
-
head_mask=None,
|
| 1188 |
-
output_attentions=None,
|
| 1189 |
-
output_hidden_states=None,
|
| 1190 |
-
return_dict=None,
|
| 1191 |
-
):
|
| 1192 |
-
# (N, T, C) -> (T, N, C) -> (N, C, T)
|
| 1193 |
-
input_features = input_features.permute(1, 0, 2)
|
| 1194 |
-
input_features = self.adapt_in(input_features)
|
| 1195 |
-
input_features = input_features.permute(1, 2, 0)
|
| 1196 |
-
|
| 1197 |
-
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 1198 |
-
output_hidden_states = (
|
| 1199 |
-
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 1200 |
-
)
|
| 1201 |
-
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1202 |
-
|
| 1203 |
-
# (N, C, T) -> (N, C, T//2)
|
| 1204 |
-
inputs_embeds = nn.functional.gelu(self.conv1(input_features))
|
| 1205 |
-
inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
|
| 1206 |
-
|
| 1207 |
-
# (N, C, T) -> (N, T, C)
|
| 1208 |
-
inputs_embeds = inputs_embeds.permute(0, 2, 1) # torch.Size([1, 100, 768])
|
| 1209 |
-
embed_pos = self.embed_positions.weight # torch.Size([1500, 768])
|
| 1210 |
-
|
| 1211 |
-
if inputs_embeds.shape[1] > embed_pos.shape[0]:
|
| 1212 |
-
target_len = inputs_embeds.shape[1]
|
| 1213 |
-
padding = [0, 0, 0, target_len-embed_pos.shape[0]]
|
| 1214 |
-
|
| 1215 |
-
embed_pos = nn.functional.pad(embed_pos, pad=padding, mode='constant', value=0)
|
| 1216 |
-
hidden_states = inputs_embeds[:, :embed_pos.shape[0], :] + embed_pos
|
| 1217 |
-
else:
|
| 1218 |
-
hidden_states = inputs_embeds + embed_pos[:inputs_embeds.shape[1], :]
|
| 1219 |
-
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
| 1220 |
-
|
| 1221 |
-
encoder_states = () if output_hidden_states else None
|
| 1222 |
-
all_attentions = () if output_attentions else None
|
| 1223 |
-
|
| 1224 |
-
input_shape = inputs_embeds.size()[:-1]
|
| 1225 |
-
past_key_values_length = 0
|
| 1226 |
-
attention_mask = None
|
| 1227 |
-
if self.mask_type == 'chunk':
|
| 1228 |
-
attention_mask = self.prepare_chunk_attention_mask(attention_mask, input_shape, inputs_embeds)
|
| 1229 |
-
else:
|
| 1230 |
-
attention_mask = self._prepare_decoder_attention_mask(
|
| 1231 |
-
attention_mask, input_shape, inputs_embeds, past_key_values_length
|
| 1232 |
-
)
|
| 1233 |
-
|
| 1234 |
-
if head_mask is not None:
|
| 1235 |
-
assert head_mask.size()[0] == (
|
| 1236 |
-
len(self.layers)
|
| 1237 |
-
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
| 1238 |
-
|
| 1239 |
-
for idx, encoder_layer in enumerate(self.layers):
|
| 1240 |
-
if output_hidden_states:
|
| 1241 |
-
encoder_states = encoder_states + (self.layer_norm(hidden_states),)
|
| 1242 |
-
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
| 1243 |
-
to_drop = False
|
| 1244 |
-
if self.training:
|
| 1245 |
-
dropout_probability = torch.rand([])
|
| 1246 |
-
if dropout_probability < self.layerdrop: # skip the layer
|
| 1247 |
-
to_drop = True
|
| 1248 |
-
|
| 1249 |
-
if to_drop:
|
| 1250 |
-
layer_outputs = (None, None)
|
| 1251 |
-
else:
|
| 1252 |
-
if self.gradient_checkpointing and self.training:
|
| 1253 |
-
|
| 1254 |
-
def create_custom_forward(module):
|
| 1255 |
-
def custom_forward(*inputs):
|
| 1256 |
-
return module(*inputs, output_attentions)
|
| 1257 |
-
|
| 1258 |
-
return custom_forward
|
| 1259 |
-
|
| 1260 |
-
layer_outputs = torch.utils.checkpoint.checkpoint(
|
| 1261 |
-
create_custom_forward(encoder_layer),
|
| 1262 |
-
hidden_states,
|
| 1263 |
-
attention_mask,
|
| 1264 |
-
(head_mask[idx] if head_mask is not None else None),
|
| 1265 |
-
)
|
| 1266 |
-
else:
|
| 1267 |
-
layer_outputs = encoder_layer(
|
| 1268 |
-
hidden_states,
|
| 1269 |
-
attention_mask,
|
| 1270 |
-
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
| 1271 |
-
output_attentions=output_attentions,
|
| 1272 |
-
)
|
| 1273 |
-
|
| 1274 |
-
hidden_states = layer_outputs[0]
|
| 1275 |
-
|
| 1276 |
-
if output_attentions:
|
| 1277 |
-
all_attentions = all_attentions + (layer_outputs[1],)
|
| 1278 |
-
|
| 1279 |
-
# (N, T, C) -> (T, N, C)
|
| 1280 |
-
hidden_states = hidden_states.permute(1, 0, 2)
|
| 1281 |
-
hidden_states = self.layer_norm(hidden_states)
|
| 1282 |
-
hidden_states = self.adapt_out(hidden_states)
|
| 1283 |
-
|
| 1284 |
-
# (T, N, C) -> (N, T, C)
|
| 1285 |
-
hidden_states = hidden_states.permute(1, 0, 2)
|
| 1286 |
-
if output_hidden_states:
|
| 1287 |
-
encoder_states = encoder_states + (hidden_states,)
|
| 1288 |
-
|
| 1289 |
-
if not return_dict:
|
| 1290 |
-
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
|
| 1291 |
-
return ModelOutput(
|
| 1292 |
-
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
|
| 1293 |
-
)
|
| 1294 |
-
|
| 1295 |
-
|
| 1296 |
-
class InternS1ProTimeSeriesConcatSubsampling(nn.Module):
|
| 1297 |
-
def __init__(self, in_channels: int, concat_size: int):
|
| 1298 |
-
super().__init__()
|
| 1299 |
-
self.in_channels = in_channels
|
| 1300 |
-
self.out_channels = in_channels * concat_size
|
| 1301 |
-
|
| 1302 |
-
def forward(self, ts_signals: torch.Tensor, ts_lens: torch.Tensor):
|
| 1303 |
-
if ts_signals.shape[1] % 2 != 0:
|
| 1304 |
-
ts_signals = ts_signals[:, :-1, :]
|
| 1305 |
-
even_frames = ts_signals[:, ::2, :]
|
| 1306 |
-
odd_frames = ts_signals[:, 1::2, :]
|
| 1307 |
-
ts_signals = torch.cat((even_frames, odd_frames), dim=2)
|
| 1308 |
-
ts_lens = ts_lens // 2
|
| 1309 |
-
return ts_signals, ts_lens
|
| 1310 |
-
|
| 1311 |
-
|
| 1312 |
-
class InternS1ProTimeSeriesFixPositionalEncoding(nn.Module):
|
| 1313 |
-
def __init__(self, d_model, max_len=20000):
|
| 1314 |
-
super().__init__()
|
| 1315 |
-
pe = torch.zeros(max_len, d_model,dtype=torch.float)
|
| 1316 |
-
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
| 1317 |
-
div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model))
|
| 1318 |
-
pe[:, 0::2] = torch.sin(position * div_term)
|
| 1319 |
-
pe[:, 1::2] = torch.cos(position * div_term)
|
| 1320 |
-
pe = pe.unsqueeze(0).transpose(0, 1).to(torch.float32) # (max_len, 1, d_model)
|
| 1321 |
-
self.register_buffer('pe', pe, persistent=True)
|
| 1322 |
-
|
| 1323 |
-
def forward(self, x):
|
| 1324 |
-
# x: (seq_len, batch_size, d_model)
|
| 1325 |
-
x = x + self.pe[:x.size(0), :]
|
| 1326 |
-
return x.clone()
|
| 1327 |
-
|
| 1328 |
-
|
| 1329 |
-
class InternS1ProTimeSeriesMultiChannelAdaptiveSubsampling(nn.Module):
|
| 1330 |
-
def __init__(self, hidden_dim=128, nhead=8,num_encoder_layers = 1):
|
| 1331 |
-
super().__init__()
|
| 1332 |
-
self.conv = nn.Conv1d(in_channels=1, out_channels=hidden_dim, kernel_size=5, stride=1, padding=2)
|
| 1333 |
-
encoder_layers = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead)
|
| 1334 |
-
self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
|
| 1335 |
-
self.pos_encoder = InternS1ProTimeSeriesFixPositionalEncoding(d_model=hidden_dim)
|
| 1336 |
-
self.subsampling = InternS1ProTimeSeriesConcatSubsampling(128, 2)
|
| 1337 |
-
|
| 1338 |
-
|
| 1339 |
-
def forward(self, inputs, input_lens, sr):
|
| 1340 |
-
features, feature_lens = self.forward_patch(inputs, input_lens, sr)
|
| 1341 |
-
outputs = features
|
| 1342 |
-
output_lens = feature_lens
|
| 1343 |
-
return outputs, output_lens
|
| 1344 |
-
|
| 1345 |
-
def forward_patch(self, inputs, input_lens, sr):
|
| 1346 |
-
sr = sr.float()
|
| 1347 |
-
strides = torch.floor(160/((1+torch.exp(-sr/100))**6))
|
| 1348 |
-
patch_sizes = strides * 2
|
| 1349 |
-
patched_outputs = []
|
| 1350 |
-
output_lens = []
|
| 1351 |
-
|
| 1352 |
-
for i in range(len(inputs)):
|
| 1353 |
-
seq = inputs[i] # [seq_len, num_channel]
|
| 1354 |
-
ps = patch_sizes[i].item()
|
| 1355 |
-
st = strides[i].item()
|
| 1356 |
-
le = input_lens[i]
|
| 1357 |
-
|
| 1358 |
-
output_len = torch.ceil((le - ps) / st) + 1
|
| 1359 |
-
pad_len = ((output_len - 1) * st + ps - le).long().item()
|
| 1360 |
-
if seq.ndim == 1:
|
| 1361 |
-
seq = seq.unsqueeze(-1)
|
| 1362 |
-
seq = nn.functional.pad(seq, (0, 0, 0, pad_len), "constant", 0)
|
| 1363 |
-
assert output_len > 0,(seq.shape, ps,st,le,output_len)
|
| 1364 |
-
output_lens.append(output_len)
|
| 1365 |
-
indices = (torch.arange(0, output_len * st, st).unsqueeze(1) + torch.arange(ps)).long()
|
| 1366 |
-
patched = seq[indices]
|
| 1367 |
-
|
| 1368 |
-
output = self.forward_encoder(patched) #[num_patch, D]
|
| 1369 |
-
patched_outputs.append(output)
|
| 1370 |
-
|
| 1371 |
-
outputs = nn.utils.rnn.pad_sequence(patched_outputs, batch_first=True)
|
| 1372 |
-
output_lens = torch.tensor(output_lens).squeeze().to(outputs.device).long()
|
| 1373 |
-
if output_lens.ndim == 0:
|
| 1374 |
-
output_lens = output_lens.unsqueeze(0)
|
| 1375 |
-
|
| 1376 |
-
outputs, output_lens = self.subsampling(outputs.clone(), output_lens.clone())
|
| 1377 |
-
return outputs, output_lens
|
| 1378 |
-
|
| 1379 |
-
def forward_encoder(self, x):
|
| 1380 |
-
num_patch, patch_len, C = x.shape
|
| 1381 |
-
# conv1
|
| 1382 |
-
x = x.reshape(num_patch*C, 1, patch_len) # 每个 channel 当作独立样本送入 conv1
|
| 1383 |
-
x = nn.functional.relu((self.conv(x))) # [B*C, D1, L]
|
| 1384 |
-
x = x.permute(2,0,1) # [L, B*C, D1]
|
| 1385 |
-
|
| 1386 |
-
x = self.pos_encoder(x) # [L, B*C, D1]
|
| 1387 |
-
x = self.transformer_encoder(x.to(torch.bfloat16))
|
| 1388 |
-
x = x.mean(0)
|
| 1389 |
-
|
| 1390 |
-
x = x.reshape(num_patch,C,-1)
|
| 1391 |
-
|
| 1392 |
-
return x.mean(1)
|
| 1393 |
-
|
| 1394 |
-
class InternS1ProTimeSeriesProjector(nn.Module):
|
| 1395 |
-
def __init__(self, config: InternS1ProTimeSeriesConfig):
|
| 1396 |
-
super().__init__()
|
| 1397 |
-
self.layer_norm = nn.LayerNorm(config.ts_hidden_dim)
|
| 1398 |
-
self.linear_1 = nn.Linear(config.ts_hidden_dim, config.out_hidden_size)
|
| 1399 |
-
self.act = ACT2FN[config.activation_function]
|
| 1400 |
-
self.linear_2 = nn.Linear(config.out_hidden_size, config.out_hidden_size)
|
| 1401 |
-
|
| 1402 |
-
def forward(self, ts_features):
|
| 1403 |
-
hidden_states = self.layer_norm(ts_features)
|
| 1404 |
-
hidden_states = self.linear_1(hidden_states)
|
| 1405 |
-
hidden_states = self.act(hidden_states)
|
| 1406 |
-
hidden_states = self.linear_2(hidden_states)
|
| 1407 |
-
return hidden_states
|
| 1408 |
-
|
| 1409 |
-
class InternS1ProTimeSeriesModel(InternS1ProPreTrainedModel):
|
| 1410 |
-
main_input_name = 'time_series_signals'
|
| 1411 |
-
_supports_flash_attn_2 = False
|
| 1412 |
-
config_class = InternS1ProTimeSeriesConfig
|
| 1413 |
-
_no_split_modules = ['WhisperEncoderLayer']
|
| 1414 |
-
|
| 1415 |
-
def __init__(self, config: InternS1ProTimeSeriesConfig):
|
| 1416 |
-
super().__init__(config)
|
| 1417 |
-
self.config = config
|
| 1418 |
-
self.encoder_embed = InternS1ProTimeSeriesMultiChannelAdaptiveSubsampling()
|
| 1419 |
-
self.encoder = InternS1ProTimeSeriesEncoder(config)
|
| 1420 |
-
self.projector = InternS1ProTimeSeriesProjector(config)
|
| 1421 |
-
|
| 1422 |
-
def get_input_embeddings(self):
|
| 1423 |
-
return self.encoder_embed
|
| 1424 |
-
|
| 1425 |
-
def make_pad_mask(self, lengths: torch.Tensor) -> torch.Tensor:
|
| 1426 |
-
"""
|
| 1427 |
-
Args:
|
| 1428 |
-
lengths:
|
| 1429 |
-
A 1-D tensor containing sentence lengths.
|
| 1430 |
-
max_len:
|
| 1431 |
-
The length of masks.
|
| 1432 |
-
Returns:
|
| 1433 |
-
Return a 2-D bool tensor, where masked positions
|
| 1434 |
-
are filled with `True` and non-masked positions are
|
| 1435 |
-
filled with `False`.
|
| 1436 |
-
|
| 1437 |
-
>>> lengths = torch.tensor([1, 3, 2, 5])
|
| 1438 |
-
>>> make_pad_mask(lengths)
|
| 1439 |
-
tensor([[False, True, True, True, True],
|
| 1440 |
-
[False, False, False, True, True],
|
| 1441 |
-
[False, False, True, True, True],
|
| 1442 |
-
[False, False, False, False, False]])
|
| 1443 |
-
"""
|
| 1444 |
-
assert lengths.ndim == 1, lengths.ndim
|
| 1445 |
-
max_len = lengths.max()
|
| 1446 |
-
n = lengths.size(0)
|
| 1447 |
-
seq_range = torch.arange(0, max_len, device=lengths.device)
|
| 1448 |
-
expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
|
| 1449 |
-
return expaned_lengths >= lengths.unsqueeze(-1)
|
| 1450 |
-
|
| 1451 |
-
def forward(
|
| 1452 |
-
self,
|
| 1453 |
-
time_series_signals: Optional[torch.FloatTensor] = None,
|
| 1454 |
-
ts_lens: Optional[torch.Tensor] = None,
|
| 1455 |
-
sr: Optional[torch.Tensor] = None,
|
| 1456 |
-
output_hidden_states: Optional[bool] = None,
|
| 1457 |
-
return_dict: Optional[bool] = None,
|
| 1458 |
-
time_series_embeds: Optional[torch.FloatTensor] = None,
|
| 1459 |
-
):
|
| 1460 |
-
|
| 1461 |
-
output_hidden_states = (
|
| 1462 |
-
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 1463 |
-
)
|
| 1464 |
-
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1465 |
-
|
| 1466 |
-
if time_series_signals is None and time_series_embeds is None:
|
| 1467 |
-
raise ValueError('You have to specify time_series_signals or time_series_embeds')
|
| 1468 |
-
|
| 1469 |
-
if time_series_embeds is not None and len(time_series_embeds.shape) == 3 and time_series_embeds.shape[-1] == self.config.ts_adapt_in_dim:
|
| 1470 |
-
time_series_embeds = time_series_embeds
|
| 1471 |
-
else:
|
| 1472 |
-
if (isinstance(time_series_signals,list) and len(time_series_signals[0].shape) == 2) \
|
| 1473 |
-
or (isinstance(time_series_signals, torch.Tensor) and len(time_series_signals.shape) == 3):
|
| 1474 |
-
time_series_embeds, ts_lens = self.encoder_embed(time_series_signals, ts_lens, sr)
|
| 1475 |
-
else:
|
| 1476 |
-
raise ValueError(f'wrong time_series_signals size: {time_series_signals[0].shape}')
|
| 1477 |
-
|
| 1478 |
-
# [B, 64000, 1] -> [B, 200, 256] -> [B, 100, 1024]
|
| 1479 |
-
encoder_outputs = self.encoder(
|
| 1480 |
-
input_features=time_series_embeds,
|
| 1481 |
-
output_hidden_states=output_hidden_states,
|
| 1482 |
-
return_dict=return_dict,
|
| 1483 |
-
)
|
| 1484 |
-
|
| 1485 |
-
# ts_lens after encoder
|
| 1486 |
-
ts_lens = (ts_lens+1) // 2
|
| 1487 |
-
assert torch.all(ts_lens > 0), f"The length of time_series_embeds is so small. ts_lens: {ts_lens}"
|
| 1488 |
-
|
| 1489 |
-
src_key_padding_mask = self.make_pad_mask(ts_lens)
|
| 1490 |
-
last_hidden_state = encoder_outputs.last_hidden_state
|
| 1491 |
-
|
| 1492 |
-
ts_pad_mask = src_key_padding_mask
|
| 1493 |
-
ts_embeds = self.projector(last_hidden_state)
|
| 1494 |
-
|
| 1495 |
-
return ts_embeds,ts_pad_mask
|
| 1496 |
-
|
| 1497 |
-
|
| 1498 |
@dataclass
|
| 1499 |
@auto_docstring(
|
| 1500 |
custom_intro="""
|
|
@@ -1556,13 +1118,12 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
|
|
| 1556 |
# Reference: fix gemma3 grad acc #37208
|
| 1557 |
accepts_loss_kwargs = False
|
| 1558 |
config: InternS1ProConfig
|
| 1559 |
-
_no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"
|
| 1560 |
|
| 1561 |
def __init__(self, config):
|
| 1562 |
super().__init__(config)
|
| 1563 |
self.visual = InternS1ProVisionModel._from_config(config.vision_config)
|
| 1564 |
self.language_model = InternS1ProTextModel._from_config(config.text_config)
|
| 1565 |
-
self.time_series = InternS1ProTimeSeriesModel._from_config(config.ts_config)
|
| 1566 |
|
| 1567 |
# Initialize weights and apply final processing
|
| 1568 |
self.post_init()
|
|
@@ -1609,15 +1170,6 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
|
|
| 1609 |
split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
|
| 1610 |
image_embeds = torch.split(image_embeds, split_sizes)
|
| 1611 |
return image_embeds
|
| 1612 |
-
|
| 1613 |
-
def get_ts_feature(self, ts_values, ts_lens, sr):
|
| 1614 |
-
ts_embeds, ts_pad_mask = self.time_series(
|
| 1615 |
-
time_series_signals=ts_values,
|
| 1616 |
-
ts_lens=ts_lens,
|
| 1617 |
-
sr=sr,
|
| 1618 |
-
output_hidden_states=False,
|
| 1619 |
-
return_dict=True)
|
| 1620 |
-
return ts_embeds, ts_pad_mask
|
| 1621 |
|
| 1622 |
def get_placeholder_mask(
|
| 1623 |
self,
|
|
@@ -1660,7 +1212,7 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
|
|
| 1660 |
return special_image_mask, special_video_mask
|
| 1661 |
|
| 1662 |
@auto_docstring
|
| 1663 |
-
@check_model_inputs
|
| 1664 |
def forward(
|
| 1665 |
self,
|
| 1666 |
input_ids: torch.LongTensor = None,
|
|
@@ -1673,9 +1225,6 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
|
|
| 1673 |
image_grid_thw: Optional[torch.LongTensor] = None,
|
| 1674 |
video_grid_thw: Optional[torch.LongTensor] = None,
|
| 1675 |
cache_position: Optional[torch.LongTensor] = None,
|
| 1676 |
-
ts_values: Union[torch.FloatTensor, list[torch.FloatTensor]] = None,
|
| 1677 |
-
ts_lens: Union[torch.Tensor, list[torch.Tensor]] = None,
|
| 1678 |
-
ts_sr: Union[torch.FloatTensor, list[torch.FloatTensor]] = None,
|
| 1679 |
**kwargs: Unpack[TransformersKwargs],
|
| 1680 |
) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]:
|
| 1681 |
r"""
|
|
@@ -1683,12 +1232,6 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
|
|
| 1683 |
The temporal, height and width of feature shape of each image in LLM.
|
| 1684 |
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
|
| 1685 |
The temporal, height and width of feature shape of each video in LLM.
|
| 1686 |
-
ts_values (`torch.FloatTensor` of shape `(batch_size, seq_len, num_channels)`, *optional*):
|
| 1687 |
-
The tensors corresponding to the input time series signals.
|
| 1688 |
-
ts_lens (`torch.Tensor` of shape `(batch_size,)`, *optional*):
|
| 1689 |
-
The valid lengths of each time series signal in the batch.
|
| 1690 |
-
ts_sr (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
|
| 1691 |
-
The sampling rates of each time series signal in the batch.
|
| 1692 |
"""
|
| 1693 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
| 1694 |
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
@@ -1715,27 +1258,6 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
|
|
| 1715 |
)
|
| 1716 |
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
|
| 1717 |
|
| 1718 |
-
if pixel_values is None and pixel_values_videos is None and ts_values is not None:
|
| 1719 |
-
ts_features, ts_pad_mask = self.get_ts_feature(ts_values, ts_lens, ts_sr) # [B, T, C], [B, T]
|
| 1720 |
-
ts_features = ts_features[~ts_pad_mask].to(inputs_embeds.device, inputs_embeds.dtype) # [num_valid_ts_tokens, C]
|
| 1721 |
-
B, N, C = inputs_embeds.shape
|
| 1722 |
-
input_ids = input_ids.reshape(B * N)
|
| 1723 |
-
inputs_embeds = inputs_embeds.reshape(B * N, C)
|
| 1724 |
-
# replace ts_token in inputs_embeds and attention_mask
|
| 1725 |
-
ts_placeholder = (input_ids == self.config.ts_token_id)
|
| 1726 |
-
n_ts_placeholders = ts_placeholder.sum().item()
|
| 1727 |
-
n_ts_tokens = ts_features.size(0)
|
| 1728 |
-
assert n_ts_placeholders == n_ts_tokens, f"[ERROR]: Mismatch: <TS_CONTEXT> tokens={n_ts_placeholders}, ts_embeds_valid={n_ts_tokens}"
|
| 1729 |
-
|
| 1730 |
-
try:
|
| 1731 |
-
inputs_embeds[ts_placeholder] = inputs_embeds[ts_placeholder] * 0.0 + ts_features
|
| 1732 |
-
except Exception as e:
|
| 1733 |
-
print(f'warning: {e}, inputs_embeds[selected].shape={inputs_embeds[ts_placeholder].shape}, ts_embeds_valid.shape={ts_features.shape}')
|
| 1734 |
-
inputs_embeds[ts_placeholder] = inputs_embeds[ts_placeholder] * 0.0 + n_ts_tokens[:n_ts_placeholders]
|
| 1735 |
-
|
| 1736 |
-
inputs_embeds = inputs_embeds.reshape(B, N, C)
|
| 1737 |
-
# input_ids = input_ids.reshape(B, N)
|
| 1738 |
-
|
| 1739 |
if position_ids is None:
|
| 1740 |
batch_size, seq_length = inputs_embeds.shape[:2]
|
| 1741 |
if cache_position is not None:
|
|
@@ -1874,8 +1396,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1874 |
def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
|
| 1875 |
return self.model.get_image_features(pixel_values, image_grid_thw)
|
| 1876 |
|
| 1877 |
-
def get_ts_feature(self, ts_values, ts_lens, sr):
|
| 1878 |
-
return self.model.get_ts_feature(ts_values, ts_lens, sr)
|
| 1879 |
# Make modules available through conditional class for BC
|
| 1880 |
@property
|
| 1881 |
def language_model(self):
|
|
@@ -1884,11 +1404,8 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1884 |
@property
|
| 1885 |
def visual(self):
|
| 1886 |
return self.model.visual
|
| 1887 |
-
|
| 1888 |
-
def time_series(self):
|
| 1889 |
-
return self.model.time_series
|
| 1890 |
|
| 1891 |
-
@check_model_inputs
|
| 1892 |
def forward(
|
| 1893 |
self,
|
| 1894 |
input_ids: torch.LongTensor = None,
|
|
@@ -1901,9 +1418,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1901 |
pixel_values_videos: Optional[torch.FloatTensor] = None,
|
| 1902 |
image_grid_thw: Optional[torch.LongTensor] = None,
|
| 1903 |
video_grid_thw: Optional[torch.LongTensor] = None,
|
| 1904 |
-
ts_values: Optional[Union[torch.FloatTensor, list[torch.FloatTensor]]] = None,
|
| 1905 |
-
ts_lens: Optional[Union[torch.Tensor, list[torch.Tensor]]] = None,
|
| 1906 |
-
ts_sr: Optional[Union[torch.FloatTensor, list[torch.FloatTensor]]] = None,
|
| 1907 |
cache_position: Optional[torch.LongTensor] = None,
|
| 1908 |
logits_to_keep: Union[int, torch.Tensor] = 0,
|
| 1909 |
**kwargs: Unpack[TransformersKwargs],
|
|
@@ -1970,9 +1484,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1970 |
past_key_values=past_key_values,
|
| 1971 |
inputs_embeds=inputs_embeds,
|
| 1972 |
cache_position=cache_position,
|
| 1973 |
-
ts_values=ts_values,
|
| 1974 |
-
ts_lens=ts_lens,
|
| 1975 |
-
ts_sr=ts_sr,
|
| 1976 |
**kwargs,
|
| 1977 |
)
|
| 1978 |
|
|
@@ -2019,9 +1530,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 2019 |
pixel_values_videos=None,
|
| 2020 |
image_grid_thw=None,
|
| 2021 |
video_grid_thw=None,
|
| 2022 |
-
ts_values=None,
|
| 2023 |
-
ts_lens=None,
|
| 2024 |
-
ts_sr=None,
|
| 2025 |
**kwargs,
|
| 2026 |
):
|
| 2027 |
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -2038,9 +1546,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 2038 |
image_grid_thw=image_grid_thw,
|
| 2039 |
video_grid_thw=video_grid_thw,
|
| 2040 |
use_cache=use_cache,
|
| 2041 |
-
ts_values=ts_values,
|
| 2042 |
-
ts_lens=ts_lens,
|
| 2043 |
-
ts_sr=ts_sr,
|
| 2044 |
**kwargs,
|
| 2045 |
)
|
| 2046 |
|
|
@@ -2049,9 +1554,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 2049 |
if cache_position[0] != 0:
|
| 2050 |
model_inputs["pixel_values"] = None
|
| 2051 |
model_inputs["pixel_values_videos"] = None
|
| 2052 |
-
model_inputs["ts_values"] = None
|
| 2053 |
-
model_inputs["ts_lens"] = None
|
| 2054 |
-
model_inputs["ts_sr"] = None
|
| 2055 |
|
| 2056 |
return model_inputs
|
| 2057 |
|
|
@@ -2195,7 +1697,6 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 2195 |
|
| 2196 |
__all__ = [
|
| 2197 |
"InternS1ProVisionModel",
|
| 2198 |
-
"InternS1ProTimeSeriesModel",
|
| 2199 |
"InternS1ProForConditionalGeneration",
|
| 2200 |
"InternS1ProModel",
|
| 2201 |
"InternS1ProPreTrainedModel",
|
|
|
|
| 34 |
from transformers.processing_utils import Unpack
|
| 35 |
from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
|
| 36 |
from transformers.utils.generic import OutputRecorder, check_model_inputs
|
| 37 |
+
from .configuration_interns1_pro import InternS1ProConfig, InternS1ProTextConfig, InternS1ProVisionConfig
|
| 38 |
+
|
|
|
|
|
|
|
| 39 |
|
| 40 |
@use_kernel_forward_from_hub("RMSNorm")
|
| 41 |
class Qwen3VLMoeTextRMSNorm(nn.Module):
|
|
|
|
| 439 |
config: InternS1ProConfig
|
| 440 |
base_model_prefix = "model"
|
| 441 |
supports_gradient_checkpointing = True
|
| 442 |
+
_no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
|
| 443 |
_skip_keys_device_placement = ["past_key_values"]
|
| 444 |
_supports_flash_attn = True
|
| 445 |
_supports_sdpa = True
|
|
|
|
| 986 |
# Initialize weights and apply final processing
|
| 987 |
self.post_init()
|
| 988 |
|
| 989 |
+
@check_model_inputs()
|
| 990 |
@auto_docstring
|
| 991 |
def forward(
|
| 992 |
self,
|
|
|
|
| 1057 |
)
|
| 1058 |
|
| 1059 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1060 |
@dataclass
|
| 1061 |
@auto_docstring(
|
| 1062 |
custom_intro="""
|
|
|
|
| 1118 |
# Reference: fix gemma3 grad acc #37208
|
| 1119 |
accepts_loss_kwargs = False
|
| 1120 |
config: InternS1ProConfig
|
| 1121 |
+
_no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
|
| 1122 |
|
| 1123 |
def __init__(self, config):
|
| 1124 |
super().__init__(config)
|
| 1125 |
self.visual = InternS1ProVisionModel._from_config(config.vision_config)
|
| 1126 |
self.language_model = InternS1ProTextModel._from_config(config.text_config)
|
|
|
|
| 1127 |
|
| 1128 |
# Initialize weights and apply final processing
|
| 1129 |
self.post_init()
|
|
|
|
| 1170 |
split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
|
| 1171 |
image_embeds = torch.split(image_embeds, split_sizes)
|
| 1172 |
return image_embeds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1173 |
|
| 1174 |
def get_placeholder_mask(
|
| 1175 |
self,
|
|
|
|
| 1212 |
return special_image_mask, special_video_mask
|
| 1213 |
|
| 1214 |
@auto_docstring
|
| 1215 |
+
@check_model_inputs()
|
| 1216 |
def forward(
|
| 1217 |
self,
|
| 1218 |
input_ids: torch.LongTensor = None,
|
|
|
|
| 1225 |
image_grid_thw: Optional[torch.LongTensor] = None,
|
| 1226 |
video_grid_thw: Optional[torch.LongTensor] = None,
|
| 1227 |
cache_position: Optional[torch.LongTensor] = None,
|
|
|
|
|
|
|
|
|
|
| 1228 |
**kwargs: Unpack[TransformersKwargs],
|
| 1229 |
) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]:
|
| 1230 |
r"""
|
|
|
|
| 1232 |
The temporal, height and width of feature shape of each image in LLM.
|
| 1233 |
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
|
| 1234 |
The temporal, height and width of feature shape of each video in LLM.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1235 |
"""
|
| 1236 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
| 1237 |
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
|
|
| 1258 |
)
|
| 1259 |
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
|
| 1260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1261 |
if position_ids is None:
|
| 1262 |
batch_size, seq_length = inputs_embeds.shape[:2]
|
| 1263 |
if cache_position is not None:
|
|
|
|
| 1396 |
def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
|
| 1397 |
return self.model.get_image_features(pixel_values, image_grid_thw)
|
| 1398 |
|
|
|
|
|
|
|
| 1399 |
# Make modules available through conditional class for BC
|
| 1400 |
@property
|
| 1401 |
def language_model(self):
|
|
|
|
| 1404 |
@property
|
| 1405 |
def visual(self):
|
| 1406 |
return self.model.visual
|
|
|
|
|
|
|
|
|
|
| 1407 |
|
| 1408 |
+
@check_model_inputs()
|
| 1409 |
def forward(
|
| 1410 |
self,
|
| 1411 |
input_ids: torch.LongTensor = None,
|
|
|
|
| 1418 |
pixel_values_videos: Optional[torch.FloatTensor] = None,
|
| 1419 |
image_grid_thw: Optional[torch.LongTensor] = None,
|
| 1420 |
video_grid_thw: Optional[torch.LongTensor] = None,
|
|
|
|
|
|
|
|
|
|
| 1421 |
cache_position: Optional[torch.LongTensor] = None,
|
| 1422 |
logits_to_keep: Union[int, torch.Tensor] = 0,
|
| 1423 |
**kwargs: Unpack[TransformersKwargs],
|
|
|
|
| 1484 |
past_key_values=past_key_values,
|
| 1485 |
inputs_embeds=inputs_embeds,
|
| 1486 |
cache_position=cache_position,
|
|
|
|
|
|
|
|
|
|
| 1487 |
**kwargs,
|
| 1488 |
)
|
| 1489 |
|
|
|
|
| 1530 |
pixel_values_videos=None,
|
| 1531 |
image_grid_thw=None,
|
| 1532 |
video_grid_thw=None,
|
|
|
|
|
|
|
|
|
|
| 1533 |
**kwargs,
|
| 1534 |
):
|
| 1535 |
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
|
|
| 1546 |
image_grid_thw=image_grid_thw,
|
| 1547 |
video_grid_thw=video_grid_thw,
|
| 1548 |
use_cache=use_cache,
|
|
|
|
|
|
|
|
|
|
| 1549 |
**kwargs,
|
| 1550 |
)
|
| 1551 |
|
|
|
|
| 1554 |
if cache_position[0] != 0:
|
| 1555 |
model_inputs["pixel_values"] = None
|
| 1556 |
model_inputs["pixel_values_videos"] = None
|
|
|
|
|
|
|
|
|
|
| 1557 |
|
| 1558 |
return model_inputs
|
| 1559 |
|
|
|
|
| 1697 |
|
| 1698 |
__all__ = [
|
| 1699 |
"InternS1ProVisionModel",
|
|
|
|
| 1700 |
"InternS1ProForConditionalGeneration",
|
| 1701 |
"InternS1ProModel",
|
| 1702 |
"InternS1ProPreTrainedModel",
|
processing_interns1_pro.py
CHANGED
|
@@ -18,7 +18,7 @@
|
|
| 18 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 19 |
# See the License for the specific language governing permissions and
|
| 20 |
# limitations under the License.
|
| 21 |
-
from typing import Union
|
| 22 |
|
| 23 |
import numpy as np
|
| 24 |
|
|
@@ -28,7 +28,6 @@ from transformers.processing_utils import MultiModalData, ProcessingKwargs, Proc
|
|
| 28 |
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
| 29 |
from transformers.utils import logging
|
| 30 |
from transformers.video_utils import VideoInput
|
| 31 |
-
import os
|
| 32 |
|
| 33 |
|
| 34 |
logger = logging.get_logger(__name__)
|
|
@@ -42,7 +41,6 @@ class InternS1ProProcessorKwargs(ProcessingKwargs, total=False):
|
|
| 42 |
"return_mm_token_type_ids": False,
|
| 43 |
},
|
| 44 |
"videos_kwargs": {"return_metadata": True},
|
| 45 |
-
"time_series_kwargs": {},
|
| 46 |
}
|
| 47 |
|
| 48 |
|
|
@@ -70,7 +68,6 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 70 |
def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
|
| 71 |
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
| 72 |
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
| 73 |
-
self.ts_token = "<TS_CONTEXT>" if not hasattr(tokenizer, "ts_token") else tokenizer.ts_token
|
| 74 |
self.image_token_id = (
|
| 75 |
tokenizer.image_token_id
|
| 76 |
if getattr(tokenizer, "image_token_id", None)
|
|
@@ -81,11 +78,6 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 81 |
if getattr(tokenizer, "video_token_id", None)
|
| 82 |
else tokenizer.convert_tokens_to_ids(self.video_token)
|
| 83 |
)
|
| 84 |
-
self.ts_token_id = (
|
| 85 |
-
tokenizer.ts_token_id
|
| 86 |
-
if getattr(tokenizer, "ts_token_id", None)
|
| 87 |
-
else tokenizer.convert_tokens_to_ids(self.ts_token)
|
| 88 |
-
)
|
| 89 |
super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
|
| 90 |
self.vision_start_token = (
|
| 91 |
"<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
|
|
@@ -103,132 +95,12 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 103 |
if getattr(tokenizer, "vision_end_token_id", None)
|
| 104 |
else tokenizer.convert_tokens_to_ids(self.vision_end_token)
|
| 105 |
)
|
| 106 |
-
self.ts_start_token = (
|
| 107 |
-
"<|ts|>" if not hasattr(tokenizer, "ts_start_token") else tokenizer.ts_start_token
|
| 108 |
-
)
|
| 109 |
-
self.ts_end_token = (
|
| 110 |
-
"<|/ts|>" if not hasattr(tokenizer, "ts_end_token") else tokenizer.ts_end_token
|
| 111 |
-
)
|
| 112 |
-
self.ts_start_token_id = (
|
| 113 |
-
tokenizer.ts_start_token_id
|
| 114 |
-
if getattr(tokenizer, "ts_start_token_id", None)
|
| 115 |
-
else tokenizer.convert_tokens_to_ids(self.ts_start_token)
|
| 116 |
-
)
|
| 117 |
-
self.ts_end_token_id = (
|
| 118 |
-
tokenizer.ts_end_token_id
|
| 119 |
-
if getattr(tokenizer, "ts_end_token_id", None)
|
| 120 |
-
else tokenizer.convert_tokens_to_ids(self.ts_end_token)
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
def time_series_preprocessor(self,conversation):
|
| 124 |
-
if isinstance(conversation, (list, tuple)) and (
|
| 125 |
-
isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
|
| 126 |
-
):
|
| 127 |
-
conversations = conversation
|
| 128 |
-
else:
|
| 129 |
-
conversations = [conversation]
|
| 130 |
-
|
| 131 |
-
batch_time_series = []
|
| 132 |
-
batch_time_series_metadata = []
|
| 133 |
-
for conversation in conversations:
|
| 134 |
-
for message in conversation:
|
| 135 |
-
if message['role'] != "user": continue
|
| 136 |
-
time_series_fnames = [
|
| 137 |
-
content["data"]
|
| 138 |
-
for content in message["content"]
|
| 139 |
-
if content.get("type") == "time_series" and "data" in content
|
| 140 |
-
]
|
| 141 |
-
time_series_rates = [
|
| 142 |
-
content.get("sampling_rate", None)
|
| 143 |
-
for content in message["content"]
|
| 144 |
-
if content.get("type") == "time_series"
|
| 145 |
-
]
|
| 146 |
-
for path, rate in zip(time_series_fnames, time_series_rates):
|
| 147 |
-
batch_time_series.append(path)
|
| 148 |
-
batch_time_series_metadata.append(rate)
|
| 149 |
-
|
| 150 |
-
return {"time_series_paths": batch_time_series if batch_time_series else None,
|
| 151 |
-
"time_series_sampling_rates": batch_time_series_metadata if batch_time_series_metadata else None}
|
| 152 |
-
|
| 153 |
-
def time_series_processor(self,
|
| 154 |
-
ts_paths: list[str],
|
| 155 |
-
sampling_rates: list[float],
|
| 156 |
-
do_normalize=True,
|
| 157 |
-
do_truncate=True,
|
| 158 |
-
|
| 159 |
-
)-> BatchFeature:
|
| 160 |
-
assert len(ts_paths)==len(sampling_rates), "ts_paths and sampling_rates must have the same length"
|
| 161 |
-
|
| 162 |
-
ts_values=[]
|
| 163 |
-
ts_sr=[]
|
| 164 |
-
ts_lens=[]
|
| 165 |
-
|
| 166 |
-
for idx,ts_path in enumerate(ts_paths):
|
| 167 |
-
sr=sampling_rates[idx]
|
| 168 |
-
ext = os.path.splitext(ts_path)[-1].lower()
|
| 169 |
-
if ext in [".wav",'.mp3','.flac']:
|
| 170 |
-
try:
|
| 171 |
-
import soundfile as sf
|
| 172 |
-
except ImportError:
|
| 173 |
-
raise ImportError("Please install soundfile to process audio files.")
|
| 174 |
-
ts_input, sr = sf.read(ts_path) # ts_input: np.ndarray, shape [T] or [T, C]
|
| 175 |
-
elif ext == ".csv":
|
| 176 |
-
pd = __import__("pandas")
|
| 177 |
-
df = pd.read_csv(ts_path, header=None)
|
| 178 |
-
ts_input = df.values # [T, C]
|
| 179 |
-
elif ext == ".npy":
|
| 180 |
-
ts_input = np.load(ts_path) # [T, C]
|
| 181 |
-
else:
|
| 182 |
-
raise ValueError(f"Unsupported file format: {ext}")
|
| 183 |
-
|
| 184 |
-
# ts_tensor = torch.from_numpy(ts_input).float()
|
| 185 |
-
if not isinstance(ts_input, np.ndarray):
|
| 186 |
-
ts_input = np.array(ts_input, dtype=np.float32)
|
| 187 |
-
|
| 188 |
-
if do_normalize:
|
| 189 |
-
mean = ts_input.mean(axis=0, keepdims=True)
|
| 190 |
-
std = ts_input.std(axis=0, keepdims=True)
|
| 191 |
-
ts_input = (ts_input - mean) / (std + 1e-8)
|
| 192 |
-
|
| 193 |
-
if do_truncate and len(ts_input)>240000:
|
| 194 |
-
ts_input=ts_input[:240000] # truncate to 240k to avoid oom
|
| 195 |
-
|
| 196 |
-
if ts_input.ndim==1:
|
| 197 |
-
ts_input=ts_input[:, None] #[T,C]
|
| 198 |
-
|
| 199 |
-
ts_len=ts_input.shape[0]
|
| 200 |
-
|
| 201 |
-
if sr is None or sr == 0: # if no sr provided
|
| 202 |
-
sr = ts_len/4
|
| 203 |
-
|
| 204 |
-
ts_values.append(ts_input)
|
| 205 |
-
ts_sr.append(sr)
|
| 206 |
-
ts_lens.append(ts_len)
|
| 207 |
-
|
| 208 |
-
ts_lens = np.array(ts_lens)
|
| 209 |
-
ts_sr = np.array(ts_sr)
|
| 210 |
-
num_ts_tokens = self._get_num_ts_tokens(sampling_rates=ts_sr,
|
| 211 |
-
ts_lens=ts_lens)
|
| 212 |
-
return BatchFeature(data={"ts_values": ts_values,
|
| 213 |
-
"ts_sr":ts_sr,
|
| 214 |
-
"ts_lens":ts_lens,
|
| 215 |
-
"num_ts_tokens":num_ts_tokens}
|
| 216 |
-
)
|
| 217 |
-
|
| 218 |
-
def _get_num_ts_tokens(self,sampling_rates,ts_lens):
|
| 219 |
-
strides = np.floor(160/((1+np.exp(-sampling_rates/100))**6))
|
| 220 |
-
patch_sizes = strides * 2
|
| 221 |
-
embed_lengths = (np.ceil((ts_lens - patch_sizes) / strides) + 1).astype(np.int64)
|
| 222 |
-
num_ts_tokens=[(embed_length // 2 + 1) // 2 for embed_length in embed_lengths]
|
| 223 |
-
return num_ts_tokens
|
| 224 |
|
| 225 |
def __call__(
|
| 226 |
self,
|
| 227 |
images: ImageInput = None,
|
| 228 |
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
| 229 |
videos: VideoInput = None,
|
| 230 |
-
time_series_paths: Optional[list[str]]=None,
|
| 231 |
-
time_series_sampling_rates: Optional[list[float]]=None,
|
| 232 |
**kwargs: Unpack[InternS1ProProcessorKwargs],
|
| 233 |
) -> BatchFeature:
|
| 234 |
"""
|
|
@@ -248,7 +120,6 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 248 |
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
| 249 |
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
| 250 |
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
| 251 |
-
time_series_signals (`list[np.ndarray]`, `list[torch.Tensor]`):
|
| 252 |
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
| 253 |
If set, will return tensors of a particular framework. Acceptable values are:
|
| 254 |
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
|
@@ -345,22 +216,6 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 345 |
|
| 346 |
text[i] = text[i].replace("<|placeholder|>", self.video_token)
|
| 347 |
|
| 348 |
-
time_series_inputs = {}
|
| 349 |
-
if images is None and videos is None and time_series_paths is not None:
|
| 350 |
-
assert time_series_sampling_rates is not None, "If time_series_signals is provided, time_series_sampling_rates must also be provided."
|
| 351 |
-
assert len(time_series_paths) == len(time_series_sampling_rates), "The number of time series signals must match the number of sampling rates."
|
| 352 |
-
time_series_inputs = self.time_series_processor(ts_paths=time_series_paths, sampling_rates=time_series_sampling_rates)
|
| 353 |
-
num_ts_tokens = time_series_inputs.pop("num_ts_tokens")
|
| 354 |
-
assert len(num_ts_tokens) == len(text), "The number of time series signals must match the number of text prompts."
|
| 355 |
-
for i in range(len(text)):
|
| 356 |
-
if f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}" in text[i]:
|
| 357 |
-
ts_placeholder = self.ts_start_token + self.ts_token * num_ts_tokens[i] + self.ts_end_token
|
| 358 |
-
text[i] = text[i].replace(
|
| 359 |
-
f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}", ts_placeholder, 1
|
| 360 |
-
)
|
| 361 |
-
elif self.ts_token in text[i]:
|
| 362 |
-
text[i] = text[i].replace(self.ts_token, self.ts_token * num_ts_tokens[i])
|
| 363 |
-
|
| 364 |
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
| 365 |
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
|
| 366 |
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
|
@@ -372,7 +227,7 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 372 |
mm_token_type_ids[array_ids == self.image_token_id] = 1
|
| 373 |
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
|
| 374 |
|
| 375 |
-
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs
|
| 376 |
|
| 377 |
def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
|
| 378 |
"""
|
|
|
|
| 18 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 19 |
# See the License for the specific language governing permissions and
|
| 20 |
# limitations under the License.
|
| 21 |
+
from typing import Union
|
| 22 |
|
| 23 |
import numpy as np
|
| 24 |
|
|
|
|
| 28 |
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
| 29 |
from transformers.utils import logging
|
| 30 |
from transformers.video_utils import VideoInput
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
logger = logging.get_logger(__name__)
|
|
|
|
| 41 |
"return_mm_token_type_ids": False,
|
| 42 |
},
|
| 43 |
"videos_kwargs": {"return_metadata": True},
|
|
|
|
| 44 |
}
|
| 45 |
|
| 46 |
|
|
|
|
| 68 |
def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
|
| 69 |
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
| 70 |
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
|
|
|
| 71 |
self.image_token_id = (
|
| 72 |
tokenizer.image_token_id
|
| 73 |
if getattr(tokenizer, "image_token_id", None)
|
|
|
|
| 78 |
if getattr(tokenizer, "video_token_id", None)
|
| 79 |
else tokenizer.convert_tokens_to_ids(self.video_token)
|
| 80 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
|
| 82 |
self.vision_start_token = (
|
| 83 |
"<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
|
|
|
|
| 95 |
if getattr(tokenizer, "vision_end_token_id", None)
|
| 96 |
else tokenizer.convert_tokens_to_ids(self.vision_end_token)
|
| 97 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
def __call__(
|
| 100 |
self,
|
| 101 |
images: ImageInput = None,
|
| 102 |
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
| 103 |
videos: VideoInput = None,
|
|
|
|
|
|
|
| 104 |
**kwargs: Unpack[InternS1ProProcessorKwargs],
|
| 105 |
) -> BatchFeature:
|
| 106 |
"""
|
|
|
|
| 120 |
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
| 121 |
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
| 122 |
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
|
|
|
| 123 |
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
| 124 |
If set, will return tensors of a particular framework. Acceptable values are:
|
| 125 |
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
|
|
|
| 216 |
|
| 217 |
text[i] = text[i].replace("<|placeholder|>", self.video_token)
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
| 220 |
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
|
| 221 |
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
|
|
|
| 227 |
mm_token_type_ids[array_ids == self.image_token_id] = 1
|
| 228 |
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
|
| 229 |
|
| 230 |
+
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
|
| 231 |
|
| 232 |
def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
|
| 233 |
"""
|
test_inference_ts.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
from pathlib import Path
|
| 2 |
-
import torch
|
| 3 |
-
from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
model_path = Path(__file__).parent.resolve()
|
| 7 |
-
print(f"Loading model from: {model_path}")
|
| 8 |
-
|
| 9 |
-
# 加载模型配置
|
| 10 |
-
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
| 11 |
-
print(f"Model config: {config.model_type}")
|
| 12 |
-
print(f"Architecture: {config.architectures}")
|
| 13 |
-
|
| 14 |
-
# 加载处理器(tokenizer + image processor + ts processor)
|
| 15 |
-
print("\nLoading processor...")
|
| 16 |
-
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
| 17 |
-
|
| 18 |
-
# 加载模型(使用 bfloat16 精度和自动设备映射)
|
| 19 |
-
print("\nLoading model...")
|
| 20 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 21 |
-
model_path,
|
| 22 |
-
dtype=torch.bfloat16,
|
| 23 |
-
device_map="auto",
|
| 24 |
-
# attn_implementation="flash_attention_2", #时序暂不支持flash_attn,load加这行会报错
|
| 25 |
-
trust_remote_code=True
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
print(f"✓ Model loaded successfully!")
|
| 29 |
-
print(f"Model type: {type(model).__name__}")
|
| 30 |
-
print(f"Model device: {model.device}")
|
| 31 |
-
|
| 32 |
-
# ============================================================================
|
| 33 |
-
# 测试 3: 时序对话
|
| 34 |
-
# ============================================================================
|
| 35 |
-
print("\n" + "=" * 80)
|
| 36 |
-
print("测试 3: 时序对话")
|
| 37 |
-
print("=" * 80)
|
| 38 |
-
|
| 39 |
-
messages = [
|
| 40 |
-
{
|
| 41 |
-
"role": "user",
|
| 42 |
-
"content": [
|
| 43 |
-
{"type": "time_series", "data": "./0092638_seism.npy", "sampling_rate": 100},
|
| 44 |
-
{"type": "text", "text": "Please determine whether an Earthquake event has occurred in the provided time-series data. If so, please specify the starting time point indices of the P-wave and S-wave in the event."},
|
| 45 |
-
],
|
| 46 |
-
}
|
| 47 |
-
]
|
| 48 |
-
|
| 49 |
-
time_series_inputs = processor.time_series_preprocessor(messages)
|
| 50 |
-
multimodal_inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", enable_thinking=False, **time_series_inputs).to(model.device, dtype=torch.bfloat16)
|
| 51 |
-
|
| 52 |
-
print("\n生成时序回复...")
|
| 53 |
-
with torch.inference_mode():
|
| 54 |
-
multimodal_generated_ids = model.generate(
|
| 55 |
-
**multimodal_inputs,
|
| 56 |
-
max_new_tokens=200,
|
| 57 |
-
do_sample=False,
|
| 58 |
-
temperature=1.0,
|
| 59 |
-
)
|
| 60 |
-
|
| 61 |
-
# 提取生成的 token(去除输入部分)
|
| 62 |
-
multimodal_generated_ids_trimmed = [
|
| 63 |
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(multimodal_inputs.input_ids, multimodal_generated_ids)
|
| 64 |
-
]
|
| 65 |
-
|
| 66 |
-
# 解码为文本
|
| 67 |
-
multimodal_output = processor.batch_decode(
|
| 68 |
-
multimodal_generated_ids_trimmed,
|
| 69 |
-
skip_special_tokens=True,
|
| 70 |
-
clean_up_tokenization_spaces=False
|
| 71 |
-
)
|
| 72 |
-
|
| 73 |
-
print("\n" + "-" * 80)
|
| 74 |
-
print("时序输出:")
|
| 75 |
-
print("-" * 80)
|
| 76 |
-
print(multimodal_output[0])
|
| 77 |
-
print("-" * 80)
|
| 78 |
-
print("\n✅ 时序功能测试完成!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenization_interns1.py
CHANGED
|
@@ -24,14 +24,11 @@ from functools import lru_cache
|
|
| 24 |
import regex as re
|
| 25 |
import sentencepiece as spm
|
| 26 |
|
| 27 |
-
import transformers
|
| 28 |
from transformers.tokenization_utils_base import AddedToken, TextInput
|
|
|
|
| 29 |
from transformers.utils import logging
|
| 30 |
-
from
|
| 31 |
-
|
| 32 |
-
from transformers.tokenization_python import PreTrainedTokenizer
|
| 33 |
-
else:
|
| 34 |
-
from transformers.tokenization_utils import PreTrainedTokenizer
|
| 35 |
|
| 36 |
logger = logging.get_logger(__name__)
|
| 37 |
|
|
@@ -509,7 +506,6 @@ class InternS1Tokenizer(PreTrainedTokenizer):
|
|
| 509 |
pad_token="<|endoftext|>",
|
| 510 |
clean_up_tokenization_spaces=False,
|
| 511 |
split_special_tokens=False,
|
| 512 |
-
special_tokens_pattern="none",
|
| 513 |
**kwargs,
|
| 514 |
):
|
| 515 |
bos_token = (
|
|
@@ -570,7 +566,6 @@ class InternS1Tokenizer(PreTrainedTokenizer):
|
|
| 570 |
pad_token=pad_token,
|
| 571 |
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
| 572 |
split_special_tokens=split_special_tokens,
|
| 573 |
-
special_tokens_pattern="none",
|
| 574 |
**kwargs,
|
| 575 |
)
|
| 576 |
|
|
@@ -720,6 +715,9 @@ class InternS1Tokenizer(PreTrainedTokenizer):
|
|
| 720 |
|
| 721 |
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
|
| 722 |
|
|
|
|
|
|
|
|
|
|
| 723 |
if hasattr(self, "do_lower_case") and self.do_lower_case:
|
| 724 |
# convert non-special tokens to lowercase. Might be super slow as well?
|
| 725 |
escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
|
|
|
|
| 24 |
import regex as re
|
| 25 |
import sentencepiece as spm
|
| 26 |
|
|
|
|
| 27 |
from transformers.tokenization_utils_base import AddedToken, TextInput
|
| 28 |
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
| 29 |
from transformers.utils import logging
|
| 30 |
+
# from transformers.utils.import_utils import requires
|
| 31 |
+
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
logger = logging.get_logger(__name__)
|
| 34 |
|
|
|
|
| 506 |
pad_token="<|endoftext|>",
|
| 507 |
clean_up_tokenization_spaces=False,
|
| 508 |
split_special_tokens=False,
|
|
|
|
| 509 |
**kwargs,
|
| 510 |
):
|
| 511 |
bos_token = (
|
|
|
|
| 566 |
pad_token=pad_token,
|
| 567 |
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
| 568 |
split_special_tokens=split_special_tokens,
|
|
|
|
| 569 |
**kwargs,
|
| 570 |
)
|
| 571 |
|
|
|
|
| 715 |
|
| 716 |
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
|
| 717 |
|
| 718 |
+
if kwargs:
|
| 719 |
+
logger.warning(f"Keyword arguments {kwargs} not recognized.")
|
| 720 |
+
|
| 721 |
if hasattr(self, "do_lower_case") and self.do_lower_case:
|
| 722 |
# convert non-special tokens to lowercase. Might be super slow as well?
|
| 723 |
escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
|