Instructions to use chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit with MLX:

# Make sure mlx-lm is installed
# pip install --upgrade mlx-lm

# Generate text with mlx-lm
from mlx_lm import load, generate

model, tokenizer = load("chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit")

prompt = "Write a story about Einstein"
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True
)

text = generate(model, tokenizer, prompt=prompt, verbose=True)

Notebooks
Google Colab
Kaggle
Local Apps
LM Studio

Pi new

How to use chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit with Pi:

Start the MLX server

# Install MLX LM:
uv tool install mlx-lm
# Start a local OpenAI-compatible server:
mlx_lm.server --model "chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit"

Configure the model in Pi

# Install Pi:
npm install -g @mariozechner/pi-coding-agent
# Add to ~/.pi/agent/models.json:
{
  "providers": {
    "mlx-lm": {
      "baseUrl": "http://localhost:8080/v1",
      "api": "openai-completions",
      "apiKey": "none",
      "models": [
        {
          "id": "chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit"
        }
      ]
    }
  }
}

Run Pi

# Start Pi in your project directory:
pi

Hermes Agent new

How to use chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit with Hermes Agent:

Start the MLX server

# Install MLX LM:
uv tool install mlx-lm
# Start a local OpenAI-compatible server:
mlx_lm.server --model "chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit"

Configure Hermes

# Install Hermes:
curl -fsSL https://hermes-agent.nousresearch.com/install.sh | bash
hermes setup
# Point Hermes at the local server:
hermes config set model.provider custom
hermes config set model.base_url http://127.0.0.1:8080/v1
hermes config set model.default chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit

Run Hermes

hermes

MLX LM

How to use chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit with MLX LM:

Generate or start a chat session

# Install MLX LM
uv tool install mlx-lm
# Interactive chat REPL
mlx_lm.chat --model "chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit"

Run an OpenAI-compatible server

# Install MLX LM
uv tool install mlx-lm
# Start the server
mlx_lm.server --model "chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit"
# Calling the OpenAI-compatible server with curl
curl -X POST "http://localhost:8000/v1/chat/completions" \
   -H "Content-Type: application/json" \
   --data '{
     "model": "chanderbalaji/Intern-S2-Preview-FP8-MLX-4bit",
     "messages": [
       {"role": "user", "content": "Hello"}
     ]
   }'

chanderbalaji commited on 10 days ago

Commit

0c03955

verified ·

1 Parent(s): d1317fb

Add files using upload-large-folder tool

Browse files

Files changed (15) hide show

chat_template.jinja +159 -0
config.json +792 -0
configuration_interns2_preview.py +434 -0
generation_config.json +13 -0
intern_s2_fp8_qwen3_5_moe.py +71 -0
merges.txt +0 -0
model.safetensors.index.json +0 -0
modeling_interns2_preview.py +0 -0
processing_interns2_preview.py +423 -0
tokenization_interns1.py +1009 -0
tokenizer_PROT.model +3 -0
tokenizer_SMILES.model +3 -0
tokenizer_XNA.model +3 -0
tokenizer_config.json +508 -0
vocab.json +0 -0

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,159 @@

+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+    {%- if content is string %}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping %}
+        {%- for item in content %}
+            {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain images.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Picture ' ~ image_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+            {%- elif 'video' in item or item.type == 'video' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain videos.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Video ' ~ video_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+            {%- elif 'time_series' in item or item.type == 'time_series' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain time series.') }}
+                {%- endif %}
+                {{- '<|ts|><TS_CONTEXT><|/ts|>' }}
+            {%- elif 'text' in item %}
+                {{- item.text }}
+            {%- else %}
+                {{- raise_exception('Unexpected item type in content.') }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif content is none or content is undefined %}
+        {{- '' }}
+    {%- else %}
+        {{- raise_exception('Unexpected content type.') }}
+    {%- endif %}
+{%- endmacro %}
+{%- if not messages %}
+    {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+    {{- '<|im_start|>system\n' }}
+    {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {%- if content %}
+            {{- '\n\n' + content }}
+        {%- endif %}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" %}
+        {%- set content = render_content(message.content, false)|trim %}
+        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
+            {%- set ns.multi_step_tool = false %}
+            {%- set ns.last_query_index = index %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+    {{- raise_exception('No user query found in messages.') }}
+{%- endif %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content, true)|trim %}
+    {%- if message.role == "system" %}
+        {%- if not loop.first %}
+            {{- raise_exception('System message must be at the beginning.') }}
+        {%- endif %}
+    {%- elif message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- set reasoning_content = reasoning_content|trim %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {%- if loop.first %}
+                    {%- if content|trim %}
+                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- else %}
+                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- endif %}
+                {%- else %}
+                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                {%- endif %}
+                {%- if tool_call.arguments is defined %}
+                    {%- for args_name, args_value in tool_call.arguments|items %}
+                        {{- '<parameter=' + args_name + '>\n' }}
+                        {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                        {{- args_value }}
+                        {{- '\n</parameter>\n' }}
+                    {%- endfor %}
+                {%- endif %}
+                {{- '</function>\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- raise_exception('Unexpected message role.') }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- else %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,792 @@

+{
+    "architectures": [
+        "InternS2PreviewForConditionalGeneration"
+    ],
+    "auto_map": {
+        "AutoConfig": "configuration_interns2_preview.InternS2PreviewConfig",
+        "AutoModelForCausalLM": "modeling_interns2_preview.InternS2PreviewForCausalLM",
+        "AutoModel": "modeling_interns2_preview.InternS2PreviewModel",
+        "AutoModelForImageTextToText": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration",
+        "AutoModelForMultimodalLM": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration"
+    },
+    "eos_token_id": [
+        248046,
+        248044
+    ],
+    "image_token_id": 248056,
+    "model_file": "intern_s2_fp8_qwen3_5_moe.py",
+    "model_type": "intern_s2_preview",
+    "quantization": {
+        "group_size": 64,
+        "bits": 4,
+        "mode": "affine",
+        "language_model.model.layers.0.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.0.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.1.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.1.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.2.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.2.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.3.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.3.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.4.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.4.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.5.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.5.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.6.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.6.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.7.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.7.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.8.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.8.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.9.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.9.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.10.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.10.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.11.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.11.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.12.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.12.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.13.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.13.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.14.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.14.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.15.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.15.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.16.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.16.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.17.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.17.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.18.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.18.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.19.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.19.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.20.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.20.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.21.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.21.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.22.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.22.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.23.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.23.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.24.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.24.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.25.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.25.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.26.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.26.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.27.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.27.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.28.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.28.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.29.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.29.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.30.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.30.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.31.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.31.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.32.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.32.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.33.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.33.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.34.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.34.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.35.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.35.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.36.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.36.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.37.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.37.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.38.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.38.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.39.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.39.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        }
+    },
+    "quantization_config": {
+        "group_size": 64,
+        "bits": 4,
+        "mode": "affine",
+        "language_model.model.layers.0.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.0.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.1.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.1.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.2.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.2.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.3.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.3.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.4.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.4.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.5.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.5.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.6.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.6.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.7.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.7.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.8.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.8.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.9.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.9.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.10.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.10.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.11.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.11.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.12.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.12.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.13.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.13.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.14.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.14.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.15.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.15.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.16.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.16.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.17.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.17.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.18.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.18.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.19.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.19.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.20.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.20.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.21.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.21.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.22.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.22.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.23.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.23.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.24.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.24.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.25.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.25.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.26.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.26.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.27.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.27.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.28.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.28.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.29.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.29.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.30.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.30.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.31.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.31.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.32.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.32.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.33.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.33.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.34.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.34.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.35.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.35.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.36.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.36.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.37.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.37.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.38.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.38.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.39.mlp.gate": {
+            "group_size": 64,
+            "bits": 8
+        },
+        "language_model.model.layers.39.mlp.shared_expert_gate": {
+            "group_size": 64,
+            "bits": 8
+        }
+    },
+    "text_config": {
+        "model_type": "qwen3_5_moe_text",
+        "attention_bias": false,
+        "attention_dropout": 0.0,
+        "attn_output_gate": true,
+        "dtype": "bfloat16",
+        "eos_token_id": 248044,
+        "full_attention_interval": 4,
+        "head_dim": 256,
+        "hidden_act": "silu",
+        "hidden_size": 2048,
+        "initializer_range": 0.02,
+        "layer_types": [
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention"
+        ],
+        "linear_conv_kernel_dim": 4,
+        "linear_key_head_dim": 128,
+        "linear_num_key_heads": 16,
+        "linear_num_value_heads": 32,
+        "linear_value_head_dim": 128,
+        "max_position_embeddings": 262144,
+        "mlp_only_layers": [],
+        "moe_intermediate_size": 512,
+        "mtp_num_hidden_layers": 1,
+        "mtp_use_dedicated_embeddings": false,
+        "num_attention_heads": 16,
+        "num_experts": 256,
+        "num_experts_per_tok": 8,
+        "num_hidden_layers": 40,
+        "num_key_value_heads": 2,
+        "rms_norm_eps": 1e-06,
+        "router_aux_loss_coef": 0.001,
+        "shared_expert_intermediate_size": 512,
+        "use_cache": true,
+        "vocab_size": 251392,
+        "mamba_ssm_dtype": "float32",
+        "rope_parameters": {
+            "mrope_interleaved": true,
+            "mrope_section": [
+                11,
+                11,
+                10
+            ],
+            "rope_theta": 10000000,
+            "partial_rotary_factor": 0.25,
+            "type": "default"
+        },
+        "pad_token_id": null,
+        "bos_token_id": null,
+        "tie_word_embeddings": false,
+        "output_router_logits": false,
+        "partial_rotary_factor": 0.25
+    },
+    "tie_word_embeddings": false,
+    "transformers_version": "5.2.0",
+    "ts_config": {
+        "model_type": "interns2_preview_time_series",
+        "auto_map": {
+            "AutoConfig": "configuration_interns2_preview.InternS2PreviewTimeSeriesConfig",
+            "AutoModel": "modeling_interns2_preview.InternS2PreviewTimeSeriesModel"
+        },
+        "activation_dropout": 0.0,
+        "activation_function": "gelu",
+        "attention_dropout": 0.0,
+        "d_model": 768,
+        "dropout": 0.0,
+        "encoder_attention_heads": 8,
+        "encoder_ffn_dim": 3072,
+        "encoder_layerdrop": 0.0,
+        "encoder_layers": 17,
+        "max_source_positions": 1500,
+        "num_mel_bins": 80,
+        "out_hidden_size": 2048,
+        "scale_embedding": false,
+        "ts_adapt_in_dim": 256,
+        "ts_adapt_out_dim": 1024,
+        "ts_hidden_dim": 1024
+    },
+    "ts_end_id": 248092,
+    "ts_start_id": 248091,
+    "ts_token_id": 248093,
+    "video_token_id": 248057,
+    "vision_end_token_id": 248054,
+    "vision_start_token_id": 248053
+}

configuration_interns2_preview.py ADDED Viewed

	@@ -0,0 +1,434 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/interns2_preview/modular_interns2_preview.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_interns2_preview.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PreTrainedConfig, layer_type_validation
+from transformers.modeling_rope_utils import RopeParameters
+class InternS2PreviewVisionConfig(PreTrainedConfig):
+    model_type = "intern_s2_preview"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        depth=27,
+        hidden_size=1152,
+        hidden_act="gelu_pytorch_tanh",
+        intermediate_size=4304,
+        num_heads=16,
+        in_channels=3,
+        patch_size=16,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=3584,
+        num_position_embeddings=2304,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.num_position_embeddings = num_position_embeddings
+        self.initializer_range = initializer_range
+class InternS2PreviewTextConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternS2PreviewTextModel`]. It is used to instantiate a
+    Qwen3.5-MoE model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of
+    Qwen3.5-35B-A3B-Instruct [Qwen/Qwen3.5-35B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-Instruct).
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 248320):
+            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
+            `inputs_ids`.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_parameters (`RopeParameters`, *optional*):
+            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
+            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
+            with longer `max_position_embeddings`.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 256):
+            Projection weights dimension in multi-head attention.
+        linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
+            Kernel size of the convolution used in linear attention layers.
+        linear_key_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each key head in linear attention.
+        linear_value_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each value head in linear attention.
+        linear_num_key_heads (`int`, *optional*, defaults to 16):
+            Number of key heads used in linear attention layers.
+        linear_num_value_heads (`int`, *optional*, defaults to 32):
+            Number of value heads used in linear attention layers.
+        moe_intermediate_size (`int`, *optional*, defaults to 512):
+            Intermediate size of the routed expert.
+        shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
+            Intermediate size of the shared expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 8):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 256):
+            Number of routed experts.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        layer_types (`list[str]`, *optional*):
+            Types of each layer (attention or linear).
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*):
+            End of stream token id.
+    ```python
+    >>> from transformers import InternS2PreviewTextModel, InternS2PreviewTextConfig
+    >>> # Initializing a Qwen3.5-MoE style configuration
+    >>> configuration =  InternS2PreviewTextConfig()
+    >>> # Initializing a model from the Qwen3.5-35B-A3B style configuration
+    >>> model = InternS2PreviewTextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    # NOTE: `model_type` is kept as `qwen3_5_moe_text` because transformers hardcodes weight-renaming logic keyed
+    # on model_type (e.g. `model_dtype`); reusing the parent's value ensures correct weight loading via
+    # `AutoModelForCausalLM.from_pretrained`.
+    model_type = "qwen3_5_moe_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.experts.gate_up_proj": "packed_colwise",
+        "layers.*.mlp.experts.down_proj": "rowwise",
+        "layers.*.mlp.shared_expert.gate_proj": "colwise",
+        "layers.*.mlp.shared_expert.up_proj": "colwise",
+        "layers.*.mlp.shared_expert.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    base_config_key = "text_config"
+    def __init__(
+        self,
+        vocab_size=248320,
+        hidden_size=2048,
+        num_hidden_layers=40,
+        num_attention_heads=16,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=256,
+        linear_conv_kernel_dim=4,
+        linear_key_head_dim=128,
+        linear_value_head_dim=128,
+        linear_num_key_heads=16,
+        linear_num_value_heads=32,
+        moe_intermediate_size=512,
+        shared_expert_intermediate_size=512,
+        num_experts_per_tok=8,
+        num_experts=256,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        layer_types=None,
+        pad_token_id: int | None = None,
+        bos_token_id: int | None = None,
+        eos_token_id: int | None = None,
+        **kwargs,
+    ):
+        kwargs["ignore_keys_at_rope_validation"] = {"mrope_section", "mrope_interleaved"}
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim
+        self.rope_parameters = rope_parameters
+        kwargs.setdefault("partial_rotary_factor", 0.25)  # assign default for BC
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            interval_pattern = kwargs.get("full_attention_interval", 4)
+            self.layer_types = [
+                "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types, self.num_hidden_layers)
+        # linear attention part
+        self.linear_conv_kernel_dim = linear_conv_kernel_dim
+        self.linear_key_head_dim = linear_key_head_dim
+        self.linear_value_head_dim = linear_value_head_dim
+        self.linear_num_key_heads = linear_num_key_heads
+        self.linear_num_value_heads = linear_num_value_heads
+        self.moe_intermediate_size = moe_intermediate_size
+        self.shared_expert_intermediate_size = shared_expert_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        super().__init__(**kwargs)
+class InternS2PreviewTimeSeriesConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternS2PreviewTimeSeriesModel`]. It is used to instantiate a
+    InternS2PreviewTimeSeries model according to the specified arguments, defining the model architecture.
+    Args:
+        ts_adapt_in_dim (`int`, *optional*, defaults to 256):
+            The input dimension of the time series adapter.
+        ts_adapt_out_dim (`int`, *optional*, defaults to 1024):
+            The output dimension of the time series adapter.
+        ts_hidden_dim (`int`, *optional*, defaults to 1024):
+            The hidden dimension of the time series model.
+        ts_cnn_channels (`list[int]`, *optional*, defaults to [1, 32, 64, 128, 128]):
+            The channels of the time series CNN.
+        ts_cnn_kernel_sizes (`list[int]`, *optional*, defaults to [3, 5, 5, 5]):
+            The kernel sizes of the time series CNN.
+        ts_cnn_strides (`list[int]`, *optional*, defaults to [2, 4, 4, 5]):
+            The strides of the time series CNN.
+        ts_cnn_paddings (`list[int]`, *optional*, defaults to [1, 2, 2, 2]):
+            The paddings of the time series CNN.
+        ts_concat_subsampling_in_channels (`int`, *optional*, defaults to 128):
+            The input channels of the time series concat subsampling.
+        ts_concat_subsampling_concat_size (`int`, *optional*, defaults to 2):
+            The concat size of the time series concat subsampling.
+        **super_kwargs:
+            Additional keyword arguments passed along to the base class `WhisperConfig`.
+    """
+    model_type = "interns2_preview_time_series"
+    base_config_key = "ts_config"
+    def __init__(
+        self,
+        activation_dropout: float = 0.0,
+        activation_function: str = "gelu",
+        attention_dropout: float = 0.0,
+        d_model: int = 768,
+        dropout: float = 0.0,
+        encoder_attention_heads: int = 8,
+        encoder_ffn_dim: int = 3072,
+        encoder_layerdrop: float = 0.0,
+        encoder_layers: int = 17,
+        max_source_positions: int = 1500,
+        num_mel_bins: int = 80,
+        out_hidden_size: int = 2048,
+        scale_embedding: bool = False,
+        ts_adapt_in_dim: int = 256,
+        ts_adapt_out_dim: int = 1024,
+        ts_hidden_dim: int = 1024,
+        **super_kwargs,
+    ):
+        super().__init__(**super_kwargs)
+        self.auto_map = {
+            "AutoConfig": "configuration_interns2_preview.InternS2PreviewTimeSeriesConfig",
+            "AutoModel": "modeling_interns2_preview.InternS2PreviewTimeSeriesModel",
+        }
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.attention_dropout = attention_dropout
+        self.d_model = d_model
+        self.dropout = dropout
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layerdrop = encoder_layerdrop
+        self.encoder_layers = encoder_layers
+        self.max_source_positions = max_source_positions
+        self.num_mel_bins = num_mel_bins
+        self.out_hidden_size = out_hidden_size
+        self.scale_embedding = scale_embedding
+        self.ts_adapt_in_dim = ts_adapt_in_dim
+        self.ts_adapt_out_dim = ts_adapt_out_dim
+        self.ts_hidden_dim = ts_hidden_dim
+        assert self.ts_adapt_out_dim == self.ts_hidden_dim, "ts_adapt_out_dim should be equal to ts_hidden_dim"
+class InternS2PreviewConfig(PreTrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternS2PreviewModel`]. It is used to instantiate a
+    Qwen3.5-MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3.5-35B-A3B-Instruct [Qwen/Qwen3.5-35B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-Instruct).
+    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PreTrainedConfig`] for more information.
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3_5TextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Qwen3_5VisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 248056):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 248057):
+            The video token index to encode the image prompt.
+        vision_start_token_id (`int`, *optional*, defaults to 248053):
+            The start token index to encode the image prompt.
+        vision_end_token_id (`int`, *optional*, defaults to 248054):
+            The end token index to encode the image prompt.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie the word embeddings.
+    ```python
+    >>> from transformers import InternS2PreviewForConditionalGeneration, InternS2PreviewConfig
+    >>> # Initializing a Qwen3.5-MoE style configuration
+    >>> configuration = InternS2PreviewConfig()
+    >>> # Initializing a model from the Qwen3.5-35B-A3B style configuration
+    >>> model = InternS2PreviewForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "intern_s2_preview"
+    sub_configs = {
+        "vision_config": InternS2PreviewVisionConfig,
+        "text_config": InternS2PreviewTextConfig,
+        "ts_config": InternS2PreviewTimeSeriesConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=248056,
+        video_token_id=248057,
+        vision_start_token_id=248053,
+        vision_end_token_id=248054,
+        tie_word_embeddings=False,
+        ts_config=None,
+        ts_token_id=248093,
+        ts_start_id=248091,
+        ts_end_id=248092,
+        **kwargs,
+    ):
+        if isinstance(ts_config, dict):
+            self.ts_config = self.sub_configs["ts_config"](**ts_config)
+        elif ts_config is None:
+            self.ts_config = self.sub_configs["ts_config"]()
+        self.ts_token_id = ts_token_id
+        self.ts_start_id = ts_start_id
+        self.ts_end_id = ts_end_id
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            self.text_config = self.sub_configs["text_config"]()
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(**kwargs)
+        self.auto_map = {
+            "AutoConfig": "configuration_interns2_preview.InternS2PreviewConfig",
+            "AutoModelForCausalLM": "modeling_interns2_preview.InternS2PreviewForCausalLM",
+            "AutoModel": "modeling_interns2_preview.InternS2PreviewModel",
+            "AutoModelForImageTextToText": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration",
+            "AutoModelForMultimodalLM": "modeling_interns2_preview.InternS2PreviewForConditionalGeneration",
+        }
+        self.architectures = ["InternS2PreviewForConditionalGeneration"]
+__all__ = ["InternS2PreviewConfig", "InternS2PreviewTextConfig"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "bos_token_id": 248044,
+    "do_sample": true,
+    "eos_token_id": [
+        248046,
+        248044
+    ],
+    "pad_token_id": 248044,
+    "temperature": 1.0,
+    "top_k": 20,
+    "top_p": 0.95,
+    "transformers_version": "4.57.0.dev0"
+}

intern_s2_fp8_qwen3_5_moe.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Auto-generated adapter for Intern-S2-Preview-FP8 -> MLX.
+import mlx.core as mx
+from mlx_lm.models.qwen3_5_moe import Model as Qwen35MoeModel
+from mlx_lm.models.qwen3_5_moe import ModelArgs
+def _dequant_fp8_blockwise(weight, scale_inv, block_size=128):
+    dtype = mx.bfloat16
+    weight = mx.from_fp8(weight, dtype=dtype)
+    if scale_inv.ndim == 0:
+        return (weight * scale_inv).astype(dtype)
+    if weight.ndim == 2 and scale_inv.ndim == 2:
+        m, n = weight.shape
+        pad_m = (-m) % block_size
+        pad_n = (-n) % block_size
+        padded = mx.pad(weight, ((0, pad_m), (0, pad_n)))
+        padded = padded.reshape(
+            (m + pad_m) // block_size,
+            block_size,
+            (n + pad_n) // block_size,
+            block_size,
+        )
+        out = (padded * scale_inv[:, None, :, None]).reshape(m + pad_m, n + pad_n)
+        return out[:m, :n].astype(dtype)
+    if weight.ndim == 3 and scale_inv.ndim == 3:
+        e, m, n = weight.shape
+        pad_m = (-m) % block_size
+        pad_n = (-n) % block_size
+        padded = mx.pad(weight, ((0, 0), (0, pad_m), (0, pad_n)))
+        padded = padded.reshape(
+            e,
+            (m + pad_m) // block_size,
+            block_size,
+            (n + pad_n) // block_size,
+            block_size,
+        )
+        out = (padded * scale_inv[:, :, None, :, None]).reshape(
+            e, m + pad_m, n + pad_n
+        )
+        return out[:, :m, :n].astype(dtype)
+    return (weight * scale_inv).astype(dtype)
+class Model(Qwen35MoeModel):
+    def sanitize(self, weights):
+        filtered = {}
+        for key, value in weights.items():
+            if key.startswith("mtp."):
+                continue
+            if key.startswith("model.visual") or key.startswith("vision_tower"):
+                continue
+            filtered[key] = value
+        dequantized = {}
+        for key, value in filtered.items():
+            if key.endswith("_scale_inv"):
+                base_key = key[: -len("_scale_inv")]
+                if base_key in filtered:
+                    dequantized[base_key] = _dequant_fp8_blockwise(
+                        filtered[base_key], value
+                    )
+                continue
+            if key not in dequantized:
+                dequantized[key] = value
+        return super().sanitize(dequantized)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_interns2_preview.py ADDED Viewed

The diff for this file is too large to render. See raw diff

processing_interns2_preview.py ADDED Viewed

	@@ -0,0 +1,423 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/interns2_preview/modular_interns2_preview.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_interns2_preview.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+import numpy as np
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import auto_docstring, logging
+from transformers.video_utils import VideoInput
+logger = logging.get_logger(__name__)
+class InternS2PreviewProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_token_type_ids": False,
+            "return_mm_token_type_ids": False,
+        },
+        "videos_kwargs": {"return_metadata": True},
+        "time_series_kwargs": {},
+    }
+@auto_docstring
+class InternS2PreviewProcessor(ProcessorMixin):
+    def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
+        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        self.video_token_id = (
+            tokenizer.video_token_id
+            if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.video_token)
+        )
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
+        self.vision_start_token = (
+            "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
+        )
+        self.vision_end_token = (
+            "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token
+        )
+        self.vision_start_token_id = (
+            tokenizer.vision_start_token_id
+            if getattr(tokenizer, "vision_start_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.vision_start_token)
+        )
+        self.vision_end_token_id = (
+            tokenizer.vision_end_token_id
+            if getattr(tokenizer, "vision_end_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.vision_end_token)
+        )
+        self.ts_token = "<TS_CONTEXT>" if not hasattr(tokenizer, "ts_token") else tokenizer.ts_token
+        self.ts_start_token = "<|ts|>" if not hasattr(tokenizer, "ts_start_token") else tokenizer.ts_start_token
+        self.ts_end_token = "<|/ts|>" if not hasattr(tokenizer, "ts_end_token") else tokenizer.ts_end_token
+        self.ts_start_token_id = (
+            tokenizer.ts_start_token_id
+            if getattr(tokenizer, "ts_start_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.ts_start_token)
+        )
+        self.ts_end_token_id = (
+            tokenizer.ts_end_token_id
+            if getattr(tokenizer, "ts_end_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.ts_end_token)
+        )
+        self.ts_token_id = (
+            tokenizer.ts_token_id
+            if getattr(tokenizer, "ts_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.ts_token)
+        )
+    @auto_docstring
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
+        videos: VideoInput = None,
+        time_series_paths: list[str] = None,
+        time_series_sampling_rates: list[int] = None,
+        **kwargs: Unpack[InternS2PreviewProcessorKwargs],
+    ) -> BatchFeature:
+        r"""
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **ts_values** -- List of time series values to be fed to a model. Returned when `time_series_paths` is not `None`.
+            - **ts_sr** -- List of time series sampling rates to be fed to a model. Returned when `time_series_sampling_rates` is not `None`.
+            - **ts_lens** -- List of time series lengths to be fed to a model. Returned when `time_series_paths` is not `None`.
+            - **num_ts_tokens** -- List of number of time series tokens to be fed to a model. Returned when `time_series_paths` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            InternS2PreviewProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+        if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            video_grid_thw = videos_inputs["video_grid_thw"]
+            # If user has not requested video metadata, pop it
+            if not kwargs.get("return_metadata"):
+                video_metadata = videos_inputs.pop("video_metadata")
+            else:
+                video_metadata = videos_inputs["video_metadata"]
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+        if not isinstance(text, list):
+            text = [text]
+        text = text.copy()  # below lines change text in-place
+        if time_series_paths is not None:
+            assert time_series_sampling_rates is not None, (
+                "If time_series_signals is provided, time_series_sampling_rates must also be provided."
+            )
+            assert len(time_series_paths) == len(time_series_sampling_rates), (
+                "The number of time series signals must match the number of sampling rates."
+            )
+            time_series_inputs = self.time_series_processor(
+                ts_paths=time_series_paths, sampling_rates=time_series_sampling_rates
+            )
+            num_ts_tokens = time_series_inputs.pop("num_ts_tokens")
+            assert len(num_ts_tokens) == len(text), (
+                "The number of time series signals must match the number of text prompts."
+            )
+            for i in range(len(text)):
+                if f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}" in text[i]:
+                    ts_placeholder = self.ts_start_token + self.ts_token * num_ts_tokens[i] + self.ts_end_token
+                    text[i] = text[i].replace(
+                        f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}", ts_placeholder, 1
+                    )
+                elif self.ts_token in text[i]:
+                    text[i] = text[i].replace(self.ts_token, self.ts_token * num_ts_tokens[i])
+        else:
+            time_series_inputs = {}
+        if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    num_image_tokens = image_grid_thw[index].prod() // merge_length
+                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        if video_grid_thw is not None:
+            merge_length = self.video_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    metadata = video_metadata[index]
+                    if metadata.fps is None:
+                        logger.warning_once(
+                            "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
+                            "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
+                            "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
+                        )
+                        metadata.fps = 24 if metadata.fps is None else metadata.fps
+                    # if timestamps are not provided, calculate them
+                    curr_timestamp = self._calculate_timestamps(
+                        metadata.frames_indices,
+                        metadata.fps,
+                        self.video_processor.temporal_patch_size,
+                    )
+                    video_placeholder = ""
+                    frame_seqlen = video_grid_thw[index][1:].prod() // merge_length
+                    for frame_idx in range(video_grid_thw[index][0]):
+                        curr_time = curr_timestamp[frame_idx]
+                        video_placeholder += f"<{curr_time:.1f} seconds>"
+                        video_placeholder += (
+                            self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token
+                        )
+                    if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in text[i]:
+                        text[i] = text[i].replace(
+                            f"{self.vision_start_token}{self.video_token}{self.vision_end_token}", video_placeholder, 1
+                        )
+                    else:
+                        # vllm may input video token directly
+                        text[i] = text[i].replace(self.video_token, video_placeholder, 1)
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video", "ts"])
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        return BatchFeature(
+            data={**text_inputs, **image_inputs, **videos_inputs, **time_series_inputs}, tensor_type=return_tensors
+        )
+    def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+            video_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (num_frames, height, width) per each video.
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = InternS2PreviewProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+            merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
+            num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+                for image_size in image_sizes
+            ]
+            num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+        if video_sizes is not None:
+            videos_kwargs = InternS2PreviewProcessorKwargs._defaults.get("videos_kwargs", {})
+            videos_kwargs.update(kwargs)
+            num_video_patches = [
+                self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
+                for video_size in video_sizes
+            ]
+            num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
+            vision_data["num_video_tokens"] = num_video_tokens
+        return MultiModalData(**vision_data)
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+        Returns:
+            `list[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    def _calculate_timestamps(self, indices: list[int] | np.ndarray, video_fps: float, merge_size: int = 2):
+        if not isinstance(indices, list):
+            indices = indices.tolist()
+        if len(indices) % merge_size != 0:
+            indices.extend(indices[-1] for _ in range(merge_size - len(indices) % merge_size))
+        timestamps = [idx / video_fps for idx in indices]
+        # @JJJYmmm frames are merged by self.merge_size, \
+        # so we need to average the timestamps between the first/last frame within the temporal patch
+        timestamps = [
+            (timestamps[i] + timestamps[i + merge_size - 1]) / 2 for i in range(0, len(timestamps), merge_size)
+        ]
+        return timestamps
+    def time_series_preprocessor(self, conversation):
+        if isinstance(conversation, (list, tuple)) and (
+            isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
+        ):
+            conversations = conversation
+        else:
+            conversations = [conversation]
+        batch_time_series = []
+        batch_time_series_metadata = []
+        for conversation in conversations:
+            for message in conversation:
+                if message["role"] != "user":
+                    continue
+                time_series_fnames = [
+                    content["data"]
+                    for content in message["content"]
+                    if content.get("type") == "time_series" and "data" in content
+                ]
+                time_series_rates = [
+                    content.get("sampling_rate", None)
+                    for content in message["content"]
+                    if content.get("type") == "time_series"
+                ]
+                for path, rate in zip(time_series_fnames, time_series_rates):
+                    batch_time_series.append(path)
+                    batch_time_series_metadata.append(rate)
+        return {
+            "time_series_paths": batch_time_series or None,
+            "time_series_sampling_rates": batch_time_series_metadata or None,
+        }
+    def time_series_processor(
+        self,
+        ts_paths: list[str],
+        sampling_rates: list[float],
+        do_normalize=True,
+        do_truncate=True,
+    ) -> BatchFeature:
+        pd = importlib.import_module("pandas")
+        sf = importlib.import_module("soundfile")
+        assert len(ts_paths) == len(sampling_rates), "ts_paths and sampling_rates must have the same length"
+        ts_values = []
+        ts_sr = []
+        ts_lens = []
+        for idx, ts_path in enumerate(ts_paths):
+            sr = sampling_rates[idx]
+            ext = os.path.splitext(ts_path)[-1].lower()
+            if ext in [".wav", ".mp3", ".flac"]:
+                ts_input, sr = sf.read(ts_path)  # ts_input: np.ndarray, shape [T] or [T, C]
+            elif ext == ".csv":
+                df = pd.read_csv(ts_path, header=None)
+                ts_input = df.values  # [T, C]
+            elif ext == ".npy":
+                ts_input = np.load(ts_path)  # [T, C]
+            else:
+                raise ValueError(f"Unsupported file format: {ext}")
+            if not isinstance(ts_input, np.ndarray):
+                ts_input = np.array(ts_input, dtype=np.float32)
+            if do_normalize:
+                mean = ts_input.mean(axis=0, keepdims=True)
+                std = ts_input.std(axis=0, keepdims=True)
+                ts_input = (ts_input - mean) / (std + 1e-8)
+            if do_truncate and len(ts_input) > 240000:
+                ts_input = ts_input[:240000]  # truncate to 240k to avoid oom
+            if ts_input.ndim == 1:
+                ts_input = ts_input[:, None]  # [T,C]
+            ts_len = ts_input.shape[0]
+            if sr is None or sr == 0:  # if no sr provided
+                sr = ts_len / 4
+            ts_values.append(ts_input)
+            ts_sr.append(sr)
+            ts_lens.append(ts_len)
+        ts_lens = np.array(ts_lens)
+        ts_sr = np.array(ts_sr)
+        num_ts_tokens = self._get_num_ts_tokens(sampling_rates=ts_sr, ts_lens=ts_lens)
+        return BatchFeature(
+            data={"ts_values": ts_values, "ts_sr": ts_sr, "ts_lens": ts_lens, "num_ts_tokens": num_ts_tokens}
+        )
+    def _get_num_ts_tokens(self, sampling_rates, ts_lens):
+        strides = np.floor(160 / ((1 + np.exp(-sampling_rates / 100)) ** 6))
+        patch_sizes = strides * 2
+        embed_lengths = (np.ceil((ts_lens - patch_sizes) / strides) + 1).astype(np.int64)
+        num_ts_tokens = [(embed_length // 2 + 1) // 2 for embed_length in embed_lengths]
+        return num_ts_tokens
+__all__ = ["InternS2PreviewProcessor"]

tokenization_interns1.py ADDED Viewed

	@@ -0,0 +1,1009 @@

+# coding=utf-8
+# Copyright 2025 The Intern team and Shanghai AI Lab team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for InternS1."""
+import json
+import os
+import unicodedata
+from abc import ABC, abstractmethod
+from typing import Optional, Union
+from functools import lru_cache
+import regex as re
+import sentencepiece as spm
+from transformers.tokenization_utils_base import AddedToken, TextInput
+from transformers.utils import logging
+from packaging import version
+import transformers
+if version.parse(transformers.__version__) >= version.parse("5.0.0"):
+    from transformers.tokenization_python import PreTrainedTokenizer
+else:
+    from transformers.tokenization_utils import PreTrainedTokenizer
+logger = logging.get_logger(__name__)
+try:
+    from rdkit import Chem, RDLogger
+    RDLogger.DisableLog("rdApp.error")
+    RDLogger.DisableLog("rdApp.*")
+    RDKIT_AVAILABLE = True
+except ImportError:
+    logger.warning_once(
+        "If tokenization with SMILES formula is of necessity, please 'pip install RDKit' for better tokenization quality."
+    )
+    RDKIT_AVAILABLE = False
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+    "sp_model_SMILES": "tokenizer_SMILES.model",
+    "sp_model_PROT": "tokenizer_PROT.model",
+    "sp_model_XNA": "tokenizer_XNA.model",
+}
+PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+class InternS1CheckModuleMixin(ABC):
+    """
+    Basic auto-detection module.
+    Note that short strings are ignored by this module.
+    """
+    def __init__(self, *, min_length: int):
+        self.min_length = min_length
+        self.REGEX = self._build_regex()
+        self.all_auto_detect_token_start = ["<SMILES_AUTO_DETECT>", "<PROT_AUTO_DETECT>", "<XNA_AUTO_DETECT>"]
+        self.all_auto_detect_token_end = ["</SMILES_AUTO_DETECT>", "</PROT_AUTO_DETECT>", "</XNA_AUTO_DETECT>"]
+        self.auto_detect_token = []
+        self.truncation = False
+    @abstractmethod
+    def _build_regex(self):
+        pass
+    @abstractmethod
+    def check_legitimacy(self, candidate: str) -> bool:
+        pass
+    def re_split(self, texts: Union[str, list[str]]) -> list[str]:
+        if isinstance(texts, str):
+            texts = [texts]
+        total_results = []
+        no_split_flag = 0
+        for text in texts:
+            if text in self.all_auto_detect_token_start:
+                total_results.append(text)
+                no_split_flag += 1
+                continue
+            elif text in self.all_auto_detect_token_end:
+                total_results.append(text)
+                no_split_flag = max(0, no_split_flag - 1)
+                continue
+            if no_split_flag > 0:
+                total_results.append(text)
+                continue
+            results = []
+            current_pos = 0
+            for match in self.REGEX.finditer(text):
+                candidate = match.group(1)
+                if len(candidate) >= self.min_length:
+                    match_start, match_end = match.span(1)
+                    if not self.check_legitimacy(candidate):
+                        continue
+                    if not self.truncation:
+                        if match_start > 0 and text[match_start - 1].encode("UTF-8").isalpha():
+                            continue
+                        if match_end < len(text) and text[match_end].encode("UTF-8").isalpha():
+                            continue
+                    if match_start > current_pos:
+                        non_candidate_part = text[current_pos:match_start]
+                        results.append(non_candidate_part)
+                else:
+                    continue
+                results.extend([self.auto_detect_token[0], candidate, self.auto_detect_token[1]])
+                current_pos = match_end
+            if current_pos < len(text):
+                remaining_part = text[current_pos:]
+                results.append(remaining_part)
+            total_results.extend(results)
+        return total_results
+class XnaCheckModule(InternS1CheckModuleMixin):
+    """
+    XNA sequence auto-detection module.
+    Automatically detects XNA sequence using regex patterns.
+    """
+    def __init__(self, *, min_length: int = 27):
+        super().__init__(min_length=min_length)
+        self.auto_detect_token = ["<XNA_AUTO_DETECT>", "</XNA_AUTO_DETECT>"]
+        self.truncation = True
+    def _build_regex(self):
+        return re.compile(r"([ATCGU]{" + str(self.min_length) + r",})")
+    def check_legitimacy(self, candidate: str):
+        return True
+class ProtCheckModule(InternS1CheckModuleMixin):
+    """
+    Protein sequence auto-detection module.
+    Automatically detects protein sequence using regex patterns.
+    """
+    def __init__(self, *, min_length: int = 27):
+        super().__init__(min_length=min_length)
+        self.auto_detect_token = ["<PROT_AUTO_DETECT>", "</PROT_AUTO_DETECT>"]
+        self.truncation = True
+        self._xna_pattern = re.compile(r"^[ATCGU]+$")
+    def _build_regex(self):
+        return re.compile(r"([A-Z]{" + str(self.min_length) + r",})")
+    def check_legitimacy(self, candidate: str):
+        if self._xna_pattern.match(candidate):
+            return False
+        return True
+# fmt: off
+bonds = ["-", "=", "#", ":", "/", "\\", ".", "$"]
+organic_symbols = ["B", "C", "N", "O", "P", "S", "F", "Cl", "Br", "I"]
+other_allows = bonds + ["[", "]", "(", ")", ";"]
+aromatic_symbols = ["b", "c", "n", "o", "s", "p"]
+elements = [
+    "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
+    "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca",
+    "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
+    "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr",
+    "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn",
+    "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd",
+    "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
+    "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
+    "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th",
+    "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm",
+    "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
+    "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
+]
+# fmt: on
+class SmilesCheckModule(InternS1CheckModuleMixin):
+    """
+    SMILES molecular sequence auto-detection module.
+    Automatically detects and validates SMILES strings in text using regex patterns
+    or chemical syntax rules. Uses RDKit for precise validation when available,
+    otherwise falls back to rule-based validation.
+    """
+    def __init__(self, *, min_length: int = 10):
+        super().__init__(min_length=min_length)
+        self.auto_detect_token = ["<SMILES_AUTO_DETECT>", "</SMILES_AUTO_DETECT>"]
+        self._SQ_BRACKET_BAN_1 = re.compile(r"(?:[A-GI-Z]|[a-z]){3,}")
+        self._SQ_BRACKET_BAN_2 = re.compile(r"\d{4,}")
+    def _build_regex(self):
+        # fmt: off
+        _two_letter_elements = [
+            'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'Ba', 'Be', 'Bh', 'Bi', 'Bk', 'Br', 'Ca', 'Cd',
+            'Ce', 'Cf', 'Cl', 'Cm', 'Cn', 'Co', 'Cr', 'Cs', 'Cu', 'Db', 'Ds', 'Dy', 'Er', 'Es', 'Eu', 'Fe',
+            'Fl', 'Fm', 'Fr', 'Ga', 'Gd', 'Ge', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'In', 'Ir', 'Kr', 'La', 'Li',
+            'Lr', 'Lu', 'Lv', 'Mc', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'Na', 'Nb', 'Nd', 'Ne', 'Nh', 'Ni', 'No',
+            'Np', 'Og', 'Os', 'Pa', 'Pb', 'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg',
+            'Rh', 'Rn', 'Ru', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Tc', 'Te', 'Th',
+            'Ti', 'Tl', 'Tm', 'Ts', 'Xe', 'Yb', 'Zn', 'Zr'
+        ]
+        _single_letter_elements = [
+            "B", "C", "F", "H", "I", "K", "N", "O", "P", "S", "U", "V", "W", "Y", 'b', 'c', 'n', 'o', 'p', 's'
+        ]
+        # fmt: on
+        all_elements_sorted = sorted(_two_letter_elements + _single_letter_elements, key=lambda x: (-len(x), x))
+        elements_pattern_str = "|".join(all_elements_sorted)
+        bracket_atom_pattern_str = r"\[[^\]]+\]"
+        other_single_chars_pattern_str = r"[\(\)\.=\-#@\d\$\%\*:\+\-\/\\]"
+        smiles_unit_pattern = (
+            r"(?:"
+            + bracket_atom_pattern_str
+            + r"|"
+            + elements_pattern_str
+            + r"|"
+            + other_single_chars_pattern_str
+            + r")"
+        )
+        core_sequence_pattern = rf"(?>{smiles_unit_pattern}){{10,}}"
+        constrained_core_sequence_pattern = rf"(?![:.=]){core_sequence_pattern}(?<![:.=])"
+        final_regex_str = rf"({constrained_core_sequence_pattern})"
+        COMPILED_REGEX = re.compile(final_regex_str)
+        return COMPILED_REGEX
+    def check_legitimacy_slow(self, candidate: str) -> bool:
+        """Check legitimacy with RDKit"""
+        if sum(1 for char in candidate if char.encode("UTF-8").isalpha()) < 5:
+            return False
+        mol = Chem.MolFromSmiles(candidate)
+        if mol is None:
+            return False
+        else:
+            return True
+    def check_legitimacy_fast(self, candidate: str) -> bool:
+        """Check legitimacy with hard rules"""
+        if sum(1 for char in candidate if char.encode("UTF-8").isalpha()) < 5:
+            return False
+        if not self.check_rings_and_brackets(candidate):
+            return False
+        else:
+            return True
+    def check_legitimacy(self, candidate: str) -> bool:
+        if RDKIT_AVAILABLE:
+            return self.check_legitimacy_slow(candidate)
+        else:
+            return self.check_legitimacy_fast(candidate)
+    def check_brackets(self, text):
+        matches = re.findall(r"\[([^\[\]]*)\]", text)
+        for part in matches:
+            if "(" in part or ")" in part:
+                return False
+            if len(part) == 0:
+                return False
+            if part[0] in elements or part[0] in aromatic_symbols or part[:2] in elements:
+                return True
+        return True
+    def check_rings_and_brackets(self, text):
+        rings = {}
+        left_sq_bracket, right_sq_bracket = 0, 0
+        left_pt_bracket, right_pt_bracket = 0, 0
+        all_lower = True
+        digits_cnt = 0
+        pos = 0
+        while pos < len(text):
+            step = 0
+            c = text[pos]
+            if ord(c) >= 65 and ord(c) <= 90:
+                all_lower = False
+            if (pos == len(text) - 1 or pos == 0) and c in bonds:
+                return False
+            if pos > 0 and text[pos - 1] in bonds and text[pos] in bonds:
+                return False
+            if c == "[":
+                step = 1
+                left_sq_bracket += 1
+                if left_sq_bracket > right_sq_bracket + 1:
+                    return False
+                if pos == len(text) - 1:
+                    return False
+                if "]" not in text[pos + 1 :]:
+                    return False
+                bracket_span = text[pos + 1 : text.find("]")]
+                if self._SQ_BRACKET_BAN_1.search(bracket_span) or self._SQ_BRACKET_BAN_2.search(bracket_span):
+                    return False
+                matches = re.findall(r"\d+", bracket_span)
+                if len(matches) > 2:
+                    return False
+            if c == "]":
+                step = 1
+                right_sq_bracket += 1
+                if right_sq_bracket > left_sq_bracket:
+                    return False
+            if c == "(":
+                step = 1
+                left_pt_bracket += 1
+            if c == ")":
+                step = 1
+                right_pt_bracket += 1
+                if right_pt_bracket > left_pt_bracket:
+                    return False
+            if left_sq_bracket == right_sq_bracket:
+                if c.isdigit():
+                    digits_cnt += 1
+                    step = 1
+                    if (
+                        pos == 0
+                        or (pos == 1 and text[pos - 1] != "%")
+                        or (pos > 1 and text[pos - 1] != "%" and text[pos - 2] != "%")
+                    ):
+                        if c in rings:
+                            if rings[c] == "unclosed":
+                                rings[c] = "closed"
+                            else:
+                                rings[c] = "unclosed"
+                        else:
+                            rings[c] = "unclosed"
+                if c == "%":
+                    if pos >= len(text) - 2 or not text[pos + 1].isdigit() or not text[pos + 2].isdigit():
+                        return False
+                    step = 3
+                    digits_cnt += 1
+                    num = text[pos + 1 : pos + 3]
+                    if num in rings:
+                        if rings[num] == "unclosed":
+                            rings[num] = "closed"
+                        else:
+                            rings[num] = "unclosed"
+                    else:
+                        rings[num] = "unclosed"
+                if step == 0:
+                    if (
+                        pos < len(text) - 1
+                        and text[pos : pos + 2] in organic_symbols + aromatic_symbols + other_allows
+                    ):
+                        step = 2
+                    elif c in organic_symbols + aromatic_symbols + other_allows:
+                        step = 1
+                    else:
+                        return False
+            if step == 0:
+                step = 1
+            pos += step
+        if left_sq_bracket != right_sq_bracket or any(v == "unclosed" for v in rings.values()):
+            return False
+        if all_lower and digits_cnt < 2:
+            return False
+        return self.check_brackets(text)
+@lru_cache
+# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+# @requires(backends=("sentencepiece",))
+class InternS1Tokenizer(PreTrainedTokenizer):
+    """
+    Construct an InternS1 tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+    ```python
+    >>> from transformers import AutoTokenizer
+    >>> tokenizer = AutoTokenizer.from_pretrained("InternS1Tokenizer", trust_remote_code=True)
+    >>> tokenizer("Hello world")["input_ids"]
+    [9707, 1879]
+    >>> tokenizer(" Hello world")["input_ids"]
+    [21927, 1879]
+    ```
+    This is expected.
+    Include custom extension to support better domain-specific text tokenization, leveraging a separately trained tokenizer model.
+    ```python
+    >>> from transformers import AutoTokenizer
+    >>> tokenizer = AutoTokenizer.from_pretrained("InternS1Tokenizer", trust_remote_code=True)
+    >>> tokenizer.tokenize("Describe <SMILES>C1=CC=C(C=C1)C=O</SMILES> and CC1=CC=CC=C1C=O")
+    ["Describe ", "<SMILES>", "C1=CC=C(C=C1)C=O", "</SMILES>", " and ", "<SMILES_AUTO_DETECT>",
+        "CC1=CC=CC=C1C=O", "</SMILES_AUTO_DETECT>"]
+    >>> token_ids = tokenizer("Describe <SMILES>C1=CC=C(C=C1)C=O</SMILES> and CC1=CC=CC=C1C=O")["input_ids"]
+    >>> token_ids
+    [74785, 220, 151925, 151854, 151860, 151698, 151707, 151860, 151690, 151726, 151926, 323, 220, 151672, 151860, 151701, 151860, 151854, 151726]
+    >>> tokenizer.convert_ids_to_tokens(token_ids)
+    ['Describe', 'Ġ', '<SMILES>', 'C', '1', '=CC=C(', 'C=C', '1', ')C', '=O', '</SMILES>', 'Ġand', 'Ġ', 'CC', '1', '=CC=CC=C', '1', 'C', '=O']
+    ```
+    Users should refer to this superclass [`PreTrainedTokenizer`] for more information regarding those overloaded methods
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*):
+            The beginning of sequence token. Not applicable for this tokenizer.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
+            tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
+        split_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not the special tokens should be split during the tokenization process. The default behavior is
+            to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
+            ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
+            '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token=None,
+        eos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        clean_up_tokenization_spaces=False,
+        split_special_tokens=False,
+        special_tokens_pattern="none",
+        **kwargs,
+    ):
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        pad_token = (
+            AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(pad_token, str)
+            else pad_token
+        )
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_merges = []
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            for i, line in enumerate(merges_handle):
+                line = line.strip()
+                if (i == 0 and line.startswith("#version:")) or not line:
+                    continue
+                bpe_merges.append(tuple(line.split()))
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        # NOTE: the cache can grow without bound and will get really large for long running processes
+        # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
+        # not a memory leak but appears as one.
+        # GPT2Tokenizer has the same problem, so let's be consistent.
+        self.cache = {}
+        self.pat = re.compile(PRETOKENIZE_REGEX)
+        if kwargs.get("add_prefix_space", False):
+            logger.warning_once(
+                f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
+            )
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            split_special_tokens=split_special_tokens,
+            special_tokens_pattern=special_tokens_pattern,
+            **kwargs,
+        )
+        self.prepare_extra_tokenizers(vocab_file)
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+    def prepare_extra_tokenizers(self, vocab_file: str) -> None:
+        """
+        Prepare domain-specific tokenizers.
+        Define variables/maps here which guide domain-specific tokenization later.
+        """
+        # Load extra tokenizers with SentencePiece model
+        dir_name = os.path.dirname(vocab_file)
+        self.sp_model_SMILES = spm.SentencePieceProcessor()
+        self.sp_model_SMILES.Load(os.path.join(dir_name, "tokenizer_SMILES.model"))
+        self.sp_model_SMILES.offset = self.init_kwargs["offset_SMILES"]
+        self.sp_model_PROT = spm.SentencePieceProcessor()
+        self.sp_model_PROT.Load(os.path.join(dir_name, "tokenizer_PROT.model"))
+        self.sp_model_PROT.offset = self.init_kwargs["offset_PROT"]
+        self.sp_model_XNA = spm.SentencePieceProcessor()
+        self.sp_model_XNA.Load(os.path.join(dir_name, "tokenizer_XNA.model"))
+        self.sp_model_XNA.offset = self.init_kwargs["offset_XNA"]
+        base_mapping = {
+            "SMILES": self.sp_model_SMILES,
+            "protein": self.sp_model_PROT,
+            "dna": self.sp_model_XNA,
+            "rna": self.sp_model_XNA,
+        }
+        auto_detect_mapping = {
+            "SMILES": self.sp_model_SMILES,
+            "PROT": self.sp_model_PROT,
+            "XNA": self.sp_model_XNA,
+        }
+        # Guiding tokens of domain-specific tokenization
+        self.ex_begin_mapping = {f"<{key}>": value for key, value in base_mapping.items()}
+        self.ex_end_mapping = {f"</{key}>": value for key, value in base_mapping.items()}
+        # Transient markers for auto-detection, these tokens will not be assigned token ids
+        self.ex_auto_begin_mapping = {f"<{key}_AUTO_DETECT>": value for key, value in auto_detect_mapping.items()}
+        self.ex_auto_end_mapping = {f"</{key}_AUTO_DETECT>": value for key, value in auto_detect_mapping.items()}
+        # Token markers to prevent unwanted auto-detection
+        self.ex_protect_begin_tokens = ["<MOLFORMULA>"]
+        self.ex_protect_end_tokens = ["</MOLFORMULA>"]
+        # For simplicity
+        self.ex_protect_tokens = self.ex_protect_begin_tokens + self.ex_protect_end_tokens
+        self.ex_all_begin_mapping = self.ex_begin_mapping | self.ex_auto_begin_mapping
+        self.ex_all_end_mapping = self.ex_end_mapping | self.ex_auto_end_mapping
+        # Update encoder & decoder with extra tokenizers
+        for tokenizer_name, sp_model in [
+            ("SMILES", self.sp_model_SMILES),
+            ("PROT", self.sp_model_PROT),
+            ("XNA", self.sp_model_XNA),
+        ]:
+            self.decoder.update(
+                {i + sp_model.offset: sp_model.id_to_piece(i) for i in range(sp_model.get_piece_size())}
+            )
+            # Not really used, only to fill holes in encoder, to keep methods like `add_tokens` working
+            self.encoder.update(
+                {
+                    f"<|{tokenizer_name}_{sp_model.id_to_piece(i)}|>": i + sp_model.offset
+                    for i in range(sp_model.get_piece_size())
+                }
+            )
+        # protect-tokens should keep complete temporarily to guide later tokenization
+        # it will be segmented later
+        for token in self.ex_protect_tokens:
+            self.tokens_trie.add(token)
+        self._unk_token = "<unk>"  # Fall-back
+        self.check_module_list = [SmilesCheckModule(), ProtCheckModule(), XnaCheckModule()]
+    def _pop_logical_sp_token(self, extra_tokenizer_stack: list, mapping_name: str) -> None:
+        """Switch tokenizer when it comes to an end sp token"""
+        extra_tokenizer = extra_tokenizer_stack.pop()
+        if extra_tokenizer != self.ex_all_end_mapping[mapping_name]:
+            logger.warning_once(
+                f"Encounter incorrect nesting of extra tokenizer: {self.ex_all_end_mapping[mapping_name]} and {extra_tokenizer}"
+            )
+            logger.warning_once("This may lead to unexpected behaviour of the tokenizer, please check your input.")
+    def tokenize(self, text: TextInput, **kwargs) -> list[str]:
+        """
+        Converts a string into a sequence of tokens, using the tokenizer.
+        It will switch to domain-specific tokenizer once encountering extra/logical sp tokens.
+        Args:
+            text: TextInput
+        """
+        split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
+        text, kwargs = self.prepare_for_tokenization(text, **kwargs)
+        if hasattr(self, "do_lower_case") and self.do_lower_case:
+            # convert non-special tokens to lowercase. Might be super slow as well?
+            escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
+            escaped_special_toks += [
+                re.escape(s_tok.content)
+                for s_tok in (self._added_tokens_decoder.values())
+                if not s_tok.special and s_tok.normalized
+            ]
+            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+            text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
+        if split_special_tokens:
+            no_split_token = []
+            tokens = [text]
+        else:
+            no_split_token = self._added_tokens_encoder.keys()  # don't split on any of the added tokens
+            # "This is something<special_token_1>  else"
+            tokens = self.tokens_trie.split(text)
+        # ["This is something", "<special_token_1>", "  else"]
+        for i, token in enumerate(tokens):
+            if token in no_split_token:
+                tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None)
+                left = tokens[i - 1] if i > 0 else None
+                right = tokens[i + 1] if i < len(tokens) - 1 else None
+                if isinstance(tok_extended, AddedToken):
+                    if tok_extended.rstrip and right:
+                        # A bit counter-intuitive but we strip the left of the string
+                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
+                        tokens[i + 1] = right.lstrip()
+                    # Strip white spaces on the left
+                    if tok_extended.lstrip and left:
+                        tokens[i - 1] = left.rstrip()  # Opposite here
+                    if tok_extended.single_word and left and left[-1] != " ":
+                        tokens[i - 1] += token
+                        tokens[i] = ""
+                    elif tok_extended.single_word and right and right[0] != " ":
+                        tokens[i + 1] = token + tokens[i + 1]
+                        tokens[i] = ""
+                else:
+                    raise ValueError(
+                        f"{tok_extended} cannot be tokenized because it was not properly added"
+                        f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
+                    )
+        # ["This is something", "<special_token_1>", "else"]
+        tokenized_text = []
+        # Codes for automatically detecting domain-specific content
+        # All parts that have been marked by domain-specific or protection tokens will not be subject to auto detection
+        # See transformers/tests/models/intern_s1/test_tokenization_intern_s1.py::test_auto_detection() for more details
+        new_tokens = []
+        not_split_flag = 0
+        for token in tokens:
+            if not token:
+                continue
+            if token in no_split_token or token in self.ex_protect_tokens:
+                new_tokens.append(token)
+                if token in self.ex_begin_mapping or token in self.ex_protect_begin_tokens:
+                    not_split_flag += 1  # In case nested sp tokens
+                elif token in self.ex_end_mapping or token in self.ex_protect_end_tokens:
+                    not_split_flag = max(0, not_split_flag - 1)
+            else:
+                if not_split_flag:
+                    new_tokens.append(token)
+                else:
+                    for check_module in self.check_module_list:
+                        token = check_module.re_split(token)
+                    new_tokens.extend(token)
+        tokens = new_tokens
+        # Use stack to maintain which tokenizer should be used, considering the possibility of nested extra tokenizer
+        extra_tokenizer_stack = []
+        for token in tokens:
+            # Need to skip eventual empty (fully stripped) tokens
+            if not token:
+                continue
+            # protect-tokens are not assigned token ids, should be segmented here
+            if token in self.ex_protect_tokens:
+                tokenized_text.extend(self._tokenize(token))
+            # push tokenizer to stack when encountering begin token
+            elif token in self.ex_all_begin_mapping:
+                tokenized_text.append(token)
+                extra_tokenizer_stack.append(self.ex_all_begin_mapping[token])
+            # pop tokenizer from stack when encountering end token
+            elif token in self.ex_all_end_mapping:
+                tokenized_text.append(token)
+                if extra_tokenizer_stack:
+                    self._pop_logical_sp_token(extra_tokenizer_stack, token)
+            # other special tokens
+            elif token in no_split_token:
+                tokenized_text.append(token)
+            else:
+                tokenized_text.extend(self._tokenize(token, extra_tokenizer_stack=extra_tokenizer_stack))
+        # ["This", " is", " something", "<special_token_1>", "else"]
+        return tokenized_text
+    def _tokenize(self, text, **kwargs):
+        """
+        Modified from `transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize`.
+        This adaptation supports domain-specific tokenizers.
+        """
+        extra_tokenizer_stack = kwargs.pop("extra_tokenizer_stack", False)
+        if extra_tokenizer_stack:
+            tokenized_text = extra_tokenizer_stack[-1].encode(text, out_type=str)
+            tokenized_id = extra_tokenizer_stack[-1].encode(text, out_type=int)
+            final_tokenized_text = []
+            for text_piece, id_piece in zip(tokenized_text, tokenized_id):
+                if id_piece == 0:
+                    final_tokenized_text.extend(self._bpe_tokenize(text_piece))
+                else:
+                    final_tokenized_text.append(text_piece)
+            return final_tokenized_text
+        else:
+            return self._bpe_tokenize(text)
+    def _bpe_tokenize(self, text, **kwargs):
+        text = text.replace(
+            "▁", " "
+        )  # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+    def convert_tokens_to_ids(self, tokens: Union[str, list[str]]) -> Union[int, list[int]]:
+        """
+        Modified from `transformers.tokenization_utils.PreTrainedTokenzier.convert_tokens_to_ids`.
+        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
+        vocabulary.
+        This adaptation supports domain-specific tokenizers.
+        Args:
+            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
+        Returns:
+            `int` or `List[int]`: The token id or list of token ids.
+        """
+        if tokens is None:
+            return None
+        if isinstance(tokens, str):
+            return self._convert_token_to_id_with_added_voc(tokens)
+        ids = []
+        extra_tokenizer_stack = []
+        for token in tokens:
+            if token not in self.ex_auto_begin_mapping and token not in self.ex_auto_end_mapping:
+                ids.append(
+                    self._convert_token_to_id_with_added_voc(token, extra_tokenizer_stack=extra_tokenizer_stack)
+                )
+            if token in self.ex_all_begin_mapping:
+                extra_tokenizer_stack.append(self.ex_all_begin_mapping[token])
+            elif token in self.ex_all_end_mapping:
+                if extra_tokenizer_stack:
+                    self._pop_logical_sp_token(extra_tokenizer_stack, token)
+        return ids
+    def _convert_token_to_id_with_added_voc(self, token, **kwargs):
+        """
+        Modified from `transformers.tokenization_utils.PreTrainedTokenzier._convert_token_to_id_with_added_voc`.
+        This adaptation supports domain-specific tokenizers.
+        """
+        if token is None:
+            return None
+        if token in self._added_tokens_encoder:
+            return self._added_tokens_encoder[token]
+        return self._convert_token_to_id(token, **kwargs)
+    def _convert_token_to_id(self, token, **kwargs):
+        """
+        Modified from `transformers.tokenization_utils.PreTrainedTokenzier._convert_token_to_id`.
+        Converts a token (str) in an id using the vocab.
+        Fall back to original tokenizer once OOV.
+        """
+        extra_tokenizer_stack = kwargs.pop("extra_tokenizer_stack", False)
+        if extra_tokenizer_stack:
+            token_id = extra_tokenizer_stack[-1].piece_to_id(token)
+            if token_id == extra_tokenizer_stack[-1].unk_id():
+                return self.encoder.get(token, self.encoder.get(self._unk_token))
+            else:
+                return token_id + extra_tokenizer_stack[-1].offset
+        else:
+            return self.encoder.get(token, self.encoder.get(self._unk_token))
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = text.replace(
+            "▁", "Ġ"
+        )  # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
+        text = text.replace("\n", "Ċ")
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
+        # and cannot be configured elsewhere, but it should default to False for InternS1Tokenizer
+        return super().decode(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        """
+        Modified from `transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary` to support saving custom extension.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+        sp_model_smiles = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_SMILES"]
+        )
+        sp_model_prot = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_PROT"]
+        )
+        sp_model_xna = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["sp_model_XNA"]
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+        with open(sp_model_smiles, "wb") as f:
+            f.write(self.sp_model_SMILES.serialized_model_proto())
+        with open(sp_model_prot, "wb") as f:
+            f.write(self.sp_model_PROT.serialized_model_proto())
+        with open(sp_model_xna, "wb") as f:
+            f.write(self.sp_model_XNA.serialized_model_proto())
+        return vocab_file, merge_file
+    def prepare_for_tokenization(self, text, **kwargs):
+        text = unicodedata.normalize("NFC", text)
+        return (text, kwargs)
+__all__ = ["InternS1Tokenizer"]

tokenizer_PROT.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1144f52f86f3ca5a29940d69b037e508c05a89e6eedbe42bea641e226b20dbe0
+size 12118

tokenizer_SMILES.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fba1c97da0353ccbffd368ae78e311ccbc762aa5ba74f9aff8bf2ab363c4d37d
+size 14775

tokenizer_XNA.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58fc8bfb2af3dfe936a13dad8a9cb28dab7850b70b358db19605d867c133fb35
+size 15451

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,508 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "248044": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248045": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248046": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248047": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248048": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248049": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248050": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248051": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248052": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248053": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248054": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248055": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248056": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248057": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248058": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248059": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248060": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248061": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248062": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248063": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248064": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248065": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248066": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248067": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248068": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248069": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248070": {
+      "content": "<|audio_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248071": {
+      "content": "<|audio_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248072": {
+      "content": "<tts_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248073": {
+      "content": "<tts_text_bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248074": {
+      "content": "<tts_text_eod>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248075": {
+      "content": "<tts_text_bos_single>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248076": {
+      "content": "<|audio_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248077": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248078": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248079": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248080": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248081": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248082": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248083": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248084": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248085": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248086": {
+      "content": "<|action_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248087": {
+      "content": "<|action_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248088": {
+      "content": "<|interpreter|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248089": {
+      "content": "<|plugin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248090": {
+      "content": "<video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248091": {
+      "content": "<|ts|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248092": {
+      "content": "<|/ts|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248093": {
+      "content": "<TS_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "248094": {
+      "content": "<SMILES>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248095": {
+      "content": "</SMILES>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248096": {
+      "content": "<protein>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248097": {
+      "content": "</protein>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248098": {
+      "content": "<dna>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248099": {
+      "content": "</dna>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248100": {
+      "content": "<rna>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "248101": {
+      "content": "</rna>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "audio_bos_token": "<|audio_start|>",
+  "audio_eos_token": "<|audio_end|>",
+  "audio_token": "<|audio_pad|>",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_interns1.InternS1Tokenizer",
+      null
+    ]
+  },
+  "backend": "custom",
+  "bos_token": "<|im_start|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "image_token": "<|image_pad|>",
+  "is_local": true,
+  "local_files_only": false,
+  "model_max_length": 262144,
+  "model_specific_special_tokens": {
+    "audio_bos_token": "<|audio_start|>",
+    "audio_eos_token": "<|audio_end|>",
+    "audio_token": "<|audio_pad|>",
+    "image_token": "<|image_pad|>",
+    "video_token": "<|video_pad|>",
+    "vision_bos_token": "<|vision_start|>",
+    "vision_eos_token": "<|vision_end|>"
+  },
+  "offset_PROT": 249126,
+  "offset_SMILES": 248102,
+  "offset_XNA": 250150,
+  "pad_token": "<|endoftext|>",
+  "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+  "split_special_tokens": false,
+  "tokenizer_class": "InternS1Tokenizer",
+  "tool_parser_type": "qwen3_coder",
+  "unk_token": null,
+  "video_token": "<|video_pad|>",
+  "vision_bos_token": "<|vision_start|>",
+  "vision_eos_token": "<|vision_end|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff