Reza2kn commited on about 14 hours ago

Commit

cead59c

verified ·

1 Parent(s): 34d7e53

Initial upload: INT4 ONNX (encoder + prefill + step) + tokenizer + examples + inference.py

Browse files

Files changed (23) hide show

.gitattributes +11 -0
chat_template.json +1 -0
config.json +221 -0
examples/distortion.wav +3 -0
examples/dropout.wav +3 -0
examples/echo.wav +3 -0
examples/far_field.wav +3 -0
examples/mixed.wav +3 -0
examples/noise.wav +3 -0
examples/obstructed.wav +3 -0
examples/recording.wav +3 -0
generation_config.json +9 -0
inference.py +150 -0
merges.txt +0 -0
onnx/audio_encoder_int4.onnx +3 -0
onnx/audio_encoder_int4.onnx.data +3 -0
onnx/decoder_prefill_int4.onnx +3 -0
onnx/decoder_prefill_int4.onnx.data +3 -0
onnx/decoder_step_int4.onnx +3 -0
onnx/decoder_step_int4.onnx.data +3 -0
preprocessor_config.json +14 -0
tokenizer_config.json +549 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/distortion.wav filter=lfs diff=lfs merge=lfs -text
+examples/dropout.wav filter=lfs diff=lfs merge=lfs -text
+examples/echo.wav filter=lfs diff=lfs merge=lfs -text
+examples/far_field.wav filter=lfs diff=lfs merge=lfs -text
+examples/mixed.wav filter=lfs diff=lfs merge=lfs -text
+examples/noise.wav filter=lfs diff=lfs merge=lfs -text
+examples/obstructed.wav filter=lfs diff=lfs merge=lfs -text
+examples/recording.wav filter=lfs diff=lfs merge=lfs -text
+onnx/audio_encoder_int4.onnx.data filter=lfs diff=lfs merge=lfs -text
+onnx/decoder_prefill_int4.onnx.data filter=lfs diff=lfs merge=lfs -text
+onnx/decoder_step_int4.onnx.data filter=lfs diff=lfs merge=lfs -text

chat_template.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"chat_template": "{%- set ns = namespace(system_text=\"\") -%}\n{%- for m in messages -%}\n {%- if m.role == 'system' -%}\n {%- if m.content is string -%}\n {%- set ns.system_text = ns.system_text + m.content -%}\n {%- else -%}\n {%- for c in m.content -%}\n {%- if c.type == 'text' and (c.text is defined) -%}\n {%- set ns.system_text = ns.system_text + c.text -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n\n{%- set ns2 = namespace(audio_tokens=\"\") -%}\n{%- for m in messages -%}\n {%- if m.content is not string -%}\n {%- for c in m.content -%}\n {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) -%}\n {%- set ns2.audio_tokens = ns2.audio_tokens + \"<|audio_start|><|audio_pad|><|audio_end|>\" -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n{%- endfor -%}\n\n{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n{%- if add_generation_prompt -%}\n{{- '<|im_start|>assistant\\n' -}}\n{%- endif -%}"}

config.json ADDED Viewed

	@@ -0,0 +1,221 @@

+{
+    "architectures": [
+      "Qwen3ASRForConditionalGeneration"
+    ],
+    "model_type": "qwen3_asr",
+    "support_languages": [
+      "Chinese",
+      "English",
+      "Cantonese",
+      "Arabic",
+      "German",
+      "French",
+      "Spanish",
+      "Portuguese",
+      "Indonesian",
+      "Italian",
+      "Korean",
+      "Russian",
+      "Thai",
+      "Vietnamese",
+      "Japanese",
+      "Turkish",
+      "Hindi",
+      "Malay",
+      "Dutch",
+      "Swedish",
+      "Danish",
+      "Finnish",
+      "Polish",
+      "Czech",
+      "Filipino",
+      "Persian",
+      "Greek",
+      "Romanian",
+      "Hungarian",
+      "Macedonian"
+    ],
+    "thinker_config": {
+      "model_type": "qwen3_asr",
+      "architectures": [
+        "Qwen3ASRForConditionalGeneration"
+      ],
+      "audio_config": {
+        "_name_or_path": "",
+        "activation_dropout": 0,
+        "activation_function": "gelu",
+        "add_cross_attention": false,
+        "architectures": null,
+        "attention_dropout": 0,
+        "bad_words_ids": null,
+        "begin_suppress_tokens": null,
+        "bos_token_id": null,
+        "chunk_size_feed_forward": 0,
+        "conv_chunksize": 500,
+        "cross_attention_hidden_size": null,
+        "d_model": 1024,
+        "decoder_start_token_id": null,
+        "diversity_penalty": 0.0,
+        "do_sample": false,
+        "downsample_hidden_size": 480,
+        "dropout": 0,
+        "dtype": null,
+        "early_stopping": false,
+        "encoder_attention_heads": 16,
+        "encoder_ffn_dim": 4096,
+        "encoder_layers": 24,
+        "encoder_no_repeat_ngram_size": 0,
+        "eos_token_id": null,
+        "exponential_decay_length_penalty": null,
+        "finetuning_task": null,
+        "forced_bos_token_id": null,
+        "forced_eos_token_id": null,
+        "id2label": {
+          "0": "LABEL_0",
+          "1": "LABEL_1"
+        },
+        "initializer_range": 0.02,
+        "is_decoder": false,
+        "is_encoder_decoder": false,
+        "label2id": {
+          "LABEL_0": 0,
+          "LABEL_1": 1
+        },
+        "length_penalty": 1.0,
+        "max_length": 20,
+        "max_source_positions": 1500,
+        "min_length": 0,
+        "model_type": "qwen3_asr_audio_encoder",
+        "n_window": 50,
+        "n_window_infer": 800,
+        "no_repeat_ngram_size": 0,
+        "num_beam_groups": 1,
+        "num_beams": 1,
+        "num_hidden_layers": 24,
+        "num_mel_bins": 128,
+        "num_return_sequences": 1,
+        "output_attentions": false,
+        "output_dim": 2048,
+        "output_hidden_states": false,
+        "output_scores": false,
+        "pad_token_id": null,
+        "prefix": null,
+        "problem_type": null,
+        "pruned_heads": {},
+        "remove_invalid_values": false,
+        "repetition_penalty": 1.0,
+        "return_dict": true,
+        "return_dict_in_generate": false,
+        "scale_embedding": false,
+        "sep_token_id": null,
+        "suppress_tokens": null,
+        "task_specific_params": null,
+        "temperature": 1.0,
+        "tf_legacy_loss": false,
+        "tie_encoder_decoder": false,
+        "tie_word_embeddings": true,
+        "tokenizer_class": null,
+        "top_k": 50,
+        "top_p": 1.0,
+        "torchscript": false,
+        "typical_p": 1.0,
+        "use_bfloat16": false
+      },
+      "audio_end_token_id": 151670,
+      "audio_start_token_id": 151669,
+      "audio_token_id": 151676,
+      "dtype": "bfloat16",
+      "initializer_range": 0.02,
+      "text_config": {
+        "_name_or_path": "",
+        "add_cross_attention": false,
+        "architectures": null,
+        "attention_bias": false,
+        "attention_dropout": 0.0,
+        "bad_words_ids": null,
+        "begin_suppress_tokens": null,
+        "bos_token_id": null,
+        "chunk_size_feed_forward": 0,
+        "cross_attention_hidden_size": null,
+        "decoder_start_token_id": null,
+        "diversity_penalty": 0.0,
+        "do_sample": false,
+        "dtype": null,
+        "early_stopping": false,
+        "encoder_no_repeat_ngram_size": 0,
+        "eos_token_id": null,
+        "exponential_decay_length_penalty": null,
+        "finetuning_task": null,
+        "forced_bos_token_id": null,
+        "forced_eos_token_id": null,
+        "head_dim": 128,
+        "hidden_act": "silu",
+        "hidden_size": 2048,
+        "id2label": {
+          "0": "LABEL_0",
+          "1": "LABEL_1"
+        },
+        "initializer_range": 0.02,
+        "intermediate_size": 6144,
+        "is_decoder": false,
+        "is_encoder_decoder": false,
+        "label2id": {
+          "LABEL_0": 0,
+          "LABEL_1": 1
+        },
+        "length_penalty": 1.0,
+        "max_length": 20,
+        "max_position_embeddings": 65536,
+        "min_length": 0,
+        "model_type": "qwen3",
+        "no_repeat_ngram_size": 0,
+        "num_attention_heads": 16,
+        "num_beam_groups": 1,
+        "num_beams": 1,
+        "num_hidden_layers": 28,
+        "num_key_value_heads": 8,
+        "num_return_sequences": 1,
+        "output_attentions": false,
+        "output_hidden_states": false,
+        "output_scores": false,
+        "pad_token_id": null,
+        "prefix": null,
+        "problem_type": null,
+        "pruned_heads": {},
+        "remove_invalid_values": false,
+        "repetition_penalty": 1.0,
+        "return_dict": true,
+        "return_dict_in_generate": false,
+        "rms_norm_eps": 1e-06,
+        "rope_scaling": {
+          "interleaved": true,
+          "mrope_interleaved": true,
+          "mrope_section": [
+            24,
+            20,
+            20
+          ],
+          "rope_type": "default",
+          "type": "default"
+        },
+        "rope_theta": 1000000,
+        "sep_token_id": null,
+        "suppress_tokens": null,
+        "task_specific_params": null,
+        "temperature": 1.0,
+        "tf_legacy_loss": false,
+        "tie_encoder_decoder": false,
+        "tie_word_embeddings": true,
+        "tokenizer_class": null,
+        "top_k": 50,
+        "top_p": 1.0,
+        "torchscript": false,
+        "typical_p": 1.0,
+        "use_bfloat16": false,
+        "use_cache": true,
+        "vocab_size": 151936
+      }
+    },
+    "transformers_version": "4.57.6"
+  }

examples/distortion.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:194231a8aa2a31049d167df3f52bc62d4e9377aa935678c983d1165f3c9ca86d
+size 353324

examples/dropout.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff94831ee3497ce90d9b873719b823e5c4c4a9890dec86832e0b6357cd2b2e6f
+size 320684

examples/echo.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2eda219a3b7091c7a2772408db3f0356d1d7d30184d0523a6c98f6fdec35bd2b
+size 359084

examples/far_field.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4546478ee704b5b7b81bb4937e6c74b48be82aef541e0fb3388fcb49789082d
+size 284204

examples/mixed.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:339b28e7f59abb2dcfac81c22298060a5da85d09ea3363a8e4004e17a15b31e2
+size 243884

examples/noise.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96793b49e286c7b05a3081fccf0d6a6f7df85cc5aef0a2d28f3b4aaba60d95d1
+size 416684

examples/obstructed.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:411e605c25a6a8a6b09e49f0db2ad7543854bbcd0cab4cd7157fc429f9c5b0d3
+size 422444

examples/recording.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a548239681d613a007375825fd2423494be634c49bd450e46743f29101ffdcfc
+size 240044

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "eos_token_id": [
+    151643,
+    151645
+  ],
+  "pad_token_id": 151643,
+  "do_sample": false
+}

inference.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""Standalone Mega-ASR inference using the INT4 ONNX models.
+This script demonstrates how to run Mega-ASR end-to-end from raw audio
+using only the ONNX models in this repo plus the qwen-asr tokenizer.
+Requirements:
+    pip install onnxruntime numpy soundfile transformers qwen-asr
+Usage:
+    python inference.py --audio examples/noise.wav
+"""
+from __future__ import annotations
+import argparse
+import re
+import sys
+from pathlib import Path
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+HERE = Path(__file__).parent
+def _ort(path, providers=("CPUExecutionProvider",)):
+    so = ort.SessionOptions()
+    so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    return ort.InferenceSession(str(path), so, providers=list(providers))
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--audio", required=True, type=Path)
+    ap.add_argument("--max-new-tokens", type=int, default=80)
+    ap.add_argument("--providers", default="cuda,cpu",
+                    help="Comma-separated ORT providers, e.g. cuda,cpu")
+    args = ap.parse_args()
+    providers = []
+    for p in args.providers.split(","):
+        p = p.strip().lower()
+        if p == "cuda": providers.append("CUDAExecutionProvider")
+        elif p == "cpu": providers.append("CPUExecutionProvider")
+        elif p == "webgpu": providers.append("WebGpuExecutionProvider")
+    # 1) Tokenizer + processor (needs HF transformers + qwen-asr installed)
+    from transformers import AutoTokenizer
+    from qwen_asr.core.transformers_backend.processing_qwen3_asr import Qwen3ASRProcessor
+    tokenizer = AutoTokenizer.from_pretrained(str(HERE))
+    processor = Qwen3ASRProcessor.from_pretrained(str(HERE))
+    # 2) Read audio + build mel features
+    audio, sr = sf.read(str(args.audio))
+    if sr != 16000:
+        import librosa
+        audio = librosa.resample(audio.astype(np.float32), orig_sr=sr, target_sr=16000)
+    # The processor expects a prompt with the audio placeholder. Build it as
+    # the Qwen3-ASR chat template would.
+    prompt = (
+        "<|im_start|>system\\nYou are a speech recognition model.<|im_end|>\\n"
+        "<|im_start|>user\\nDetect the language and recognize the speech: "
+        "<|audio_start|><|audio_pad|><|audio_end|><|im_end|>\\n"
+        "<|im_start|>assistant\\n"
+    )
+    inputs = processor(text=[prompt], audio=[audio], return_tensors="np", padding=True)
+    text_ids = inputs["input_ids"]                 # (1, L)
+    attention_mask = inputs["attention_mask"]      # (1, L)
+    input_features = inputs["input_features"]      # (1, 128, T_mel)
+    # 3) Audio encoder
+    T_mel = input_features.shape[-1]
+    if T_mel > 3000:
+        input_features = input_features[..., :3000]
+        T_mel = 3000
+    mel_padded = np.pad(input_features.astype(np.float32),
+                         ((0, 0), (0, 0), (0, 3000 - T_mel)))
+    enc = _ort(HERE / "onnx" / "audio_encoder_int4.onnx", providers)
+    audio_embeds = enc.run(["audio_embeds"], {"mel": mel_padded})[0]   # (1, 390, 2048)
+    real_chunks = (T_mel + 99) // 100
+    last_chunk_mel = T_mel - (real_chunks - 1) * 100
+    real_audio_frames = (real_chunks - 1) * 13 + (last_chunk_mel + 7) // 8
+    audio_embeds = audio_embeds[:, :real_audio_frames]                 # (1, N, 2048)
+    # 4) Embed text tokens + scatter audio into placeholder positions
+    # The embedding table is published as embed_tokens.npy in the repo
+    # (extracted from the original Qwen3-ASR weights for portability).
+    embed_path = HERE / "onnx" / "embed_tokens.npy"
+    if embed_path.exists():
+        embed_w = np.load(embed_path).astype(np.float16)
+    else:
+        raise FileNotFoundError(
+            "embed_tokens.npy missing — re-extract from the source PT model."
+        )
+    inputs_embeds = embed_w[text_ids[0]][None]                          # (1, L, hidden)
+    audio_token_id = tokenizer.convert_tokens_to_ids("<|audio_pad|>")
+    placeholder_mask = (text_ids == audio_token_id)                     # (1, L)
+    # Replace placeholder embeddings with audio_embeds[0] in order
+    placeholder_idx = np.where(placeholder_mask[0])[0]
+    n_replace = min(len(placeholder_idx), audio_embeds.shape[1])
+    inputs_embeds[0, placeholder_idx[:n_replace]] = audio_embeds[0, :n_replace]
+    # 5) Position ids (1D)
+    pos_ids = np.arange(text_ids.shape[1])[None].astype(np.int64)
+    # 6) Prefill ONNX
+    prefill = _ort(HERE / "onnx" / "decoder_prefill_int4.onnx", providers)
+    feeds = {
+        "inputs_embeds": inputs_embeds.astype(np.float16),
+        "attention_mask": attention_mask.astype(np.int64),
+        "position_ids": pos_ids,
+    }
+    out = prefill.run(None, feeds)
+    logits, *past_kvs = out
+    eos = tokenizer.eos_token_id
+    # 7) Greedy decode with step ONNX
+    step = _ort(HERE / "onnx" / "decoder_step_int4.onnx", providers)
+    num_layers = 28
+    cur_len = inputs_embeds.shape[1]
+    gen = []
+    nid = int(np.argmax(logits[0, -1, :]))
+    gen.append(nid)
+    for _ in range(args.max_new_tokens - 1):
+        if nid == eos: break
+        new_embed = embed_w[nid][None, None].astype(np.float16)
+        new_attn = np.ones((1, cur_len + 1), dtype=np.int64)
+        new_pos = np.array([[cur_len]], dtype=np.int64)
+        f = {"inputs_embeds": new_embed,
+             "attention_mask": new_attn,
+             "position_ids": new_pos}
+        for i in range(num_layers):
+            f[f"past.{i}.key"] = past_kvs[2 * i]
+            f[f"past.{i}.value"] = past_kvs[2 * i + 1]
+        out = step.run(None, f)
+        logits, *past_kvs = out
+        nid = int(np.argmax(logits[0, -1, :]))
+        gen.append(nid)
+        cur_len += 1
+    text = tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    print(f"TRANSCRIPTION: {text}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx/audio_encoder_int4.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5794e7ba5a96c836a0377d4fdd767ee1f1224b3958bd3b8077b1f618efab0b7
+size 202471

onnx/audio_encoder_int4.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e77f76d9ecc1bf83ff8625e7752b8b48f1b2ec2850aecf441be20e53e2f99515
+size 213590040

onnx/decoder_prefill_int4.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f9a614c02a60675cc484dc821b931ae2caa25468c0b4fec9b254d8fb0956972
+size 4693422

onnx/decoder_prefill_int4.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44b50e23cd84e89c89e14fc78867beb3305247231b1716e515e1d2655d41ce03
+size 967987200

onnx/decoder_step_int4.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a50be8689ce3d03c1087432012411804e77829ded5c24b03f662dc11ee8569bb
+size 4672657

onnx/decoder_step_int4.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44b50e23cd84e89c89e14fc78867beb3305247231b1716e515e1d2655d41ce03
+size 967987200

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "chunk_length": 30,
+  "dither": 0.0,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 128,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Qwen3ASRProcessor",
+  "return_attention_mask": true
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,549 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<|audio_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|audio_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<tts_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<tts_text_bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<tts_text_eod>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<tts_text_bos_single>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<non_speech>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151676": {
+      "content": "<|audio_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "<blank1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "<blank2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "<blank3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151680": {
+      "content": "<blank4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151681": {
+      "content": "<blank5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151682": {
+      "content": "<blank6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151683": {
+      "content": "<blank7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151684": {
+      "content": "<blank8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151685": {
+      "content": "<blank9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151686": {
+      "content": "<blank10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151687": {
+      "content": "<blank11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151688": {
+      "content": "<blank12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151689": {
+      "content": "<blank13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151690": {
+      "content": "<blank14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151691": {
+      "content": "<blank15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151692": {
+      "content": "<blank16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151693": {
+      "content": "<blank17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151694": {
+      "content": "<blank18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151695": {
+      "content": "<blank19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151696": {
+      "content": "<blank20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151697": {
+      "content": "<blank21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151698": {
+      "content": "<blank22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151699": {
+      "content": "<blank23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151700": {
+      "content": "<blank24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151701": {
+      "content": "<blank25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151702": {
+      "content": "<blank26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151703": {
+      "content": "<blank27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151704": {
+      "content": "<asr_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<|audio_start|>",
+    "<|audio_end|>",
+    "<tts_pad>",
+    "<tts_text_bos>",
+    "<tts_text_bos_single>",
+    "<|audio_pad|>"
+  ],
+  "audio_bos_token": "<|audio_start|>",
+  "audio_eos_token": "<|audio_end|>",
+  "audio_token": "<|audio_pad|>",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {
+    "audio_bos_token": "<|audio_start|>",
+    "audio_eos_token": "<|audio_end|>",
+    "audio_token": "<|audio_pad|>",
+    "image_token": "<|image_pad|>",
+    "video_token": "<|video_pad|>",
+    "vision_bos_token": "<|vision_start|>",
+    "vision_eos_token": "<|vision_end|>"
+  },
+  "image_token": "<|image_pad|>",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen3ASRProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<|video_pad|>",
+  "vision_bos_token": "<|vision_start|>",
+  "vision_eos_token": "<|vision_end|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff