Prince-1 commited on
Commit
7b1105d
·
verified ·
1 Parent(s): 29b4ee9

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ figures/carbon-8b-banner.png filter=lfs diff=lfs merge=lfs -text
38
+ model.onnx.data filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: onnxruntime-genai
3
+ license: apache-2.0
4
+ language:
5
+ - dna
6
+ tags:
7
+ - dna
8
+ - genomic
9
+ - onnx
10
+ - onnxruntime
11
+ - onnxruntime-genai
12
+ - transformers
13
+ base_model:
14
+ - HuggingFaceBio/Carbon-8B
15
+ ---
16
+
17
+ ![](figures/carbon-8b-banner.png)
18
+
19
+ <p align="center">
20
+ <a href="https://huggingface.co/HuggingFaceBio/Carbon-3B/blob/main/tech-report.pdf"><b>Technical Report</b> 🧬</a>
21
+ </p>
22
+
23
+ # Carbon-8B
24
+
25
+ A larger, higher-capacity member of the **Carbon** family of generative DNA foundation models.
26
+
27
+ Carbon-8B is the 8B-parameter sibling of [Carbon-3B](https://huggingface.co/HuggingFaceBio/Carbon-3B). It is intended for users who can afford additional inference cost in exchange for stronger downstream performance. For the full design rationale, tokenizer specification, evaluation protocol, and usage details, please refer to the **[Carbon-3B model card](https://huggingface.co/HuggingFaceBio/Carbon-3B)** and the Carbon technical report — this card focuses only on what is specific to Carbon-8B.
28
+
29
+ - Technical report: https://github.com/huggingface/carbon/blob/main/tech-report.pdf
30
+ - Demo: https://huggingface.co/spaces/HuggingFaceBio/carbon-demo
31
+
32
+ ## Model Summary
33
+
34
+ - **8B-parameter decoder-only autoregressive model** trained on DNA and RNA sequences with a primary focus on eukaryotes.
35
+ - **Same hybrid tokenizer** as Carbon-3B (non-overlapping 6-mer for DNA + Qwen3 BPE for English text). Each DNA token encodes 6 bp. Wrap DNA inputs with `<dna>...</dna>` — see the Carbon-3B card for tokenizer details and usage caveats.
36
+ - **Native context: 32,768 tokens (≈ 196 kbp).** Carbon-8B was extended with a long-context decay stage from an 8 k-context base, so it natively handles 32 k tokens. You can apply YaRN at 4× to extrapolate up to 128 k tokens (≈ 786 kbp).
37
+ - Released as a standard Hugging Face causal LM (`LlamaForCausalLM`).
38
+
39
+ ## How to use
40
+
41
+ ```python
42
+ from transformers import AutoModelForCausalLM, AutoTokenizer
43
+ import torch
44
+
45
+ repo = "HuggingFaceBio/Carbon-8B"
46
+ tok = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
47
+ model = AutoModelForCausalLM.from_pretrained(
48
+ repo, dtype=torch.bfloat16,
49
+ ).cuda().eval()
50
+
51
+ prompt = "<dna>ATGCGCTAGCTACGATCGATCGTAGCTAGCTAGCTAGCTACG" # multiple of 6 bp
52
+ inputs = tok(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")
53
+ out = model.generate(**inputs, max_new_tokens=64, do_sample=False)
54
+ print(tok.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))
55
+ ```
56
+
57
+ ### Base-pair-level generation and scoring
58
+
59
+ The `fns` branch loads custom modeling code for Factorized Nucleotide Supervision (FNS). Carbon still uses its efficient 6-mer tokenizer, but during generation each selected 6-mer is assembled from six per-position nucleotide distributions, giving base-pair-level control over decoded DNA. Use this branch when you need exact base-pair counts, per-position masks, or temperature/top-p behavior applied at the nucleotide level rather than over the 4,096-way 6-mer distribution:
60
+
61
+ ```py
62
+ import math
63
+ import torch
64
+ from transformers import AutoModelForCausalLM, AutoTokenizer
65
+
66
+ model_id = "HuggingFaceBio/Carbon-8B"
67
+ revision = "fns"
68
+ device = "cuda"
69
+
70
+ tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
71
+ model = AutoModelForCausalLM.from_pretrained(
72
+ model_id,
73
+ revision=revision,
74
+ trust_remote_code=True,
75
+ dtype=torch.bfloat16,
76
+ ).to(device).eval()
77
+
78
+ context = "ATGCGCTAGCTACGATCGATCGTAGCTAGCTAGCTAGCTACG"
79
+ n_bp = 60
80
+
81
+ inputs = tokenizer(f"<dna>{context}", return_tensors="pt", add_special_tokens=False).to(device)
82
+
83
+ with torch.no_grad():
84
+ output_ids = model.generate(
85
+ **inputs,
86
+ max_new_tokens=math.ceil(n_bp / tokenizer.k),
87
+ do_sample=False,
88
+ pad_token_id=tokenizer.eos_token_id,
89
+ )
90
+
91
+ generated_ids = output_ids[0, inputs.input_ids.shape[1]:]
92
+ generated_dna = tokenizer.decode(generated_ids, skip_special_tokens=True)[:n_bp]
93
+
94
+ print(generated_dna)
95
+ ```
96
+
97
+ The same per-base marginals are exposed through `score_sequence()`, which returns the probability assigned to the observed base at each position. Taking the mean log probability gives a base-pair-level sequence score, where higher values indicate higher model likelihood:
98
+
99
+ ```py
100
+ import torch
101
+ from transformers import AutoModelForCausalLM, AutoTokenizer
102
+
103
+ model_id = "HuggingFaceBio/Carbon-8B"
104
+ revision = "fns"
105
+ device = "cuda"
106
+
107
+ tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
108
+ model = AutoModelForCausalLM.from_pretrained(
109
+ model_id,
110
+ revision=revision,
111
+ trust_remote_code=True,
112
+ dtype=torch.bfloat16,
113
+ ).to(device).eval()
114
+
115
+ reference = "GGGCTATAAAGGCCATCGATCGATCGATCGATCGATCGATCG"
116
+ perturbed = "GGGCGCGCGCGGCCATCGATCGATCGATCGATCGATCGATCG"
117
+
118
+ with torch.no_grad():
119
+ bp_probs, actual_probs = model.score_sequence([reference, perturbed])
120
+
121
+ scores = [torch.log(p.clamp_min(1e-12)).mean().item() for p in actual_probs]
122
+
123
+ print(f"reference mean bp logp: {scores[0]:.4f}")
124
+ print(f"perturbed mean bp logp: {scores[1]:.4f}")
125
+ print(f"reference preferred: {scores[0] > scores[1]}")
126
+ ```
127
+
128
+
129
+ ## Training
130
+
131
+ Carbon-8B follows the same pre-training recipe as Carbon-3B on the **[`HuggingFaceBio/carbon-pretraining-corpus`](https://huggingface.co/datasets/HuggingFaceBio/carbon-pretraining-corpus)** with the identical data mixture on 1T DNA 6-mer tokens. The main recipe ingredients:
132
+
133
+ - **Learning-rate schedule: cosine** (instead of the WSD schedule used for Carbon-3B).
134
+ - **Loss schedule:** after 100B tokens the loss switches from cross-entropy to FNS loss until the end of training.
135
+ - **Pre-training**: on 1T 6-mer tokens (≈ 6T DNA base pairs), with GBS=512, seq=8192 → 4.19 M tok/step. On 32 nodes (TP=4, DP=64), bfloat16, AdamW. We keep the same training mixture even in the decay phase with 70% Generator eukaryote data with metadata with dropout, 16% mRNA, 4% splice mRNA and 10% Prokaryote data.
136
+ - **Long-context extension stage.** After pre-training, Carbon-8B undergoes a long-context decay phase that extends the native context from 8,192 to 32,768 tokens (≈ 196 kbp). You can apply YaRN at 4× to further extrapolate to 128 k tokens (≈ 786 kbp).
137
+
138
+ Training infrastructure, framework ([Megatron-LM-Carbon](https://github.com/huggingface/Megatron-LM-Carbon)), and conversion path ([Megatron-Bridge](https://github.com/NVIDIA/Megatron-Bridge)) are identical to Carbon-3B.
139
+
140
+ ## Evaluation
141
+
142
+ All evaluations are zero-shot and use the [public Carbon evaluation pipeline](https://github.com/huggingface/carbon/tree/main/evaluation). See the [Carbon-3B card](https://huggingface.co/HuggingFaceBio/Carbon-3B#evaluation) for the full task suite, metrics, and methodology.
143
+
144
+ ### Downstream tasks
145
+
146
+ | Category | Metric (%) | Carbon 3B | Carbon 8B | Δ |
147
+ |---|---|---|---|---|
148
+ | Generative | Sequence Recovery eukaryote | 61.54 | **64.05** | +2.51 |
149
+ | Variant effect prediction | BRCA2 | 84.63 | **85.72** | +1.09 |
150
+ | | TraitGym Mendelian | 33.65 | **36.43** | +2.78 |
151
+ | | ClinVar coding (24 kb) | 92.89 | **93.11** | +0.22 |
152
+ | | ClinVar non-coding (24 kb) | 91.14 | **91.63** | +0.49 |
153
+ | Perturbation | Nucleotide triplet-expansion | 85.20 | **89.05** | +3.85 |
154
+ | | Synonymous codon replacement | 88.89 | **91.46** | +2.57 |
155
+ | Long-context retrieval | Genomic-NIAH @ 393 kbp | 79.00 | **86.00** | +7.00 |
156
+
157
+ ### Genomic-NIAH (long-context retrieval)
158
+
159
+ Genomic-NIAH measures how well a DNA model actually *uses* its long context. See the [`HuggingFaceBio/genomic-niah` dataset card](https://huggingface.co/datasets/HuggingFaceBio/genomic-niah) for the benchmark design.
160
+
161
+ | Context length | Carbon 3B (native / YaRN 4×) | Carbon 8B (native / YaRN 4×) | Evo2 7B |
162
+ |------------------------|------------------------------|------------------------------|---------|
163
+ | 16 k tokens (98 kbp) | 0.73 / 0.91 | 0.78 / 0.89 | **0.97** |
164
+ | 32 k tokens (196 kbp) | 0.55 / 0.90 | 0.69 / 0.87 | **0.95** |
165
+ | 64 k tokens (393 kbp) | — / 0.79 | — / **0.86** | 0.80 |
166
+ | 128 k tokens (786 kbp) | — / 0.27 | — / **0.65** | 0.53 |
167
+
168
+ Carbon-8B retrieves reliably up to its 32 k native boundary; **YaRN 4×** recovers most of the loss at the 32 k → 64 k boundary and extends usable retrieval to ≈ 786 kbp.
169
+
170
+ ## Intended use
171
+
172
+ Generative modelling, variant-effect prediction, motif-perturbation analysis, and long-context retrieval on DNA sequences. For faster inference at shorter contexts, use **Carbon-3B**.
173
+
174
+ ⚠️ **Genetic data is highly sensitive.** Depending on how this model is used (local download, inference API/endpoints, third-party inference providers, Spaces demos or others), input and output data may be processed or handled differently by different providers or space owners. Please make sure you understand and agree with how your data is handled before using the model.
175
+
176
+ ## License
177
+
178
+ Apache 2.0.
179
+
180
+ ## Acknowledgements
181
+
182
+ Carbon is a joint collaboration between the research teams at Hugging Face, Zhongguancun Academy, and TIGEM/University of Naples “Federico II”.
chat_template.jinja ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
+ {%- elif message.role == "assistant" %}
29
+ {%- set content = message.content %}
30
+ {%- set reasoning_content = '' %}
31
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
+ {%- set reasoning_content = message.reasoning_content %}
33
+ {%- else %}
34
+ {%- if '</think>' in message.content %}
35
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
+ {%- endif %}
38
+ {%- endif %}
39
+ {%- if loop.index0 > ns.last_query_index %}
40
+ {%- if loop.last or (not loop.last and reasoning_content) %}
41
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
+ {%- else %}
43
+ {{- '<|im_start|>' + message.role + '\n' + content }}
44
+ {%- endif %}
45
+ {%- else %}
46
+ {{- '<|im_start|>' + message.role + '\n' + content }}
47
+ {%- endif %}
48
+ {%- if message.tool_calls %}
49
+ {%- for tool_call in message.tool_calls %}
50
+ {%- if (loop.first and content) or (not loop.first) %}
51
+ {{- '\n' }}
52
+ {%- endif %}
53
+ {%- if tool_call.function %}
54
+ {%- set tool_call = tool_call.function %}
55
+ {%- endif %}
56
+ {{- '<tool_call>\n{"name": "' }}
57
+ {{- tool_call.name }}
58
+ {{- '", "arguments": ' }}
59
+ {%- if tool_call.arguments is string %}
60
+ {{- tool_call.arguments }}
61
+ {%- else %}
62
+ {{- tool_call.arguments | tojson }}
63
+ {%- endif %}
64
+ {{- '}\n</tool_call>' }}
65
+ {%- endfor %}
66
+ {%- endif %}
67
+ {{- '<|im_end|>\n' }}
68
+ {%- elif message.role == "tool" %}
69
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
+ {{- '<|im_start|>user' }}
71
+ {%- endif %}
72
+ {{- '\n<tool_response>\n' }}
73
+ {{- message.content }}
74
+ {{- '\n</tool_response>' }}
75
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
+ {{- '<|im_end|>\n' }}
77
+ {%- endif %}
78
+ {%- endif %}
79
+ {%- endfor %}
80
+ {%- if add_generation_prompt %}
81
+ {{- '<|im_start|>assistant\n' }}
82
+ {%- if enable_thinking is defined and enable_thinking is false %}
83
+ {{- '<think>\n\n</think>\n\n' }}
84
+ {%- endif %}
85
+ {%- endif %}
dna_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "k": 6,
3
+ "dna_start_id": 151669,
4
+ "dna_vocab_size": 4107,
5
+ "dna_special_tokens": [
6
+ "<dna>",
7
+ "</dna>",
8
+ "<oov>"
9
+ ],
10
+ "auto_dna_tags": false
11
+ }
figures/carbon-8b-banner.png ADDED

Git LFS Details

  • SHA256: 4673b72dfc0241f4c856d90ef3c4071e9d88c3ec6d803486d0f115ff1cb414d5
  • Pointer size: 132 Bytes
  • Size of remote file: 1.28 MB
genai_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "bos_token_id": 1,
4
+ "context_length": 32768,
5
+ "decoder": {
6
+ "session_options": {
7
+ "log_id": "onnxruntime-genai",
8
+ "provider_options": []
9
+ },
10
+ "filename": "model.onnx",
11
+ "head_size": 128,
12
+ "hidden_size": 4096,
13
+ "inputs": {
14
+ "input_ids": "input_ids",
15
+ "attention_mask": "attention_mask",
16
+ "position_ids": "position_ids",
17
+ "past_key_names": "past_key_values.%d.key",
18
+ "past_value_names": "past_key_values.%d.value"
19
+ },
20
+ "outputs": {
21
+ "logits": "logits",
22
+ "present_key_names": "present.%d.key",
23
+ "present_value_names": "present.%d.value"
24
+ },
25
+ "num_attention_heads": 32,
26
+ "num_hidden_layers": 32,
27
+ "num_key_value_heads": 8
28
+ },
29
+ "eos_token_id": 151643,
30
+ "pad_token_id": 151643,
31
+ "type": "llama",
32
+ "vocab_size": 155776
33
+ },
34
+ "search": {
35
+ "diversity_penalty": 0.0,
36
+ "do_sample": false,
37
+ "early_stopping": true,
38
+ "length_penalty": 1.0,
39
+ "max_length": 32768,
40
+ "min_length": 0,
41
+ "no_repeat_ngram_size": 0,
42
+ "num_beams": 1,
43
+ "num_return_sequences": 1,
44
+ "past_present_share_buffer": false,
45
+ "repetition_penalty": 1.0,
46
+ "temperature": 1.0,
47
+ "top_k": 50,
48
+ "top_p": 1.0
49
+ }
50
+ }
model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4edd74d06b828140d6149be21ca1f3630beec5972bf30e1063bbc115f1dc8fdf
3
+ size 748183
model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bbe623ec31ef77b8be3f94eb6151b36aa4a1e8030f5a574747aa21278b8fb84
3
+ size 16519856128
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
tokenizer.py ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HybridDNATokenizer: Combines Qwen3 BPE tokenization with DNA 6-mer tokenization.
3
+
4
+ DNA sequences wrapped in <dna>...</dna> tags are tokenized as 6-mers.
5
+ All other text uses Qwen3's BPE tokenization.
6
+
7
+ Supports token_mask for Fine-grained Nucleotide Supervision (FNS):
8
+ -2: padding token
9
+ -1: text token (BPE)
10
+ 0: DNA special token (<dna>, </dna>, <oov>)
11
+ 1-5: partial 6-mer token — valid_length real bases at positions [0, valid_length),
12
+ right-padded with 'A' at positions [valid_length, k) so loss can supervise
13
+ positions 0..valid_len-1 via pos_mask = (valid_len > pos)
14
+ 6: full 6-mer
15
+ """
16
+
17
+ import os
18
+ import json
19
+ import warnings
20
+ import itertools
21
+ from typing import List, Optional, Tuple, Dict, Union, Any
22
+
23
+ from transformers import PreTrainedTokenizer, AutoTokenizer, BatchEncoding
24
+
25
+
26
+ class HybridDNATokenizer(PreTrainedTokenizer):
27
+ """
28
+ Hybrid tokenizer combining Qwen3 BPE with DNA 6-mer tokenization.
29
+
30
+ DNA regions must be wrapped in <dna>...</dna> tags to be tokenized as 6-mers.
31
+ Without tags, DNA sequences are tokenized as regular BPE text.
32
+
33
+ For pure-DNA input (no metadata tokens), pass auto_dna_tags=True to have
34
+ <dna>...</dna> tags added automatically when they are absent. Do NOT set
35
+ this if the input may contain BPE metadata such as species tags
36
+ (<fungi_species> etc.) — those must appear outside <dna>...</dna> and would
37
+ be incorrectly k-mer encoded if auto-wrapping fired.
38
+ """
39
+
40
+ model_input_names = ["input_ids", "attention_mask"]
41
+
42
+ def __init__(
43
+ self,
44
+ base_tokenizer_path: Optional[str] = None,
45
+ k: int = 6,
46
+ auto_dna_tags: bool = False,
47
+ **kwargs
48
+ ):
49
+ self.k = k
50
+
51
+ # Load base tokenizer (Qwen3-4B-Base)
52
+ self._base_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Base")
53
+
54
+ # Get base vocabulary
55
+ self._base_vocab = self._base_tokenizer.get_vocab()
56
+ self._base_vocab_size = len(self._base_vocab)
57
+
58
+ # Initialize DNA vocabulary
59
+ self._init_dna_vocab()
60
+
61
+ # Build combined vocabulary
62
+ self._build_combined_vocab()
63
+
64
+ # Set special tokens
65
+ self._eos_token = kwargs.pop('eos_token', None) or "<|endoftext|>"
66
+ self._pad_token = kwargs.pop('pad_token', None) or self._base_tokenizer.pad_token or "<|endoftext|>"
67
+
68
+ # Initialize parent class
69
+ super().__init__(
70
+ eos_token=self._eos_token,
71
+ pad_token=self._pad_token,
72
+ **kwargs
73
+ )
74
+
75
+ self.special_tokens = self.dna_special_tokens + [self._eos_token, self._pad_token]
76
+ self.auto_dna_tags = auto_dna_tags
77
+
78
+ def _init_dna_vocab(self):
79
+ """Initialize DNA vocabulary (special tokens + k-mers + padding for 128 alignment)."""
80
+ bases = ['A', 'T', 'C', 'G']
81
+
82
+ # DNA special tokens
83
+ self.dna_special_tokens = ["<dna>", "</dna>", "<oov>"]
84
+
85
+ # Generate all k-mer combinations (4^k = 4096 for k=6)
86
+ self.kmers = [''.join(kmer) for kmer in itertools.product(bases, repeat=self.k)]
87
+
88
+ # DNA tokens start after base vocabulary
89
+ self.dna_start_id = self._base_vocab_size
90
+
91
+ # All DNA tokens get new IDs (no reuse of base vocab IDs, even for
92
+ # overlapping tokens like CCCCCC — they have different semantics in
93
+ # DNA context vs BPE context, per Qiuyi's recommendation)
94
+ base_dna_tokens = self.dna_special_tokens + self.kmers
95
+
96
+ # Calculate padding for 128 alignment
97
+ total_vocab_unpadded = self._base_vocab_size + len(base_dna_tokens)
98
+ target_vocab_size = ((total_vocab_unpadded + 127) // 128) * 128
99
+ num_padding_tokens = target_vocab_size - total_vocab_unpadded
100
+
101
+ # Add unused padding tokens
102
+ self.padding_tokens = [f"<unused_{i}>" for i in range(num_padding_tokens)]
103
+
104
+ # Create DNA token mappings — all get sequential new IDs
105
+ self.dna_token_to_id = {}
106
+ self.dna_id_to_token = {}
107
+
108
+ current_id = self.dna_start_id
109
+ for token in base_dna_tokens:
110
+ self.dna_token_to_id[token] = current_id
111
+ self.dna_id_to_token[current_id] = token
112
+ current_id += 1
113
+
114
+ # Add padding tokens
115
+ for token in self.padding_tokens:
116
+ self.dna_token_to_id[token] = current_id
117
+ self.dna_id_to_token[current_id] = token
118
+ current_id += 1
119
+
120
+ self.dna_vocab_size = len(base_dna_tokens) + len(self.padding_tokens)
121
+
122
+ # Set DNA special token IDs
123
+ self.dna_begin_token_id = self.dna_token_to_id["<dna>"]
124
+ self.dna_end_token_id = self.dna_token_to_id["</dna>"]
125
+ self.oov_token_id = self.dna_token_to_id["<oov>"]
126
+
127
+ def _build_combined_vocab(self):
128
+ """Build combined vocabulary (base + DNA)."""
129
+ self._vocab = self._base_vocab.copy()
130
+
131
+ for token, token_id in self.dna_token_to_id.items():
132
+ if token not in self._vocab:
133
+ self._vocab[token] = token_id
134
+
135
+ self._id_to_token = {v: k for k, v in self._vocab.items()}
136
+ for token_id, token in self.dna_id_to_token.items():
137
+ if token_id not in self._id_to_token:
138
+ self._id_to_token[token_id] = token
139
+
140
+ @property
141
+ def vocab_size(self) -> int:
142
+ return max(self._vocab.values()) + 1
143
+
144
+ def get_vocab(self) -> Dict[str, int]:
145
+ return self._vocab.copy()
146
+
147
+ @property
148
+ def vocab(self) -> Dict[str, int]:
149
+ # Compatibility shim: fast tokenizers (PreTrainedTokenizerFast) expose
150
+ # `tokenizer.vocab` as a property; slow PreTrainedTokenizer subclasses
151
+ # like this one only expose `get_vocab()`. Some downstream tools
152
+ # (e.g. llama.cpp's convert_hf_to_gguf.py) read `.vocab` directly.
153
+ return self._vocab
154
+
155
+ def __len__(self):
156
+ # Override default (len(get_vocab())) because get_vocab() deduplicates
157
+ # CCCCCC which exists as both BPE (ID 91443) and DNA 6-mer (ID 154402).
158
+ return self.vocab_size
159
+
160
+ def _split_by_dna_tags(self, text: str) -> List[Tuple[str, bool]]:
161
+ segments = []
162
+ i = 0
163
+ n = len(text)
164
+
165
+ while i < n:
166
+ start_pos = text.find('<dna>', i)
167
+ end_pos = text.find('</dna>', i)
168
+
169
+ if start_pos == -1 and end_pos == -1:
170
+ remaining = text[i:]
171
+ if remaining:
172
+ segments.append((remaining, False))
173
+ break
174
+
175
+ if start_pos == -1 and end_pos != -1:
176
+ dna_region = text[i:end_pos + 6]
177
+ if dna_region:
178
+ segments.append((dna_region, True))
179
+ i = end_pos + 6
180
+ continue
181
+
182
+ if start_pos != -1 and end_pos == -1:
183
+ if i < start_pos:
184
+ normal_text = text[i:start_pos]
185
+ if normal_text:
186
+ segments.append((normal_text, False))
187
+ dna_region = text[start_pos:]
188
+ if dna_region:
189
+ segments.append((dna_region, True))
190
+ break
191
+
192
+ if start_pos < end_pos:
193
+ if i < start_pos:
194
+ normal_text = text[i:start_pos]
195
+ if normal_text:
196
+ segments.append((normal_text, False))
197
+ dna_region = text[start_pos:end_pos + 6]
198
+ if dna_region:
199
+ segments.append((dna_region, True))
200
+ i = end_pos + 6
201
+ else:
202
+ dna_region = text[i:end_pos + 6]
203
+ if dna_region:
204
+ segments.append((dna_region, True))
205
+ i = end_pos + 6
206
+
207
+ return segments
208
+
209
+ def _parse_dna_region(self, dna_region: str) -> Tuple[str, bool, bool]:
210
+ if dna_region == '<dna>':
211
+ return '', True, False
212
+ elif dna_region == '</dna>':
213
+ return '', False, True
214
+
215
+ has_start = dna_region.startswith('<dna>')
216
+ has_end = dna_region.endswith('</dna>')
217
+
218
+ content = dna_region
219
+ if has_start:
220
+ content = content[5:]
221
+ if has_end and content.endswith('</dna>'):
222
+ content = content[:-6]
223
+
224
+ return content.strip(), has_start, has_end
225
+
226
+ def _process_dna_sequence(self, dna_seq: str) -> Dict:
227
+ k = self.k
228
+ dna_seq = dna_seq.upper()
229
+
230
+ kmer_tokens = []
231
+ valid_bases = set('ATCG')
232
+
233
+ def is_valid_kmer(kmer):
234
+ return len(kmer) == k and all(base in valid_bases for base in kmer)
235
+
236
+ for i in range(0, len(dna_seq) - k + 1, k):
237
+ kmer = dna_seq[i:i+k]
238
+ if is_valid_kmer(kmer):
239
+ kmer_tokens.append(kmer)
240
+ else:
241
+ kmer_tokens.append("<oov>")
242
+
243
+ processed_length = len(kmer_tokens) * k
244
+ remaining = dna_seq[processed_length:]
245
+ padding_length = 0
246
+ valid_length = k
247
+
248
+ if remaining:
249
+ padding_needed = k - len(remaining)
250
+ # Right-pad with A: real bases occupy positions [0, valid_length).
251
+ # The hybrid BP loss supervises positions 0..valid_len-1 via
252
+ # pos_mask = (valid_len > pos)
253
+ # so padding must be at the END, not the start.
254
+ padded = remaining + 'A' * padding_needed
255
+
256
+ if is_valid_kmer(padded):
257
+ kmer_tokens.append(padded)
258
+ else:
259
+ kmer_tokens.append("<oov>")
260
+
261
+ padding_length = padding_needed
262
+ valid_length = len(remaining)
263
+
264
+ return {
265
+ "kmer_tokens": kmer_tokens,
266
+ "padding_length": padding_length,
267
+ "valid_length": valid_length,
268
+ }
269
+
270
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
271
+ return list(text)
272
+
273
+ def _convert_token_to_id(self, token: str) -> int:
274
+ if token in self.dna_token_to_id:
275
+ return self.dna_token_to_id[token]
276
+ return self._base_vocab.get(token, self._base_tokenizer.unk_token_id or 0)
277
+
278
+ def _convert_id_to_token(self, index: int) -> str:
279
+ if index in self.dna_id_to_token:
280
+ return self.dna_id_to_token[index]
281
+ return self._id_to_token.get(index, "<oov>")
282
+
283
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
284
+ return "".join(tokens)
285
+
286
+ def encode(
287
+ self,
288
+ text: str,
289
+ add_special_tokens: bool = False,
290
+ return_token_mask: bool = False,
291
+ auto_dna_tags: Optional[bool] = None,
292
+ **kwargs
293
+ ) -> Union[List[int], Tuple[List[int], List[int]]]:
294
+ use_auto = self.auto_dna_tags if auto_dna_tags is None else auto_dna_tags
295
+ if use_auto and '<dna>' not in text:
296
+ text = f'<dna>{text}</dna>'
297
+
298
+ segments = self._split_by_dna_tags(text)
299
+
300
+ token_ids = []
301
+ token_mask = [] if return_token_mask else None
302
+
303
+ for segment_content, is_dna in segments:
304
+ if is_dna:
305
+ dna_content, has_start, has_end = self._parse_dna_region(segment_content)
306
+
307
+ if has_start:
308
+ token_ids.append(self.dna_begin_token_id)
309
+ if return_token_mask:
310
+ token_mask.append(0)
311
+
312
+ if dna_content:
313
+ result = self._process_dna_sequence(dna_content)
314
+
315
+ for idx, kmer in enumerate(result["kmer_tokens"]):
316
+ token_id = self.dna_token_to_id.get(kmer, self.oov_token_id)
317
+ token_ids.append(token_id)
318
+
319
+ if return_token_mask:
320
+ if kmer == "<oov>":
321
+ token_mask.append(0)
322
+ elif idx == len(result["kmer_tokens"]) - 1 and result["padding_length"] > 0:
323
+ token_mask.append(result["valid_length"])
324
+ else:
325
+ token_mask.append(self.k)
326
+
327
+ if has_end:
328
+ token_ids.append(self.dna_end_token_id)
329
+ if return_token_mask:
330
+ token_mask.append(0)
331
+ else:
332
+ base_ids = self._base_tokenizer.encode(
333
+ segment_content,
334
+ add_special_tokens=add_special_tokens
335
+ )
336
+ token_ids.extend(base_ids)
337
+ if return_token_mask:
338
+ token_mask.extend([-1] * len(base_ids))
339
+
340
+ # Do NOT append EOS when add_special_tokens=True. Qwen3 doesn't add
341
+ # BOS/EOS either, and appending EOS here breaks lighteval's
342
+ # tok_encode_pair: it relies on
343
+ # len(encode(ctx)) + len(encode(answer)) == len(encode(ctx + answer))
344
+ # which the extra EOS violates by shifting the split by 1.
345
+
346
+ if return_token_mask:
347
+ return token_ids, token_mask
348
+ return token_ids
349
+
350
+ def decode(
351
+ self,
352
+ token_ids: Union[int, List[int]],
353
+ skip_special_tokens: bool = False,
354
+ **kwargs
355
+ ) -> str:
356
+ if hasattr(token_ids, 'tolist'):
357
+ token_ids = token_ids.tolist()
358
+ if isinstance(token_ids, int):
359
+ token_ids = [token_ids]
360
+
361
+ if skip_special_tokens:
362
+ special_ids = {self.eos_token_id, self.pad_token_id}
363
+ token_ids = [tid for tid in token_ids if tid not in special_ids]
364
+
365
+ parts = []
366
+ i = 0
367
+
368
+ while i < len(token_ids):
369
+ tid = token_ids[i]
370
+
371
+ if tid == self.dna_begin_token_id:
372
+ dna_tokens = []
373
+ i += 1
374
+
375
+ while i < len(token_ids) and token_ids[i] != self.dna_end_token_id:
376
+ if token_ids[i] in self.dna_id_to_token:
377
+ dna_tokens.append(self.dna_id_to_token[token_ids[i]])
378
+ i += 1
379
+
380
+ dna_seq = ''.join(dna_tokens)
381
+
382
+ if skip_special_tokens:
383
+ parts.append(dna_seq)
384
+ else:
385
+ parts.append(f"<dna>{dna_seq}")
386
+ if i < len(token_ids) and token_ids[i] == self.dna_end_token_id:
387
+ parts.append("</dna>")
388
+ i += 1
389
+
390
+ elif tid in self.dna_id_to_token:
391
+ # This branch handles k-mer tokens that appear without a <dna>
392
+ # wrapper — the common generation case where <dna> was in the
393
+ # prompt but only the generated portion is being decoded.
394
+ # K-mer tokens are content, not special tokens, so always decode
395
+ # them. Only drop true DNA special tokens (<dna>, </dna>, <oov>)
396
+ # when skip_special_tokens=True.
397
+ is_dna_special = tid in (self.dna_begin_token_id, self.dna_end_token_id, self.oov_token_id)
398
+ if not (skip_special_tokens and is_dna_special):
399
+ parts.append(self.dna_id_to_token[tid])
400
+ i += 1
401
+
402
+ else:
403
+ text_ids = []
404
+ while i < len(token_ids):
405
+ curr_id = token_ids[i]
406
+ if curr_id in self.dna_id_to_token or curr_id == self.dna_begin_token_id:
407
+ break
408
+ text_ids.append(curr_id)
409
+ i += 1
410
+
411
+ if text_ids:
412
+ decoded = self._base_tokenizer.decode(text_ids, skip_special_tokens=skip_special_tokens)
413
+ parts.append(decoded)
414
+
415
+ return ''.join(parts)
416
+
417
+ def batch_decode(
418
+ self,
419
+ sequences: Union[List[int], List[List[int]], "torch.Tensor"],
420
+ skip_special_tokens: bool = False,
421
+ **kwargs
422
+ ) -> List[str]:
423
+ return [
424
+ self.decode(
425
+ seq.tolist() if hasattr(seq, 'tolist') else list(seq),
426
+ skip_special_tokens=skip_special_tokens,
427
+ **kwargs
428
+ )
429
+ for seq in sequences
430
+ ]
431
+
432
+ def __call__(
433
+ self,
434
+ text: Union[str, List[str]],
435
+ add_special_tokens: bool = False,
436
+ padding: bool = False,
437
+ truncation: bool = False,
438
+ max_length: Optional[int] = None,
439
+ return_tensors: Optional[str] = None,
440
+ return_token_mask: bool = False,
441
+ auto_dna_tags: Optional[bool] = None,
442
+ **kwargs
443
+ ) -> Dict[str, Any]:
444
+ if add_special_tokens:
445
+ warnings.warn(
446
+ "HybridTokenizer does not support add_special_tokens=True, ignoring.",
447
+ UserWarning
448
+ )
449
+ add_special_tokens = False
450
+
451
+ is_batch = isinstance(text, list)
452
+ texts = text if is_batch else [text]
453
+
454
+ all_ids = []
455
+ all_masks = [] if return_token_mask else None
456
+
457
+ for t in texts:
458
+ if return_token_mask:
459
+ ids, mask = self.encode(t, add_special_tokens=add_special_tokens, return_token_mask=True, auto_dna_tags=auto_dna_tags)
460
+ all_ids.append(ids)
461
+ all_masks.append(mask)
462
+ else:
463
+ ids = self.encode(t, add_special_tokens=add_special_tokens, return_token_mask=False, auto_dna_tags=auto_dna_tags)
464
+ all_ids.append(ids)
465
+
466
+ if padding:
467
+ max_len = max(len(ids) for ids in all_ids)
468
+ if max_length:
469
+ max_len = min(max_len, max_length)
470
+
471
+ padded_ids = []
472
+ attention_masks = []
473
+ padded_token_masks = [] if return_token_mask else None
474
+
475
+ for idx, ids in enumerate(all_ids):
476
+ pad_len = max_len - len(ids)
477
+
478
+ if pad_len > 0:
479
+ ids = ids + [self.pad_token_id] * pad_len
480
+ attn = [1] * (max_len - pad_len) + [0] * pad_len
481
+ if return_token_mask:
482
+ mask = all_masks[idx] + [-2] * pad_len
483
+ else:
484
+ ids = ids[:max_len]
485
+ attn = [1] * max_len
486
+ if return_token_mask:
487
+ mask = all_masks[idx][:max_len]
488
+
489
+ padded_ids.append(ids)
490
+ attention_masks.append(attn)
491
+ if return_token_mask:
492
+ padded_token_masks.append(mask)
493
+
494
+ all_ids = padded_ids
495
+ all_masks = padded_token_masks
496
+ else:
497
+ attention_masks = [[1] * len(ids) for ids in all_ids]
498
+
499
+ result = {
500
+ "input_ids": all_ids if is_batch else all_ids[0],
501
+ "attention_mask": attention_masks if is_batch else attention_masks[0],
502
+ }
503
+
504
+ if return_token_mask:
505
+ result["token_mask"] = all_masks if is_batch else all_masks[0]
506
+
507
+ if return_tensors == "pt":
508
+ import torch
509
+ if is_batch:
510
+ result["input_ids"] = torch.tensor(result["input_ids"])
511
+ result["attention_mask"] = torch.tensor(result["attention_mask"])
512
+ if return_token_mask:
513
+ result["token_mask"] = torch.tensor(result["token_mask"])
514
+ else:
515
+ result["input_ids"] = torch.tensor([result["input_ids"]])
516
+ result["attention_mask"] = torch.tensor([result["attention_mask"]])
517
+ if return_token_mask:
518
+ result["token_mask"] = torch.tensor([result["token_mask"]])
519
+
520
+ return BatchEncoding(result, tensor_type=return_tensors)
521
+
522
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
523
+ vocab_file = os.path.join(
524
+ save_directory,
525
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
526
+ )
527
+
528
+ with open(vocab_file, "w", encoding="utf-8") as f:
529
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
530
+
531
+ return (vocab_file,)
532
+
533
+ def save_pretrained(self, save_directory: str, **kwargs):
534
+ os.makedirs(save_directory, exist_ok=True)
535
+
536
+ # Save base tokenizer files
537
+ self._base_tokenizer.save_pretrained(save_directory)
538
+
539
+ # Save DNA config
540
+ dna_config = {
541
+ "k": self.k,
542
+ "dna_start_id": self.dna_start_id,
543
+ "dna_vocab_size": self.dna_vocab_size,
544
+ "dna_special_tokens": self.dna_special_tokens,
545
+ "auto_dna_tags": self.auto_dna_tags,
546
+ }
547
+
548
+ dna_config_path = os.path.join(save_directory, "dna_config.json")
549
+ with open(dna_config_path, "w", encoding="utf-8") as f:
550
+ json.dump(dna_config, f, indent=2)
551
+
552
+ # Update tokenizer_config.json with auto_map
553
+ config_path = os.path.join(save_directory, "tokenizer_config.json")
554
+
555
+ if os.path.exists(config_path):
556
+ with open(config_path, "r") as f:
557
+ config = json.load(f)
558
+ else:
559
+ config = {}
560
+
561
+ config.update({
562
+ "tokenizer_class": "HybridDNATokenizer",
563
+ "auto_map": {
564
+ "AutoTokenizer": ["tokenizer.HybridDNATokenizer", None]
565
+ },
566
+ "k": self.k,
567
+ "auto_dna_tags": self.auto_dna_tags,
568
+ })
569
+
570
+ with open(config_path, "w", encoding="utf-8") as f:
571
+ json.dump(config, f, indent=2, ensure_ascii=False)
572
+
573
+ # Copy this tokenizer.py to save directory
574
+ import shutil
575
+ src_py = os.path.abspath(__file__)
576
+ dst_py = os.path.join(save_directory, "tokenizer.py")
577
+ if os.path.exists(src_py) and src_py != dst_py:
578
+ shutil.copy2(src_py, dst_py)
579
+
580
+ return (save_directory,)
581
+
582
+ @classmethod
583
+ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
584
+ k = 6
585
+ auto_dna_tags = False
586
+
587
+ dna_config_path = os.path.join(pretrained_model_name_or_path, "dna_config.json")
588
+ tok_config_path = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
589
+
590
+ if os.path.exists(dna_config_path):
591
+ with open(dna_config_path, "r") as f:
592
+ dna_config = json.load(f)
593
+ k = dna_config.get("k", 6)
594
+ auto_dna_tags = dna_config.get("auto_dna_tags", False)
595
+ elif os.path.exists(tok_config_path):
596
+ with open(tok_config_path, "r") as f:
597
+ tok_config = json.load(f)
598
+ k = tok_config.get("k", 6)
599
+ auto_dna_tags = tok_config.get("auto_dna_tags", False)
600
+
601
+ return cls(base_tokenizer_path=pretrained_model_name_or_path, k=k, auto_dna_tags=auto_dna_tags, **kwargs)
tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "local_files_only": false,
25
+ "model_max_length": 131072,
26
+ "pad_token": "<|endoftext|>",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "HybridDNATokenizer",
29
+ "unk_token": null,
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "tokenizer.HybridDNATokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "k": 6,
37
+ "auto_dna_tags": false
38
+ }