Add files using upload-large-folder tool

Browse files

Files changed (11) hide show

.gitattributes +3 -0
README.md +182 -0
chat_template.jinja +85 -0
dna_config.json +11 -0
figures/carbon-8b-banner.png +3 -0
genai_config.json +50 -0
model.onnx +3 -0
model.onnx.data +3 -0
tokenizer.json +3 -0
tokenizer.py +601 -0
tokenizer_config.json +38 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+figures/carbon-8b-banner.png filter=lfs diff=lfs merge=lfs -text
+model.onnx.data filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,182 @@

+---
+library_name: onnxruntime-genai
+license: apache-2.0
+language:
+  - dna
+tags:
+  - dna
+  - genomic
+  - onnx
+  - onnxruntime
+  - onnxruntime-genai
+  - transformers
+base_model:
+  - HuggingFaceBio/Carbon-8B
+---
+![](figures/carbon-8b-banner.png)
+<p align="center">
+  <a href="https://huggingface.co/HuggingFaceBio/Carbon-3B/blob/main/tech-report.pdf"><b>Technical Report</b> 🧬</a>
+</p>
+# Carbon-8B
+A larger, higher-capacity member of the **Carbon** family of generative DNA foundation models.
+Carbon-8B is the 8B-parameter sibling of [Carbon-3B](https://huggingface.co/HuggingFaceBio/Carbon-3B). It is intended for users who can afford additional inference cost in exchange for stronger downstream performance. For the full design rationale, tokenizer specification, evaluation protocol, and usage details, please refer to the **[Carbon-3B model card](https://huggingface.co/HuggingFaceBio/Carbon-3B)** and the Carbon technical report — this card focuses only on what is specific to Carbon-8B.
+- Technical report: https://github.com/huggingface/carbon/blob/main/tech-report.pdf
+- Demo: https://huggingface.co/spaces/HuggingFaceBio/carbon-demo
+## Model Summary
+- **8B-parameter decoder-only autoregressive model** trained on DNA and RNA sequences with a primary focus on eukaryotes.
+- **Same hybrid tokenizer** as Carbon-3B (non-overlapping 6-mer for DNA + Qwen3 BPE for English text). Each DNA token encodes 6 bp. Wrap DNA inputs with `<dna>...</dna>` — see the Carbon-3B card for tokenizer details and usage caveats.
+- **Native context: 32,768 tokens (≈ 196 kbp).** Carbon-8B was extended with a long-context decay stage from an 8 k-context base, so it natively handles 32 k tokens. You can apply YaRN at 4× to extrapolate up to 128 k tokens (≈ 786 kbp).
+- Released as a standard Hugging Face causal LM (`LlamaForCausalLM`).
+## How to use
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+repo = "HuggingFaceBio/Carbon-8B"
+tok = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    repo, dtype=torch.bfloat16,
+).cuda().eval()
+prompt = "<dna>ATGCGCTAGCTACGATCGATCGTAGCTAGCTAGCTAGCTACG"   # multiple of 6 bp
+inputs = tok(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")
+out = model.generate(**inputs, max_new_tokens=64, do_sample=False)
+print(tok.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))
+```
+### Base-pair-level generation and scoring
+The `fns` branch loads custom modeling code for Factorized Nucleotide Supervision (FNS). Carbon still uses its efficient 6-mer tokenizer, but during generation each selected 6-mer is assembled from six per-position nucleotide distributions, giving base-pair-level control over decoded DNA. Use this branch when you need exact base-pair counts, per-position masks, or temperature/top-p behavior applied at the nucleotide level rather than over the 4,096-way 6-mer distribution:
+```py
+import math
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_id = "HuggingFaceBio/Carbon-8B"
+revision = "fns"
+device = "cuda"
+tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    revision=revision,
+    trust_remote_code=True,
+    dtype=torch.bfloat16,
+).to(device).eval()
+context = "ATGCGCTAGCTACGATCGATCGTAGCTAGCTAGCTAGCTACG"
+n_bp = 60
+inputs = tokenizer(f"<dna>{context}", return_tensors="pt", add_special_tokens=False).to(device)
+with torch.no_grad():
+    output_ids = model.generate(
+        **inputs,
+        max_new_tokens=math.ceil(n_bp / tokenizer.k),
+        do_sample=False,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+generated_ids = output_ids[0, inputs.input_ids.shape[1]:]
+generated_dna = tokenizer.decode(generated_ids, skip_special_tokens=True)[:n_bp]
+print(generated_dna)
+```
+The same per-base marginals are exposed through `score_sequence()`, which returns the probability assigned to the observed base at each position. Taking the mean log probability gives a base-pair-level sequence score, where higher values indicate higher model likelihood:
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_id = "HuggingFaceBio/Carbon-8B"
+revision = "fns"
+device = "cuda"
+tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    revision=revision,
+    trust_remote_code=True,
+    dtype=torch.bfloat16,
+).to(device).eval()
+reference = "GGGCTATAAAGGCCATCGATCGATCGATCGATCGATCGATCG"
+perturbed = "GGGCGCGCGCGGCCATCGATCGATCGATCGATCGATCGATCG"
+with torch.no_grad():
+    bp_probs, actual_probs = model.score_sequence([reference, perturbed])
+scores = [torch.log(p.clamp_min(1e-12)).mean().item() for p in actual_probs]
+print(f"reference mean bp logp: {scores[0]:.4f}")
+print(f"perturbed mean bp logp: {scores[1]:.4f}")
+print(f"reference preferred: {scores[0] > scores[1]}")
+```
+## Training
+Carbon-8B follows the same pre-training recipe as Carbon-3B on the **[`HuggingFaceBio/carbon-pretraining-corpus`](https://huggingface.co/datasets/HuggingFaceBio/carbon-pretraining-corpus)** with the identical data mixture on 1T DNA 6-mer tokens. The main recipe ingredients:
+- **Learning-rate schedule: cosine** (instead of the WSD schedule used for Carbon-3B).
+- **Loss schedule:** after 100B tokens the loss switches from cross-entropy to FNS loss until the end of training.
+- **Pre-training**: on 1T 6-mer tokens (≈ 6T DNA base pairs), with GBS=512, seq=8192 → 4.19 M tok/step. On 32 nodes (TP=4, DP=64), bfloat16, AdamW. We keep the same training mixture even in the decay phase with 70% Generator eukaryote data with metadata with dropout, 16% mRNA, 4% splice mRNA and 10% Prokaryote data.
+- **Long-context extension stage.** After pre-training, Carbon-8B undergoes a long-context decay phase that extends the native context from 8,192 to 32,768 tokens (≈ 196 kbp). You can apply YaRN at 4× to further extrapolate to 128 k tokens (≈ 786 kbp).
+Training infrastructure, framework ([Megatron-LM-Carbon](https://github.com/huggingface/Megatron-LM-Carbon)), and conversion path ([Megatron-Bridge](https://github.com/NVIDIA/Megatron-Bridge)) are identical to Carbon-3B.
+## Evaluation
+All evaluations are zero-shot and use the [public Carbon evaluation pipeline](https://github.com/huggingface/carbon/tree/main/evaluation). See the [Carbon-3B card](https://huggingface.co/HuggingFaceBio/Carbon-3B#evaluation) for the full task suite, metrics, and methodology.
+### Downstream tasks
+| Category | Metric (%) | Carbon 3B | Carbon 8B | Δ |
+|---|---|---|---|---|
+| Generative | Sequence Recovery eukaryote | 61.54 | **64.05** | +2.51 |
+| Variant effect prediction | BRCA2 | 84.63 | **85.72** | +1.09 |
+| | TraitGym Mendelian | 33.65 | **36.43** | +2.78 |
+| | ClinVar coding (24 kb) | 92.89 | **93.11** | +0.22 |
+| | ClinVar non-coding (24 kb) | 91.14 | **91.63** | +0.49 |
+| Perturbation | Nucleotide triplet-expansion | 85.20 | **89.05** | +3.85 |
+| | Synonymous codon replacement | 88.89 | **91.46** | +2.57 |
+| Long-context retrieval | Genomic-NIAH @ 393 kbp | 79.00 | **86.00** | +7.00 |
+### Genomic-NIAH (long-context retrieval)
+Genomic-NIAH measures how well a DNA model actually *uses* its long context. See the [`HuggingFaceBio/genomic-niah` dataset card](https://huggingface.co/datasets/HuggingFaceBio/genomic-niah) for the benchmark design.
+| Context length         | Carbon 3B (native / YaRN 4×) | Carbon 8B (native / YaRN 4×) | Evo2 7B |
+|------------------------|------------------------------|------------------------------|---------|
+| 16 k tokens (98 kbp)   | 0.73 / 0.91                  | 0.78 / 0.89                  | **0.97**    |
+| 32 k tokens (196 kbp)  | 0.55 / 0.90                  | 0.69 / 0.87                  | **0.95**    |
+| 64 k tokens (393 kbp)  | — / 0.79                     | — / **0.86**                     | 0.80    |
+| 128 k tokens (786 kbp) | — / 0.27                     | — / **0.65**                     | 0.53 |
+Carbon-8B retrieves reliably up to its 32 k native boundary; **YaRN 4×** recovers most of the loss at the 32 k → 64 k boundary and extends usable retrieval to ≈ 786 kbp.
+## Intended use
+Generative modelling, variant-effect prediction, motif-perturbation analysis, and long-context retrieval on DNA sequences. For faster inference at shorter contexts, use **Carbon-3B**.
+⚠️ **Genetic data is highly sensitive.** Depending on how this model is used (local download, inference API/endpoints, third-party inference providers, Spaces demos or others), input and output data may be processed or handled differently by different providers or space owners. Please make sure you understand and agree with how your data is handled before using the model.
+## License
+Apache 2.0.
+## Acknowledgements
+Carbon is a joint collaboration between the research teams at Hugging Face, Zhongguancun Academy, and TIGEM/University of Naples “Federico II”.

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,85 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

dna_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "k": 6,
+  "dna_start_id": 151669,
+  "dna_vocab_size": 4107,
+  "dna_special_tokens": [
+    "<dna>",
+    "</dna>",
+    "<oov>"
+  ],
+  "auto_dna_tags": false
+}

figures/carbon-8b-banner.png ADDED Viewed

Git LFS Details

SHA256: 4673b72dfc0241f4c856d90ef3c4071e9d88c3ec6d803486d0f115ff1cb414d5
Pointer size: 132 Bytes
Size of remote file: 1.28 MB

genai_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+    "model": {
+        "bos_token_id": 1,
+        "context_length": 32768,
+        "decoder": {
+            "session_options": {
+                "log_id": "onnxruntime-genai",
+                "provider_options": []
+            },
+            "filename": "model.onnx",
+            "head_size": 128,
+            "hidden_size": 4096,
+            "inputs": {
+                "input_ids": "input_ids",
+                "attention_mask": "attention_mask",
+                "position_ids": "position_ids",
+                "past_key_names": "past_key_values.%d.key",
+                "past_value_names": "past_key_values.%d.value"
+            },
+            "outputs": {
+                "logits": "logits",
+                "present_key_names": "present.%d.key",
+                "present_value_names": "present.%d.value"
+            },
+            "num_attention_heads": 32,
+            "num_hidden_layers": 32,
+            "num_key_value_heads": 8
+        },
+        "eos_token_id": 151643,
+        "pad_token_id": 151643,
+        "type": "llama",
+        "vocab_size": 155776
+    },
+    "search": {
+        "diversity_penalty": 0.0,
+        "do_sample": false,
+        "early_stopping": true,
+        "length_penalty": 1.0,
+        "max_length": 32768,
+        "min_length": 0,
+        "no_repeat_ngram_size": 0,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "past_present_share_buffer": false,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_k": 50,
+        "top_p": 1.0
+    }
+}

model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4edd74d06b828140d6149be21ca1f3630beec5972bf30e1063bbc115f1dc8fdf
+size 748183

model.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7bbe623ec31ef77b8be3f94eb6151b36aa4a1e8030f5a574747aa21278b8fb84
+size 16519856128

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

tokenizer.py ADDED Viewed

	@@ -0,0 +1,601 @@

+"""
+HybridDNATokenizer: Combines Qwen3 BPE tokenization with DNA 6-mer tokenization.
+DNA sequences wrapped in <dna>...</dna> tags are tokenized as 6-mers.
+All other text uses Qwen3's BPE tokenization.
+Supports token_mask for Fine-grained Nucleotide Supervision (FNS):
+  -2: padding token
+  -1: text token (BPE)
+   0: DNA special token (<dna>, </dna>, <oov>)
+  1-5: partial 6-mer token — valid_length real bases at positions [0, valid_length),
+       right-padded with 'A' at positions [valid_length, k) so loss can supervise
+       positions 0..valid_len-1 via pos_mask = (valid_len > pos)
+   6: full 6-mer
+"""
+import os
+import json
+import warnings
+import itertools
+from typing import List, Optional, Tuple, Dict, Union, Any
+from transformers import PreTrainedTokenizer, AutoTokenizer, BatchEncoding
+class HybridDNATokenizer(PreTrainedTokenizer):
+    """
+    Hybrid tokenizer combining Qwen3 BPE with DNA 6-mer tokenization.
+    DNA regions must be wrapped in <dna>...</dna> tags to be tokenized as 6-mers.
+    Without tags, DNA sequences are tokenized as regular BPE text.
+    For pure-DNA input (no metadata tokens), pass auto_dna_tags=True to have
+    <dna>...</dna> tags added automatically when they are absent.  Do NOT set
+    this if the input may contain BPE metadata such as species tags
+    (<fungi_species> etc.) — those must appear outside <dna>...</dna> and would
+    be incorrectly k-mer encoded if auto-wrapping fired.
+    """
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        base_tokenizer_path: Optional[str] = None,
+        k: int = 6,
+        auto_dna_tags: bool = False,
+        **kwargs
+    ):
+        self.k = k
+        # Load base tokenizer (Qwen3-4B-Base)
+        self._base_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Base")
+        # Get base vocabulary
+        self._base_vocab = self._base_tokenizer.get_vocab()
+        self._base_vocab_size = len(self._base_vocab)
+        # Initialize DNA vocabulary
+        self._init_dna_vocab()
+        # Build combined vocabulary
+        self._build_combined_vocab()
+        # Set special tokens
+        self._eos_token = kwargs.pop('eos_token', None) or "<|endoftext|>"
+        self._pad_token = kwargs.pop('pad_token', None) or self._base_tokenizer.pad_token or "<|endoftext|>"
+        # Initialize parent class
+        super().__init__(
+            eos_token=self._eos_token,
+            pad_token=self._pad_token,
+            **kwargs
+        )
+        self.special_tokens = self.dna_special_tokens + [self._eos_token, self._pad_token]
+        self.auto_dna_tags = auto_dna_tags
+    def _init_dna_vocab(self):
+        """Initialize DNA vocabulary (special tokens + k-mers + padding for 128 alignment)."""
+        bases = ['A', 'T', 'C', 'G']
+        # DNA special tokens
+        self.dna_special_tokens = ["<dna>", "</dna>", "<oov>"]
+        # Generate all k-mer combinations (4^k = 4096 for k=6)
+        self.kmers = [''.join(kmer) for kmer in itertools.product(bases, repeat=self.k)]
+        # DNA tokens start after base vocabulary
+        self.dna_start_id = self._base_vocab_size
+        # All DNA tokens get new IDs (no reuse of base vocab IDs, even for
+        # overlapping tokens like CCCCCC — they have different semantics in
+        # DNA context vs BPE context, per Qiuyi's recommendation)
+        base_dna_tokens = self.dna_special_tokens + self.kmers
+        # Calculate padding for 128 alignment
+        total_vocab_unpadded = self._base_vocab_size + len(base_dna_tokens)
+        target_vocab_size = ((total_vocab_unpadded + 127) // 128) * 128
+        num_padding_tokens = target_vocab_size - total_vocab_unpadded
+        # Add unused padding tokens
+        self.padding_tokens = [f"<unused_{i}>" for i in range(num_padding_tokens)]
+        # Create DNA token mappings — all get sequential new IDs
+        self.dna_token_to_id = {}
+        self.dna_id_to_token = {}
+        current_id = self.dna_start_id
+        for token in base_dna_tokens:
+            self.dna_token_to_id[token] = current_id
+            self.dna_id_to_token[current_id] = token
+            current_id += 1
+        # Add padding tokens
+        for token in self.padding_tokens:
+            self.dna_token_to_id[token] = current_id
+            self.dna_id_to_token[current_id] = token
+            current_id += 1
+        self.dna_vocab_size = len(base_dna_tokens) + len(self.padding_tokens)
+        # Set DNA special token IDs
+        self.dna_begin_token_id = self.dna_token_to_id["<dna>"]
+        self.dna_end_token_id = self.dna_token_to_id["</dna>"]
+        self.oov_token_id = self.dna_token_to_id["<oov>"]
+    def _build_combined_vocab(self):
+        """Build combined vocabulary (base + DNA)."""
+        self._vocab = self._base_vocab.copy()
+        for token, token_id in self.dna_token_to_id.items():
+            if token not in self._vocab:
+                self._vocab[token] = token_id
+        self._id_to_token = {v: k for k, v in self._vocab.items()}
+        for token_id, token in self.dna_id_to_token.items():
+            if token_id not in self._id_to_token:
+                self._id_to_token[token_id] = token
+    @property
+    def vocab_size(self) -> int:
+        return max(self._vocab.values()) + 1
+    def get_vocab(self) -> Dict[str, int]:
+        return self._vocab.copy()
+    @property
+    def vocab(self) -> Dict[str, int]:
+        # Compatibility shim: fast tokenizers (PreTrainedTokenizerFast) expose
+        # `tokenizer.vocab` as a property; slow PreTrainedTokenizer subclasses
+        # like this one only expose `get_vocab()`. Some downstream tools
+        # (e.g. llama.cpp's convert_hf_to_gguf.py) read `.vocab` directly.
+        return self._vocab
+    def __len__(self):
+        # Override default (len(get_vocab())) because get_vocab() deduplicates
+        # CCCCCC which exists as both BPE (ID 91443) and DNA 6-mer (ID 154402).
+        return self.vocab_size
+    def _split_by_dna_tags(self, text: str) -> List[Tuple[str, bool]]:
+        segments = []
+        i = 0
+        n = len(text)
+        while i < n:
+            start_pos = text.find('<dna>', i)
+            end_pos = text.find('</dna>', i)
+            if start_pos == -1 and end_pos == -1:
+                remaining = text[i:]
+                if remaining:
+                    segments.append((remaining, False))
+                break
+            if start_pos == -1 and end_pos != -1:
+                dna_region = text[i:end_pos + 6]
+                if dna_region:
+                    segments.append((dna_region, True))
+                i = end_pos + 6
+                continue
+            if start_pos != -1 and end_pos == -1:
+                if i < start_pos:
+                    normal_text = text[i:start_pos]
+                    if normal_text:
+                        segments.append((normal_text, False))
+                dna_region = text[start_pos:]
+                if dna_region:
+                    segments.append((dna_region, True))
+                break
+            if start_pos < end_pos:
+                if i < start_pos:
+                    normal_text = text[i:start_pos]
+                    if normal_text:
+                        segments.append((normal_text, False))
+                dna_region = text[start_pos:end_pos + 6]
+                if dna_region:
+                    segments.append((dna_region, True))
+                i = end_pos + 6
+            else:
+                dna_region = text[i:end_pos + 6]
+                if dna_region:
+                    segments.append((dna_region, True))
+                i = end_pos + 6
+        return segments
+    def _parse_dna_region(self, dna_region: str) -> Tuple[str, bool, bool]:
+        if dna_region == '<dna>':
+            return '', True, False
+        elif dna_region == '</dna>':
+            return '', False, True
+        has_start = dna_region.startswith('<dna>')
+        has_end = dna_region.endswith('</dna>')
+        content = dna_region
+        if has_start:
+            content = content[5:]
+        if has_end and content.endswith('</dna>'):
+            content = content[:-6]
+        return content.strip(), has_start, has_end
+    def _process_dna_sequence(self, dna_seq: str) -> Dict:
+        k = self.k
+        dna_seq = dna_seq.upper()
+        kmer_tokens = []
+        valid_bases = set('ATCG')
+        def is_valid_kmer(kmer):
+            return len(kmer) == k and all(base in valid_bases for base in kmer)
+        for i in range(0, len(dna_seq) - k + 1, k):
+            kmer = dna_seq[i:i+k]
+            if is_valid_kmer(kmer):
+                kmer_tokens.append(kmer)
+            else:
+                kmer_tokens.append("<oov>")
+        processed_length = len(kmer_tokens) * k
+        remaining = dna_seq[processed_length:]
+        padding_length = 0
+        valid_length = k
+        if remaining:
+            padding_needed = k - len(remaining)
+            # Right-pad with A: real bases occupy positions [0, valid_length).
+            # The hybrid BP loss supervises positions 0..valid_len-1 via
+            #   pos_mask = (valid_len > pos)
+            # so padding must be at the END, not the start.
+            padded = remaining + 'A' * padding_needed
+            if is_valid_kmer(padded):
+                kmer_tokens.append(padded)
+            else:
+                kmer_tokens.append("<oov>")
+            padding_length = padding_needed
+            valid_length = len(remaining)
+        return {
+            "kmer_tokens": kmer_tokens,
+            "padding_length": padding_length,
+            "valid_length": valid_length,
+        }
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        return list(text)
+    def _convert_token_to_id(self, token: str) -> int:
+        if token in self.dna_token_to_id:
+            return self.dna_token_to_id[token]
+        return self._base_vocab.get(token, self._base_tokenizer.unk_token_id or 0)
+    def _convert_id_to_token(self, index: int) -> str:
+        if index in self.dna_id_to_token:
+            return self.dna_id_to_token[index]
+        return self._id_to_token.get(index, "<oov>")
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return "".join(tokens)
+    def encode(
+        self,
+        text: str,
+        add_special_tokens: bool = False,
+        return_token_mask: bool = False,
+        auto_dna_tags: Optional[bool] = None,
+        **kwargs
+    ) -> Union[List[int], Tuple[List[int], List[int]]]:
+        use_auto = self.auto_dna_tags if auto_dna_tags is None else auto_dna_tags
+        if use_auto and '<dna>' not in text:
+            text = f'<dna>{text}</dna>'
+        segments = self._split_by_dna_tags(text)
+        token_ids = []
+        token_mask = [] if return_token_mask else None
+        for segment_content, is_dna in segments:
+            if is_dna:
+                dna_content, has_start, has_end = self._parse_dna_region(segment_content)
+                if has_start:
+                    token_ids.append(self.dna_begin_token_id)
+                    if return_token_mask:
+                        token_mask.append(0)
+                if dna_content:
+                    result = self._process_dna_sequence(dna_content)
+                    for idx, kmer in enumerate(result["kmer_tokens"]):
+                        token_id = self.dna_token_to_id.get(kmer, self.oov_token_id)
+                        token_ids.append(token_id)
+                        if return_token_mask:
+                            if kmer == "<oov>":
+                                token_mask.append(0)
+                            elif idx == len(result["kmer_tokens"]) - 1 and result["padding_length"] > 0:
+                                token_mask.append(result["valid_length"])
+                            else:
+                                token_mask.append(self.k)
+                if has_end:
+                    token_ids.append(self.dna_end_token_id)
+                    if return_token_mask:
+                        token_mask.append(0)
+            else:
+                base_ids = self._base_tokenizer.encode(
+                    segment_content,
+                    add_special_tokens=add_special_tokens
+                )
+                token_ids.extend(base_ids)
+                if return_token_mask:
+                    token_mask.extend([-1] * len(base_ids))
+        # Do NOT append EOS when add_special_tokens=True. Qwen3 doesn't add
+        # BOS/EOS either, and appending EOS here breaks lighteval's
+        # tok_encode_pair: it relies on
+        #   len(encode(ctx)) + len(encode(answer)) == len(encode(ctx + answer))
+        # which the extra EOS violates by shifting the split by 1.
+        if return_token_mask:
+            return token_ids, token_mask
+        return token_ids
+    def decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        **kwargs
+    ) -> str:
+        if hasattr(token_ids, 'tolist'):
+            token_ids = token_ids.tolist()
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            special_ids = {self.eos_token_id, self.pad_token_id}
+            token_ids = [tid for tid in token_ids if tid not in special_ids]
+        parts = []
+        i = 0
+        while i < len(token_ids):
+            tid = token_ids[i]
+            if tid == self.dna_begin_token_id:
+                dna_tokens = []
+                i += 1
+                while i < len(token_ids) and token_ids[i] != self.dna_end_token_id:
+                    if token_ids[i] in self.dna_id_to_token:
+                        dna_tokens.append(self.dna_id_to_token[token_ids[i]])
+                    i += 1
+                dna_seq = ''.join(dna_tokens)
+                if skip_special_tokens:
+                    parts.append(dna_seq)
+                else:
+                    parts.append(f"<dna>{dna_seq}")
+                    if i < len(token_ids) and token_ids[i] == self.dna_end_token_id:
+                        parts.append("</dna>")
+                        i += 1
+            elif tid in self.dna_id_to_token:
+                # This branch handles k-mer tokens that appear without a <dna>
+                # wrapper — the common generation case where <dna> was in the
+                # prompt but only the generated portion is being decoded.
+                # K-mer tokens are content, not special tokens, so always decode
+                # them.  Only drop true DNA special tokens (<dna>, </dna>, <oov>)
+                # when skip_special_tokens=True.
+                is_dna_special = tid in (self.dna_begin_token_id, self.dna_end_token_id, self.oov_token_id)
+                if not (skip_special_tokens and is_dna_special):
+                    parts.append(self.dna_id_to_token[tid])
+                i += 1
+            else:
+                text_ids = []
+                while i < len(token_ids):
+                    curr_id = token_ids[i]
+                    if curr_id in self.dna_id_to_token or curr_id == self.dna_begin_token_id:
+                        break
+                    text_ids.append(curr_id)
+                    i += 1
+                if text_ids:
+                    decoded = self._base_tokenizer.decode(text_ids, skip_special_tokens=skip_special_tokens)
+                    parts.append(decoded)
+        return ''.join(parts)
+    def batch_decode(
+        self,
+        sequences: Union[List[int], List[List[int]], "torch.Tensor"],
+        skip_special_tokens: bool = False,
+        **kwargs
+    ) -> List[str]:
+        return [
+            self.decode(
+                seq.tolist() if hasattr(seq, 'tolist') else list(seq),
+                skip_special_tokens=skip_special_tokens,
+                **kwargs
+            )
+            for seq in sequences
+        ]
+    def __call__(
+        self,
+        text: Union[str, List[str]],
+        add_special_tokens: bool = False,
+        padding: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_mask: bool = False,
+        auto_dna_tags: Optional[bool] = None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        if add_special_tokens:
+            warnings.warn(
+                "HybridTokenizer does not support add_special_tokens=True, ignoring.",
+                UserWarning
+            )
+            add_special_tokens = False
+        is_batch = isinstance(text, list)
+        texts = text if is_batch else [text]
+        all_ids = []
+        all_masks = [] if return_token_mask else None
+        for t in texts:
+            if return_token_mask:
+                ids, mask = self.encode(t, add_special_tokens=add_special_tokens, return_token_mask=True, auto_dna_tags=auto_dna_tags)
+                all_ids.append(ids)
+                all_masks.append(mask)
+            else:
+                ids = self.encode(t, add_special_tokens=add_special_tokens, return_token_mask=False, auto_dna_tags=auto_dna_tags)
+                all_ids.append(ids)
+        if padding:
+            max_len = max(len(ids) for ids in all_ids)
+            if max_length:
+                max_len = min(max_len, max_length)
+            padded_ids = []
+            attention_masks = []
+            padded_token_masks = [] if return_token_mask else None
+            for idx, ids in enumerate(all_ids):
+                pad_len = max_len - len(ids)
+                if pad_len > 0:
+                    ids = ids + [self.pad_token_id] * pad_len
+                    attn = [1] * (max_len - pad_len) + [0] * pad_len
+                    if return_token_mask:
+                        mask = all_masks[idx] + [-2] * pad_len
+                else:
+                    ids = ids[:max_len]
+                    attn = [1] * max_len
+                    if return_token_mask:
+                        mask = all_masks[idx][:max_len]
+                padded_ids.append(ids)
+                attention_masks.append(attn)
+                if return_token_mask:
+                    padded_token_masks.append(mask)
+            all_ids = padded_ids
+            all_masks = padded_token_masks
+        else:
+            attention_masks = [[1] * len(ids) for ids in all_ids]
+        result = {
+            "input_ids": all_ids if is_batch else all_ids[0],
+            "attention_mask": attention_masks if is_batch else attention_masks[0],
+        }
+        if return_token_mask:
+            result["token_mask"] = all_masks if is_batch else all_masks[0]
+        if return_tensors == "pt":
+            import torch
+            if is_batch:
+                result["input_ids"] = torch.tensor(result["input_ids"])
+                result["attention_mask"] = torch.tensor(result["attention_mask"])
+                if return_token_mask:
+                    result["token_mask"] = torch.tensor(result["token_mask"])
+            else:
+                result["input_ids"] = torch.tensor([result["input_ids"]])
+                result["attention_mask"] = torch.tensor([result["attention_mask"]])
+                if return_token_mask:
+                    result["token_mask"] = torch.tensor([result["token_mask"]])
+        return BatchEncoding(result, tensor_type=return_tensors)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self._vocab, f, ensure_ascii=False, indent=2)
+        return (vocab_file,)
+    def save_pretrained(self, save_directory: str, **kwargs):
+        os.makedirs(save_directory, exist_ok=True)
+        # Save base tokenizer files
+        self._base_tokenizer.save_pretrained(save_directory)
+        # Save DNA config
+        dna_config = {
+            "k": self.k,
+            "dna_start_id": self.dna_start_id,
+            "dna_vocab_size": self.dna_vocab_size,
+            "dna_special_tokens": self.dna_special_tokens,
+            "auto_dna_tags": self.auto_dna_tags,
+        }
+        dna_config_path = os.path.join(save_directory, "dna_config.json")
+        with open(dna_config_path, "w", encoding="utf-8") as f:
+            json.dump(dna_config, f, indent=2)
+        # Update tokenizer_config.json with auto_map
+        config_path = os.path.join(save_directory, "tokenizer_config.json")
+        if os.path.exists(config_path):
+            with open(config_path, "r") as f:
+                config = json.load(f)
+        else:
+            config = {}
+        config.update({
+            "tokenizer_class": "HybridDNATokenizer",
+            "auto_map": {
+                "AutoTokenizer": ["tokenizer.HybridDNATokenizer", None]
+            },
+            "k": self.k,
+            "auto_dna_tags": self.auto_dna_tags,
+        })
+        with open(config_path, "w", encoding="utf-8") as f:
+            json.dump(config, f, indent=2, ensure_ascii=False)
+        # Copy this tokenizer.py to save directory
+        import shutil
+        src_py = os.path.abspath(__file__)
+        dst_py = os.path.join(save_directory, "tokenizer.py")
+        if os.path.exists(src_py) and src_py != dst_py:
+            shutil.copy2(src_py, dst_py)
+        return (save_directory,)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
+        k = 6
+        auto_dna_tags = False
+        dna_config_path = os.path.join(pretrained_model_name_or_path, "dna_config.json")
+        tok_config_path = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
+        if os.path.exists(dna_config_path):
+            with open(dna_config_path, "r") as f:
+                dna_config = json.load(f)
+            k = dna_config.get("k", 6)
+            auto_dna_tags = dna_config.get("auto_dna_tags", False)
+        elif os.path.exists(tok_config_path):
+            with open(tok_config_path, "r") as f:
+                tok_config = json.load(f)
+            k = tok_config.get("k", 6)
+            auto_dna_tags = tok_config.get("auto_dna_tags", False)
+        return cls(base_tokenizer_path=pretrained_model_name_or_path, k=k, auto_dna_tags=auto_dna_tags, **kwargs)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "HybridDNATokenizer",
+  "unk_token": null,
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizer.HybridDNATokenizer",
+      null
+    ]
+  },
+  "k": 6,
+  "auto_dna_tags": false
+}