Upload folder using huggingface_hub
Browse files- README.md +7 -3
- config.json +41 -0
- generation_config.json +8 -0
- model.safetensors +3 -0
- special_tokens_map.json +37 -0
- tokenizer_config.json +58 -0
- tokenizers.py +126 -0
README.md
CHANGED
|
@@ -1,3 +1,7 @@
|
|
| 1 |
-
---
|
| 2 |
-
|
| 3 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
{}
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
# EnhancAR Sorted
|
| 6 |
+
EnhancAR Sorted is an autoregressive generative model of enhancer homology families, trained on 233,158,475 enhancers extracted from 241 vertebrate genomes. By "unrolling" homology families (enhancer sequences are sorted into sets of homology sequences, and input data is sequences concatenated to each other with a separator token delimiting different sequences), enhancAR Sorted learns to generate new sequences that conserve the function of prompt sequences. We demonstrate that this can be used to design new enhancers "by example", which is particularly useful when the function of enhancers is not known a priori.
|
| 7 |
+
|
config.json
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"JambaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"attn_layer_offset": 4,
|
| 7 |
+
"attn_layer_period": 8,
|
| 8 |
+
"bos_token_id": 9,
|
| 9 |
+
"eos_token_id": 7,
|
| 10 |
+
"expert_layer_offset": 1,
|
| 11 |
+
"expert_layer_period": 2,
|
| 12 |
+
"hidden_act": "silu",
|
| 13 |
+
"hidden_size": 256,
|
| 14 |
+
"initializer_range": 0.02,
|
| 15 |
+
"intermediate_size": 1024,
|
| 16 |
+
"mamba_conv_bias": true,
|
| 17 |
+
"mamba_d_conv": 4,
|
| 18 |
+
"mamba_d_state": 16,
|
| 19 |
+
"mamba_dt_rank": 16,
|
| 20 |
+
"mamba_expand": 2,
|
| 21 |
+
"mamba_proj_bias": false,
|
| 22 |
+
"max_position_embeddings": 262144,
|
| 23 |
+
"model_type": "jamba",
|
| 24 |
+
"num_attention_heads": 16,
|
| 25 |
+
"num_experts": 16,
|
| 26 |
+
"num_experts_per_tok": 2,
|
| 27 |
+
"num_hidden_layers": 24,
|
| 28 |
+
"num_key_value_heads": 8,
|
| 29 |
+
"num_logits_to_keep": 1,
|
| 30 |
+
"output_router_logits": true,
|
| 31 |
+
"pad_token_id": 6,
|
| 32 |
+
"rms_norm_eps": 1e-06,
|
| 33 |
+
"router_aux_loss_coef": 0.001,
|
| 34 |
+
"sliding_window": null,
|
| 35 |
+
"tie_word_embeddings": false,
|
| 36 |
+
"torch_dtype": "float32",
|
| 37 |
+
"transformers_version": "4.48.2",
|
| 38 |
+
"use_cache": false,
|
| 39 |
+
"use_mamba_kernels": true,
|
| 40 |
+
"vocab_size": 16
|
| 41 |
+
}
|
generation_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 9,
|
| 4 |
+
"eos_token_id": 7,
|
| 5 |
+
"pad_token_id": 6,
|
| 6 |
+
"transformers_version": "4.48.2",
|
| 7 |
+
"use_cache": false
|
| 8 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c2fa527a693668611bfeede25de64f014e12bc8d49f96a04b0bc982d94bf5ca
|
| 3 |
+
size 681256576
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "@",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": true,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "*",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": true,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"mask_token": {
|
| 17 |
+
"content": "#",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": true,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"pad_token": {
|
| 24 |
+
"content": "!",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": true,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"sep_token": {
|
| 31 |
+
"content": "/",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": true,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
}
|
| 37 |
+
}
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"27": {
|
| 4 |
+
"content": "*",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": true,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"28": {
|
| 12 |
+
"content": "#",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": true,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"29": {
|
| 20 |
+
"content": "@",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": true,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"30": {
|
| 28 |
+
"content": "!",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": true,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"31": {
|
| 36 |
+
"content": "/",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": true,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"auto_map": {
|
| 45 |
+
"AutoTokenizer": [
|
| 46 |
+
"tokenizers.DNATokenizer",
|
| 47 |
+
null
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
"bos_token": "@",
|
| 51 |
+
"clean_up_tokenization_spaces": true,
|
| 52 |
+
"eos_token": "*",
|
| 53 |
+
"mask_token": "#",
|
| 54 |
+
"model_max_length": 2048,
|
| 55 |
+
"pad_token": "!",
|
| 56 |
+
"sep_token": "/",
|
| 57 |
+
"tokenizer_class": "DNATokenizer"
|
| 58 |
+
}
|
tokenizers.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
| 2 |
+
from typing import List, Optional, Union
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
MASK = "#"
|
| 6 |
+
MSA_PAD = "!"
|
| 7 |
+
UL_ALPHABET_PLUS = "GATCN-*#@!/[]{}"
|
| 8 |
+
MSA_AAS = "GATCN-"
|
| 9 |
+
GAP = "-"
|
| 10 |
+
START = "@"
|
| 11 |
+
STOP = "*"
|
| 12 |
+
SEP = "/"
|
| 13 |
+
END_AL = "]"
|
| 14 |
+
END_UL = "}"
|
| 15 |
+
START_AL = "["
|
| 16 |
+
START_UL = "{"
|
| 17 |
+
|
| 18 |
+
class DNATokenizer(PreTrainedTokenizer):
|
| 19 |
+
|
| 20 |
+
def __init__(
|
| 21 |
+
self,
|
| 22 |
+
dna_alphabet: str = UL_ALPHABET_PLUS,
|
| 23 |
+
model_max_length: int = 2048,
|
| 24 |
+
pad_token=MSA_PAD,
|
| 25 |
+
mask_token=MASK,
|
| 26 |
+
all_aas=MSA_AAS,
|
| 27 |
+
gap_token=GAP,
|
| 28 |
+
bos_token=START,
|
| 29 |
+
eos_token=STOP,
|
| 30 |
+
sep_token=SEP,
|
| 31 |
+
**kwargs
|
| 32 |
+
):
|
| 33 |
+
"""Character tokenizer for Hugging Face transformers.
|
| 34 |
+
|
| 35 |
+
model_max_length (int): Model maximum sequence length.
|
| 36 |
+
"""
|
| 37 |
+
self.alphabet = list("".join(dna_alphabet))
|
| 38 |
+
self.all_aas = list("".join(all_aas))
|
| 39 |
+
self.a_to_i = {u: i for i, u in enumerate(self.alphabet)}
|
| 40 |
+
self.i_to_a = {i: u for i, u in enumerate(self.alphabet)}
|
| 41 |
+
self.gap_token = gap_token
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
| 45 |
+
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
| 46 |
+
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
|
| 47 |
+
mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token
|
| 48 |
+
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
| 49 |
+
gap_token = AddedToken(gap_token, lstrip=False, rstrip=False) if isinstance(gap_token, str) else gap_token
|
| 50 |
+
|
| 51 |
+
super().__init__(
|
| 52 |
+
pad_token=pad_token,
|
| 53 |
+
mask_token=mask_token,
|
| 54 |
+
eos_token=eos_token,
|
| 55 |
+
bos_token=bos_token,
|
| 56 |
+
sep_token=sep_token,
|
| 57 |
+
model_max_length=model_max_length,
|
| 58 |
+
**kwargs
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
@property
|
| 62 |
+
def vocab_size(self):
|
| 63 |
+
return len(self.alphabet)
|
| 64 |
+
|
| 65 |
+
@property
|
| 66 |
+
def gap_token_id(self):
|
| 67 |
+
return self.convert_tokens_to_ids(self.gap_token)
|
| 68 |
+
|
| 69 |
+
def get_vocab(self):
|
| 70 |
+
return self.a_to_i
|
| 71 |
+
|
| 72 |
+
def _tokenize(self, text: str) -> List[str]:
|
| 73 |
+
return list(text)
|
| 74 |
+
|
| 75 |
+
def _convert_token_to_id(self, token) -> int:
|
| 76 |
+
return self.a_to_i[token]
|
| 77 |
+
|
| 78 |
+
def _convert_id_to_token(self, index) -> str:
|
| 79 |
+
return self.i_to_a[index]
|
| 80 |
+
|
| 81 |
+
def convert_tokens_to_string(self, tokens):
|
| 82 |
+
return "".join(tokens)
|
| 83 |
+
|
| 84 |
+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
| 85 |
+
result = token_ids_0
|
| 86 |
+
if token_ids_1 is not None:
|
| 87 |
+
raise NotImplementedError("This tokenizer does not support two sequences")
|
| 88 |
+
return result
|
| 89 |
+
|
| 90 |
+
def get_special_tokens_mask(
|
| 91 |
+
self,
|
| 92 |
+
token_ids_0: List[int],
|
| 93 |
+
token_ids_1: Optional[List[int]] = None,
|
| 94 |
+
already_has_special_tokens: bool = False,
|
| 95 |
+
) -> List[int]:
|
| 96 |
+
if already_has_special_tokens:
|
| 97 |
+
return super().get_special_tokens_mask(
|
| 98 |
+
token_ids_0=token_ids_0,
|
| 99 |
+
token_ids_1=token_ids_1,
|
| 100 |
+
already_has_special_tokens=True,
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
result = [0] * len(token_ids_0)
|
| 104 |
+
if token_ids_1 is not None:
|
| 105 |
+
raise NotImplementedError("This tokenizer does not support two sequences")
|
| 106 |
+
|
| 107 |
+
return result
|
| 108 |
+
|
| 109 |
+
def create_token_type_ids_from_sequences(
|
| 110 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
| 111 |
+
) -> List[int]:
|
| 112 |
+
"""
|
| 113 |
+
Identifies the type of token. 0 for the first sentence, 1 for the second sentence if it exists
|
| 114 |
+
"""
|
| 115 |
+
|
| 116 |
+
result = len(token_ids_0) * [0]
|
| 117 |
+
|
| 118 |
+
if token_ids_1 is not None:
|
| 119 |
+
raise NotImplementedError("This tokenizer does not support two sequences")
|
| 120 |
+
return result
|
| 121 |
+
|
| 122 |
+
def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
|
| 123 |
+
super().save_pretrained(save_directory, **kwargs)
|
| 124 |
+
|
| 125 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
|
| 126 |
+
return ()
|