aduncan94 commited on
Commit
4bf9a40
·
verified ·
1 Parent(s): 0adc8d0

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,3 +1,7 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
1
+ ---
2
+ {}
3
+ ---
4
+
5
+ # EnhancAR Sorted
6
+ EnhancAR Sorted is an autoregressive generative model of enhancer homology families, trained on 233,158,475 enhancers extracted from 241 vertebrate genomes. By "unrolling" homology families (enhancer sequences are sorted into sets of homology sequences, and input data is sequences concatenated to each other with a separator token delimiting different sequences), enhancAR Sorted learns to generate new sequences that conserve the function of prompt sequences. We demonstrate that this can be used to design new enhancers "by example", which is particularly useful when the function of enhancers is not known a priori.
7
+
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "JambaForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "attn_layer_offset": 4,
7
+ "attn_layer_period": 8,
8
+ "bos_token_id": 9,
9
+ "eos_token_id": 7,
10
+ "expert_layer_offset": 1,
11
+ "expert_layer_period": 2,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 256,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 1024,
16
+ "mamba_conv_bias": true,
17
+ "mamba_d_conv": 4,
18
+ "mamba_d_state": 16,
19
+ "mamba_dt_rank": 16,
20
+ "mamba_expand": 2,
21
+ "mamba_proj_bias": false,
22
+ "max_position_embeddings": 262144,
23
+ "model_type": "jamba",
24
+ "num_attention_heads": 16,
25
+ "num_experts": 16,
26
+ "num_experts_per_tok": 2,
27
+ "num_hidden_layers": 24,
28
+ "num_key_value_heads": 8,
29
+ "num_logits_to_keep": 1,
30
+ "output_router_logits": true,
31
+ "pad_token_id": 6,
32
+ "rms_norm_eps": 1e-06,
33
+ "router_aux_loss_coef": 0.001,
34
+ "sliding_window": null,
35
+ "tie_word_embeddings": false,
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.48.2",
38
+ "use_cache": false,
39
+ "use_mamba_kernels": true,
40
+ "vocab_size": 16
41
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 9,
4
+ "eos_token_id": 7,
5
+ "pad_token_id": 6,
6
+ "transformers_version": "4.48.2",
7
+ "use_cache": false
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c2fa527a693668611bfeede25de64f014e12bc8d49f96a04b0bc982d94bf5ca
3
+ size 681256576
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "@",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "*",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "#",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "!",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "sep_token": {
31
+ "content": "/",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "27": {
4
+ "content": "*",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "28": {
12
+ "content": "#",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "29": {
20
+ "content": "@",
21
+ "lstrip": false,
22
+ "normalized": true,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "30": {
28
+ "content": "!",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "31": {
36
+ "content": "/",
37
+ "lstrip": false,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenizers.DNATokenizer",
47
+ null
48
+ ]
49
+ },
50
+ "bos_token": "@",
51
+ "clean_up_tokenization_spaces": true,
52
+ "eos_token": "*",
53
+ "mask_token": "#",
54
+ "model_max_length": 2048,
55
+ "pad_token": "!",
56
+ "sep_token": "/",
57
+ "tokenizer_class": "DNATokenizer"
58
+ }
tokenizers.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
2
+ from typing import List, Optional, Union
3
+ import os
4
+
5
+ MASK = "#"
6
+ MSA_PAD = "!"
7
+ UL_ALPHABET_PLUS = "GATCN-*#@!/[]{}"
8
+ MSA_AAS = "GATCN-"
9
+ GAP = "-"
10
+ START = "@"
11
+ STOP = "*"
12
+ SEP = "/"
13
+ END_AL = "]"
14
+ END_UL = "}"
15
+ START_AL = "["
16
+ START_UL = "{"
17
+
18
+ class DNATokenizer(PreTrainedTokenizer):
19
+
20
+ def __init__(
21
+ self,
22
+ dna_alphabet: str = UL_ALPHABET_PLUS,
23
+ model_max_length: int = 2048,
24
+ pad_token=MSA_PAD,
25
+ mask_token=MASK,
26
+ all_aas=MSA_AAS,
27
+ gap_token=GAP,
28
+ bos_token=START,
29
+ eos_token=STOP,
30
+ sep_token=SEP,
31
+ **kwargs
32
+ ):
33
+ """Character tokenizer for Hugging Face transformers.
34
+
35
+ model_max_length (int): Model maximum sequence length.
36
+ """
37
+ self.alphabet = list("".join(dna_alphabet))
38
+ self.all_aas = list("".join(all_aas))
39
+ self.a_to_i = {u: i for i, u in enumerate(self.alphabet)}
40
+ self.i_to_a = {i: u for i, u in enumerate(self.alphabet)}
41
+ self.gap_token = gap_token
42
+
43
+
44
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
45
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
46
+ sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
47
+ mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token
48
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
49
+ gap_token = AddedToken(gap_token, lstrip=False, rstrip=False) if isinstance(gap_token, str) else gap_token
50
+
51
+ super().__init__(
52
+ pad_token=pad_token,
53
+ mask_token=mask_token,
54
+ eos_token=eos_token,
55
+ bos_token=bos_token,
56
+ sep_token=sep_token,
57
+ model_max_length=model_max_length,
58
+ **kwargs
59
+ )
60
+
61
+ @property
62
+ def vocab_size(self):
63
+ return len(self.alphabet)
64
+
65
+ @property
66
+ def gap_token_id(self):
67
+ return self.convert_tokens_to_ids(self.gap_token)
68
+
69
+ def get_vocab(self):
70
+ return self.a_to_i
71
+
72
+ def _tokenize(self, text: str) -> List[str]:
73
+ return list(text)
74
+
75
+ def _convert_token_to_id(self, token) -> int:
76
+ return self.a_to_i[token]
77
+
78
+ def _convert_id_to_token(self, index) -> str:
79
+ return self.i_to_a[index]
80
+
81
+ def convert_tokens_to_string(self, tokens):
82
+ return "".join(tokens)
83
+
84
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
85
+ result = token_ids_0
86
+ if token_ids_1 is not None:
87
+ raise NotImplementedError("This tokenizer does not support two sequences")
88
+ return result
89
+
90
+ def get_special_tokens_mask(
91
+ self,
92
+ token_ids_0: List[int],
93
+ token_ids_1: Optional[List[int]] = None,
94
+ already_has_special_tokens: bool = False,
95
+ ) -> List[int]:
96
+ if already_has_special_tokens:
97
+ return super().get_special_tokens_mask(
98
+ token_ids_0=token_ids_0,
99
+ token_ids_1=token_ids_1,
100
+ already_has_special_tokens=True,
101
+ )
102
+
103
+ result = [0] * len(token_ids_0)
104
+ if token_ids_1 is not None:
105
+ raise NotImplementedError("This tokenizer does not support two sequences")
106
+
107
+ return result
108
+
109
+ def create_token_type_ids_from_sequences(
110
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
111
+ ) -> List[int]:
112
+ """
113
+ Identifies the type of token. 0 for the first sentence, 1 for the second sentence if it exists
114
+ """
115
+
116
+ result = len(token_ids_0) * [0]
117
+
118
+ if token_ids_1 is not None:
119
+ raise NotImplementedError("This tokenizer does not support two sequences")
120
+ return result
121
+
122
+ def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
123
+ super().save_pretrained(save_directory, **kwargs)
124
+
125
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
126
+ return ()