aduncan94 commited on
Commit
9ad8a66
·
verified ·
1 Parent(s): 0b0fa03

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. config.json +1 -2
  2. special_tokens_map.json +35 -5
  3. tokenizer_config.json +54 -20
  4. tokenizers.py +126 -0
config.json CHANGED
@@ -37,6 +37,5 @@
37
  "transformers_version": "4.48.2",
38
  "use_cache": false,
39
  "use_mamba_kernels": true,
40
- "vocab_size": 16,
41
- "tokenizer_class": "PreTrainedTokenizer"
42
  }
 
37
  "transformers_version": "4.48.2",
38
  "use_cache": false,
39
  "use_mamba_kernels": true,
40
+ "vocab_size": 16
 
41
  }
special_tokens_map.json CHANGED
@@ -1,7 +1,37 @@
1
  {
2
- "pad_token": "!",
3
- "bos_token": "@",
4
- "eos_token": "*",
5
- "unk_token": "-",
6
- "sep_token": "/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "@",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "*",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "#",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "!",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "sep_token": {
31
+ "content": "/",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
  }
tokenizer_config.json CHANGED
@@ -1,24 +1,58 @@
1
  {
2
- "model_type": "enhancar",
3
- "alphabet": [
4
- "G",
5
- "A",
6
- "T",
7
- "C",
8
- "N",
9
- "-",
10
- "!",
11
- "*",
12
- "/",
13
- "@",
14
- "[",
15
- "]",
16
- "{",
17
- "}"
18
- ],
19
- "pad_token": "!",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  "bos_token": "@",
 
21
  "eos_token": "*",
22
- "unk_token": "-",
23
- "sep_token": "/"
 
 
 
24
  }
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "27": {
4
+ "content": "*",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "28": {
12
+ "content": "#",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "29": {
20
+ "content": "@",
21
+ "lstrip": false,
22
+ "normalized": true,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "30": {
28
+ "content": "!",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "31": {
36
+ "content": "/",
37
+ "lstrip": false,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenizers.DNATokenizer",
47
+ null
48
+ ]
49
+ },
50
  "bos_token": "@",
51
+ "clean_up_tokenization_spaces": true,
52
  "eos_token": "*",
53
+ "mask_token": "#",
54
+ "model_max_length": 2048,
55
+ "pad_token": "!",
56
+ "sep_token": "/",
57
+ "tokenizer_class": "DNATokenizer"
58
  }
tokenizers.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
2
+ from typing import List, Optional, Union
3
+ import os
4
+
5
+ MASK = "#"
6
+ MSA_PAD = "!"
7
+ UL_ALPHABET_PLUS = "GATCN-*#@!/[]{}"
8
+ MSA_AAS = "GATCN-"
9
+ GAP = "-"
10
+ START = "@"
11
+ STOP = "*"
12
+ SEP = "/"
13
+ END_AL = "]"
14
+ END_UL = "}"
15
+ START_AL = "["
16
+ START_UL = "{"
17
+
18
+ class DNATokenizer(PreTrainedTokenizer):
19
+
20
+ def __init__(
21
+ self,
22
+ dna_alphabet: str = UL_ALPHABET_PLUS,
23
+ model_max_length: int = 2048,
24
+ pad_token=MSA_PAD,
25
+ mask_token=MASK,
26
+ all_aas=MSA_AAS,
27
+ gap_token=GAP,
28
+ bos_token=START,
29
+ eos_token=STOP,
30
+ sep_token=SEP,
31
+ **kwargs
32
+ ):
33
+ """Character tokenizer for Hugging Face transformers.
34
+
35
+ model_max_length (int): Model maximum sequence length.
36
+ """
37
+ self.alphabet = list("".join(dna_alphabet))
38
+ self.all_aas = list("".join(all_aas))
39
+ self.a_to_i = {u: i for i, u in enumerate(self.alphabet)}
40
+ self.i_to_a = {i: u for i, u in enumerate(self.alphabet)}
41
+ self.gap_token = gap_token
42
+
43
+
44
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
45
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
46
+ sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
47
+ mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token
48
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
49
+ gap_token = AddedToken(gap_token, lstrip=False, rstrip=False) if isinstance(gap_token, str) else gap_token
50
+
51
+ super().__init__(
52
+ pad_token=pad_token,
53
+ mask_token=mask_token,
54
+ eos_token=eos_token,
55
+ bos_token=bos_token,
56
+ sep_token=sep_token,
57
+ model_max_length=model_max_length,
58
+ **kwargs
59
+ )
60
+
61
+ @property
62
+ def vocab_size(self):
63
+ return len(self.alphabet)
64
+
65
+ @property
66
+ def gap_token_id(self):
67
+ return self.convert_tokens_to_ids(self.gap_token)
68
+
69
+ def get_vocab(self):
70
+ return self.a_to_i
71
+
72
+ def _tokenize(self, text: str) -> List[str]:
73
+ return list(text)
74
+
75
+ def _convert_token_to_id(self, token) -> int:
76
+ return self.a_to_i[token]
77
+
78
+ def _convert_id_to_token(self, index) -> str:
79
+ return self.i_to_a[index]
80
+
81
+ def convert_tokens_to_string(self, tokens):
82
+ return "".join(tokens)
83
+
84
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
85
+ result = token_ids_0
86
+ if token_ids_1 is not None:
87
+ raise NotImplementedError("This tokenizer does not support two sequences")
88
+ return result
89
+
90
+ def get_special_tokens_mask(
91
+ self,
92
+ token_ids_0: List[int],
93
+ token_ids_1: Optional[List[int]] = None,
94
+ already_has_special_tokens: bool = False,
95
+ ) -> List[int]:
96
+ if already_has_special_tokens:
97
+ return super().get_special_tokens_mask(
98
+ token_ids_0=token_ids_0,
99
+ token_ids_1=token_ids_1,
100
+ already_has_special_tokens=True,
101
+ )
102
+
103
+ result = [0] * len(token_ids_0)
104
+ if token_ids_1 is not None:
105
+ raise NotImplementedError("This tokenizer does not support two sequences")
106
+
107
+ return result
108
+
109
+ def create_token_type_ids_from_sequences(
110
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
111
+ ) -> List[int]:
112
+ """
113
+ Identifies the type of token. 0 for the first sentence, 1 for the second sentence if it exists
114
+ """
115
+
116
+ result = len(token_ids_0) * [0]
117
+
118
+ if token_ids_1 is not None:
119
+ raise NotImplementedError("This tokenizer does not support two sequences")
120
+ return result
121
+
122
+ def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
123
+ super().save_pretrained(save_directory, **kwargs)
124
+
125
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
126
+ return ()