aduncan94 commited on
Commit
2e081f7
·
verified ·
1 Parent(s): 78dce87

Upload folder using huggingface_hub

Browse files
special_tokens_map.json CHANGED
@@ -13,13 +13,6 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "mask_token": {
17
- "content": "#",
18
- "lstrip": false,
19
- "normalized": true,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
  "pad_token": {
24
  "content": "!",
25
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
 
 
 
 
 
 
 
16
  "pad_token": {
17
  "content": "!",
18
  "lstrip": false,
tokenizer_config.json CHANGED
@@ -1,39 +1,31 @@
1
  {
2
  "added_tokens_decoder": {
3
- "27": {
4
- "content": "*",
5
- "lstrip": false,
6
- "normalized": true,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "28": {
12
- "content": "#",
13
  "lstrip": false,
14
  "normalized": true,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
- "29": {
20
- "content": "@",
21
  "lstrip": false,
22
  "normalized": true,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
- "30": {
28
- "content": "!",
29
  "lstrip": false,
30
  "normalized": true,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
- "31": {
36
- "content": "/",
37
  "lstrip": false,
38
  "normalized": true,
39
  "rstrip": false,
@@ -50,7 +42,6 @@
50
  "bos_token": "@",
51
  "clean_up_tokenization_spaces": true,
52
  "eos_token": "*",
53
- "mask_token": "#",
54
  "model_max_length": 2048,
55
  "pad_token": "!",
56
  "sep_token": "/",
 
1
  {
2
  "added_tokens_decoder": {
3
+ "6": {
4
+ "content": "!",
 
 
 
 
 
 
 
 
5
  "lstrip": false,
6
  "normalized": true,
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
10
  },
11
+ "7": {
12
+ "content": "*",
13
  "lstrip": false,
14
  "normalized": true,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
+ "8": {
20
+ "content": "/",
21
  "lstrip": false,
22
  "normalized": true,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "9": {
28
+ "content": "@",
29
  "lstrip": false,
30
  "normalized": true,
31
  "rstrip": false,
 
42
  "bos_token": "@",
43
  "clean_up_tokenization_spaces": true,
44
  "eos_token": "*",
 
45
  "model_max_length": 2048,
46
  "pad_token": "!",
47
  "sep_token": "/",
tokenizers.py CHANGED
@@ -2,9 +2,9 @@ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
2
  from typing import List, Optional, Union
3
  import os
4
 
5
- MASK = "#"
6
  MSA_PAD = "!"
7
- UL_ALPHABET_PLUS = "GATCN-*#@!/[]{}"
 
8
  MSA_AAS = "GATCN-"
9
  GAP = "-"
10
  START = "@"
@@ -22,7 +22,6 @@ class DNATokenizer(PreTrainedTokenizer):
22
  dna_alphabet: str = UL_ALPHABET_PLUS,
23
  model_max_length: int = 2048,
24
  pad_token=MSA_PAD,
25
- mask_token=MASK,
26
  all_aas=MSA_AAS,
27
  gap_token=GAP,
28
  bos_token=START,
@@ -44,13 +43,11 @@ class DNATokenizer(PreTrainedTokenizer):
44
  bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
45
  eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
46
  sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
47
- mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token
48
  pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
49
  gap_token = AddedToken(gap_token, lstrip=False, rstrip=False) if isinstance(gap_token, str) else gap_token
50
 
51
  super().__init__(
52
  pad_token=pad_token,
53
- mask_token=mask_token,
54
  eos_token=eos_token,
55
  bos_token=bos_token,
56
  sep_token=sep_token,
 
2
  from typing import List, Optional, Union
3
  import os
4
 
 
5
  MSA_PAD = "!"
6
+ #UL_ALPHABET_PLUS = "GATCN-*#@!/[]{}"
7
+ UL_ALPHABET_PLUS = 'GATCN-!*/@[]{}'
8
  MSA_AAS = "GATCN-"
9
  GAP = "-"
10
  START = "@"
 
22
  dna_alphabet: str = UL_ALPHABET_PLUS,
23
  model_max_length: int = 2048,
24
  pad_token=MSA_PAD,
 
25
  all_aas=MSA_AAS,
26
  gap_token=GAP,
27
  bos_token=START,
 
43
  bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
44
  eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
45
  sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
 
46
  pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
47
  gap_token = AddedToken(gap_token, lstrip=False, rstrip=False) if isinstance(gap_token, str) else gap_token
48
 
49
  super().__init__(
50
  pad_token=pad_token,
 
51
  eos_token=eos_token,
52
  bos_token=bos_token,
53
  sep_token=sep_token,