lhallee commited on
Commit
37c2e59
·
verified ·
1 Parent(s): 0055f02

Upload tokenizer

Browse files
Files changed (4) hide show
  1. README.md +0 -2
  2. special_tokens_map.json +28 -4
  3. tokenizer.json +2 -2
  4. tokenizer_config.json +4 -2
README.md CHANGED
@@ -1,6 +1,4 @@
1
  ---
2
- # For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
3
- # Doc / guide: https://huggingface.co/docs/hub/model-cards
4
  {}
5
  ---
6
  Encoder only version of the [ANKH base model](https://huggingface.co/ElnaggarLab/ankh-base) ([paper](https://arxiv.org/abs/2301.06568)). The encoder only version is ideal for protein representation tasks.
 
1
  ---
 
 
2
  {}
3
  ---
4
  Encoder only version of the [ANKH base model](https://huggingface.co/ElnaggarLab/ankh-base) ([paper](https://arxiv.org/abs/2301.06568)). The encoder only version is ideal for protein representation tasks.
special_tokens_map.json CHANGED
@@ -117,8 +117,32 @@
117
  "<extra_id_114>",
118
  "<extra_id_115>"
119
  ],
120
- "eos_token": "</s>",
121
- "mask_token": "<extra_id_0>",
122
- "pad_token": "<pad>",
123
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  }
 
117
  "<extra_id_114>",
118
  "<extra_id_115>"
119
  ],
120
+ "eos_token": {
121
+ "content": "</s>",
122
+ "lstrip": false,
123
+ "normalized": false,
124
+ "rstrip": false,
125
+ "single_word": false
126
+ },
127
+ "mask_token": {
128
+ "content": "<extra_id_0>",
129
+ "lstrip": false,
130
+ "normalized": false,
131
+ "rstrip": false,
132
+ "single_word": false
133
+ },
134
+ "pad_token": {
135
+ "content": "<pad>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false
140
+ },
141
+ "unk_token": {
142
+ "content": "<unk>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false
147
+ }
148
  }
tokenizer.json CHANGED
@@ -1134,8 +1134,8 @@
1134
  "decoder": {
1135
  "type": "Metaspace",
1136
  "replacement": "▁",
1137
- "add_prefix_space": true,
1138
- "prepend_scheme": "always"
1139
  },
1140
  "model": {
1141
  "type": "Unigram",
 
1134
  "decoder": {
1135
  "type": "Metaspace",
1136
  "replacement": "▁",
1137
+ "prepend_scheme": "always",
1138
+ "split": true
1139
  },
1140
  "model": {
1141
  "type": "Unigram",
tokenizer_config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "added_tokens_decoder": {
3
  "0": {
4
  "content": "<pad>",
@@ -1071,12 +1072,13 @@
1071
  "<extra_id_114>",
1072
  "<extra_id_115>"
1073
  ],
1074
- "clean_up_tokenization_spaces": true,
1075
  "eos_token": "</s>",
1076
  "extra_ids": 116,
 
1077
  "mask_token": "<extra_id_0>",
1078
  "model_max_length": 1000000000000000019884624838656,
1079
  "pad_token": "<pad>",
1080
- "tokenizer_class": "T5Tokenizer",
1081
  "unk_token": "<unk>"
1082
  }
 
1
  {
2
+ "add_prefix_space": null,
3
  "added_tokens_decoder": {
4
  "0": {
5
  "content": "<pad>",
 
1072
  "<extra_id_114>",
1073
  "<extra_id_115>"
1074
  ],
1075
+ "clean_up_tokenization_spaces": false,
1076
  "eos_token": "</s>",
1077
  "extra_ids": 116,
1078
+ "extra_special_tokens": {},
1079
  "mask_token": "<extra_id_0>",
1080
  "model_max_length": 1000000000000000019884624838656,
1081
  "pad_token": "<pad>",
1082
+ "tokenizer_class": "T5TokenizerFast",
1083
  "unk_token": "<unk>"
1084
  }