Upload tokenizer

Files changed (4) hide show

README.md CHANGED Viewed

@@ -1,6 +1,4 @@
 ---
-# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
-# Doc / guide: https://huggingface.co/docs/hub/model-cards
 {}
 ---
 Encoder only version of the [ANKH base model](https://huggingface.co/ElnaggarLab/ankh-base) ([paper](https://arxiv.org/abs/2301.06568)). The encoder only version is ideal for protein representation tasks.

 ---
 {}
 ---
 Encoder only version of the [ANKH base model](https://huggingface.co/ElnaggarLab/ankh-base) ([paper](https://arxiv.org/abs/2301.06568)). The encoder only version is ideal for protein representation tasks.

special_tokens_map.json CHANGED Viewed

@@ -117,8 +117,32 @@
     "<extra_id_114>",
     "<extra_id_115>"
   ],
-  "eos_token": "</s>",
-  "mask_token": "<extra_id_0>",
-  "pad_token": "<pad>",
-  "unk_token": "<unk>"
 }

     "<extra_id_114>",
     "<extra_id_115>"
   ],
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<extra_id_0>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
 }

tokenizer.json CHANGED Viewed

@@ -1134,8 +1134,8 @@
   "decoder": {
     "type": "Metaspace",
     "replacement": "▁",
-    "add_prefix_space": true,
-    "prepend_scheme": "always"
   },
   "model": {
     "type": "Unigram",

   "decoder": {
     "type": "Metaspace",
     "replacement": "▁",
+    "prepend_scheme": "always",
+    "split": true
   },
   "model": {
     "type": "Unigram",

tokenizer_config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "added_tokens_decoder": {
     "0": {
       "content": "<pad>",
@@ -1071,12 +1072,13 @@
     "<extra_id_114>",
     "<extra_id_115>"
   ],
-  "clean_up_tokenization_spaces": true,
   "eos_token": "</s>",
   "extra_ids": 116,
   "mask_token": "<extra_id_0>",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
-  "tokenizer_class": "T5Tokenizer",
   "unk_token": "<unk>"
 }

 {
+  "add_prefix_space": null,
   "added_tokens_decoder": {
     "0": {
       "content": "<pad>",
     "<extra_id_114>",
     "<extra_id_115>"
   ],
+  "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
   "extra_ids": 116,
+  "extra_special_tokens": {},
   "mask_token": "<extra_id_0>",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
+  "tokenizer_class": "T5TokenizerFast",
   "unk_token": "<unk>"
 }