Create tokenizer.py
Browse files- tokenizer.py +18 -0
tokenizer.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
class Tokenizer:
|
| 4 |
+
def __init__(self, vocab):
|
| 5 |
+
self.token_to_id = vocab
|
| 6 |
+
self.id_to_token = {v: k for k, v in vocab.items()}
|
| 7 |
+
|
| 8 |
+
@classmethod
|
| 9 |
+
def load(cls, path):
|
| 10 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 11 |
+
data = json.load(f)
|
| 12 |
+
return cls(data["token_to_id"])
|
| 13 |
+
|
| 14 |
+
def encode(self, text):
|
| 15 |
+
return [self.token_to_id.get(c, 0) for c in text]
|
| 16 |
+
|
| 17 |
+
def decode(self, tokens):
|
| 18 |
+
return "".join([self.id_to_token.get(t, "") for t in tokens])
|