AxionLab-official commited on
Commit
f724f93
·
verified ·
1 Parent(s): 6e14a07

Create tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +18 -0
tokenizer.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ class Tokenizer:
4
+ def __init__(self, vocab):
5
+ self.token_to_id = vocab
6
+ self.id_to_token = {v: k for k, v in vocab.items()}
7
+
8
+ @classmethod
9
+ def load(cls, path):
10
+ with open(path, "r", encoding="utf-8") as f:
11
+ data = json.load(f)
12
+ return cls(data["token_to_id"])
13
+
14
+ def encode(self, text):
15
+ return [self.token_to_id.get(c, 0) for c in text]
16
+
17
+ def decode(self, tokens):
18
+ return "".join([self.id_to_token.get(t, "") for t in tokens])