teszenofficial commited on
Commit
5d4912f
·
verified ·
1 Parent(s): a31420b

Upload tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer.py +93 -0
tokenizer.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import sentencepiece as spm
3
+ import os
4
+ import json
5
+
6
+ class MTPTokenizer:
7
+ def __init__(self, model_path=None):
8
+ self.sp = None
9
+ self.model_path = model_path
10
+ if model_path and os.path.exists(model_path):
11
+ self.load(model_path)
12
+
13
+ def train(self, corpus_path, vocab_size=4000, model_prefix='mtp_tokenizer'):
14
+ texts = []
15
+ with open(corpus_path, 'r', encoding='utf-8') as f:
16
+ for line in f:
17
+ line = line.strip()
18
+ if not line: continue
19
+ try:
20
+ data = json.loads(line)
21
+ if 'instruction' in data:
22
+ texts.append(data['instruction'])
23
+ if 'input' in data and data['input'].strip():
24
+ texts.append(data['input'])
25
+ if 'output' in data:
26
+ texts.append(data['output'])
27
+ except: continue
28
+
29
+ if not texts: raise ValueError("Corpus vacío")
30
+
31
+ temp_file = 'temp_corpus.txt'
32
+ with open(temp_file, 'w', encoding='utf-8') as f:
33
+ f.write('\n'.join(texts))
34
+
35
+ total_chars = sum(len(text) for text in texts)
36
+ min_vocab = 4000
37
+ max_vocab = max(min_vocab, int(total_chars * 0.15))
38
+
39
+ try:
40
+ spm.SentencePieceTrainer.train(
41
+ input=temp_file,
42
+ model_prefix=model_prefix,
43
+ vocab_size=max_vocab,
44
+ model_type='bpe',
45
+ pad_id=0, unk_id=1, bos_id=2, eos_id=3,
46
+ character_coverage=1.0,
47
+ normalization_rule_name='identity',
48
+ num_threads=2,
49
+ split_digits=True,
50
+ max_sentencepiece_length=16
51
+ )
52
+ except RuntimeError as e:
53
+ if "Vocabulary size too high" in str(e):
54
+ import re
55
+ match = re.search(r'value <= (\d+)', str(e))
56
+ if match:
57
+ spm.SentencePieceTrainer.train(
58
+ input=temp_file,
59
+ model_prefix=model_prefix,
60
+ vocab_size=int(match.group(1)),
61
+ model_type='bpe',
62
+ pad_id=0, unk_id=1, bos_id=2, eos_id=3,
63
+ character_coverage=1.0,
64
+ normalization_rule_name='identity',
65
+ num_threads=2
66
+ )
67
+
68
+ os.remove(temp_file)
69
+ self.model_path = f"{model_prefix}.model"
70
+ self.load(self.model_path)
71
+ print(f"✓ Tokenizer trained: {self.vocab_size()} tokens")
72
+
73
+ def load(self, model_path):
74
+ self.sp = spm.SentencePieceProcessor()
75
+ self.sp.load(model_path)
76
+ self.model_path = model_path
77
+
78
+ def encode(self, text):
79
+ if self.sp is None: raise ValueError("Tokenizer not loaded")
80
+ return self.sp.encode_as_ids(text)
81
+
82
+ def decode(self, ids):
83
+ if self.sp is None: raise ValueError("Tokenizer not loaded")
84
+ return self.sp.decode_ids(ids)
85
+
86
+ def vocab_size(self):
87
+ if self.sp is None: return 0
88
+ return self.sp.get_piece_size()
89
+
90
+ def bos_id(self): return self.sp.bos_id()
91
+ def eos_id(self): return self.sp.eos_id()
92
+ def pad_id(self): return self.sp.pad_id()
93
+ def unk_id(self): return self.sp.unk_id()