general-deep-learning / data /tokenizers.py
yetrun's picture
适配 hf-space
8aa4d8f
"""
GPT模型的共享组件模块:
- 分词器
"""
import keras_hub
from keras import layers
from env.resolve import resolve_path, resolve_saved
def sentence_piece():
vocabulary_file = resolve_saved("vocab/sentencepiece/vocabulary.proto")
# [Note] 依然需要 tensorflow_text 包
tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(str(vocabulary_file))
end_of_text = tokenizer.token_to_id("<|endoftext|>")
def decode(tokens: list[int]) -> str:
return tokenizer.detokenize(tokens)
return tokenizer, end_of_text, decode
def character_vectorization():
"""简单的字符级分词器,适用于测试"""
vectorizer = layers.TextVectorization(output_mode="int", split="character")
vectorizer.set_vocabulary(
list("abcdefghijklmnopqrstuvwxyz0123456789 .,!?;:()[]{}<>-_\n")
+ ["<|endoftext|>"] # 兼容 sentence_piece 分词器的特殊标记
)
vocab = vectorizer.get_vocabulary()
for idx, word in enumerate(vocab):
if word == "<|endoftext|>":
end_of_text = idx
break
else:
raise ValueError("Vocabulary does not contain <|endoftext|> token.")
def decode(tokens: list[int]) -> str:
words = [vocab[token] for token in tokens]
return "".join(words)
return vectorizer, end_of_text, decode
def poetry_character_vectorization(
vocab_path: str = "local/saved/vocab/poetry/vocab.txt",
):
"""从文本文件加载诗歌字符级分词器。
词汇表文件格式:每行一个字符,第一行必须是 <|endoftext|>。
Args:
vocab_path: 词汇表文件路径,默认为 "local/saved/poetry/vocab.txt"
Returns:
(vectorizer, end_of_text, decode): 分词器、结束标记ID、解码函数
"""
# 读取词汇表
vocab_file = resolve_path(vocab_path)
with open(vocab_file, "r", encoding="utf-8") as f:
vocab = [line.rstrip("\n") for line in f]
# 创建 TextVectorization 层
vectorizer = layers.TextVectorization(
output_mode="int", split="character", standardize=None
)
vectorizer.set_vocabulary(vocab)
# 找到 end_of_text 的索引
for idx, word in enumerate(vocab):
if word == "<|endoftext|>":
end_of_text = idx
break
else:
raise ValueError("Vocabulary does not contain <|endoftext|> token.")
def decode(tokens: list[int]) -> str:
words = [vocab[token] for token in tokens]
return "".join(words)
return vectorizer, end_of_text, decode