Spaces:

yetrun
/

general-deep-learning

Running

App Files Files Community

general-deep-learning / data /tokenizers.py

yetrun

适配 hf-space

8aa4d8f about 10 hours ago

raw

history blame contribute delete

2.56 kB

	"""
	GPT模型的共享组件模块：

	- 分词器
	"""

	import keras_hub
	from keras import layers
	from env.resolve import resolve_path, resolve_saved


	def sentence_piece():
	vocabulary_file = resolve_saved("vocab/sentencepiece/vocabulary.proto")
	# [Note] 依然需要 tensorflow_text 包
	tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(str(vocabulary_file))

	end_of_text = tokenizer.token_to_id("<\|endoftext\|>")

	def decode(tokens: list[int]) -> str:
	return tokenizer.detokenize(tokens)

	return tokenizer, end_of_text, decode


	def character_vectorization():
	"""简单的字符级分词器，适用于测试"""
	vectorizer = layers.TextVectorization(output_mode="int", split="character")
	vectorizer.set_vocabulary(
	list("abcdefghijklmnopqrstuvwxyz0123456789 .,!?;:()[]{}<>-_\n")
	+ ["<\|endoftext\|>"] # 兼容 sentence_piece 分词器的特殊标记
	)

	vocab = vectorizer.get_vocabulary()
	for idx, word in enumerate(vocab):
	if word == "<\|endoftext\|>":
	end_of_text = idx
	break
	else:
	raise ValueError("Vocabulary does not contain <\|endoftext\|> token.")

	def decode(tokens: list[int]) -> str:
	words = [vocab[token] for token in tokens]
	return "".join(words)

	return vectorizer, end_of_text, decode


	def poetry_character_vectorization(
	vocab_path: str = "local/saved/vocab/poetry/vocab.txt",
	):
	"""从文本文件加载诗歌字符级分词器。

	词汇表文件格式：每行一个字符，第一行必须是 <\|endoftext\|>。

	Args:
	vocab_path: 词汇表文件路径，默认为 "local/saved/poetry/vocab.txt"

	Returns:
	(vectorizer, end_of_text, decode): 分词器、结束标记ID、解码函数
	"""
	# 读取词汇表
	vocab_file = resolve_path(vocab_path)
	with open(vocab_file, "r", encoding="utf-8") as f:
	vocab = [line.rstrip("\n") for line in f]

	# 创建 TextVectorization 层
	vectorizer = layers.TextVectorization(
	output_mode="int", split="character", standardize=None
	)
	vectorizer.set_vocabulary(vocab)

	# 找到 end_of_text 的索引
	for idx, word in enumerate(vocab):
	if word == "<\|endoftext\|>":
	end_of_text = idx
	break
	else:
	raise ValueError("Vocabulary does not contain <\|endoftext\|> token.")

	def decode(tokens: list[int]) -> str:
	words = [vocab[token] for token in tokens]
	return "".join(words)

	return vectorizer, end_of_text, decode