NeMo_Canary / scripts /process_asr_text_tokenizer.py

Upload folder using huggingface_hub

b386992 verified 9 months ago

17.1 kB

	# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# USAGE: python process_asr_text_tokenizer.py --manifest=<path to train manifest files, seperated by commas> \
	# --data_root="<output directory>" \
	# --vocab_size=<number of tokens in vocabulary> \
	# --tokenizer=<"spe" or "wpe"> \
	# --log
	# where <manifest> can be: train_clean_100, train_clean_360, train_other_500
	# You can also put more than one data_set comma-separated:
	# --manifest="train_clean_100,train_clean_360,train_other_500"
	# or
	# python process_asr_text_tokenizer.py --data_file=<path to train text file> \
	# --data_root="<output directory>" \
	# --vocab_size=<number of tokens in vocabulary> \
	# --tokenizer=<"bpe" or "wpe"> \
	# --log
	# where <manifest> can be: train_clean_100, train_clean_360, train_other_500
	# You can also put more than one data_set comma-separated:
	# --manifest="train_clean_100,train_clean_360,train_other_500"
	#
	# Args:
	# --manifest or --data_file: If your text data lies inside of an ASR manifest file,
	# then use the --manifest path. If instead the text data is inside a file with separate lines
	# corresponding to different text lines, then use --data_file.
	# In either case, you can add commas to concatenate different manifests or different data files.
	#
	# --data_root: The output directory (whose subdirectories will be created if not present) where
	# the tokenizers will be placed.
	#
	# --vocab_size: The size of the tokenizer vocabulary. Larger vocabularies can accommodate almost entire,
	# words but the decoder size of any model will grow proportionally.
	#
	# --tokenizer: Can be either spe or wpe . spe refers to the Google sentencepiece library tokenizer.
	# wpe refers to the HuggingFace BERT Word Piece tokenizer.
	#
	# --no_lower_case: When this flag is passed, it will force the tokenizer to create seperate tokens for
	# upper and lower case characters. By default, the script will turn all the text to lower case
	# before tokenization (and if upper case characters are passed during training/inference, the
	# tokenizer will emit a token equivalent to Out-Of-Vocabulary). Used primarily for the
	# English language.
	#
	# --spe_type: The sentencepiece library has a few implementations of the tokenization technique, and
	# spe_type refers to these implementations. Currently supported types are unigram, bpe, char, word.
	# Defaults to bpe.
	#
	# --spe_character_coverage: The sentencepiece library considers how much of the original vocabulary it
	# should cover in its "base set" of tokens (akin to the lower and upper case characters of the
	# English language). For almost all languages with small base token sets (<1000 tokens), this
	# should be kept at its default of 1.0. For languages with larger vocabularies (say Japanese,
	# Mandarin, Korean etc), the suggested value is 0.9995.
	#
	# --spe_user_defined_symbols: The sentencepiece library allows you to add your own tokens to the base set.
	# This flag allows you to pass a space separated list of tokens that you want to add to the base set.
	# These tokens remain in the decoded text and are encoded automatically when present in the input text.
	#
	# --spe_control_symbols: The sentencepiece library allows you to add your own tokens to the base set.
	# This flag allows you to pass a space separated list of tokens that you want to add to the base set.
	# These tokens get removed at decode time and are not encoded from the text - can only be added to the
	# input programatically.
	#
	# --spe_byte_fallback: If <unk>, fallback to a byte sequence of the characters.
	#
	# --spe_split_digits: If true, digits are split into individual tokens.
	#
	# --spe_sample_size: If the dataset is too large, consider using a sampled dataset indicated by a
	# positive integer. By default, any negative value (default = -1) will use the entire dataset.
	#
	# --spe_train_extremely_large_corpus: When training a sentencepiece tokenizer on very large amounts of text,
	# sometimes the tokenizer will run out of memory or wont be able to process so much data on RAM.
	# At some point you might receive the following error - "Input corpus too large, try with
	# train_extremely_large_corpus=true". If your machine has large amounts of RAM, it might still be possible
	# to build the tokenizer using the above flag. Will silently fail if it runs out of RAM.
	#
	# --spe_max_sentencepiece_length: Limits the maximum length that any any SentencePiece subword can be.
	# Using this will change the subword tokens generated.
	#
	# --spe_pad: Adds <pad> as special token.
	#
	# --spe_bos: Adds <s> as Begining-of-Sentence special token.
	#
	# --spe_eos: Adds </s> as End-of-Sentence special token.
	#
	# --log: Whether the script should display log messages


	import argparse
	import json
	import logging
	import os
	from typing import List, Optional

	import tokenizers

	from nemo.collections.common.tokenizers.sentencepiece_tokenizer import create_spt_model
	from nemo.utils.data_utils import DataStoreObject

	parser = argparse.ArgumentParser(description='Create tokenizer')
	group = parser.add_mutually_exclusive_group(required=True)
	group.add_argument("--manifest", default=None, type=str, help='Comma separated list of manifest files')
	group.add_argument("--data_file", default=None, help='data file from which to create tokenizer model')
	parser.add_argument("--data_root", required=True, default=None, type=str, help='Output directory')
	parser.add_argument("--vocab_size", default=1024, type=int, help='Vocabulary size')
	parser.add_argument("--tokenizer", default="wpe", choices=["spe", "wpe"], help='Type of tokenization to perform')
	parser.add_argument(
	"--spe_type",
	default="bpe",
	choices=['bpe', 'unigram', 'char', 'word'],
	help='Type of the SentencePiece model. Can be `bpe`, `unigram`, `char` or `word`.'
	'Used only if --tokenizer == `spe`',
	)
	parser.add_argument(
	'--spe_character_coverage',
	type=float,
	default=1.0,
	help="Character coverage percentage for SentencePiece tokenization. For languages "
	"with large vocabulary, should be close to 0.9995, otherwise kept as 1.0",
	)
	parser.add_argument('--spe_bos', action='store_true', help='Add <s> token to SentencePiece Tokenizer.')
	parser.add_argument('--spe_eos', action='store_true', help='Add </s> token to SentencePiece Tokenizer.')
	parser.add_argument('--spe_pad', action='store_true', help='Add <pad> token to SentencePiece Tokenizer.')
	parser.add_argument(
	'--spe_user_defined_symbols', default=None, type=str, nargs='+', help='User defined symbols for SentencePiece'
	)
	parser.add_argument(
	'--spe_control_symbols', default=None, type=str, nargs='+', help='Control symbols for SentencePiece'
	)
	parser.add_argument('--spe_split_digits', action='store_true', help='Split digits into separate tokens.')
	parser.add_argument(
	'--spe_remove_extra_whitespaces',
	action='store_true',
	help='Remove leading, trailing, and duplicate internal whitespace.',
	)

	parser.add_argument(
	'--spe_sample_size',
	type=int,
	default=-1,
	help="Samples the dataset by `sample_size` if positive integer, otherwise uses whole dataset",
	)
	parser.add_argument('--spe_train_extremely_large_corpus', action='store_true', help='')
	parser.add_argument(
	'--spe_max_sentencepiece_length',
	type=int,
	default=-1,
	help='Limit the maximum number of tokens in each SentencePiece subword. '
	'Must be a positive integer > 0. By default places no limit on subword length.',
	)
	parser.add_argument(
	'--spe_no_split_by_unicode_script',
	dest='spe_split_by_unicode_script',
	action='store_false',
	help="Don't use Unicode script to split sentence pieces.",
	)
	parser.add_argument(
	'--spe_byte_fallback',
	dest='spe_byte_fallback',
	action='store_true',
	help="If <unk>, fallback to a byte sequence of the characters.",
	)
	parser.add_argument('--no_lower_case', dest='lower_case', action='store_false')
	parser.add_argument("--log", action='store_true')
	parser.set_defaults(log=False, lower_case=True, spe_train_extremely_large_corpus=False)
	args = parser.parse_args()


	def __build_document_from_manifests(
	data_root: str,
	manifests: str,
	):
	if ',' in manifests:
	manifests = manifests.split(',')
	else:
	manifests = [manifests]

	document_dir = os.path.join(data_root, 'text_corpus')
	if not os.path.exists(document_dir):
	os.makedirs(document_dir)

	document_path = os.path.join(document_dir, 'document.txt')

	if os.path.exists(document_path):
	logging.info('Corpus already exists at path : %s', document_path)
	return document_path

	num_lines = 0
	with open(document_path, 'w') as out_writer:
	for manifest in manifests:
	with open(DataStoreObject(manifest).get(), 'r') as in_reader:
	for line in in_reader:
	item = json.loads(line)
	text = item['text']

	out_writer.write(text + '\n')
	out_writer.flush()

	num_lines += 1

	logging.info(f"Finished extracting manifest : {manifest}")

	logging.info("Finished extracting all manifests ! Number of sentences : {}".format(num_lines))
	return document_path


	def __process_data(
	text_path: str,
	dst_folder: str,
	vocab_size: int,
	tokenizer_type: str,
	spe_type: str,
	spe_character_coverage: float,
	spe_train_extremely_large_corpus: bool,
	spe_sample_size: int,
	spe_max_sentencepiece_length: int,
	spe_split_by_unicode_script: bool,
	spe_bos: bool,
	spe_eos: bool,
	spe_pad: bool,
	spe_control_symbols: Optional[List[str]],
	spe_user_defined_symbols: Optional[List[str]],
	spe_byte_fallback: bool,
	spe_split_digits: bool,
	spe_remove_extra_whitespaces: bool,
	lower_case: bool,
	):
	"""
	Converts flac to wav and build manifests's json
	Args:
	text_path: source with text lines
	dst_folder: where wav files will be stored
	vocab_size: vocabular size used in encoding the text
	tokenizer_type: type of tokenization to perform - wpe or spe
	spe_type: type of tokenization model used for spe.
	spe_character_coverage: float value between 0 and 1 (as a percentage). For languages with a vast charset,
	can be < 1.0, but for all other languages, it should be set as 1.0
	spe_sample_size: int, default of -1. If positive integer is used, samples the dataset
	by given sample size.
	spe_train_extremely_large_corpus: bool. If dataset is too large, and user has sufficient RAM,
	this flag can be set to try to trained the tokenizer. Will silently fail if it runs out of RAM.
	spe_max_sentencepiece_length: Limits the maximum length of the SentencePiece subword that can be constructed.
	By default, no limit is placed.
	spe_bos: Bool flag, whether to add <s> to SentencePiece tokenizer vocabulary.
	spe_eos: Bool flag, whether to add </s> to SentencePiece tokenizer vocabulary.
	spe_pad: Bool flag, whether to add <pad> to SentencePiece tokenizer vocabulary.
	spe_control_symbols: control symbols to add to tokenizer, as defined by sentencepiece.
	These tokens get removed at decode time and are not encoded from the text - can only be added to the input programatically.
	spe_user_defined_symbols: user symbols to add to tokenizer, as defined by sentencepiece.
	These tokens remain in the decoded text and are encoded automatically when present in the input text.
	spe_byte_fallback: If <unk>, fallback to a byte sequence of the character.
	spe_split_digits: If true, digits are split into individual tokens.
	spe_remove_extra_whitespaces: If true, removes leading, trailing, and duplicate internal whitespace.
	lower_case: whether to tokenize with lower case character set only (for english)

	Returns:
	"""
	if tokenizer_type == 'spe':

	# Prepare directory of tokenizer
	if spe_max_sentencepiece_length > 0:
	tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}_max_{}').format(
	tokenizer_type, spe_type, vocab_size, spe_max_sentencepiece_length
	)
	else:
	tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_{}_v{}').format(
	tokenizer_type, spe_type, vocab_size
	)

	if spe_pad:
	tokenizer_dir = f'{tokenizer_dir}_pad'
	if spe_bos:
	tokenizer_dir = f'{tokenizer_dir}_bos'
	if spe_eos:
	tokenizer_dir = f'{tokenizer_dir}_eos'

	if not os.path.exists(tokenizer_dir):
	os.makedirs(tokenizer_dir)

	if os.path.exists(os.path.join(tokenizer_dir, 'tokenizer.model')):
	logging.warning("Model file already exists, overriding old model file !")
	os.remove(os.path.join(tokenizer_dir, 'tokenizer.model'))

	# Build tokenizer
	tokenizer_path, vocab_path = create_spt_model(
	data_file=text_path,
	vocab_size=vocab_size,
	sample_size=spe_sample_size,
	do_lower_case=lower_case,
	output_dir=tokenizer_dir,
	tokenizer_type=spe_type,
	character_coverage=spe_character_coverage,
	train_extremely_large_corpus=spe_train_extremely_large_corpus,
	max_sentencepiece_length=spe_max_sentencepiece_length,
	split_by_unicode_script=spe_split_by_unicode_script,
	bos=spe_bos,
	eos=spe_eos,
	pad=spe_pad,
	control_symbols=spe_control_symbols,
	user_defined_symbols=spe_user_defined_symbols,
	byte_fallback=spe_byte_fallback,
	split_digits=spe_split_digits,
	remove_extra_whitespaces=spe_remove_extra_whitespaces,
	)

	else:
	tokenizer_dir = os.path.join(dst_folder, 'tokenizer_{}_v{}').format(tokenizer_type, vocab_size)

	if not os.path.exists(tokenizer_dir):
	os.makedirs(tokenizer_dir)

	tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=lower_case)

	tokenizer.train(text_path, vocab_size=vocab_size)
	tokenizer.save_model(tokenizer_dir)

	return tokenizer_dir


	def main():
	data_root = args.data_root
	manifests = args.manifest
	data_file = args.data_file
	vocab_size = args.vocab_size
	tokenizer = args.tokenizer
	spe_type = args.spe_type
	spe_character_coverage = args.spe_character_coverage
	spe_sample_size = args.spe_sample_size
	spe_train_extremely_large_corpus = args.spe_train_extremely_large_corpus
	spe_max_sentencepiece_length = args.spe_max_sentencepiece_length
	spe_split_by_unicode_script = args.spe_split_by_unicode_script
	spe_bos, spe_eos, spe_pad = args.spe_bos, args.spe_eos, args.spe_pad
	spe_control_symbols = args.spe_control_symbols
	spe_user_defined_symbols = args.spe_user_defined_symbols
	spe_byte_fallback = args.spe_byte_fallback
	spe_split_digits = args.spe_split_digits
	spe_remove_extra_whitespaces = args.spe_remove_extra_whitespaces
	lower_case = args.lower_case

	if not os.path.exists(data_root):
	os.makedirs(data_root)

	if args.log:
	logging.basicConfig(level=logging.INFO)

	if manifests:
	text_corpus_path = __build_document_from_manifests(data_root, manifests)
	else:
	text_corpus_path = data_file
	tokenizer_path = __process_data(
	text_corpus_path,
	data_root,
	vocab_size,
	tokenizer,
	spe_type,
	lower_case=lower_case,
	spe_character_coverage=spe_character_coverage,
	spe_sample_size=spe_sample_size,
	spe_train_extremely_large_corpus=spe_train_extremely_large_corpus,
	spe_max_sentencepiece_length=spe_max_sentencepiece_length,
	spe_split_by_unicode_script=spe_split_by_unicode_script,
	spe_bos=spe_bos,
	spe_eos=spe_eos,
	spe_pad=spe_pad,
	spe_control_symbols=spe_control_symbols,
	spe_user_defined_symbols=spe_user_defined_symbols,
	spe_byte_fallback=spe_byte_fallback,
	spe_split_digits=spe_split_digits,
	spe_remove_extra_whitespaces=spe_remove_extra_whitespaces,
	)

	print("Serialized tokenizer at location :", tokenizer_path)
	logging.info('Done!')


	if __name__ == "__main__":
	main()