Update handler.py

3e8f6f8 over 2 years ago

6.21 kB

	from typing import Dict, List, Any
	import sys, os, re
	from tqdm import tqdm

	import torch
	from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
	from IndicTransTokenizer.utils import preprocess_batch, postprocess_batch
	from IndicTransTokenizer.tokenizer import IndicTransTokenizer


	class EndpointHandler():
	def __init__(self, direction = "en-indic", quantization = ""):
	self.model_name = "ai4bharat/indictrans2-en-indic-1B"

	self.utterance_pattern = re.compile(r"^\d+$")
	self.timestamp_pattern = re.compile(r"(\d+:\d+:\d+,\d+)\s-->\s(\d+:\d+:\d+,\d+)")

	self.BATCH_SIZE = 16
	self.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	self.model = None
	self.tokenizer = None

	if quantization == "4-bit":
	qconfig = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	)
	elif quantization == "8-bit":
	qconfig = BitsAndBytesConfig(
	load_in_8bit=True,
	bnb_8bit_use_double_quant=True,
	bnb_8bit_compute_dtype=torch.bfloat16,
	)
	else:
	qconfig = None

	self.tokenizer = IndicTransTokenizer(direction=direction)
	self.model = AutoModelForSeq2SeqLM.from_pretrained(
	self.model_name,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	quantization_config=qconfig
	)

	if qconfig==None:
	self.model = self.model.to(self.DEVICE)
	self.model.half()

	self.model.eval()


	def batch_translate(self, input_sentences, src_lang, tgt_lang):
	translations = []
	for i in range(0, len(input_sentences), self.BATCH_SIZE):
	batch = input_sentences[i : i + self.BATCH_SIZE]

	# Preprocess the batch and extract entity mappings
	batch, entity_map = preprocess_batch(
	batch, src_lang=src_lang, tgt_lang=tgt_lang
	)

	# Tokenize the batch and generate input encodings
	inputs = self.tokenizer(
	batch,
	src=True,
	truncation=True,
	padding="longest",
	return_tensors="pt",
	return_attention_mask=True,
	).to(self.DEVICE)

	# Generate translations using the model
	with torch.no_grad():
	generated_tokens = self.model.generate(
	**inputs,
	use_cache=True,
	min_length=0,
	max_length=256,
	num_beams=5,
	num_return_sequences=1,
	)

	# Decode the generated tokens into text
	generated_tokens = self.tokenizer.batch_decode(
	generated_tokens.detach().cpu().tolist(), src=False
	)

	# Postprocess the translations, including entity replacement
	translations += postprocess_batch(
	generated_tokens, lang=tgt_lang, placeholder_entity_map=entity_map
	)

	del inputs
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return translations


	def read_srt(self, srt_path):
	data = []
	with open(srt_path, 'r', encoding='utf-8') as fp:
	utterance_ind = ""
	start_end = ""
	text = ""
	for ind, line in enumerate(fp.readlines()):
	line = line.strip()
	if re.search(self.utterance_pattern, line) is not None:
	utterance_ind = line
	elif re.search(self.timestamp_pattern, line) is not None:
	start_end = line
	else:
	text += line

	if utterance_ind!='' and start_end!='' and text!='':
	data.append({'utterance_ind': utterance_ind, 'start_end': start_end, 'text': text})
	utterance_ind = ''
	start_end = ''
	text = ''

	return data

	def test(self, inputs) -> List[Dict[str, Any]]:
	"""
	data args:
	inputs (:obj: (transcript_path : 'str', src_lang : 'str', tgt_lang : 'str')
	kwargs
	Return:
	A :obj:`list` \| `dict`: will be serialized and returned
	"""

	src_lang = inputs["src_lang"]
	tgt_lang = inputs["tgt_lang"]
	transcript_path = inputs["transcript_path"]

	output_translations = []
	if self.model is not None:
	transcriptions = self.read_srt(transcript_path)
	trans_sents = [entry['text'] for entry in transcriptions]
	indic_translations = self.batch_translate(trans_sents, src_lang, tgt_lang)

	for i in tqdm(range(len(transcriptions))):
	entry = transcriptions[i]
	entry['text'] = indic_translations[i]
	output_translations.append(entry)

	return output_translations
	else:
	return []

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	data args:
	inputs (:obj: (transcript_path : 'str', src_lang : 'str', tgt_lang : 'str')
	kwargs
	Return:
	A :obj:`list` \| `dict`: will be serialized and returned
	"""

	inputs = data.pop("inputs",data)

	src_lang = inputs["src_lang"]
	tgt_lang = inputs["tgt_lang"]
	transcript_path = inputs["transcript_path"]

	output_translations = []
	if self.model is not None:
	transcriptions = self.read_srt(transcript_path)
	trans_sents = [entry['text'] for entry in transcriptions]
	indic_translations = self.batch_translate(trans_sents, src_lang, tgt_lang)

	for i in tqdm(range(len(transcriptions))):
	entry = transcriptions[i]
	entry['text'] = indic_translations[i]
	output_translations.append(entry)

	return output_translations
	else:
	return []