Spaces:

Maximofn
/

subtify

Build error

App Files Files Community

subtify / transcribe.py

Maximofn

Enhance Whisper transcription with multiple model support and performance improvements

149ed58 about 1 year ago

raw

history blame contribute delete

8.27 kB

	import os
	import argparse
	from lang_list import LANGUAGE_NAME_TO_CODE, WHISPER_LANGUAGES
	from tqdm import tqdm
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from transformers.utils import is_flash_attn_2_available
	from time import time

	TRANSCRIPTOR_WHISPER = "openai/whisper-large-v3-turbo" # Time to transcribe: 296.53 seconds ==> minutes: 4.94
	TRANSCRIPTOR_DISTIL_WHISPER = "distil-whisper/distil-large-v3" # Time to transcribe: 242.82 seconds ==> minutes: 4.05
	TRANSCRIPTOR = TRANSCRIPTOR_DISTIL_WHISPER


	def get_language_dict():
	language_dict = {}
	# Iterate over the LANGUAGE_NAME_TO_CODE dictionary
	for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
	# Extract the language code (the first two characters before the underscore)
	lang_code = language_code.split('_')[0].lower()

	# Check if the language code is present in WHISPER_LANGUAGES
	if lang_code in WHISPER_LANGUAGES:
	# Construct the entry for the resulting dictionary
	language_dict[language_name] = {
	"transcriber": lang_code,
	"translator": language_code
	}
	return language_dict

	def transcription_to_dict(transcription):
	"""
	Convierte una transcripción en formato string a un diccionario estructurado.

	Args:
	transcription (str): String que contiene la transcripción con timestamps

	Returns:
	dict: Diccionario con el texto completo y los chunks con sus timestamps
	"""
	try:
	# Si la entrada es un string, convertirlo a diccionario
	if isinstance(transcription, str):
	# Evaluar el string como diccionario de Python
	transcription_dict = eval(transcription)
	else:
	transcription_dict = transcription

	# Validar la estructura del diccionario
	if not isinstance(transcription_dict, dict):
	raise ValueError("La transcripción no tiene el formato esperado")

	if 'text' not in transcription_dict or 'chunks' not in transcription_dict:
	raise ValueError("La transcripción no contiene los campos requeridos (text y chunks)")

	# Limpiar los chunks vacíos y validar timestamps
	cleaned_chunks = []
	for chunk in transcription_dict['chunks']:
	# Verificar que el chunk tiene texto y timestamps válidos
	if (chunk.get('text') and
	isinstance(chunk.get('timestamp'), (list, tuple)) and
	len(chunk['timestamp']) == 2 and
	chunk['timestamp'][0] is not None and
	chunk['timestamp'][1] is not None):

	cleaned_chunks.append({
	'start': float(chunk['timestamp'][0]), # Convertir a float
	'end': float(chunk['timestamp'][1]), # Convertir a float
	'text': chunk['text'].strip()
	})

	# Crear el diccionario final limpio
	result = {
	'text': transcription_dict['text'],
	'chunks': cleaned_chunks
	}

	return result

	except Exception as e:
	print(f"Error procesando la transcripción: {e}")
	return None

	def transcribe(audio_file, language, device, chunk_length_s=30, stride_length_s=5):
	"""
	Transcribe audio file using Whisper model.

	Args:
	audio_file (str): Path to audio file
	language (str): Language code for transcription
	device (str): Device to use for inference ('cuda' or 'cpu')
	chunk_length_s (int): Length of audio chunks in seconds
	stride_length_s (int): Stride length between chunks in seconds
	"""
	output_folder = "transcriptions"
	if not os.path.exists(output_folder):
	os.makedirs(output_folder)

	# Get output filename
	audio_filename = os.path.basename(audio_file)
	filename_without_ext = os.path.splitext(audio_filename)[0]
	output_file = os.path.join(output_folder, f"{filename_without_ext}.srt")

	device = torch.device(device)
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Load model and processor
	model_id = TRANSCRIPTOR
	t0 = time()

	# Configurar Flash Attention 2 si está disponible
	print(f"Using Flash Attention 2: {is_flash_attn_2_available()}")
	if TRANSCRIPTOR == TRANSCRIPTOR_WHISPER:
	model_kwargs = {"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"}
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id,
	torch_dtype=torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True,
	**model_kwargs
	)
	else:
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id,
	torch_dtype=torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True,
	)
	model.to(device)

	processor = AutoProcessor.from_pretrained(model_id)

	timestamp = True
	if TRANSCRIPTOR == TRANSCRIPTOR_DISTIL_WHISPER:
	timestamp = "word"
	else:
	timestamp = True

	# Create pipeline with timestamp generation
	if TRANSCRIPTOR == TRANSCRIPTOR_WHISPER:
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	torch_dtype=torch_dtype,
	device=device,
	chunk_length_s=chunk_length_s,
	stride_length_s=stride_length_s,
	return_timestamps=timestamp,
	max_new_tokens=128,
	batch_size=24,
	model_kwargs=model_kwargs
	)
	else:
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	torch_dtype=torch_dtype,
	device=device,
	chunk_length_s=chunk_length_s,
	stride_length_s=stride_length_s,
	return_timestamps=timestamp,
	max_new_tokens=128,
	)

	# Transcribe with timestamps and generate attention mask
	if TRANSCRIPTOR == TRANSCRIPTOR_WHISPER:
	result = pipe(
	audio_file,
	return_timestamps=timestamp,
	batch_size=24,
	generate_kwargs={
	"language": language,
	"task": "transcribe",
	"use_cache": True,
	"num_beams": 1
	}
	)
	else:
	result = pipe(
	audio_file,
	return_timestamps=timestamp,
	generate_kwargs={
	"language": language,
	"task": "transcribe",
	"use_cache": True,
	"num_beams": 1
	}
	)

	t = time()
	print(f"Time to transcribe: {t - t0:.2f} seconds")

	transcription_str = result
	transcription_dict = transcription_to_dict(transcription_str)

	return transcription_str, transcription_dict

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='Transcribe audio files')
	parser.add_argument('input_files', help='Input audio files')
	parser.add_argument('language', help='Language of the audio file')
	parser.add_argument('num_speakers', help='Number of speakers in the audio file')
	parser.add_argument('device', help='Device to use for PyTorch inference')
	args = parser.parse_args()

	chunks_folder = "chunks"

	with open(args.input_files, 'r') as f:
	inputs = f.read().splitlines()

	progress_bar = tqdm(total=len(inputs), desc="Transcribe audio files progress")
	for input in inputs:
	input_file, _ = input.split('.')
	_, input_name = input_file.split('/')
	extension = "mp3"
	file = f'{chunks_folder}/{input_name}.{extension}'
	language_dict = get_language_dict()
	transcribe(file, language_dict[args.language]["transcriber"], args.num_speakers, args.device)
	progress_bar.update(1)