Task transcribe is buggy
#2
by inoryQwQ - opened
I change the task to original transcribe, with the following code, the result is weired.
import torch
from transformers.models.whisper import tokenization_whisper
tokenization_whisper.TASK_IDS = ["translate", "transcribe", 'transcribeprecise']
from transformers import (
WhisperFeatureExtractor,
WhisperForConditionalGeneration,
WhisperProcessor,
WhisperTokenizerFast
)
import soundfile as sf
import numpy as np
from typing import Tuple
import whisper
import base64
def load_audio(filename: str) -> Tuple[np.ndarray, int]:
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate
def compute_feat(filename: str, n_mels: int):
wave, sample_rate = load_audio(filename)
if sample_rate != 16000:
import librosa
wave = librosa.resample(wave, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000
audio = whisper.pad_or_trim(wave)
assert audio.shape == (16000 * 30,), audio.shape
mel = whisper.log_mel_spectrogram(audio, n_mels=n_mels).unsqueeze(0)
assert mel.shape == (1, n_mels, 3000), mel.shape
return mel
sr = 16000
feature_extractor = WhisperFeatureExtractor.from_pretrained(
'mesolitica/Malaysian-whisper-large-v3-turbo-v3'
)
processor = WhisperProcessor.from_pretrained(
'mesolitica/Malaysian-whisper-large-v3-turbo-v3'
)
tokenizer = WhisperTokenizerFast.from_pretrained(
'mesolitica/Malaysian-whisper-large-v3-turbo-v3'
)
model = WhisperForConditionalGeneration.from_pretrained(
'mesolitica/Malaysian-whisper-large-v3-turbo-v3',
dtype = torch.float32,
).cpu()
print(f"new token <|transcribeprecise|> is {tokenizer.convert_tokens_to_ids('<|transcribeprecise|>')}")
print(f"new token <|notimestamps|> is {tokenizer.convert_tokens_to_ids('<|notimestamps|>')}")
print(f"n_mels: {model.config.num_mel_bins}")
print(f"encoder_layers: {model.config.encoder_layers}")
print(f"decoder_layers: {model.config.decoder_layers}")
with torch.no_grad():
# p = processor([assembly], return_tensors='pt')
# p['input_features'] = p['input_features'].to(torch.float32)
feature = compute_feat('assembly.mp3', model.config.num_mel_bins)
r = model.generate(
feature,
output_scores=True,
return_dict_in_generate=True,
return_timestamps=True,
task = 'transcribe',
)
tokens = r['sequences'][0]
print(f"tokens: {tokens}")
print(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(tokens)))
This gives result:
<|0.00|> Example<|1.20|><|1.56|> on Aging di Vienna,<|2.48|><|3.46|> Austria<|3.78|><|4.28|> yang telah diadakan pada tahun 1982 dan berasaskan unjuran tersebut, Jabatan Perangkaan Malaysia<|10.36|><|10.84|> menganggarkan<|11.56|><|11.98|> menjelang tahun 2035,<|14.04|><|14.50|> sejumlah 15%<|15.92|><|16.26|> penduduk kita adalah daripada kalangan warga emas.<|18.66|><|19.24|> Untuk makluman<|19.86|><|20.64|> Tuan dan Pertua dan juga Aliam Bohmat, pembangunan sistem pendaftaran warga emas ataupun kita sebutkan EWEN<|25.34|><|25.86|> adalah usaha kerajaan ke arah merealisasikan<|28.36|>