Task transcribe is buggy

by inoryQwQ - opened Jan 12

Jan 12

I change the task to original transcribe, with the following code, the result is weired.

import torch
from transformers.models.whisper import tokenization_whisper

tokenization_whisper.TASK_IDS = ["translate", "transcribe", 'transcribeprecise']

from transformers import (
    WhisperFeatureExtractor, 
    WhisperForConditionalGeneration, 
    WhisperProcessor, 
    WhisperTokenizerFast
)
import soundfile as sf
import numpy as np
from typing import Tuple
import whisper
import base64


def load_audio(filename: str) -> Tuple[np.ndarray, int]:
    data, sample_rate = sf.read(
        filename,
        always_2d=True,
        dtype="float32",
    )
    data = data[:, 0]  # use only the first channel
    samples = np.ascontiguousarray(data)
    return samples, sample_rate


def compute_feat(filename: str, n_mels: int):
    wave, sample_rate = load_audio(filename)
    if sample_rate != 16000:
        import librosa

        wave = librosa.resample(wave, orig_sr=sample_rate, target_sr=16000)
        sample_rate = 16000

    audio = whisper.pad_or_trim(wave)
    assert audio.shape == (16000 * 30,), audio.shape

    mel = whisper.log_mel_spectrogram(audio, n_mels=n_mels).unsqueeze(0)
    assert mel.shape == (1, n_mels, 3000), mel.shape

    return mel


sr = 16000
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    'mesolitica/Malaysian-whisper-large-v3-turbo-v3'
)
processor = WhisperProcessor.from_pretrained(
    'mesolitica/Malaysian-whisper-large-v3-turbo-v3'
)
tokenizer = WhisperTokenizerFast.from_pretrained(
    'mesolitica/Malaysian-whisper-large-v3-turbo-v3'
)
model = WhisperForConditionalGeneration.from_pretrained(
    'mesolitica/Malaysian-whisper-large-v3-turbo-v3', 
    dtype = torch.float32,
).cpu()

print(f"new token <|transcribeprecise|> is {tokenizer.convert_tokens_to_ids('<|transcribeprecise|>')}")
print(f"new token <|notimestamps|> is {tokenizer.convert_tokens_to_ids('<|notimestamps|>')}")
 
print(f"n_mels: {model.config.num_mel_bins}")
print(f"encoder_layers: {model.config.encoder_layers}")
print(f"decoder_layers: {model.config.decoder_layers}")

with torch.no_grad():
    # p = processor([assembly], return_tensors='pt')
    # p['input_features'] = p['input_features'].to(torch.float32)

    feature = compute_feat('assembly.mp3', model.config.num_mel_bins)
    r = model.generate(
        feature,
        output_scores=True,
        return_dict_in_generate=True,
        return_timestamps=True, 
        task = 'transcribe',
    )

tokens = r['sequences'][0]
print(f"tokens: {tokens}")
print(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(tokens)))

This gives result:

<|0.00|> Example<|1.20|><|1.56|> on Aging di Vienna,<|2.48|><|3.46|> Austria<|3.78|><|4.28|> yang telah diadakan pada tahun 1982 dan berasaskan unjuran tersebut, Jabatan Perangkaan Malaysia<|10.36|><|10.84|> menganggarkan<|11.56|><|11.98|> menjelang tahun 2035,<|14.04|><|14.50|> sejumlah 15%<|15.92|><|16.26|> penduduk kita adalah daripada kalangan warga emas.<|18.66|><|19.24|> Untuk makluman<|19.86|><|20.64|> Tuan dan Pertua dan juga Aliam Bohmat, pembangunan sistem pendaftaran warga emas ataupun kita sebutkan EWEN<|25.34|><|25.86|> adalah usaha kerajaan ke arah merealisasikan<|28.36|>

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment