File size: 2,711 Bytes
745f62a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""Benchmark different Whisper models on Hindi medical audio.

Usage:
    python scripts/benchmark_whisper.py

Tests each model on test_audio/*.mp3, reports:
  - Transcription output (first 200 chars)
  - Whether key medical values appear as digits
  - Time taken
  - VRAM usage
"""

import time
import os
import sys
import torch

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.hindi_normalize import normalize_transcript

AUDIO_DIR = "test_audio"
AUDIO_FILES = [f for f in os.listdir(AUDIO_DIR) if f.endswith((".mp3", ".wav"))]

# Expected digit substrings in ANC normal transcript
ANC_NORMAL_EXPECT = ["110", "70", "58", "11.5", "24"]
ANC_DANGER_EXPECT = ["155", "100"]

MODELS = [
    ("vasista22/whisper-hindi-small", {"chunk_length_s": 30}),
    ("collabora/whisper-large-v2-hindi", {"chunk_length_s": 30}),
]


def get_vram_mb():
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / 1024 / 1024
    return 0


def test_model(model_name, pipe_kwargs):
    from transformers import pipeline as hf_pipeline

    print(f"\n{'='*60}")
    print(f"MODEL: {model_name}")
    print(f"{'='*60}")

    vram_before = get_vram_mb()
    print(f"Loading... (VRAM before: {vram_before:.0f} MB)")

    t0 = time.time()
    pipe = hf_pipeline(
        "automatic-speech-recognition",
        model=model_name,
        device="cuda",
    )
    load_time = time.time() - t0
    vram_after = get_vram_mb()
    print(f"Loaded in {load_time:.1f}s (VRAM: {vram_after:.0f} MB, delta: {vram_after - vram_before:.0f} MB)")

    for audio_file in AUDIO_FILES:
        audio_path = os.path.join(AUDIO_DIR, audio_file)
        print(f"\n--- {audio_file} ---")

        t0 = time.time()
        result = pipe(audio_path, **pipe_kwargs)
        elapsed = time.time() - t0

        raw = result["text"].strip()
        normalized = normalize_transcript(raw)

        print(f"Time: {elapsed:.1f}s")
        print(f"Raw ({len(raw)} chars): {raw[:200]}")
        print(f"Normalized ({len(normalized)} chars): {normalized[:200]}")

        # Check for expected digits
        expect = ANC_NORMAL_EXPECT if "normal" in audio_file else ANC_DANGER_EXPECT
        for val in expect:
            found_raw = val in raw
            found_norm = val in normalized
            status = "RAW" if found_raw else ("NORM" if found_norm else "MISS")
            print(f"  {val}: {status}")

    # Free VRAM
    del pipe
    torch.cuda.empty_cache()
    import gc; gc.collect()


if __name__ == "__main__":
    for model_name, kwargs in MODELS:
        try:
            test_model(model_name, kwargs)
        except Exception as e:
            print(f"\nERROR with {model_name}: {e}")