File size: 11,946 Bytes
745f62a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
"""
MedScribe v2 — Hindi Audio Input Test (Gate 1)
Tests Gemma 4 E4B's native audio input with Hindi speech.

CRITICAL CONSTRAINT: E4B has a 30-second audio limit (750 tokens at 25 tok/sec).
ASHA conversations are 10-15 minutes. This script tests:
  1. Single 30-sec chunk processing
  2. Audio chunking strategy for long conversations
  3. Hindi ASR quality baseline
  4. Whisper fallback if E4B Hindi ASR is insufficient

Usage:
  python scripts/01_test_audio_hindi.py --audio <path.wav>
  python scripts/01_test_audio_hindi.py --generate-test   # generate synthetic test audio
  python scripts/01_test_audio_hindi.py --whisper-fallback # test Whisper as ASR backup
"""
import argparse
import json
import os
import sys
import time
from pathlib import Path

# ── Audio Chunking ──────────────────────────────────────────────────────────

CHUNK_DURATION_SEC = 28  # 2-sec margin under 30-sec limit
OVERLAP_SEC = 2          # overlap to avoid cutting mid-word
SAMPLE_RATE = 16000


def chunk_audio(audio_path: str, chunk_dir: str = None) -> list[dict]:
    """
    Split audio file into <=28-second chunks with 2-sec overlap.
    Returns list of {path, start_sec, end_sec, duration_sec}.
    """
    import librosa
    import soundfile as sf
    import numpy as np

    y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
    total_duration = len(y) / sr
    print(f"  Audio loaded: {total_duration:.1f}s, {sr}Hz, mono")

    if chunk_dir is None:
        chunk_dir = os.path.join(os.path.dirname(audio_path), "chunks")
    os.makedirs(chunk_dir, exist_ok=True)

    chunks = []
    step = CHUNK_DURATION_SEC - OVERLAP_SEC
    start = 0

    while start < total_duration:
        end = min(start + CHUNK_DURATION_SEC, total_duration)
        start_sample = int(start * sr)
        end_sample = int(end * sr)
        chunk_audio = y[start_sample:end_sample]

        chunk_path = os.path.join(chunk_dir, f"chunk_{len(chunks):03d}.wav")
        sf.write(chunk_path, chunk_audio, sr)

        chunks.append({
            "path": chunk_path,
            "start_sec": start,
            "end_sec": end,
            "duration_sec": end - start,
        })
        start += step

    print(f"  Split into {len(chunks)} chunks ({CHUNK_DURATION_SEC}s each, {OVERLAP_SEC}s overlap)")
    return chunks


# ── Gemma 4 E4B Audio Processing ───────────────────────────────────────────

def test_e4b_audio(audio_path: str, device: str = "cuda"):
    """
    Test Gemma 4 E4B native audio input via Transformers.
    Returns transcription text.
    """
    import torch
    from transformers import AutoProcessor, AutoModelForCausalLM

    print(f"\n=== Testing Gemma 4 E4B Audio (Transformers) ===")
    print(f"  Audio: {audio_path}")

    # Load model
    print("  Loading Gemma 4 E4B...")
    t0 = time.time()
    model_id = "google/gemma-4-E4B-it"

    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )
    print(f"  Model loaded in {time.time() - t0:.1f}s")

    # Load audio
    import librosa
    y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
    duration = len(y) / sr
    print(f"  Audio duration: {duration:.1f}s ({int(duration * 25)} tokens)")

    if duration > 30:
        print(f"  WARNING: Audio is {duration:.1f}s — exceeds 30s limit. Use chunk_audio() first.")
        return None

    # Build message with audio
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "audio", "audio": y.tolist()},
                {"type": "text", "text": (
                    "Transcribe the following Hindi/Hinglish speech exactly as spoken. "
                    "Preserve Hindi words in Devanagari script. "
                    "Include all medical terms and numbers precisely."
                )},
            ],
        }
    ]

    # Process
    print("  Running inference...")
    t0 = time.time()
    inputs = processor.apply_chat_template(
        messages, return_tensors="pt", tokenize=True
    ).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
        )

    # Decode
    response = processor.decode(output_ids[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    elapsed = time.time() - t0
    print(f"  Inference time: {elapsed:.1f}s")
    print(f"  Transcription:\n    {response[:500]}")

    return response


def test_e4b_audio_chunked(audio_path: str, device: str = "cuda"):
    """
    Process long audio by chunking into 28-sec segments.
    Assembles full transcription from all chunks.
    """
    print(f"\n=== Chunked Audio Processing ===")

    import librosa
    y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
    duration = len(y) / sr

    if duration <= 30:
        print(f"  Audio is {duration:.1f}s — no chunking needed")
        return test_e4b_audio(audio_path, device)

    chunk_dir = os.path.join("data", "temp", "chunks")
    chunks = chunk_audio(audio_path, chunk_dir)

    transcriptions = []
    for i, chunk in enumerate(chunks):
        print(f"\n  --- Chunk {i+1}/{len(chunks)} ({chunk['start_sec']:.0f}s-{chunk['end_sec']:.0f}s) ---")
        text = test_e4b_audio(chunk["path"], device)
        if text:
            transcriptions.append({
                "chunk_index": i,
                "start_sec": chunk["start_sec"],
                "end_sec": chunk["end_sec"],
                "text": text,
            })

    # Assemble (simple concatenation — overlap dedup can be added later)
    full_text = " ".join([t["text"] for t in transcriptions])
    print(f"\n  === Full Transcription ({len(transcriptions)} chunks) ===")
    print(f"  {full_text[:1000]}")

    # Save
    output_path = os.path.join("data", "temp", "transcription_result.json")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump({
            "audio_path": audio_path,
            "total_duration_sec": duration,
            "num_chunks": len(chunks),
            "transcriptions": transcriptions,
            "full_text": full_text,
        }, f, ensure_ascii=False, indent=2)
    print(f"  Saved to {output_path}")

    return full_text


# ── Whisper Fallback Test ──────────────────────────────────────────────────

def test_whisper_fallback(audio_path: str, device: str = "cuda"):
    """
    Test Whisper small/medium as Hindi ASR fallback.
    If E4B's native Hindi ASR is insufficient, we use:
      Whisper (Hindi ASR) → text → Gemma 4 E4B (extraction)
    This is two models but still better than v1's three-model chain.
    """
    import torch
    from transformers import pipeline

    print(f"\n=== Whisper Fallback Test (Hindi) ===")
    print(f"  Audio: {audio_path}")

    # Try whisper-small first (lighter), upgrade to medium if needed
    for model_id in ["openai/whisper-small", "openai/whisper-medium"]:
        print(f"\n  Testing {model_id}...")
        t0 = time.time()
        try:
            pipe = pipeline(
                "automatic-speech-recognition",
                model=model_id,
                device=device,
                torch_dtype=torch.float16,
            )
            result = pipe(
                audio_path,
                generate_kwargs={"language": "hindi", "task": "transcribe"},
                chunk_length_s=30,
                batch_size=8,
                return_timestamps=True,
            )
            elapsed = time.time() - t0
            print(f"  Time: {elapsed:.1f}s")
            print(f"  Transcription:\n    {result['text'][:500]}")

            if result.get("chunks"):
                print(f"  Timestamps: {len(result['chunks'])} segments")

            return result["text"]

        except Exception as e:
            print(f"  Failed: {e}")
            continue

    print("  All Whisper models failed.")
    return None


# ── Test Audio Generation ──────────────────────────────────────────────────

def generate_test_audio():
    """
    Generate a synthetic Hindi test audio using TTS or provide instructions.
    For now, creates a silent WAV as a placeholder and prints instructions
    for obtaining real Hindi test audio.
    """
    import numpy as np
    import soundfile as sf

    os.makedirs("data/raw", exist_ok=True)
    test_path = "data/raw/test_hindi_30s.wav"

    # Create 30-sec silent audio as structural test
    silence = np.zeros(SAMPLE_RATE * 30, dtype=np.float32)
    sf.write(test_path, silence, SAMPLE_RATE)
    print(f"  Created placeholder: {test_path} (30s silent)")
    print()
    print("  To test with real Hindi audio, you need one of:")
    print("  1. Record a Hindi conversation sample (phone/mic)")
    print("  2. Use Google TTS: gtts-cli 'नमस्ते, मेरा नाम सुनीता है' --lang hi -o test.mp3")
    print("  3. Download from Common Voice Hindi dataset")
    print("  4. Use a sample from Mozilla Common Voice (hindi split)")
    print()
    print("  Recommended test sentences (ASHA visit context):")
    print('  - "दीदी, मुझे सिर में बहुत दर्द हो रहा है और आँखों के सामने धुंधला दिख रहा है"')
    print('  - "बच्चे का वज़न 2.1 किलो है, दूध ठीक से नहीं पी रहा"')
    print('  - "पिछली बार बी.पी. 140/90 आया था, अभी भी पैर सूजे हुए हैं"')

    return test_path


# ── Main ───────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="MedScribe v2 — Hindi Audio Test")
    parser.add_argument("--audio", type=str, help="Path to Hindi audio file")
    parser.add_argument("--generate-test", action="store_true", help="Generate test audio placeholder")
    parser.add_argument("--whisper-fallback", action="store_true", help="Test Whisper as backup ASR")
    parser.add_argument("--chunk-only", action="store_true", help="Only test audio chunking")
    parser.add_argument("--device", type=str, default="cuda", help="Device (cuda/cpu)")
    args = parser.parse_args()

    if args.generate_test:
        generate_test_audio()
        return

    if not args.audio:
        print("Error: provide --audio <path> or --generate-test")
        sys.exit(1)

    if not os.path.exists(args.audio):
        print(f"Error: file not found: {args.audio}")
        sys.exit(1)

    if args.chunk_only:
        chunks = chunk_audio(args.audio)
        for c in chunks:
            print(f"  Chunk: {c['start_sec']:.0f}s-{c['end_sec']:.0f}s → {c['path']}")
        return

    if args.whisper_fallback:
        test_whisper_fallback(args.audio, args.device)
    else:
        # Try E4B native audio first
        import librosa
        y, sr = librosa.load(args.audio, sr=SAMPLE_RATE, mono=True)
        duration = len(y) / sr

        if duration > 30:
            test_e4b_audio_chunked(args.audio, args.device)
        else:
            test_e4b_audio(args.audio, args.device)


if __name__ == "__main__":
    main()