""" MedScribe v2 — Hindi Audio Input Test (Gate 1) Tests Gemma 4 E4B's native audio input with Hindi speech. CRITICAL CONSTRAINT: E4B has a 30-second audio limit (750 tokens at 25 tok/sec). ASHA conversations are 10-15 minutes. This script tests: 1. Single 30-sec chunk processing 2. Audio chunking strategy for long conversations 3. Hindi ASR quality baseline 4. Whisper fallback if E4B Hindi ASR is insufficient Usage: python scripts/01_test_audio_hindi.py --audio python scripts/01_test_audio_hindi.py --generate-test # generate synthetic test audio python scripts/01_test_audio_hindi.py --whisper-fallback # test Whisper as ASR backup """ import argparse import json import os import sys import time from pathlib import Path # ── Audio Chunking ────────────────────────────────────────────────────────── CHUNK_DURATION_SEC = 28 # 2-sec margin under 30-sec limit OVERLAP_SEC = 2 # overlap to avoid cutting mid-word SAMPLE_RATE = 16000 def chunk_audio(audio_path: str, chunk_dir: str = None) -> list[dict]: """ Split audio file into <=28-second chunks with 2-sec overlap. Returns list of {path, start_sec, end_sec, duration_sec}. """ import librosa import soundfile as sf import numpy as np y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True) total_duration = len(y) / sr print(f" Audio loaded: {total_duration:.1f}s, {sr}Hz, mono") if chunk_dir is None: chunk_dir = os.path.join(os.path.dirname(audio_path), "chunks") os.makedirs(chunk_dir, exist_ok=True) chunks = [] step = CHUNK_DURATION_SEC - OVERLAP_SEC start = 0 while start < total_duration: end = min(start + CHUNK_DURATION_SEC, total_duration) start_sample = int(start * sr) end_sample = int(end * sr) chunk_audio = y[start_sample:end_sample] chunk_path = os.path.join(chunk_dir, f"chunk_{len(chunks):03d}.wav") sf.write(chunk_path, chunk_audio, sr) chunks.append({ "path": chunk_path, "start_sec": start, "end_sec": end, "duration_sec": end - start, }) start += step print(f" Split into {len(chunks)} chunks ({CHUNK_DURATION_SEC}s each, {OVERLAP_SEC}s overlap)") return chunks # ── Gemma 4 E4B Audio Processing ─────────────────────────────────────────── def test_e4b_audio(audio_path: str, device: str = "cuda"): """ Test Gemma 4 E4B native audio input via Transformers. Returns transcription text. """ import torch from transformers import AutoProcessor, AutoModelForCausalLM print(f"\n=== Testing Gemma 4 E4B Audio (Transformers) ===") print(f" Audio: {audio_path}") # Load model print(" Loading Gemma 4 E4B...") t0 = time.time() model_id = "google/gemma-4-E4B-it" processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) print(f" Model loaded in {time.time() - t0:.1f}s") # Load audio import librosa y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True) duration = len(y) / sr print(f" Audio duration: {duration:.1f}s ({int(duration * 25)} tokens)") if duration > 30: print(f" WARNING: Audio is {duration:.1f}s — exceeds 30s limit. Use chunk_audio() first.") return None # Build message with audio messages = [ { "role": "user", "content": [ {"type": "audio", "audio": y.tolist()}, {"type": "text", "text": ( "Transcribe the following Hindi/Hinglish speech exactly as spoken. " "Preserve Hindi words in Devanagari script. " "Include all medical terms and numbers precisely." )}, ], } ] # Process print(" Running inference...") t0 = time.time() inputs = processor.apply_chat_template( messages, return_tensors="pt", tokenize=True ).to(device) with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=512, do_sample=False, ) # Decode response = processor.decode(output_ids[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True) elapsed = time.time() - t0 print(f" Inference time: {elapsed:.1f}s") print(f" Transcription:\n {response[:500]}") return response def test_e4b_audio_chunked(audio_path: str, device: str = "cuda"): """ Process long audio by chunking into 28-sec segments. Assembles full transcription from all chunks. """ print(f"\n=== Chunked Audio Processing ===") import librosa y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True) duration = len(y) / sr if duration <= 30: print(f" Audio is {duration:.1f}s — no chunking needed") return test_e4b_audio(audio_path, device) chunk_dir = os.path.join("data", "temp", "chunks") chunks = chunk_audio(audio_path, chunk_dir) transcriptions = [] for i, chunk in enumerate(chunks): print(f"\n --- Chunk {i+1}/{len(chunks)} ({chunk['start_sec']:.0f}s-{chunk['end_sec']:.0f}s) ---") text = test_e4b_audio(chunk["path"], device) if text: transcriptions.append({ "chunk_index": i, "start_sec": chunk["start_sec"], "end_sec": chunk["end_sec"], "text": text, }) # Assemble (simple concatenation — overlap dedup can be added later) full_text = " ".join([t["text"] for t in transcriptions]) print(f"\n === Full Transcription ({len(transcriptions)} chunks) ===") print(f" {full_text[:1000]}") # Save output_path = os.path.join("data", "temp", "transcription_result.json") os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: json.dump({ "audio_path": audio_path, "total_duration_sec": duration, "num_chunks": len(chunks), "transcriptions": transcriptions, "full_text": full_text, }, f, ensure_ascii=False, indent=2) print(f" Saved to {output_path}") return full_text # ── Whisper Fallback Test ────────────────────────────────────────────────── def test_whisper_fallback(audio_path: str, device: str = "cuda"): """ Test Whisper small/medium as Hindi ASR fallback. If E4B's native Hindi ASR is insufficient, we use: Whisper (Hindi ASR) → text → Gemma 4 E4B (extraction) This is two models but still better than v1's three-model chain. """ import torch from transformers import pipeline print(f"\n=== Whisper Fallback Test (Hindi) ===") print(f" Audio: {audio_path}") # Try whisper-small first (lighter), upgrade to medium if needed for model_id in ["openai/whisper-small", "openai/whisper-medium"]: print(f"\n Testing {model_id}...") t0 = time.time() try: pipe = pipeline( "automatic-speech-recognition", model=model_id, device=device, torch_dtype=torch.float16, ) result = pipe( audio_path, generate_kwargs={"language": "hindi", "task": "transcribe"}, chunk_length_s=30, batch_size=8, return_timestamps=True, ) elapsed = time.time() - t0 print(f" Time: {elapsed:.1f}s") print(f" Transcription:\n {result['text'][:500]}") if result.get("chunks"): print(f" Timestamps: {len(result['chunks'])} segments") return result["text"] except Exception as e: print(f" Failed: {e}") continue print(" All Whisper models failed.") return None # ── Test Audio Generation ────────────────────────────────────────────────── def generate_test_audio(): """ Generate a synthetic Hindi test audio using TTS or provide instructions. For now, creates a silent WAV as a placeholder and prints instructions for obtaining real Hindi test audio. """ import numpy as np import soundfile as sf os.makedirs("data/raw", exist_ok=True) test_path = "data/raw/test_hindi_30s.wav" # Create 30-sec silent audio as structural test silence = np.zeros(SAMPLE_RATE * 30, dtype=np.float32) sf.write(test_path, silence, SAMPLE_RATE) print(f" Created placeholder: {test_path} (30s silent)") print() print(" To test with real Hindi audio, you need one of:") print(" 1. Record a Hindi conversation sample (phone/mic)") print(" 2. Use Google TTS: gtts-cli 'नमस्ते, मेरा नाम सुनीता है' --lang hi -o test.mp3") print(" 3. Download from Common Voice Hindi dataset") print(" 4. Use a sample from Mozilla Common Voice (hindi split)") print() print(" Recommended test sentences (ASHA visit context):") print(' - "दीदी, मुझे सिर में बहुत दर्द हो रहा है और आँखों के सामने धुंधला दिख रहा है"') print(' - "बच्चे का वज़न 2.1 किलो है, दूध ठीक से नहीं पी रहा"') print(' - "पिछली बार बी.पी. 140/90 आया था, अभी भी पैर सूजे हुए हैं"') return test_path # ── Main ─────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="MedScribe v2 — Hindi Audio Test") parser.add_argument("--audio", type=str, help="Path to Hindi audio file") parser.add_argument("--generate-test", action="store_true", help="Generate test audio placeholder") parser.add_argument("--whisper-fallback", action="store_true", help="Test Whisper as backup ASR") parser.add_argument("--chunk-only", action="store_true", help="Only test audio chunking") parser.add_argument("--device", type=str, default="cuda", help="Device (cuda/cpu)") args = parser.parse_args() if args.generate_test: generate_test_audio() return if not args.audio: print("Error: provide --audio or --generate-test") sys.exit(1) if not os.path.exists(args.audio): print(f"Error: file not found: {args.audio}") sys.exit(1) if args.chunk_only: chunks = chunk_audio(args.audio) for c in chunks: print(f" Chunk: {c['start_sec']:.0f}s-{c['end_sec']:.0f}s → {c['path']}") return if args.whisper_fallback: test_whisper_fallback(args.audio, args.device) else: # Try E4B native audio first import librosa y, sr = librosa.load(args.audio, sr=SAMPLE_RATE, mono=True) duration = len(y) / sr if duration > 30: test_e4b_audio_chunked(args.audio, args.device) else: test_e4b_audio(args.audio, args.device) if __name__ == "__main__": main()