sakhi / scripts /test_audio_hindi.py
Tushar9802's picture
HF Space deploy — initial
745f62a
"""
MedScribe v2 — Hindi Audio Input Test (Gate 1)
Tests Gemma 4 E4B's native audio input with Hindi speech.
CRITICAL CONSTRAINT: E4B has a 30-second audio limit (750 tokens at 25 tok/sec).
ASHA conversations are 10-15 minutes. This script tests:
1. Single 30-sec chunk processing
2. Audio chunking strategy for long conversations
3. Hindi ASR quality baseline
4. Whisper fallback if E4B Hindi ASR is insufficient
Usage:
python scripts/01_test_audio_hindi.py --audio <path.wav>
python scripts/01_test_audio_hindi.py --generate-test # generate synthetic test audio
python scripts/01_test_audio_hindi.py --whisper-fallback # test Whisper as ASR backup
"""
import argparse
import json
import os
import sys
import time
from pathlib import Path
# ── Audio Chunking ──────────────────────────────────────────────────────────
CHUNK_DURATION_SEC = 28 # 2-sec margin under 30-sec limit
OVERLAP_SEC = 2 # overlap to avoid cutting mid-word
SAMPLE_RATE = 16000
def chunk_audio(audio_path: str, chunk_dir: str = None) -> list[dict]:
"""
Split audio file into <=28-second chunks with 2-sec overlap.
Returns list of {path, start_sec, end_sec, duration_sec}.
"""
import librosa
import soundfile as sf
import numpy as np
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
total_duration = len(y) / sr
print(f" Audio loaded: {total_duration:.1f}s, {sr}Hz, mono")
if chunk_dir is None:
chunk_dir = os.path.join(os.path.dirname(audio_path), "chunks")
os.makedirs(chunk_dir, exist_ok=True)
chunks = []
step = CHUNK_DURATION_SEC - OVERLAP_SEC
start = 0
while start < total_duration:
end = min(start + CHUNK_DURATION_SEC, total_duration)
start_sample = int(start * sr)
end_sample = int(end * sr)
chunk_audio = y[start_sample:end_sample]
chunk_path = os.path.join(chunk_dir, f"chunk_{len(chunks):03d}.wav")
sf.write(chunk_path, chunk_audio, sr)
chunks.append({
"path": chunk_path,
"start_sec": start,
"end_sec": end,
"duration_sec": end - start,
})
start += step
print(f" Split into {len(chunks)} chunks ({CHUNK_DURATION_SEC}s each, {OVERLAP_SEC}s overlap)")
return chunks
# ── Gemma 4 E4B Audio Processing ───────────────────────────────────────────
def test_e4b_audio(audio_path: str, device: str = "cuda"):
"""
Test Gemma 4 E4B native audio input via Transformers.
Returns transcription text.
"""
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
print(f"\n=== Testing Gemma 4 E4B Audio (Transformers) ===")
print(f" Audio: {audio_path}")
# Load model
print(" Loading Gemma 4 E4B...")
t0 = time.time()
model_id = "google/gemma-4-E4B-it"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
)
print(f" Model loaded in {time.time() - t0:.1f}s")
# Load audio
import librosa
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
duration = len(y) / sr
print(f" Audio duration: {duration:.1f}s ({int(duration * 25)} tokens)")
if duration > 30:
print(f" WARNING: Audio is {duration:.1f}s — exceeds 30s limit. Use chunk_audio() first.")
return None
# Build message with audio
messages = [
{
"role": "user",
"content": [
{"type": "audio", "audio": y.tolist()},
{"type": "text", "text": (
"Transcribe the following Hindi/Hinglish speech exactly as spoken. "
"Preserve Hindi words in Devanagari script. "
"Include all medical terms and numbers precisely."
)},
],
}
]
# Process
print(" Running inference...")
t0 = time.time()
inputs = processor.apply_chat_template(
messages, return_tensors="pt", tokenize=True
).to(device)
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=512,
do_sample=False,
)
# Decode
response = processor.decode(output_ids[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
elapsed = time.time() - t0
print(f" Inference time: {elapsed:.1f}s")
print(f" Transcription:\n {response[:500]}")
return response
def test_e4b_audio_chunked(audio_path: str, device: str = "cuda"):
"""
Process long audio by chunking into 28-sec segments.
Assembles full transcription from all chunks.
"""
print(f"\n=== Chunked Audio Processing ===")
import librosa
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
duration = len(y) / sr
if duration <= 30:
print(f" Audio is {duration:.1f}s — no chunking needed")
return test_e4b_audio(audio_path, device)
chunk_dir = os.path.join("data", "temp", "chunks")
chunks = chunk_audio(audio_path, chunk_dir)
transcriptions = []
for i, chunk in enumerate(chunks):
print(f"\n --- Chunk {i+1}/{len(chunks)} ({chunk['start_sec']:.0f}s-{chunk['end_sec']:.0f}s) ---")
text = test_e4b_audio(chunk["path"], device)
if text:
transcriptions.append({
"chunk_index": i,
"start_sec": chunk["start_sec"],
"end_sec": chunk["end_sec"],
"text": text,
})
# Assemble (simple concatenation — overlap dedup can be added later)
full_text = " ".join([t["text"] for t in transcriptions])
print(f"\n === Full Transcription ({len(transcriptions)} chunks) ===")
print(f" {full_text[:1000]}")
# Save
output_path = os.path.join("data", "temp", "transcription_result.json")
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump({
"audio_path": audio_path,
"total_duration_sec": duration,
"num_chunks": len(chunks),
"transcriptions": transcriptions,
"full_text": full_text,
}, f, ensure_ascii=False, indent=2)
print(f" Saved to {output_path}")
return full_text
# ── Whisper Fallback Test ──────────────────────────────────────────────────
def test_whisper_fallback(audio_path: str, device: str = "cuda"):
"""
Test Whisper small/medium as Hindi ASR fallback.
If E4B's native Hindi ASR is insufficient, we use:
Whisper (Hindi ASR) → text → Gemma 4 E4B (extraction)
This is two models but still better than v1's three-model chain.
"""
import torch
from transformers import pipeline
print(f"\n=== Whisper Fallback Test (Hindi) ===")
print(f" Audio: {audio_path}")
# Try whisper-small first (lighter), upgrade to medium if needed
for model_id in ["openai/whisper-small", "openai/whisper-medium"]:
print(f"\n Testing {model_id}...")
t0 = time.time()
try:
pipe = pipeline(
"automatic-speech-recognition",
model=model_id,
device=device,
torch_dtype=torch.float16,
)
result = pipe(
audio_path,
generate_kwargs={"language": "hindi", "task": "transcribe"},
chunk_length_s=30,
batch_size=8,
return_timestamps=True,
)
elapsed = time.time() - t0
print(f" Time: {elapsed:.1f}s")
print(f" Transcription:\n {result['text'][:500]}")
if result.get("chunks"):
print(f" Timestamps: {len(result['chunks'])} segments")
return result["text"]
except Exception as e:
print(f" Failed: {e}")
continue
print(" All Whisper models failed.")
return None
# ── Test Audio Generation ──────────────────────────────────────────────────
def generate_test_audio():
"""
Generate a synthetic Hindi test audio using TTS or provide instructions.
For now, creates a silent WAV as a placeholder and prints instructions
for obtaining real Hindi test audio.
"""
import numpy as np
import soundfile as sf
os.makedirs("data/raw", exist_ok=True)
test_path = "data/raw/test_hindi_30s.wav"
# Create 30-sec silent audio as structural test
silence = np.zeros(SAMPLE_RATE * 30, dtype=np.float32)
sf.write(test_path, silence, SAMPLE_RATE)
print(f" Created placeholder: {test_path} (30s silent)")
print()
print(" To test with real Hindi audio, you need one of:")
print(" 1. Record a Hindi conversation sample (phone/mic)")
print(" 2. Use Google TTS: gtts-cli 'नमस्ते, मेरा नाम सुनीता है' --lang hi -o test.mp3")
print(" 3. Download from Common Voice Hindi dataset")
print(" 4. Use a sample from Mozilla Common Voice (hindi split)")
print()
print(" Recommended test sentences (ASHA visit context):")
print(' - "दीदी, मुझे सिर में बहुत दर्द हो रहा है और आँखों के सामने धुंधला दिख रहा है"')
print(' - "बच्चे का वज़न 2.1 किलो है, दूध ठीक से नहीं पी रहा"')
print(' - "पिछली बार बी.पी. 140/90 आया था, अभी भी पैर सूजे हुए हैं"')
return test_path
# ── Main ───────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="MedScribe v2 — Hindi Audio Test")
parser.add_argument("--audio", type=str, help="Path to Hindi audio file")
parser.add_argument("--generate-test", action="store_true", help="Generate test audio placeholder")
parser.add_argument("--whisper-fallback", action="store_true", help="Test Whisper as backup ASR")
parser.add_argument("--chunk-only", action="store_true", help="Only test audio chunking")
parser.add_argument("--device", type=str, default="cuda", help="Device (cuda/cpu)")
args = parser.parse_args()
if args.generate_test:
generate_test_audio()
return
if not args.audio:
print("Error: provide --audio <path> or --generate-test")
sys.exit(1)
if not os.path.exists(args.audio):
print(f"Error: file not found: {args.audio}")
sys.exit(1)
if args.chunk_only:
chunks = chunk_audio(args.audio)
for c in chunks:
print(f" Chunk: {c['start_sec']:.0f}s-{c['end_sec']:.0f}s → {c['path']}")
return
if args.whisper_fallback:
test_whisper_fallback(args.audio, args.device)
else:
# Try E4B native audio first
import librosa
y, sr = librosa.load(args.audio, sr=SAMPLE_RATE, mono=True)
duration = len(y) / sr
if duration > 30:
test_e4b_audio_chunked(args.audio, args.device)
else:
test_e4b_audio(args.audio, args.device)
if __name__ == "__main__":
main()