Spaces:

Tushar9802
/

sakhi

Sleeping

App Files Files Community

sakhi / scripts /test_audio_hindi.py

Tushar9802

HF Space deploy — initial

745f62a 7 days ago

raw

history blame contribute delete

11.9 kB

	"""
	MedScribe v2 — Hindi Audio Input Test (Gate 1)
	Tests Gemma 4 E4B's native audio input with Hindi speech.

	CRITICAL CONSTRAINT: E4B has a 30-second audio limit (750 tokens at 25 tok/sec).
	ASHA conversations are 10-15 minutes. This script tests:
	1. Single 30-sec chunk processing
	2. Audio chunking strategy for long conversations
	3. Hindi ASR quality baseline
	4. Whisper fallback if E4B Hindi ASR is insufficient

	Usage:
	python scripts/01_test_audio_hindi.py --audio <path.wav>
	python scripts/01_test_audio_hindi.py --generate-test # generate synthetic test audio
	python scripts/01_test_audio_hindi.py --whisper-fallback # test Whisper as ASR backup
	"""
	import argparse
	import json
	import os
	import sys
	import time
	from pathlib import Path

	# ── Audio Chunking ──────────────────────────────────────────────────────────

	CHUNK_DURATION_SEC = 28 # 2-sec margin under 30-sec limit
	OVERLAP_SEC = 2 # overlap to avoid cutting mid-word
	SAMPLE_RATE = 16000


	def chunk_audio(audio_path: str, chunk_dir: str = None) -> list[dict]:
	"""
	Split audio file into <=28-second chunks with 2-sec overlap.
	Returns list of {path, start_sec, end_sec, duration_sec}.
	"""
	import librosa
	import soundfile as sf
	import numpy as np

	y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
	total_duration = len(y) / sr
	print(f" Audio loaded: {total_duration:.1f}s, {sr}Hz, mono")

	if chunk_dir is None:
	chunk_dir = os.path.join(os.path.dirname(audio_path), "chunks")
	os.makedirs(chunk_dir, exist_ok=True)

	chunks = []
	step = CHUNK_DURATION_SEC - OVERLAP_SEC
	start = 0

	while start < total_duration:
	end = min(start + CHUNK_DURATION_SEC, total_duration)
	start_sample = int(start * sr)
	end_sample = int(end * sr)
	chunk_audio = y[start_sample:end_sample]

	chunk_path = os.path.join(chunk_dir, f"chunk_{len(chunks):03d}.wav")
	sf.write(chunk_path, chunk_audio, sr)

	chunks.append({
	"path": chunk_path,
	"start_sec": start,
	"end_sec": end,
	"duration_sec": end - start,
	})
	start += step

	print(f" Split into {len(chunks)} chunks ({CHUNK_DURATION_SEC}s each, {OVERLAP_SEC}s overlap)")
	return chunks


	# ── Gemma 4 E4B Audio Processing ───────────────────────────────────────────

	def test_e4b_audio(audio_path: str, device: str = "cuda"):
	"""
	Test Gemma 4 E4B native audio input via Transformers.
	Returns transcription text.
	"""
	import torch
	from transformers import AutoProcessor, AutoModelForCausalLM

	print(f"\n=== Testing Gemma 4 E4B Audio (Transformers) ===")
	print(f" Audio: {audio_path}")

	# Load model
	print(" Loading Gemma 4 E4B...")
	t0 = time.time()
	model_id = "google/gemma-4-E4B-it"

	processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True,
	)
	print(f" Model loaded in {time.time() - t0:.1f}s")

	# Load audio
	import librosa
	y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
	duration = len(y) / sr
	print(f" Audio duration: {duration:.1f}s ({int(duration * 25)} tokens)")

	if duration > 30:
	print(f" WARNING: Audio is {duration:.1f}s — exceeds 30s limit. Use chunk_audio() first.")
	return None

	# Build message with audio
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "audio", "audio": y.tolist()},
	{"type": "text", "text": (
	"Transcribe the following Hindi/Hinglish speech exactly as spoken. "
	"Preserve Hindi words in Devanagari script. "
	"Include all medical terms and numbers precisely."
	)},
	],
	}
	]

	# Process
	print(" Running inference...")
	t0 = time.time()
	inputs = processor.apply_chat_template(
	messages, return_tensors="pt", tokenize=True
	).to(device)

	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=512,
	do_sample=False,
	)

	# Decode
	response = processor.decode(output_ids[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
	elapsed = time.time() - t0
	print(f" Inference time: {elapsed:.1f}s")
	print(f" Transcription:\n {response[:500]}")

	return response


	def test_e4b_audio_chunked(audio_path: str, device: str = "cuda"):
	"""
	Process long audio by chunking into 28-sec segments.
	Assembles full transcription from all chunks.
	"""
	print(f"\n=== Chunked Audio Processing ===")

	import librosa
	y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
	duration = len(y) / sr

	if duration <= 30:
	print(f" Audio is {duration:.1f}s — no chunking needed")
	return test_e4b_audio(audio_path, device)

	chunk_dir = os.path.join("data", "temp", "chunks")
	chunks = chunk_audio(audio_path, chunk_dir)

	transcriptions = []
	for i, chunk in enumerate(chunks):
	print(f"\n --- Chunk {i+1}/{len(chunks)} ({chunk['start_sec']:.0f}s-{chunk['end_sec']:.0f}s) ---")
	text = test_e4b_audio(chunk["path"], device)
	if text:
	transcriptions.append({
	"chunk_index": i,
	"start_sec": chunk["start_sec"],
	"end_sec": chunk["end_sec"],
	"text": text,
	})

	# Assemble (simple concatenation — overlap dedup can be added later)
	full_text = " ".join([t["text"] for t in transcriptions])
	print(f"\n === Full Transcription ({len(transcriptions)} chunks) ===")
	print(f" {full_text[:1000]}")

	# Save
	output_path = os.path.join("data", "temp", "transcription_result.json")
	os.makedirs(os.path.dirname(output_path), exist_ok=True)
	with open(output_path, "w", encoding="utf-8") as f:
	json.dump({
	"audio_path": audio_path,
	"total_duration_sec": duration,
	"num_chunks": len(chunks),
	"transcriptions": transcriptions,
	"full_text": full_text,
	}, f, ensure_ascii=False, indent=2)
	print(f" Saved to {output_path}")

	return full_text


	# ── Whisper Fallback Test ──────────────────────────────────────────────────

	def test_whisper_fallback(audio_path: str, device: str = "cuda"):
	"""
	Test Whisper small/medium as Hindi ASR fallback.
	If E4B's native Hindi ASR is insufficient, we use:
	Whisper (Hindi ASR) → text → Gemma 4 E4B (extraction)
	This is two models but still better than v1's three-model chain.
	"""
	import torch
	from transformers import pipeline

	print(f"\n=== Whisper Fallback Test (Hindi) ===")
	print(f" Audio: {audio_path}")

	# Try whisper-small first (lighter), upgrade to medium if needed
	for model_id in ["openai/whisper-small", "openai/whisper-medium"]:
	print(f"\n Testing {model_id}...")
	t0 = time.time()
	try:
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model_id,
	device=device,
	torch_dtype=torch.float16,
	)
	result = pipe(
	audio_path,
	generate_kwargs={"language": "hindi", "task": "transcribe"},
	chunk_length_s=30,
	batch_size=8,
	return_timestamps=True,
	)
	elapsed = time.time() - t0
	print(f" Time: {elapsed:.1f}s")
	print(f" Transcription:\n {result['text'][:500]}")

	if result.get("chunks"):
	print(f" Timestamps: {len(result['chunks'])} segments")

	return result["text"]

	except Exception as e:
	print(f" Failed: {e}")
	continue

	print(" All Whisper models failed.")
	return None


	# ── Test Audio Generation ──────────────────────────────────────────────────

	def generate_test_audio():
	"""
	Generate a synthetic Hindi test audio using TTS or provide instructions.
	For now, creates a silent WAV as a placeholder and prints instructions
	for obtaining real Hindi test audio.
	"""
	import numpy as np
	import soundfile as sf

	os.makedirs("data/raw", exist_ok=True)
	test_path = "data/raw/test_hindi_30s.wav"

	# Create 30-sec silent audio as structural test
	silence = np.zeros(SAMPLE_RATE * 30, dtype=np.float32)
	sf.write(test_path, silence, SAMPLE_RATE)
	print(f" Created placeholder: {test_path} (30s silent)")
	print()
	print(" To test with real Hindi audio, you need one of:")
	print(" 1. Record a Hindi conversation sample (phone/mic)")
	print(" 2. Use Google TTS: gtts-cli 'नमस्ते, मेरा नाम सुनीता है' --lang hi -o test.mp3")
	print(" 3. Download from Common Voice Hindi dataset")
	print(" 4. Use a sample from Mozilla Common Voice (hindi split)")
	print()
	print(" Recommended test sentences (ASHA visit context):")
	print(' - "दीदी, मुझे सिर में बहुत दर्द हो रहा है और आँखों के सामने धुंधला दिख रहा है"')
	print(' - "बच्चे का वज़न 2.1 किलो है, दूध ठीक से नहीं पी रहा"')
	print(' - "पिछली बार बी.पी. 140/90 आया था, अभी भी पैर सूजे हुए हैं"')

	return test_path


	# ── Main ───────────────────────────────────────────────────────────────────

	def main():
	parser = argparse.ArgumentParser(description="MedScribe v2 — Hindi Audio Test")
	parser.add_argument("--audio", type=str, help="Path to Hindi audio file")
	parser.add_argument("--generate-test", action="store_true", help="Generate test audio placeholder")
	parser.add_argument("--whisper-fallback", action="store_true", help="Test Whisper as backup ASR")
	parser.add_argument("--chunk-only", action="store_true", help="Only test audio chunking")
	parser.add_argument("--device", type=str, default="cuda", help="Device (cuda/cpu)")
	args = parser.parse_args()

	if args.generate_test:
	generate_test_audio()
	return

	if not args.audio:
	print("Error: provide --audio <path> or --generate-test")
	sys.exit(1)

	if not os.path.exists(args.audio):
	print(f"Error: file not found: {args.audio}")
	sys.exit(1)

	if args.chunk_only:
	chunks = chunk_audio(args.audio)
	for c in chunks:
	print(f" Chunk: {c['start_sec']:.0f}s-{c['end_sec']:.0f}s → {c['path']}")
	return

	if args.whisper_fallback:
	test_whisper_fallback(args.audio, args.device)
	else:
	# Try E4B native audio first
	import librosa
	y, sr = librosa.load(args.audio, sr=SAMPLE_RATE, mono=True)
	duration = len(y) / sr

	if duration > 30:
	test_e4b_audio_chunked(args.audio, args.device)
	else:
	test_e4b_audio(args.audio, args.device)


	if __name__ == "__main__":
	main()