Spaces:
Sleeping
Sleeping
File size: 11,946 Bytes
745f62a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | """
MedScribe v2 — Hindi Audio Input Test (Gate 1)
Tests Gemma 4 E4B's native audio input with Hindi speech.
CRITICAL CONSTRAINT: E4B has a 30-second audio limit (750 tokens at 25 tok/sec).
ASHA conversations are 10-15 minutes. This script tests:
1. Single 30-sec chunk processing
2. Audio chunking strategy for long conversations
3. Hindi ASR quality baseline
4. Whisper fallback if E4B Hindi ASR is insufficient
Usage:
python scripts/01_test_audio_hindi.py --audio <path.wav>
python scripts/01_test_audio_hindi.py --generate-test # generate synthetic test audio
python scripts/01_test_audio_hindi.py --whisper-fallback # test Whisper as ASR backup
"""
import argparse
import json
import os
import sys
import time
from pathlib import Path
# ── Audio Chunking ──────────────────────────────────────────────────────────
CHUNK_DURATION_SEC = 28 # 2-sec margin under 30-sec limit
OVERLAP_SEC = 2 # overlap to avoid cutting mid-word
SAMPLE_RATE = 16000
def chunk_audio(audio_path: str, chunk_dir: str = None) -> list[dict]:
"""
Split audio file into <=28-second chunks with 2-sec overlap.
Returns list of {path, start_sec, end_sec, duration_sec}.
"""
import librosa
import soundfile as sf
import numpy as np
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
total_duration = len(y) / sr
print(f" Audio loaded: {total_duration:.1f}s, {sr}Hz, mono")
if chunk_dir is None:
chunk_dir = os.path.join(os.path.dirname(audio_path), "chunks")
os.makedirs(chunk_dir, exist_ok=True)
chunks = []
step = CHUNK_DURATION_SEC - OVERLAP_SEC
start = 0
while start < total_duration:
end = min(start + CHUNK_DURATION_SEC, total_duration)
start_sample = int(start * sr)
end_sample = int(end * sr)
chunk_audio = y[start_sample:end_sample]
chunk_path = os.path.join(chunk_dir, f"chunk_{len(chunks):03d}.wav")
sf.write(chunk_path, chunk_audio, sr)
chunks.append({
"path": chunk_path,
"start_sec": start,
"end_sec": end,
"duration_sec": end - start,
})
start += step
print(f" Split into {len(chunks)} chunks ({CHUNK_DURATION_SEC}s each, {OVERLAP_SEC}s overlap)")
return chunks
# ── Gemma 4 E4B Audio Processing ───────────────────────────────────────────
def test_e4b_audio(audio_path: str, device: str = "cuda"):
"""
Test Gemma 4 E4B native audio input via Transformers.
Returns transcription text.
"""
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
print(f"\n=== Testing Gemma 4 E4B Audio (Transformers) ===")
print(f" Audio: {audio_path}")
# Load model
print(" Loading Gemma 4 E4B...")
t0 = time.time()
model_id = "google/gemma-4-E4B-it"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
)
print(f" Model loaded in {time.time() - t0:.1f}s")
# Load audio
import librosa
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
duration = len(y) / sr
print(f" Audio duration: {duration:.1f}s ({int(duration * 25)} tokens)")
if duration > 30:
print(f" WARNING: Audio is {duration:.1f}s — exceeds 30s limit. Use chunk_audio() first.")
return None
# Build message with audio
messages = [
{
"role": "user",
"content": [
{"type": "audio", "audio": y.tolist()},
{"type": "text", "text": (
"Transcribe the following Hindi/Hinglish speech exactly as spoken. "
"Preserve Hindi words in Devanagari script. "
"Include all medical terms and numbers precisely."
)},
],
}
]
# Process
print(" Running inference...")
t0 = time.time()
inputs = processor.apply_chat_template(
messages, return_tensors="pt", tokenize=True
).to(device)
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=512,
do_sample=False,
)
# Decode
response = processor.decode(output_ids[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
elapsed = time.time() - t0
print(f" Inference time: {elapsed:.1f}s")
print(f" Transcription:\n {response[:500]}")
return response
def test_e4b_audio_chunked(audio_path: str, device: str = "cuda"):
"""
Process long audio by chunking into 28-sec segments.
Assembles full transcription from all chunks.
"""
print(f"\n=== Chunked Audio Processing ===")
import librosa
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
duration = len(y) / sr
if duration <= 30:
print(f" Audio is {duration:.1f}s — no chunking needed")
return test_e4b_audio(audio_path, device)
chunk_dir = os.path.join("data", "temp", "chunks")
chunks = chunk_audio(audio_path, chunk_dir)
transcriptions = []
for i, chunk in enumerate(chunks):
print(f"\n --- Chunk {i+1}/{len(chunks)} ({chunk['start_sec']:.0f}s-{chunk['end_sec']:.0f}s) ---")
text = test_e4b_audio(chunk["path"], device)
if text:
transcriptions.append({
"chunk_index": i,
"start_sec": chunk["start_sec"],
"end_sec": chunk["end_sec"],
"text": text,
})
# Assemble (simple concatenation — overlap dedup can be added later)
full_text = " ".join([t["text"] for t in transcriptions])
print(f"\n === Full Transcription ({len(transcriptions)} chunks) ===")
print(f" {full_text[:1000]}")
# Save
output_path = os.path.join("data", "temp", "transcription_result.json")
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump({
"audio_path": audio_path,
"total_duration_sec": duration,
"num_chunks": len(chunks),
"transcriptions": transcriptions,
"full_text": full_text,
}, f, ensure_ascii=False, indent=2)
print(f" Saved to {output_path}")
return full_text
# ── Whisper Fallback Test ──────────────────────────────────────────────────
def test_whisper_fallback(audio_path: str, device: str = "cuda"):
"""
Test Whisper small/medium as Hindi ASR fallback.
If E4B's native Hindi ASR is insufficient, we use:
Whisper (Hindi ASR) → text → Gemma 4 E4B (extraction)
This is two models but still better than v1's three-model chain.
"""
import torch
from transformers import pipeline
print(f"\n=== Whisper Fallback Test (Hindi) ===")
print(f" Audio: {audio_path}")
# Try whisper-small first (lighter), upgrade to medium if needed
for model_id in ["openai/whisper-small", "openai/whisper-medium"]:
print(f"\n Testing {model_id}...")
t0 = time.time()
try:
pipe = pipeline(
"automatic-speech-recognition",
model=model_id,
device=device,
torch_dtype=torch.float16,
)
result = pipe(
audio_path,
generate_kwargs={"language": "hindi", "task": "transcribe"},
chunk_length_s=30,
batch_size=8,
return_timestamps=True,
)
elapsed = time.time() - t0
print(f" Time: {elapsed:.1f}s")
print(f" Transcription:\n {result['text'][:500]}")
if result.get("chunks"):
print(f" Timestamps: {len(result['chunks'])} segments")
return result["text"]
except Exception as e:
print(f" Failed: {e}")
continue
print(" All Whisper models failed.")
return None
# ── Test Audio Generation ──────────────────────────────────────────────────
def generate_test_audio():
"""
Generate a synthetic Hindi test audio using TTS or provide instructions.
For now, creates a silent WAV as a placeholder and prints instructions
for obtaining real Hindi test audio.
"""
import numpy as np
import soundfile as sf
os.makedirs("data/raw", exist_ok=True)
test_path = "data/raw/test_hindi_30s.wav"
# Create 30-sec silent audio as structural test
silence = np.zeros(SAMPLE_RATE * 30, dtype=np.float32)
sf.write(test_path, silence, SAMPLE_RATE)
print(f" Created placeholder: {test_path} (30s silent)")
print()
print(" To test with real Hindi audio, you need one of:")
print(" 1. Record a Hindi conversation sample (phone/mic)")
print(" 2. Use Google TTS: gtts-cli 'नमस्ते, मेरा नाम सुनीता है' --lang hi -o test.mp3")
print(" 3. Download from Common Voice Hindi dataset")
print(" 4. Use a sample from Mozilla Common Voice (hindi split)")
print()
print(" Recommended test sentences (ASHA visit context):")
print(' - "दीदी, मुझे सिर में बहुत दर्द हो रहा है और आँखों के सामने धुंधला दिख रहा है"')
print(' - "बच्चे का वज़न 2.1 किलो है, दूध ठीक से नहीं पी रहा"')
print(' - "पिछली बार बी.पी. 140/90 आया था, अभी भी पैर सूजे हुए हैं"')
return test_path
# ── Main ───────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="MedScribe v2 — Hindi Audio Test")
parser.add_argument("--audio", type=str, help="Path to Hindi audio file")
parser.add_argument("--generate-test", action="store_true", help="Generate test audio placeholder")
parser.add_argument("--whisper-fallback", action="store_true", help="Test Whisper as backup ASR")
parser.add_argument("--chunk-only", action="store_true", help="Only test audio chunking")
parser.add_argument("--device", type=str, default="cuda", help="Device (cuda/cpu)")
args = parser.parse_args()
if args.generate_test:
generate_test_audio()
return
if not args.audio:
print("Error: provide --audio <path> or --generate-test")
sys.exit(1)
if not os.path.exists(args.audio):
print(f"Error: file not found: {args.audio}")
sys.exit(1)
if args.chunk_only:
chunks = chunk_audio(args.audio)
for c in chunks:
print(f" Chunk: {c['start_sec']:.0f}s-{c['end_sec']:.0f}s → {c['path']}")
return
if args.whisper_fallback:
test_whisper_fallback(args.audio, args.device)
else:
# Try E4B native audio first
import librosa
y, sr = librosa.load(args.audio, sr=SAMPLE_RATE, mono=True)
duration = len(y) / sr
if duration > 30:
test_e4b_audio_chunked(args.audio, args.device)
else:
test_e4b_audio(args.audio, args.device)
if __name__ == "__main__":
main()
|