Jambonz_impl / aqib-nemo-asr.py

Upload folder using huggingface_hub

04c4cd1 verified 7 months ago

24.2 kB

	import asyncio
	import websockets
	import json
	import threading
	import numpy as np
	import logging
	import time
	import tempfile
	import os
	import re
	from concurrent.futures import ThreadPoolExecutor
	import nemo.collections.asr as nemo_asr
	import soundfile as sf

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	# ===== Arabic number mapping (expanded) =====
	arabic_numbers = {
	# Basic digits
	"صفر": "0", "زيرو": "0", "٠": "0","زيو": "0","زير": "0",
	"واحد": "1", "واحدة": "1", "١": "1",
	"اتنين": "2", "اثنين": "2", "إثنين": "2", "اثنان": "2", "إثنان": "2", "٢": "2",
	"تلاتة": "3", "ثلاثة": "3", "٣": "3",
	"اربعة": "4", "أربعة": "4", "٤": "4",
	"خمسة": "5", "٥": "5",
	"ستة": "6", "٦": "6",
	"سبعة": "7", "٧": "7",
	"تمانية": "8", "ثمانية": "8", "٨": "8",
	"تسعة": "9", "٩": "9",

	# Teens
	"عشرة": "10", "١٠": "10",
	"حداشر": "11", "احد عشر": "11","احداشر": "11",
	"اتناشر": "12", "اثنا عشر": "12",
	"تلتاشر": "13", "ثلاثة عشر": "13",
	"اربعتاشر": "14", "أربعة عشر": "14",
	"خمستاشر": "15", "خمسة عشر": "15",
	"ستاشر": "16", "ستة عشر": "16",
	"سبعتاشر": "17", "سبعة عشر": "17",
	"طمنتاشر": "18", "ثمانية عشر": "18",
	"تسعتاشر": "19", "تسعة عشر": "19",

	# Tens
	"عشرين": "20", "٢٠": "20",
	"تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
	"اربعين": "40", "أربعين": "40", "٤٠": "40",
	"خمسين": "50", "٥٠": "50",
	"ستين": "60", "٦٠": "60",
	"سبعين": "70", "٧٠": "70",
	"تمانين": "80", "ثمانين": "80", "٨٠": "80","تمانون": "80","ثمانون": "80",
	"تسعين": "90", "٩٠": "90",

	# Hundreds
	"مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
	"ميتين": "200", "مائتين": "200",
	"تلاتمية": "300", "ثلاثمائة": "300",
	"اربعمية": "400", "أربعمائة": "400",
	"خمسمية": "500", "خمسمائة": "500",
	"ستمية": "600", "ستمائة": "600",
	"سبعمية": "700", "سبعمائة": "700",
	"تمانمية": "800", "ثمانمائة": "800",
	"تسعمية": "900", "تسعمائة": "900",

	# Thousands
	"ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
	"ألفين": "2000", "الفين": "2000",
	"تلات تلاف": "3000", "ثلاثة آلاف": "3000",
	"اربعة آلاف": "4000", "أربعة آلاف": "4000",
	"خمسة آلاف": "5000",
	"ستة آلاف": "6000",
	"سبعة آلاف": "7000",
	"تمانية آلاف": "8000", "ثمانية آلاف": "8000",
	"تسعة آلاف": "9000",

	# Large numbers
	"عشرة آلاف": "10000",
	"مية ألف": "100000", "مائة ألف": "100000",
	"مليون": "1000000", "١٠٠٠٠٠٠": "1000000",
	"ملايين": "1000000",
	"مليار": "1000000000", "١٠٠٠٠٠٠٠٠٠": "1000000000"
	}

	def replace_arabic_numbers(text: str) -> str:
	for word, digit in arabic_numbers.items():
	text = re.sub(rf"\b{word}\b", digit, text)
	return text


	# Global NeMo model
	asr_model = None

	def initialize_nemo_model():
	"""Initialize NeMo FastConformer model"""
	global asr_model

	logger.info("Loading NeMo FastConformer Arabic ASR model...")

	# Model path - adjust this to your model location
	model_path = os.getenv(
	"NEMO_MODEL_PATH",
	"/path/to/stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo" # Update this path
	)

	if not os.path.exists(model_path):
	logger.error(f"Model not found at: {model_path}")
	logger.info("Please download the model from: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_ar_fastconformer_hybrid_large_pcd")
	raise FileNotFoundError(f"NeMo model not found: {model_path}")

	try:
	asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
	logger.info("NeMo FastConformer model loaded successfully")

	# Set model to eval mode for inference
	asr_model.eval()

	except Exception as e:
	logger.error(f"Failed to load NeMo model: {e}")
	raise

	# Initialize model on startup
	initialize_nemo_model()

	# Thread pool for processing
	executor = ThreadPoolExecutor(max_workers=4)

	class JambonzAudioBuffer:
	def __init__(self, sample_rate=8000, chunk_duration=1.0):
	self.sample_rate = sample_rate
	self.chunk_duration = chunk_duration
	self.chunk_samples = int(chunk_duration * sample_rate)

	self.buffer = np.array([], dtype=np.float32)
	self.lock = threading.Lock()
	self.total_audio = np.array([], dtype=np.float32)

	# Voice Activity Detection
	self.silence_threshold = 0.05
	self.min_speech_samples = int(0.5 * sample_rate)

	def add_audio(self, audio_data):
	with self.lock:
	self.buffer = np.concatenate([self.buffer, audio_data])
	self.total_audio = np.concatenate([self.total_audio, audio_data])

	def has_chunk_ready(self):
	with self.lock:
	return len(self.buffer) >= self.chunk_samples

	def is_speech(self, audio_chunk):
	"""Simple VAD based on energy"""
	if len(audio_chunk) < self.min_speech_samples:
	return False
	energy = np.mean(np.abs(audio_chunk))
	return energy > self.silence_threshold

	def get_chunk_for_processing(self):
	"""Get audio chunk for processing"""
	with self.lock:
	if len(self.buffer) < self.chunk_samples:
	return None
	return np.array([1]) # Signal that chunk is ready

	def get_all_audio(self):
	"""Get all accumulated audio"""
	with self.lock:
	return self.total_audio.copy()

	def clear(self):
	with self.lock:
	self.buffer = np.array([], dtype=np.float32)
	self.total_audio = np.array([], dtype=np.float32)

	def reset_for_new_segment(self):
	"""Reset buffers for new transcription segment"""
	with self.lock:
	self.buffer = np.array([], dtype=np.float32)
	self.total_audio = np.array([], dtype=np.float32)

	def linear16_to_audio(audio_bytes, sample_rate=8000):
	"""Convert LINEAR16 PCM bytes to numpy array"""
	try:
	audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
	audio_array = audio_array.astype(np.float32) / 32768.0
	return audio_array
	except Exception as e:
	logger.error(f"Error converting LINEAR16 to audio: {e}")
	return np.array([], dtype=np.float32)

	def resample_audio(audio_data, source_rate, target_rate):
	"""Resample audio to target sample rate"""
	if source_rate == target_rate:
	return audio_data

	if source_rate == 8000 and target_rate == 16000:
	# Simple 2x upsampling for common case
	upsampled = np.repeat(audio_data, 2)
	return upsampled.astype(np.float32)

	# Fallback: Linear interpolation resampling
	ratio = target_rate / source_rate
	indices = np.arange(0, len(audio_data), 1/ratio)
	indices = indices[indices < len(audio_data)]
	resampled = np.interp(indices, np.arange(len(audio_data)), audio_data)

	return resampled.astype(np.float32)

	def transcribe_with_nemo(audio_data, source_sample_rate=8000, target_sample_rate=16000):
	"""Transcribe audio using NeMo FastConformer"""
	try:
	if len(audio_data) == 0:
	return ""

	# Resample to 16kHz (NeMo models typically expect 16kHz)
	resampled_audio = resample_audio(audio_data, 8000, 16000)

	# Skip very short audio
	min_samples = int(0.3 * 16000)
	if len(resampled_audio) < min_samples:
	return ""

	start_time = time.time()

	# Save audio to temporary file (NeMo expects file path)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	# Write audio as WAV file
	sf.write(tmp_file.name, resampled_audio, target_sample_rate)
	tmp_path = tmp_file.name

	try:
	# Transcribe with NeMo
	result = asr_model.transcribe([tmp_path])

	# Debug logging to understand result format
	logger.info(f"NeMo result type: {type(result)}")
	if result and len(result) > 0:
	logger.info(f"First result type: {type(result[0])}")
	logger.info(f"First result content: {result[0]}")

	if result and len(result) > 0:
	# Handle different NeMo result formats
	if hasattr(result[0], 'text'):
	# If result has .text attribute (newer NeMo versions)
	raw_text = result[0].text
	logger.info(f"Using .text attribute: {raw_text}")
	elif isinstance(result[0], str):
	# If result is directly a string
	raw_text = result[0]
	logger.info(f"Using direct string: {raw_text}")
	else:
	# If result is some other format, convert to string
	raw_text = str(result[0])
	logger.info(f"Using str() conversion: {raw_text}")

	# Ensure raw_text is a string before processing
	if not isinstance(raw_text, str):
	raw_text = str(raw_text)

	# Only process if we have actual text content
	if raw_text and raw_text.strip():
	# Convert Arabic numbers to digits

	logger.info(f"before sending to FXN--- {raw_text}")
	cleaned_text = replace_arabic_numbers(raw_text)
	logger.info(f"after FXN--- {cleaned_text}")
	end_time = time.time()

	if cleaned_text.strip():
	logger.info(f"NeMo transcription: '{cleaned_text}' (processed in {end_time - start_time:.2f}s)")

	return cleaned_text.strip()
	else:
	logger.info("No transcription text found")
	return ""
	else:
	logger.info("No results from NeMo transcription")
	return ""

	finally:
	# Clean up temporary file
	if os.path.exists(tmp_path):
	os.remove(tmp_path)

	except Exception as e:
	logger.error(f"Error during NeMo transcription: {e}")
	return ""

	class JambonzSTTHandler:
	def __init__(self, websocket):
	self.websocket = websocket
	self.audio_buffer = None
	self.config = {}
	self.running = False
	self.transcription_task = None

	# Auto-final detection variables
	self.interim_count = 0
	self.last_interim_time = None
	self.silence_timeout = 2.0
	self.min_interim_count = 2
	self.auto_final_task = None
	self.accumulated_transcript = ""
	self.final_sent = False
	self.segment_number = 0
	self.last_partial = ""

	# Processing tracking
	self.processing_count = 0

	async def start_processing(self, start_message):
	"""Initialize with start message from jambonz"""
	self.config = {
	"language": start_message.get("language", "ar-EG"),
	"format": start_message.get("format", "raw"),
	"encoding": start_message.get("encoding", "LINEAR16"),
	"sample_rate": start_message.get("sampleRateHz", 8000),
	"interim_results": True, # Always enable for internal processing
	"options": start_message.get("options", {})
	}

	logger.info(f"NeMo STT session started with config: {self.config}")

	# Initialize audio buffer
	self.audio_buffer = JambonzAudioBuffer(
	sample_rate=self.config["sample_rate"],
	chunk_duration=1.0 # 1 second chunks for NeMo
	)

	# Reset session variables
	self.running = True
	self.interim_count = 0
	self.last_interim_time = None
	self.accumulated_transcript = ""
	self.final_sent = False
	self.segment_number = 0
	self.processing_count = 0
	self.last_partial = ""

	# Start background transcription task
	self.transcription_task = asyncio.create_task(self._process_audio_chunks())

	# Start auto-final detection task
	self.auto_final_task = asyncio.create_task(self._monitor_for_auto_final())

	async def stop_processing(self):
	"""Stop current processing session"""
	logger.info("Stopping NeMo STT session...")
	self.running = False

	# Cancel background tasks
	for task in [self.transcription_task, self.auto_final_task]:
	if task:
	task.cancel()
	try:
	await task
	except asyncio.CancelledError:
	pass

	# Send final transcription if not already sent
	if not self.final_sent and self.accumulated_transcript.strip():
	await self.send_transcription(self.accumulated_transcript, is_final=True)

	# Process any remaining audio for comprehensive final transcription
	if self.audio_buffer:
	all_audio = self.audio_buffer.get_all_audio()
	if len(all_audio) > 0 and not self.final_sent:
	loop = asyncio.get_event_loop()
	final_transcription = await loop.run_in_executor(
	executor,
	transcribe_with_nemo,
	all_audio,
	self.config["sample_rate"]
	)

	if final_transcription.strip():
	await self.send_transcription(final_transcription, is_final=True)

	# Clear audio buffer
	if self.audio_buffer:
	self.audio_buffer.clear()

	logger.info("NeMo STT session stopped")

	async def start_new_segment(self):
	"""Start a new transcription segment"""
	self.segment_number += 1
	self.interim_count = 0
	self.last_interim_time = None
	self.accumulated_transcript = ""
	self.final_sent = False
	self.last_partial = ""
	self.processing_count = 0

	if self.audio_buffer:
	self.audio_buffer.reset_for_new_segment()

	logger.info(f"Started new transcription segment #{self.segment_number}")

	async def add_audio_data(self, audio_bytes):
	"""Add audio data to buffer"""
	if self.audio_buffer and self.running:
	audio_data = linear16_to_audio(audio_bytes, self.config["sample_rate"])
	self.audio_buffer.add_audio(audio_data)

	async def _process_audio_chunks(self):
	"""Process audio chunks for interim results"""
	while self.running:
	try:
	if self.audio_buffer and self.audio_buffer.has_chunk_ready():
	chunk_signal = self.audio_buffer.get_chunk_for_processing()
	if chunk_signal is not None:
	all_audio = self.audio_buffer.get_all_audio()

	if len(all_audio) > 0 and self.audio_buffer.is_speech(all_audio[-self.audio_buffer.chunk_samples:]):
	loop = asyncio.get_event_loop()
	transcription = await loop.run_in_executor(
	executor,
	transcribe_with_nemo,
	all_audio,
	self.config["sample_rate"]
	)

	if transcription.strip():
	self.processing_count += 1
	self.accumulated_transcript = transcription

	if transcription != self.last_partial or self.interim_count == 0:
	self.last_partial = transcription
	self.interim_count += 1
	self.last_interim_time = time.time()
	logger.info(f"Updated interim_count to {self.interim_count} for transcript: '{transcription}'")
	else:
	self.last_interim_time = time.time()

	await asyncio.sleep(0.1) # Check every 100ms

	except Exception as e:
	logger.error(f"Error in chunk processing: {e}")
	await asyncio.sleep(0.1)

	async def _monitor_for_auto_final(self):
	"""Monitor for auto-final conditions"""
	while self.running:
	try:
	current_time = time.time()

	if (self.interim_count >= self.min_interim_count and
	self.last_interim_time is not None and
	(current_time - self.last_interim_time) >= self.silence_timeout and
	not self.final_sent and
	self.accumulated_transcript.strip()):

	logger.info(f"Auto-final triggered for segment #{self.segment_number}")

	await self.send_transcription(self.accumulated_transcript, is_final=True)
	await self.start_new_segment()

	await asyncio.sleep(0.5) # Check every 500ms

	except Exception as e:
	logger.error(f"Error in auto-final monitoring: {e}")
	await asyncio.sleep(0.5)

	async def send_transcription(self, text, is_final=True, confidence=0.9):
	"""Send transcription in jambonz format"""
	try:
	message = {
	"type": "transcription",
	"is_final": True, # Always send as final
	"alternatives": [
	{
	"transcript": text,
	"confidence": confidence
	}
	],
	"language": self.config.get("language", "ar-EG"),
	"channel": 1
	}

	await self.websocket.send(json.dumps(message))
	self.final_sent = True

	logger.info(f"Sent FINAL transcription to Jambonz: '{text}'")

	except Exception as e:
	logger.error(f"Error sending transcription: {e}")

	async def send_error(self, error_message):
	"""Send error message in jambonz format"""
	try:
	message = {
	"type": "error",
	"error": error_message
	}
	await self.websocket.send(json.dumps(message))
	logger.error(f"Sent error: {error_message}")
	except Exception as e:
	logger.error(f"Error sending error message: {e}")

	async def handle_jambonz_websocket(websocket):
	"""Handle jambonz WebSocket connections"""

	client_id = f"jambonz_{id(websocket)}"
	logger.info(f"New NeMo jambonz connection: {client_id}")

	handler = JambonzSTTHandler(websocket)

	try:
	async for message in websocket:
	try:
	if isinstance(message, str):
	data = json.loads(message)
	message_type = data.get("type")

	if message_type == "start":
	logger.info(f"Received start message: {data}")
	await handler.start_processing(data)

	elif message_type == "stop":
	logger.info("Received stop message - closing WebSocket")
	await handler.stop_processing()
	await websocket.close(code=1000, reason="Session stopped by client")
	break

	else:
	logger.warning(f"Unknown message type: {message_type}")
	await handler.send_error(f"Unknown message type: {message_type}")

	else:
	# Handle binary audio data
	if not handler.running or handler.audio_buffer is None:
	logger.warning("Received audio data outside of active session")
	await handler.send_error("Received audio before start message or after stop")
	continue

	await handler.add_audio_data(message)

	except json.JSONDecodeError as e:
	logger.error(f"JSON decode error: {e}")
	await handler.send_error(f"Invalid JSON: {str(e)}")
	except Exception as e:
	logger.error(f"Error processing message: {e}")
	await handler.send_error(f"Processing error: {str(e)}")

	except websockets.exceptions.ConnectionClosed:
	logger.info(f"NeMo jambonz connection closed: {client_id}")
	except Exception as e:
	logger.error(f"NeMo jambonz WebSocket error: {e}")
	try:
	await handler.send_error(str(e))
	except:
	pass
	finally:
	if handler.running:
	await handler.stop_processing()
	logger.info(f"NeMo jambonz connection ended: {client_id}")

	async def main():
	"""Start the NeMo jambonz STT WebSocket server"""
	logger.info("Starting NeMo Jambonz STT WebSocket server on port 3007...")

	# Start WebSocket server
	server = await websockets.serve(
	handle_jambonz_websocket,
	"0.0.0.0",
	3007,
	ping_interval=20,
	ping_timeout=10,
	close_timeout=10
	)

	logger.info("NeMo Jambonz STT WebSocket server started on ws://0.0.0.0:3007")
	logger.info("Ready to handle jambonz STT requests with NeMo FastConformer")
	logger.info("FEATURES:")
	logger.info("- Arabic ASR using NeMo FastConformer model")
	logger.info("- Arabic number word to digit conversion")
	logger.info("- Continuous transcription with segmentation")
	logger.info("- Voice Activity Detection")

	# Wait for the server to close
	await server.wait_closed()

	if __name__ == "__main__":
	print("=" * 80)
	print("NeMo FastConformer Jambonz STT Server")
	print("=" * 80)
	print("Model: NeMo FastConformer Arabic ASR")
	print("WebSocket Port: 3007")
	print("Protocol: jambonz STT API")
	print("Audio Format: LINEAR16 PCM @ 8kHz → 16kHz")
	print("Language: Arabic with number conversion")
	print("=" * 80)

	try:
	asyncio.run(main())
	except KeyboardInterrupt:
	print("\nShutting down NeMo server...")
	except Exception as e:
	print(f"Server error: {e}")