OpenMAIC-React / src /lib /audio /asr-providers.ts

Convert OpenMAIC from Next.js to React (Vite)

f56a29b verified 10 days ago

12.3 kB

	/**
	* ASR (Automatic Speech Recognition) Provider Implementation
	*
	* Factory pattern for routing ASR requests to appropriate provider implementations.
	* Follows the same architecture as lib/ai/providers.ts for consistency.
	*
	* Currently Supported Providers:
	* - OpenAI Whisper: https://platform.openai.com/docs/guides/speech-to-text
	* - Browser Native: Web Speech API (https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API)
	* - Qwen ASR: https://bailian.console.aliyun.com/
	*
	* HOW TO ADD A NEW PROVIDER:
	*
	* 1. Add provider ID to ASRProviderId in lib/audio/types.ts
	* Example: \| 'assemblyai-asr'
	*
	* 2. Add provider configuration to lib/audio/constants.ts
	* Example:
	* 'assemblyai-asr': {
	* id: 'assemblyai-asr',
	* name: 'AssemblyAI',
	* requiresApiKey: true,
	* defaultBaseUrl: 'https://api.assemblyai.com/v2',
	* icon: '/assemblyai.svg',
	* supportedLanguages: ['en', 'es', 'fr', 'de', 'auto'],
	* supportedFormats: ['mp3', 'wav', 'flac', 'm4a']
	* }
	*
	* 3. Implement provider function in this file
	* Pattern: async function transcribeXxxASR(config, audioBuffer): Promise<ASRTranscriptionResult>
	* - Handle Buffer/Blob conversion (see helper patterns below)
	* - Build API request with audio data (FormData or base64)
	* - Handle API authentication (apiKey, headers)
	* - Convert language codes if needed
	* - Return { text: string }
	*
	* Example:
	* async function transcribeAssemblyAIASR(
	* config: ASRModelConfig,
	* audioBuffer: Buffer \| Blob
	* ): Promise<ASRTranscriptionResult> {
	* const baseUrl = config.baseUrl \|\| ASR_PROVIDERS['assemblyai-asr'].defaultBaseUrl;
	*
	* // Step 1: Upload audio file
	* let blob: Blob;
	* if (audioBuffer instanceof Buffer) {
	* blob = new Blob([audioBuffer.buffer.slice(
	* audioBuffer.byteOffset,
	* audioBuffer.byteOffset + audioBuffer.byteLength
	* ) as ArrayBuffer], { type: 'audio/webm' });
	* } else {
	* blob = audioBuffer;
	* }
	*
	* const uploadResponse = await fetch(`${baseUrl}/upload`, {
	* method: 'POST',
	* headers: {
	* 'authorization': config.apiKey!,
	* },
	* body: blob,
	* });
	*
	* if (!uploadResponse.ok) {
	* throw new Error(`AssemblyAI upload error: ${uploadResponse.statusText}`);
	* }
	*
	* const { upload_url } = await uploadResponse.json();
	*
	* // Step 2: Request transcription
	* const transcriptResponse = await fetch(`${baseUrl}/transcript`, {
	* method: 'POST',
	* headers: {
	* 'authorization': config.apiKey!,
	* 'Content-Type': 'application/json',
	* },
	* body: JSON.stringify({
	* audio_url: upload_url,
	* language_code: config.language === 'auto' ? undefined : config.language,
	* }),
	* });
	*
	* const { id } = await transcriptResponse.json();
	*
	* // Step 3: Poll for completion
	* while (true) {
	* const statusResponse = await fetch(`${baseUrl}/transcript/${id}`, {
	* headers: { 'authorization': config.apiKey! },
	* });
	* const result = await statusResponse.json();
	*
	* if (result.status === 'completed') {
	* return { text: result.text \|\| '' };
	* } else if (result.status === 'error') {
	* throw new Error(`AssemblyAI error: ${result.error}`);
	* }
	*
	* await new Promise(resolve => setTimeout(resolve, 1000));
	* }
	* }
	*
	* 4. Add case to transcribeAudio() switch statement
	* case 'assemblyai-asr':
	* return await transcribeAssemblyAIASR(config, audioBuffer);
	*
	* 5. Add i18n translations in lib/i18n.ts
	* providerAssemblyAIASR: { zh: 'AssemblyAI 语音识别', en: 'AssemblyAI ASR' }
	*
	* Buffer/Blob Conversion Patterns:
	*
	* Pattern 1: Buffer to Blob (for FormData)
	* const blob = new Blob([
	* audioBuffer.buffer.slice(audioBuffer.byteOffset, audioBuffer.byteOffset + audioBuffer.byteLength) as ArrayBuffer
	* ], { type: 'audio/webm' });
	*
	* Pattern 2: Buffer to base64 (for JSON API)
	* let base64Audio: string;
	* if (audioBuffer instanceof Buffer) {
	* base64Audio = audioBuffer.toString('base64');
	* } else {
	* const arrayBuffer = await audioBuffer.arrayBuffer();
	* base64Audio = Buffer.from(arrayBuffer).toString('base64');
	* }
	*
	* Pattern 3: Buffer/Blob to File (for Vercel AI SDK)
	* let audioFile: File;
	* if (audioBuffer instanceof Buffer) {
	* const arrayBuffer = audioBuffer.buffer.slice(...) as ArrayBuffer;
	* const blob = new Blob([arrayBuffer], { type: 'audio/webm' });
	* audioFile = new File([blob], 'audio.webm', { type: 'audio/webm' });
	* } else {
	* audioFile = new File([audioBuffer], 'audio.webm', { type: 'audio/webm' });
	* }
	*
	* Error Handling Patterns:
	* - Always validate API key if requiresApiKey is true
	* - Throw descriptive errors for API failures
	* - Include response.statusText or error messages from API
	* - For client-only providers (browser-native), throw error directing to client-side usage
	* - Handle polling/async APIs with proper timeout and error checking
	*
	* API Call Patterns:
	* - Vercel AI SDK: Use createOpenAI + transcribe (OpenAI, compatible providers)
	* - FormData: For providers expecting multipart/form-data (most providers)
	* - Base64: For providers expecting JSON with base64 audio (Qwen, DashScope)
	* - Upload + Poll: For async providers (AssemblyAI, Deepgram batch)
	*/

	import { createOpenAI } from '@ai-sdk/openai';
	import { experimental_transcribe as transcribe } from 'ai';
	import type { ASRModelConfig } from './types';
	import { isCustomASRProvider } from './types';
	import { ASR_PROVIDERS } from './constants';

	/**
	* Result of ASR transcription
	*/
	export interface ASRTranscriptionResult {
	text: string;
	}

	/**
	* Transcribe audio using specified ASR provider
	*/
	export async function transcribeAudio(
	config: ASRModelConfig,
	audioBuffer: Buffer \| Blob,
	): Promise<ASRTranscriptionResult> {
	const provider = ASR_PROVIDERS[config.providerId as keyof typeof ASR_PROVIDERS];

	// Validate API key if required (only for built-in providers with known config)
	if (provider?.requiresApiKey && !config.apiKey) {
	throw new Error(`API key required for ASR provider: ${config.providerId}`);
	}

	switch (config.providerId) {
	case 'openai-whisper':
	return await transcribeOpenAIWhisper(config, audioBuffer);

	case 'browser-native':
	throw new Error('Browser Native ASR must be handled client-side using useBrowserASR hook');

	case 'qwen-asr':
	return await transcribeQwenASR(config, audioBuffer);

	default:
	if (isCustomASRProvider(config.providerId)) {
	return await transcribeOpenAIWhisper(config, audioBuffer);
	}
	throw new Error(`Unsupported ASR provider: ${config.providerId}`);
	}
	}

	/**
	* OpenAI Whisper implementation (using Vercel AI SDK)
	*/
	async function transcribeOpenAIWhisper(
	config: ASRModelConfig,
	audioBuffer: Buffer \| Blob,
	): Promise<ASRTranscriptionResult> {
	const openai = createOpenAI({
	apiKey: config.apiKey!,
	baseURL: config.baseUrl \|\| ASR_PROVIDERS['openai-whisper'].defaultBaseUrl,
	});

	// Convert to Buffer or Uint8Array (which is required by the AI SDK)
	let audioData: Buffer \| Uint8Array;
	if (audioBuffer instanceof Buffer) {
	audioData = audioBuffer;
	} else if (audioBuffer instanceof Blob) {
	const arrayBuffer = await audioBuffer.arrayBuffer();
	audioData = new Uint8Array(arrayBuffer);
	} else {
	throw new Error('Invalid audio buffer type');
	}

	try {
	const result = await transcribe({
	model: openai.transcription(config.modelId \|\| 'gpt-4o-mini-transcribe'),
	audio: audioData,
	providerOptions: {
	openai: {
	language: config.language === 'auto' ? undefined : config.language,
	},
	},
	});

	return { text: result.text \|\| '' };
	} catch (error: unknown) {
	// Short/silent audio may cause the SDK to throw — treat as empty transcription
	const errMsg = error instanceof Error ? error.message : '';
	if (errMsg.includes('empty') \|\| errMsg.includes('too short')) {
	return { text: '' };
	}
	throw error;
	}
	}

	/**
	* Qwen ASR implementation (DashScope API - Qwen3 ASR Flash)
	*/
	async function transcribeQwenASR(
	config: ASRModelConfig,
	audioBuffer: Buffer \| Blob,
	): Promise<ASRTranscriptionResult> {
	const baseUrl = config.baseUrl \|\| ASR_PROVIDERS['qwen-asr'].defaultBaseUrl;

	// Convert audio to base64
	let base64Audio: string;
	if (audioBuffer instanceof Buffer) {
	base64Audio = audioBuffer.toString('base64');
	} else if (audioBuffer instanceof Blob) {
	const arrayBuffer = await audioBuffer.arrayBuffer();
	base64Audio = Buffer.from(arrayBuffer).toString('base64');
	} else {
	throw new Error('Invalid audio buffer type');
	}

	// Build request body
	const requestBody: Record<string, unknown> = {
	model: config.modelId \|\| 'qwen3-asr-flash',
	input: {
	messages: [
	{
	role: 'user',
	content: [
	{
	audio: `data:audio/wav;base64,${base64Audio}`,
	},
	],
	},
	],
	},
	};

	// Add language parameter in asr_options if specified (optional - improves accuracy for known languages)
	// If language is uncertain or mixed, don't specify (auto-detect)
	if (config.language && config.language !== 'auto') {
	requestBody.parameters = {
	asr_options: {
	language: config.language,
	},
	};
	}

	const response = await fetch(`${baseUrl}/services/aigc/multimodal-generation/generation`, {
	method: 'POST',
	headers: {
	Authorization: `Bearer ${config.apiKey}`,
	'Content-Type': 'application/json; charset=utf-8',
	'X-DashScope-Audio-Format': 'wav',
	},
	body: JSON.stringify(requestBody),
	});

	if (!response.ok) {
	const errorText = await response.text().catch(() => response.statusText);
	// "The audio is empty" — treat as no speech detected
	if (errorText.includes('audio is empty') \|\| errorText.includes('InvalidParameter')) {
	return { text: '' };
	}
	throw new Error(`Qwen ASR API error: ${errorText}`);
	}

	const data = await response.json();

	// Check for transcription result in response
	// Qwen3 ASR returns OpenAI-compatible format:
	// { output: { choices: [{ message: { content: [{ text: "transcribed text" }] } }] } }
	if (
	!data.output?.choices \|\|
	!Array.isArray(data.output.choices) \|\|
	data.output.choices.length === 0
	) {
	throw new Error(`Qwen ASR error: No choices in response. Response: ${JSON.stringify(data)}`);
	}

	const firstChoice = data.output.choices[0];
	const messageContent = firstChoice?.message?.content;

	if (!Array.isArray(messageContent) \|\| messageContent.length === 0) {
	// Empty content typically means audio was too short or contained no speech
	return { text: '' };
	}

	// Extract text from first content item
	const transcribedText = messageContent[0]?.text \|\| '';
	return { text: transcribedText };
	}

	/**
	* Get current ASR configuration from settings store
	* Note: This function should only be called in browser context
	*/
	export async function getCurrentASRConfig(): Promise<ASRModelConfig> {
	if (typeof window === 'undefined') {
	throw new Error('getCurrentASRConfig() can only be called in browser context');
	}

	// Lazy import to avoid circular dependency
	const { useSettingsStore } = await import('@/lib/store/settings');
	const { asrProviderId, asrLanguage, asrProvidersConfig } = useSettingsStore.getState();

	const providerConfig = asrProvidersConfig?.[asrProviderId];

	return {
	providerId: asrProviderId,
	modelId:
	providerConfig?.modelId \|\|
	ASR_PROVIDERS[asrProviderId as keyof typeof ASR_PROVIDERS]?.defaultModelId \|\|
	'',
	apiKey: providerConfig?.apiKey,
	baseUrl: providerConfig?.baseUrl \|\| providerConfig?.customDefaultBaseUrl,
	language: asrLanguage,
	};
	}

	// Re-export from constants for convenience
	export { getAllASRProviders, getASRProvider, getASRSupportedLanguages } from './constants';