OpenMAIC-React / src /lib /audio /asr-providers.ts
muthuk1's picture
Convert OpenMAIC from Next.js to React (Vite)
f56a29b verified
/**
* ASR (Automatic Speech Recognition) Provider Implementation
*
* Factory pattern for routing ASR requests to appropriate provider implementations.
* Follows the same architecture as lib/ai/providers.ts for consistency.
*
* Currently Supported Providers:
* - OpenAI Whisper: https://platform.openai.com/docs/guides/speech-to-text
* - Browser Native: Web Speech API (https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API)
* - Qwen ASR: https://bailian.console.aliyun.com/
*
* HOW TO ADD A NEW PROVIDER:
*
* 1. Add provider ID to ASRProviderId in lib/audio/types.ts
* Example: | 'assemblyai-asr'
*
* 2. Add provider configuration to lib/audio/constants.ts
* Example:
* 'assemblyai-asr': {
* id: 'assemblyai-asr',
* name: 'AssemblyAI',
* requiresApiKey: true,
* defaultBaseUrl: 'https://api.assemblyai.com/v2',
* icon: '/assemblyai.svg',
* supportedLanguages: ['en', 'es', 'fr', 'de', 'auto'],
* supportedFormats: ['mp3', 'wav', 'flac', 'm4a']
* }
*
* 3. Implement provider function in this file
* Pattern: async function transcribeXxxASR(config, audioBuffer): Promise<ASRTranscriptionResult>
* - Handle Buffer/Blob conversion (see helper patterns below)
* - Build API request with audio data (FormData or base64)
* - Handle API authentication (apiKey, headers)
* - Convert language codes if needed
* - Return { text: string }
*
* Example:
* async function transcribeAssemblyAIASR(
* config: ASRModelConfig,
* audioBuffer: Buffer | Blob
* ): Promise<ASRTranscriptionResult> {
* const baseUrl = config.baseUrl || ASR_PROVIDERS['assemblyai-asr'].defaultBaseUrl;
*
* // Step 1: Upload audio file
* let blob: Blob;
* if (audioBuffer instanceof Buffer) {
* blob = new Blob([audioBuffer.buffer.slice(
* audioBuffer.byteOffset,
* audioBuffer.byteOffset + audioBuffer.byteLength
* ) as ArrayBuffer], { type: 'audio/webm' });
* } else {
* blob = audioBuffer;
* }
*
* const uploadResponse = await fetch(`${baseUrl}/upload`, {
* method: 'POST',
* headers: {
* 'authorization': config.apiKey!,
* },
* body: blob,
* });
*
* if (!uploadResponse.ok) {
* throw new Error(`AssemblyAI upload error: ${uploadResponse.statusText}`);
* }
*
* const { upload_url } = await uploadResponse.json();
*
* // Step 2: Request transcription
* const transcriptResponse = await fetch(`${baseUrl}/transcript`, {
* method: 'POST',
* headers: {
* 'authorization': config.apiKey!,
* 'Content-Type': 'application/json',
* },
* body: JSON.stringify({
* audio_url: upload_url,
* language_code: config.language === 'auto' ? undefined : config.language,
* }),
* });
*
* const { id } = await transcriptResponse.json();
*
* // Step 3: Poll for completion
* while (true) {
* const statusResponse = await fetch(`${baseUrl}/transcript/${id}`, {
* headers: { 'authorization': config.apiKey! },
* });
* const result = await statusResponse.json();
*
* if (result.status === 'completed') {
* return { text: result.text || '' };
* } else if (result.status === 'error') {
* throw new Error(`AssemblyAI error: ${result.error}`);
* }
*
* await new Promise(resolve => setTimeout(resolve, 1000));
* }
* }
*
* 4. Add case to transcribeAudio() switch statement
* case 'assemblyai-asr':
* return await transcribeAssemblyAIASR(config, audioBuffer);
*
* 5. Add i18n translations in lib/i18n.ts
* providerAssemblyAIASR: { zh: 'AssemblyAI 语音识别', en: 'AssemblyAI ASR' }
*
* Buffer/Blob Conversion Patterns:
*
* Pattern 1: Buffer to Blob (for FormData)
* const blob = new Blob([
* audioBuffer.buffer.slice(audioBuffer.byteOffset, audioBuffer.byteOffset + audioBuffer.byteLength) as ArrayBuffer
* ], { type: 'audio/webm' });
*
* Pattern 2: Buffer to base64 (for JSON API)
* let base64Audio: string;
* if (audioBuffer instanceof Buffer) {
* base64Audio = audioBuffer.toString('base64');
* } else {
* const arrayBuffer = await audioBuffer.arrayBuffer();
* base64Audio = Buffer.from(arrayBuffer).toString('base64');
* }
*
* Pattern 3: Buffer/Blob to File (for Vercel AI SDK)
* let audioFile: File;
* if (audioBuffer instanceof Buffer) {
* const arrayBuffer = audioBuffer.buffer.slice(...) as ArrayBuffer;
* const blob = new Blob([arrayBuffer], { type: 'audio/webm' });
* audioFile = new File([blob], 'audio.webm', { type: 'audio/webm' });
* } else {
* audioFile = new File([audioBuffer], 'audio.webm', { type: 'audio/webm' });
* }
*
* Error Handling Patterns:
* - Always validate API key if requiresApiKey is true
* - Throw descriptive errors for API failures
* - Include response.statusText or error messages from API
* - For client-only providers (browser-native), throw error directing to client-side usage
* - Handle polling/async APIs with proper timeout and error checking
*
* API Call Patterns:
* - Vercel AI SDK: Use createOpenAI + transcribe (OpenAI, compatible providers)
* - FormData: For providers expecting multipart/form-data (most providers)
* - Base64: For providers expecting JSON with base64 audio (Qwen, DashScope)
* - Upload + Poll: For async providers (AssemblyAI, Deepgram batch)
*/
import { createOpenAI } from '@ai-sdk/openai';
import { experimental_transcribe as transcribe } from 'ai';
import type { ASRModelConfig } from './types';
import { isCustomASRProvider } from './types';
import { ASR_PROVIDERS } from './constants';
/**
* Result of ASR transcription
*/
export interface ASRTranscriptionResult {
text: string;
}
/**
* Transcribe audio using specified ASR provider
*/
export async function transcribeAudio(
config: ASRModelConfig,
audioBuffer: Buffer | Blob,
): Promise<ASRTranscriptionResult> {
const provider = ASR_PROVIDERS[config.providerId as keyof typeof ASR_PROVIDERS];
// Validate API key if required (only for built-in providers with known config)
if (provider?.requiresApiKey && !config.apiKey) {
throw new Error(`API key required for ASR provider: ${config.providerId}`);
}
switch (config.providerId) {
case 'openai-whisper':
return await transcribeOpenAIWhisper(config, audioBuffer);
case 'browser-native':
throw new Error('Browser Native ASR must be handled client-side using useBrowserASR hook');
case 'qwen-asr':
return await transcribeQwenASR(config, audioBuffer);
default:
if (isCustomASRProvider(config.providerId)) {
return await transcribeOpenAIWhisper(config, audioBuffer);
}
throw new Error(`Unsupported ASR provider: ${config.providerId}`);
}
}
/**
* OpenAI Whisper implementation (using Vercel AI SDK)
*/
async function transcribeOpenAIWhisper(
config: ASRModelConfig,
audioBuffer: Buffer | Blob,
): Promise<ASRTranscriptionResult> {
const openai = createOpenAI({
apiKey: config.apiKey!,
baseURL: config.baseUrl || ASR_PROVIDERS['openai-whisper'].defaultBaseUrl,
});
// Convert to Buffer or Uint8Array (which is required by the AI SDK)
let audioData: Buffer | Uint8Array;
if (audioBuffer instanceof Buffer) {
audioData = audioBuffer;
} else if (audioBuffer instanceof Blob) {
const arrayBuffer = await audioBuffer.arrayBuffer();
audioData = new Uint8Array(arrayBuffer);
} else {
throw new Error('Invalid audio buffer type');
}
try {
const result = await transcribe({
model: openai.transcription(config.modelId || 'gpt-4o-mini-transcribe'),
audio: audioData,
providerOptions: {
openai: {
language: config.language === 'auto' ? undefined : config.language,
},
},
});
return { text: result.text || '' };
} catch (error: unknown) {
// Short/silent audio may cause the SDK to throw — treat as empty transcription
const errMsg = error instanceof Error ? error.message : '';
if (errMsg.includes('empty') || errMsg.includes('too short')) {
return { text: '' };
}
throw error;
}
}
/**
* Qwen ASR implementation (DashScope API - Qwen3 ASR Flash)
*/
async function transcribeQwenASR(
config: ASRModelConfig,
audioBuffer: Buffer | Blob,
): Promise<ASRTranscriptionResult> {
const baseUrl = config.baseUrl || ASR_PROVIDERS['qwen-asr'].defaultBaseUrl;
// Convert audio to base64
let base64Audio: string;
if (audioBuffer instanceof Buffer) {
base64Audio = audioBuffer.toString('base64');
} else if (audioBuffer instanceof Blob) {
const arrayBuffer = await audioBuffer.arrayBuffer();
base64Audio = Buffer.from(arrayBuffer).toString('base64');
} else {
throw new Error('Invalid audio buffer type');
}
// Build request body
const requestBody: Record<string, unknown> = {
model: config.modelId || 'qwen3-asr-flash',
input: {
messages: [
{
role: 'user',
content: [
{
audio: `data:audio/wav;base64,${base64Audio}`,
},
],
},
],
},
};
// Add language parameter in asr_options if specified (optional - improves accuracy for known languages)
// If language is uncertain or mixed, don't specify (auto-detect)
if (config.language && config.language !== 'auto') {
requestBody.parameters = {
asr_options: {
language: config.language,
},
};
}
const response = await fetch(`${baseUrl}/services/aigc/multimodal-generation/generation`, {
method: 'POST',
headers: {
Authorization: `Bearer ${config.apiKey}`,
'Content-Type': 'application/json; charset=utf-8',
'X-DashScope-Audio-Format': 'wav',
},
body: JSON.stringify(requestBody),
});
if (!response.ok) {
const errorText = await response.text().catch(() => response.statusText);
// "The audio is empty" — treat as no speech detected
if (errorText.includes('audio is empty') || errorText.includes('InvalidParameter')) {
return { text: '' };
}
throw new Error(`Qwen ASR API error: ${errorText}`);
}
const data = await response.json();
// Check for transcription result in response
// Qwen3 ASR returns OpenAI-compatible format:
// { output: { choices: [{ message: { content: [{ text: "transcribed text" }] } }] } }
if (
!data.output?.choices ||
!Array.isArray(data.output.choices) ||
data.output.choices.length === 0
) {
throw new Error(`Qwen ASR error: No choices in response. Response: ${JSON.stringify(data)}`);
}
const firstChoice = data.output.choices[0];
const messageContent = firstChoice?.message?.content;
if (!Array.isArray(messageContent) || messageContent.length === 0) {
// Empty content typically means audio was too short or contained no speech
return { text: '' };
}
// Extract text from first content item
const transcribedText = messageContent[0]?.text || '';
return { text: transcribedText };
}
/**
* Get current ASR configuration from settings store
* Note: This function should only be called in browser context
*/
export async function getCurrentASRConfig(): Promise<ASRModelConfig> {
if (typeof window === 'undefined') {
throw new Error('getCurrentASRConfig() can only be called in browser context');
}
// Lazy import to avoid circular dependency
const { useSettingsStore } = await import('@/lib/store/settings');
const { asrProviderId, asrLanguage, asrProvidersConfig } = useSettingsStore.getState();
const providerConfig = asrProvidersConfig?.[asrProviderId];
return {
providerId: asrProviderId,
modelId:
providerConfig?.modelId ||
ASR_PROVIDERS[asrProviderId as keyof typeof ASR_PROVIDERS]?.defaultModelId ||
'',
apiKey: providerConfig?.apiKey,
baseUrl: providerConfig?.baseUrl || providerConfig?.customDefaultBaseUrl,
language: asrLanguage,
};
}
// Re-export from constants for convenience
export { getAllASRProviders, getASRProvider, getASRSupportedLanguages } from './constants';