/** * ASR (Automatic Speech Recognition) Provider Implementation * * Factory pattern for routing ASR requests to appropriate provider implementations. * Follows the same architecture as lib/ai/providers.ts for consistency. * * Currently Supported Providers: * - OpenAI Whisper: https://platform.openai.com/docs/guides/speech-to-text * - Browser Native: Web Speech API (https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API) * - Qwen ASR: https://bailian.console.aliyun.com/ * * HOW TO ADD A NEW PROVIDER: * * 1. Add provider ID to ASRProviderId in lib/audio/types.ts * Example: | 'assemblyai-asr' * * 2. Add provider configuration to lib/audio/constants.ts * Example: * 'assemblyai-asr': { * id: 'assemblyai-asr', * name: 'AssemblyAI', * requiresApiKey: true, * defaultBaseUrl: 'https://api.assemblyai.com/v2', * icon: '/assemblyai.svg', * supportedLanguages: ['en', 'es', 'fr', 'de', 'auto'], * supportedFormats: ['mp3', 'wav', 'flac', 'm4a'] * } * * 3. Implement provider function in this file * Pattern: async function transcribeXxxASR(config, audioBuffer): Promise * - Handle Buffer/Blob conversion (see helper patterns below) * - Build API request with audio data (FormData or base64) * - Handle API authentication (apiKey, headers) * - Convert language codes if needed * - Return { text: string } * * Example: * async function transcribeAssemblyAIASR( * config: ASRModelConfig, * audioBuffer: Buffer | Blob * ): Promise { * const baseUrl = config.baseUrl || ASR_PROVIDERS['assemblyai-asr'].defaultBaseUrl; * * // Step 1: Upload audio file * let blob: Blob; * if (audioBuffer instanceof Buffer) { * blob = new Blob([audioBuffer.buffer.slice( * audioBuffer.byteOffset, * audioBuffer.byteOffset + audioBuffer.byteLength * ) as ArrayBuffer], { type: 'audio/webm' }); * } else { * blob = audioBuffer; * } * * const uploadResponse = await fetch(`${baseUrl}/upload`, { * method: 'POST', * headers: { * 'authorization': config.apiKey!, * }, * body: blob, * }); * * if (!uploadResponse.ok) { * throw new Error(`AssemblyAI upload error: ${uploadResponse.statusText}`); * } * * const { upload_url } = await uploadResponse.json(); * * // Step 2: Request transcription * const transcriptResponse = await fetch(`${baseUrl}/transcript`, { * method: 'POST', * headers: { * 'authorization': config.apiKey!, * 'Content-Type': 'application/json', * }, * body: JSON.stringify({ * audio_url: upload_url, * language_code: config.language === 'auto' ? undefined : config.language, * }), * }); * * const { id } = await transcriptResponse.json(); * * // Step 3: Poll for completion * while (true) { * const statusResponse = await fetch(`${baseUrl}/transcript/${id}`, { * headers: { 'authorization': config.apiKey! }, * }); * const result = await statusResponse.json(); * * if (result.status === 'completed') { * return { text: result.text || '' }; * } else if (result.status === 'error') { * throw new Error(`AssemblyAI error: ${result.error}`); * } * * await new Promise(resolve => setTimeout(resolve, 1000)); * } * } * * 4. Add case to transcribeAudio() switch statement * case 'assemblyai-asr': * return await transcribeAssemblyAIASR(config, audioBuffer); * * 5. Add i18n translations in lib/i18n.ts * providerAssemblyAIASR: { zh: 'AssemblyAI 语音识别', en: 'AssemblyAI ASR' } * * Buffer/Blob Conversion Patterns: * * Pattern 1: Buffer to Blob (for FormData) * const blob = new Blob([ * audioBuffer.buffer.slice(audioBuffer.byteOffset, audioBuffer.byteOffset + audioBuffer.byteLength) as ArrayBuffer * ], { type: 'audio/webm' }); * * Pattern 2: Buffer to base64 (for JSON API) * let base64Audio: string; * if (audioBuffer instanceof Buffer) { * base64Audio = audioBuffer.toString('base64'); * } else { * const arrayBuffer = await audioBuffer.arrayBuffer(); * base64Audio = Buffer.from(arrayBuffer).toString('base64'); * } * * Pattern 3: Buffer/Blob to File (for Vercel AI SDK) * let audioFile: File; * if (audioBuffer instanceof Buffer) { * const arrayBuffer = audioBuffer.buffer.slice(...) as ArrayBuffer; * const blob = new Blob([arrayBuffer], { type: 'audio/webm' }); * audioFile = new File([blob], 'audio.webm', { type: 'audio/webm' }); * } else { * audioFile = new File([audioBuffer], 'audio.webm', { type: 'audio/webm' }); * } * * Error Handling Patterns: * - Always validate API key if requiresApiKey is true * - Throw descriptive errors for API failures * - Include response.statusText or error messages from API * - For client-only providers (browser-native), throw error directing to client-side usage * - Handle polling/async APIs with proper timeout and error checking * * API Call Patterns: * - Vercel AI SDK: Use createOpenAI + transcribe (OpenAI, compatible providers) * - FormData: For providers expecting multipart/form-data (most providers) * - Base64: For providers expecting JSON with base64 audio (Qwen, DashScope) * - Upload + Poll: For async providers (AssemblyAI, Deepgram batch) */ import { createOpenAI } from '@ai-sdk/openai'; import { experimental_transcribe as transcribe } from 'ai'; import type { ASRModelConfig } from './types'; import { isCustomASRProvider } from './types'; import { ASR_PROVIDERS } from './constants'; /** * Result of ASR transcription */ export interface ASRTranscriptionResult { text: string; } /** * Transcribe audio using specified ASR provider */ export async function transcribeAudio( config: ASRModelConfig, audioBuffer: Buffer | Blob, ): Promise { const provider = ASR_PROVIDERS[config.providerId as keyof typeof ASR_PROVIDERS]; // Validate API key if required (only for built-in providers with known config) if (provider?.requiresApiKey && !config.apiKey) { throw new Error(`API key required for ASR provider: ${config.providerId}`); } switch (config.providerId) { case 'openai-whisper': return await transcribeOpenAIWhisper(config, audioBuffer); case 'browser-native': throw new Error('Browser Native ASR must be handled client-side using useBrowserASR hook'); case 'qwen-asr': return await transcribeQwenASR(config, audioBuffer); default: if (isCustomASRProvider(config.providerId)) { return await transcribeOpenAIWhisper(config, audioBuffer); } throw new Error(`Unsupported ASR provider: ${config.providerId}`); } } /** * OpenAI Whisper implementation (using Vercel AI SDK) */ async function transcribeOpenAIWhisper( config: ASRModelConfig, audioBuffer: Buffer | Blob, ): Promise { const openai = createOpenAI({ apiKey: config.apiKey!, baseURL: config.baseUrl || ASR_PROVIDERS['openai-whisper'].defaultBaseUrl, }); // Convert to Buffer or Uint8Array (which is required by the AI SDK) let audioData: Buffer | Uint8Array; if (audioBuffer instanceof Buffer) { audioData = audioBuffer; } else if (audioBuffer instanceof Blob) { const arrayBuffer = await audioBuffer.arrayBuffer(); audioData = new Uint8Array(arrayBuffer); } else { throw new Error('Invalid audio buffer type'); } try { const result = await transcribe({ model: openai.transcription(config.modelId || 'gpt-4o-mini-transcribe'), audio: audioData, providerOptions: { openai: { language: config.language === 'auto' ? undefined : config.language, }, }, }); return { text: result.text || '' }; } catch (error: unknown) { // Short/silent audio may cause the SDK to throw — treat as empty transcription const errMsg = error instanceof Error ? error.message : ''; if (errMsg.includes('empty') || errMsg.includes('too short')) { return { text: '' }; } throw error; } } /** * Qwen ASR implementation (DashScope API - Qwen3 ASR Flash) */ async function transcribeQwenASR( config: ASRModelConfig, audioBuffer: Buffer | Blob, ): Promise { const baseUrl = config.baseUrl || ASR_PROVIDERS['qwen-asr'].defaultBaseUrl; // Convert audio to base64 let base64Audio: string; if (audioBuffer instanceof Buffer) { base64Audio = audioBuffer.toString('base64'); } else if (audioBuffer instanceof Blob) { const arrayBuffer = await audioBuffer.arrayBuffer(); base64Audio = Buffer.from(arrayBuffer).toString('base64'); } else { throw new Error('Invalid audio buffer type'); } // Build request body const requestBody: Record = { model: config.modelId || 'qwen3-asr-flash', input: { messages: [ { role: 'user', content: [ { audio: `data:audio/wav;base64,${base64Audio}`, }, ], }, ], }, }; // Add language parameter in asr_options if specified (optional - improves accuracy for known languages) // If language is uncertain or mixed, don't specify (auto-detect) if (config.language && config.language !== 'auto') { requestBody.parameters = { asr_options: { language: config.language, }, }; } const response = await fetch(`${baseUrl}/services/aigc/multimodal-generation/generation`, { method: 'POST', headers: { Authorization: `Bearer ${config.apiKey}`, 'Content-Type': 'application/json; charset=utf-8', 'X-DashScope-Audio-Format': 'wav', }, body: JSON.stringify(requestBody), }); if (!response.ok) { const errorText = await response.text().catch(() => response.statusText); // "The audio is empty" — treat as no speech detected if (errorText.includes('audio is empty') || errorText.includes('InvalidParameter')) { return { text: '' }; } throw new Error(`Qwen ASR API error: ${errorText}`); } const data = await response.json(); // Check for transcription result in response // Qwen3 ASR returns OpenAI-compatible format: // { output: { choices: [{ message: { content: [{ text: "transcribed text" }] } }] } } if ( !data.output?.choices || !Array.isArray(data.output.choices) || data.output.choices.length === 0 ) { throw new Error(`Qwen ASR error: No choices in response. Response: ${JSON.stringify(data)}`); } const firstChoice = data.output.choices[0]; const messageContent = firstChoice?.message?.content; if (!Array.isArray(messageContent) || messageContent.length === 0) { // Empty content typically means audio was too short or contained no speech return { text: '' }; } // Extract text from first content item const transcribedText = messageContent[0]?.text || ''; return { text: transcribedText }; } /** * Get current ASR configuration from settings store * Note: This function should only be called in browser context */ export async function getCurrentASRConfig(): Promise { if (typeof window === 'undefined') { throw new Error('getCurrentASRConfig() can only be called in browser context'); } // Lazy import to avoid circular dependency const { useSettingsStore } = await import('@/lib/store/settings'); const { asrProviderId, asrLanguage, asrProvidersConfig } = useSettingsStore.getState(); const providerConfig = asrProvidersConfig?.[asrProviderId]; return { providerId: asrProviderId, modelId: providerConfig?.modelId || ASR_PROVIDERS[asrProviderId as keyof typeof ASR_PROVIDERS]?.defaultModelId || '', apiKey: providerConfig?.apiKey, baseUrl: providerConfig?.baseUrl || providerConfig?.customDefaultBaseUrl, language: asrLanguage, }; } // Re-export from constants for convenience export { getAllASRProviders, getASRProvider, getASRSupportedLanguages } from './constants';