File size: 12,269 Bytes
f56a29b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 | /**
* ASR (Automatic Speech Recognition) Provider Implementation
*
* Factory pattern for routing ASR requests to appropriate provider implementations.
* Follows the same architecture as lib/ai/providers.ts for consistency.
*
* Currently Supported Providers:
* - OpenAI Whisper: https://platform.openai.com/docs/guides/speech-to-text
* - Browser Native: Web Speech API (https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API)
* - Qwen ASR: https://bailian.console.aliyun.com/
*
* HOW TO ADD A NEW PROVIDER:
*
* 1. Add provider ID to ASRProviderId in lib/audio/types.ts
* Example: | 'assemblyai-asr'
*
* 2. Add provider configuration to lib/audio/constants.ts
* Example:
* 'assemblyai-asr': {
* id: 'assemblyai-asr',
* name: 'AssemblyAI',
* requiresApiKey: true,
* defaultBaseUrl: 'https://api.assemblyai.com/v2',
* icon: '/assemblyai.svg',
* supportedLanguages: ['en', 'es', 'fr', 'de', 'auto'],
* supportedFormats: ['mp3', 'wav', 'flac', 'm4a']
* }
*
* 3. Implement provider function in this file
* Pattern: async function transcribeXxxASR(config, audioBuffer): Promise<ASRTranscriptionResult>
* - Handle Buffer/Blob conversion (see helper patterns below)
* - Build API request with audio data (FormData or base64)
* - Handle API authentication (apiKey, headers)
* - Convert language codes if needed
* - Return { text: string }
*
* Example:
* async function transcribeAssemblyAIASR(
* config: ASRModelConfig,
* audioBuffer: Buffer | Blob
* ): Promise<ASRTranscriptionResult> {
* const baseUrl = config.baseUrl || ASR_PROVIDERS['assemblyai-asr'].defaultBaseUrl;
*
* // Step 1: Upload audio file
* let blob: Blob;
* if (audioBuffer instanceof Buffer) {
* blob = new Blob([audioBuffer.buffer.slice(
* audioBuffer.byteOffset,
* audioBuffer.byteOffset + audioBuffer.byteLength
* ) as ArrayBuffer], { type: 'audio/webm' });
* } else {
* blob = audioBuffer;
* }
*
* const uploadResponse = await fetch(`${baseUrl}/upload`, {
* method: 'POST',
* headers: {
* 'authorization': config.apiKey!,
* },
* body: blob,
* });
*
* if (!uploadResponse.ok) {
* throw new Error(`AssemblyAI upload error: ${uploadResponse.statusText}`);
* }
*
* const { upload_url } = await uploadResponse.json();
*
* // Step 2: Request transcription
* const transcriptResponse = await fetch(`${baseUrl}/transcript`, {
* method: 'POST',
* headers: {
* 'authorization': config.apiKey!,
* 'Content-Type': 'application/json',
* },
* body: JSON.stringify({
* audio_url: upload_url,
* language_code: config.language === 'auto' ? undefined : config.language,
* }),
* });
*
* const { id } = await transcriptResponse.json();
*
* // Step 3: Poll for completion
* while (true) {
* const statusResponse = await fetch(`${baseUrl}/transcript/${id}`, {
* headers: { 'authorization': config.apiKey! },
* });
* const result = await statusResponse.json();
*
* if (result.status === 'completed') {
* return { text: result.text || '' };
* } else if (result.status === 'error') {
* throw new Error(`AssemblyAI error: ${result.error}`);
* }
*
* await new Promise(resolve => setTimeout(resolve, 1000));
* }
* }
*
* 4. Add case to transcribeAudio() switch statement
* case 'assemblyai-asr':
* return await transcribeAssemblyAIASR(config, audioBuffer);
*
* 5. Add i18n translations in lib/i18n.ts
* providerAssemblyAIASR: { zh: 'AssemblyAI 语音识别', en: 'AssemblyAI ASR' }
*
* Buffer/Blob Conversion Patterns:
*
* Pattern 1: Buffer to Blob (for FormData)
* const blob = new Blob([
* audioBuffer.buffer.slice(audioBuffer.byteOffset, audioBuffer.byteOffset + audioBuffer.byteLength) as ArrayBuffer
* ], { type: 'audio/webm' });
*
* Pattern 2: Buffer to base64 (for JSON API)
* let base64Audio: string;
* if (audioBuffer instanceof Buffer) {
* base64Audio = audioBuffer.toString('base64');
* } else {
* const arrayBuffer = await audioBuffer.arrayBuffer();
* base64Audio = Buffer.from(arrayBuffer).toString('base64');
* }
*
* Pattern 3: Buffer/Blob to File (for Vercel AI SDK)
* let audioFile: File;
* if (audioBuffer instanceof Buffer) {
* const arrayBuffer = audioBuffer.buffer.slice(...) as ArrayBuffer;
* const blob = new Blob([arrayBuffer], { type: 'audio/webm' });
* audioFile = new File([blob], 'audio.webm', { type: 'audio/webm' });
* } else {
* audioFile = new File([audioBuffer], 'audio.webm', { type: 'audio/webm' });
* }
*
* Error Handling Patterns:
* - Always validate API key if requiresApiKey is true
* - Throw descriptive errors for API failures
* - Include response.statusText or error messages from API
* - For client-only providers (browser-native), throw error directing to client-side usage
* - Handle polling/async APIs with proper timeout and error checking
*
* API Call Patterns:
* - Vercel AI SDK: Use createOpenAI + transcribe (OpenAI, compatible providers)
* - FormData: For providers expecting multipart/form-data (most providers)
* - Base64: For providers expecting JSON with base64 audio (Qwen, DashScope)
* - Upload + Poll: For async providers (AssemblyAI, Deepgram batch)
*/
import { createOpenAI } from '@ai-sdk/openai';
import { experimental_transcribe as transcribe } from 'ai';
import type { ASRModelConfig } from './types';
import { isCustomASRProvider } from './types';
import { ASR_PROVIDERS } from './constants';
/**
* Result of ASR transcription
*/
export interface ASRTranscriptionResult {
text: string;
}
/**
* Transcribe audio using specified ASR provider
*/
export async function transcribeAudio(
config: ASRModelConfig,
audioBuffer: Buffer | Blob,
): Promise<ASRTranscriptionResult> {
const provider = ASR_PROVIDERS[config.providerId as keyof typeof ASR_PROVIDERS];
// Validate API key if required (only for built-in providers with known config)
if (provider?.requiresApiKey && !config.apiKey) {
throw new Error(`API key required for ASR provider: ${config.providerId}`);
}
switch (config.providerId) {
case 'openai-whisper':
return await transcribeOpenAIWhisper(config, audioBuffer);
case 'browser-native':
throw new Error('Browser Native ASR must be handled client-side using useBrowserASR hook');
case 'qwen-asr':
return await transcribeQwenASR(config, audioBuffer);
default:
if (isCustomASRProvider(config.providerId)) {
return await transcribeOpenAIWhisper(config, audioBuffer);
}
throw new Error(`Unsupported ASR provider: ${config.providerId}`);
}
}
/**
* OpenAI Whisper implementation (using Vercel AI SDK)
*/
async function transcribeOpenAIWhisper(
config: ASRModelConfig,
audioBuffer: Buffer | Blob,
): Promise<ASRTranscriptionResult> {
const openai = createOpenAI({
apiKey: config.apiKey!,
baseURL: config.baseUrl || ASR_PROVIDERS['openai-whisper'].defaultBaseUrl,
});
// Convert to Buffer or Uint8Array (which is required by the AI SDK)
let audioData: Buffer | Uint8Array;
if (audioBuffer instanceof Buffer) {
audioData = audioBuffer;
} else if (audioBuffer instanceof Blob) {
const arrayBuffer = await audioBuffer.arrayBuffer();
audioData = new Uint8Array(arrayBuffer);
} else {
throw new Error('Invalid audio buffer type');
}
try {
const result = await transcribe({
model: openai.transcription(config.modelId || 'gpt-4o-mini-transcribe'),
audio: audioData,
providerOptions: {
openai: {
language: config.language === 'auto' ? undefined : config.language,
},
},
});
return { text: result.text || '' };
} catch (error: unknown) {
// Short/silent audio may cause the SDK to throw — treat as empty transcription
const errMsg = error instanceof Error ? error.message : '';
if (errMsg.includes('empty') || errMsg.includes('too short')) {
return { text: '' };
}
throw error;
}
}
/**
* Qwen ASR implementation (DashScope API - Qwen3 ASR Flash)
*/
async function transcribeQwenASR(
config: ASRModelConfig,
audioBuffer: Buffer | Blob,
): Promise<ASRTranscriptionResult> {
const baseUrl = config.baseUrl || ASR_PROVIDERS['qwen-asr'].defaultBaseUrl;
// Convert audio to base64
let base64Audio: string;
if (audioBuffer instanceof Buffer) {
base64Audio = audioBuffer.toString('base64');
} else if (audioBuffer instanceof Blob) {
const arrayBuffer = await audioBuffer.arrayBuffer();
base64Audio = Buffer.from(arrayBuffer).toString('base64');
} else {
throw new Error('Invalid audio buffer type');
}
// Build request body
const requestBody: Record<string, unknown> = {
model: config.modelId || 'qwen3-asr-flash',
input: {
messages: [
{
role: 'user',
content: [
{
audio: `data:audio/wav;base64,${base64Audio}`,
},
],
},
],
},
};
// Add language parameter in asr_options if specified (optional - improves accuracy for known languages)
// If language is uncertain or mixed, don't specify (auto-detect)
if (config.language && config.language !== 'auto') {
requestBody.parameters = {
asr_options: {
language: config.language,
},
};
}
const response = await fetch(`${baseUrl}/services/aigc/multimodal-generation/generation`, {
method: 'POST',
headers: {
Authorization: `Bearer ${config.apiKey}`,
'Content-Type': 'application/json; charset=utf-8',
'X-DashScope-Audio-Format': 'wav',
},
body: JSON.stringify(requestBody),
});
if (!response.ok) {
const errorText = await response.text().catch(() => response.statusText);
// "The audio is empty" — treat as no speech detected
if (errorText.includes('audio is empty') || errorText.includes('InvalidParameter')) {
return { text: '' };
}
throw new Error(`Qwen ASR API error: ${errorText}`);
}
const data = await response.json();
// Check for transcription result in response
// Qwen3 ASR returns OpenAI-compatible format:
// { output: { choices: [{ message: { content: [{ text: "transcribed text" }] } }] } }
if (
!data.output?.choices ||
!Array.isArray(data.output.choices) ||
data.output.choices.length === 0
) {
throw new Error(`Qwen ASR error: No choices in response. Response: ${JSON.stringify(data)}`);
}
const firstChoice = data.output.choices[0];
const messageContent = firstChoice?.message?.content;
if (!Array.isArray(messageContent) || messageContent.length === 0) {
// Empty content typically means audio was too short or contained no speech
return { text: '' };
}
// Extract text from first content item
const transcribedText = messageContent[0]?.text || '';
return { text: transcribedText };
}
/**
* Get current ASR configuration from settings store
* Note: This function should only be called in browser context
*/
export async function getCurrentASRConfig(): Promise<ASRModelConfig> {
if (typeof window === 'undefined') {
throw new Error('getCurrentASRConfig() can only be called in browser context');
}
// Lazy import to avoid circular dependency
const { useSettingsStore } = await import('@/lib/store/settings');
const { asrProviderId, asrLanguage, asrProvidersConfig } = useSettingsStore.getState();
const providerConfig = asrProvidersConfig?.[asrProviderId];
return {
providerId: asrProviderId,
modelId:
providerConfig?.modelId ||
ASR_PROVIDERS[asrProviderId as keyof typeof ASR_PROVIDERS]?.defaultModelId ||
'',
apiKey: providerConfig?.apiKey,
baseUrl: providerConfig?.baseUrl || providerConfig?.customDefaultBaseUrl,
language: asrLanguage,
};
}
// Re-export from constants for convenience
export { getAllASRProviders, getASRProvider, getASRSupportedLanguages } from './constants';
|