| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import type { TTSModelConfig } from './types'; |
| import { isCustomTTSProvider } from './types'; |
| import { TTS_PROVIDERS } from './constants'; |
| import { |
| VOXCPM_VLLM_MODEL_ID, |
| VOXCPM_AUTO_VOICE_ID, |
| normalizeVoxCPMBackend, |
| type VoxCPMProviderOptions, |
| } from './voxcpm'; |
|
|
| |
| |
| |
| export interface TTSGenerationResult { |
| audio: Uint8Array; |
| format: string; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| export class TTSRateLimitError extends Error { |
| constructor( |
| public readonly provider: string, |
| message: string, |
| ) { |
| super(message); |
| this.name = 'TTSRateLimitError'; |
| } |
| } |
|
|
| |
| |
| |
| export async function generateTTS( |
| config: TTSModelConfig, |
| text: string, |
| ): Promise<TTSGenerationResult> { |
| const provider = TTS_PROVIDERS[config.providerId as keyof typeof TTS_PROVIDERS]; |
|
|
| |
| if (provider?.requiresApiKey && !config.apiKey) { |
| throw new Error(`API key required for TTS provider: ${config.providerId}`); |
| } |
|
|
| switch (config.providerId) { |
| case 'openai-tts': |
| return await generateOpenAITTS(config, text); |
|
|
| case 'azure-tts': |
| return await generateAzureTTS(config, text); |
|
|
| case 'glm-tts': |
| return await generateGLMTTS(config, text); |
|
|
| case 'qwen-tts': |
| return await generateQwenTTS(config, text); |
|
|
| case 'voxcpm-tts': |
| return await generateVoxCPMTTS(config, text); |
|
|
| case 'minimax-tts': |
| return await generateMiniMaxTTS(config, text); |
| case 'doubao-tts': |
| return await generateDoubaoTTS(config, text); |
| case 'elevenlabs-tts': |
| return await generateElevenLabsTTS(config, text); |
|
|
| case 'browser-native-tts': |
| throw new Error( |
| 'Browser Native TTS must be handled client-side using Web Speech API. This provider cannot be used on the server.', |
| ); |
|
|
| default: |
| if (isCustomTTSProvider(config.providerId)) { |
| return await generateOpenAITTS(config, text); |
| } |
| throw new Error(`Unsupported TTS provider: ${config.providerId}`); |
| } |
| } |
|
|
| |
| |
| |
| async function generateOpenAITTS( |
| config: TTSModelConfig, |
| text: string, |
| ): Promise<TTSGenerationResult> { |
| const baseUrl = config.baseUrl || TTS_PROVIDERS['openai-tts'].defaultBaseUrl; |
|
|
| |
| const response = await fetch(`${baseUrl}/audio/speech`, { |
| method: 'POST', |
| headers: { |
| Authorization: `Bearer ${config.apiKey}`, |
| 'Content-Type': 'application/json; charset=utf-8', |
| }, |
| body: JSON.stringify({ |
| model: config.modelId || 'gpt-4o-mini-tts', |
| input: text, |
| voice: config.voice, |
| speed: config.speed || 1.0, |
| }), |
| }); |
|
|
| if (!response.ok) { |
| const error = await response.json().catch(() => ({ error: response.statusText })); |
| throw new Error(`OpenAI TTS API error: ${error.error?.message || response.statusText}`); |
| } |
|
|
| const arrayBuffer = await response.arrayBuffer(); |
| const contentType = response.headers.get('content-type') || ''; |
| const format = getAudioResponseFormat(contentType); |
| return { |
| audio: new Uint8Array(arrayBuffer), |
| format, |
| }; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| async function generateVoxCPMTTS( |
| config: TTSModelConfig, |
| text: string, |
| ): Promise<TTSGenerationResult> { |
| const baseUrl = (config.baseUrl || TTS_PROVIDERS['voxcpm-tts'].defaultBaseUrl || '').replace( |
| /\/$/, |
| '', |
| ); |
| if (!baseUrl) { |
| throw new Error('VoxCPM base URL is required'); |
| } |
|
|
| const options = (config.providerOptions || {}) as VoxCPMProviderOptions; |
| const backend = normalizeVoxCPMBackend(options.backend); |
| const voicePrompt = |
| options.voicePrompt || |
| (config.voice && config.voice !== 'default' && config.voice !== VOXCPM_AUTO_VOICE_ID |
| ? config.voice |
| : undefined); |
| if (config.voice === VOXCPM_AUTO_VOICE_ID && !voicePrompt) { |
| throw new Error('VoxCPM Auto Voice requires agent context'); |
| } |
| const cfgValue = options.cfgValue ?? 2.0; |
| const inferenceTimesteps = options.inferenceTimesteps ?? 10; |
| const normalize = options.normalize ?? false; |
| const denoise = options.denoise ?? false; |
| const usePromptContinuation = Boolean(options.promptText?.trim() && options.referenceAudioBase64); |
|
|
| const request = { |
| targetText: usePromptContinuation ? text : buildVoxCPMTargetText(text, voicePrompt), |
| voicePrompt, |
| promptText: options.promptText, |
| cfgValue, |
| inferenceTimesteps, |
| normalize, |
| denoise, |
| referenceAudioBase64: options.referenceAudioBase64, |
| referenceAudioMimeType: options.referenceAudioMimeType, |
| referenceAudioName: options.referenceAudioName, |
| }; |
|
|
| const response = |
| backend === 'nano-vllm' |
| ? await postVoxCPMNanoVLLM(baseUrl, request, config.apiKey) |
| : backend === 'python-api' |
| ? await postVoxCPMPythonAPI(baseUrl, request, config.apiKey) |
| : await postVoxCPMVLLMOmni(baseUrl, request, config); |
|
|
| if (!response.ok) { |
| throw new Error(`VoxCPM TTS API error: ${await readTTSApiError(response)}`); |
| } |
|
|
| const arrayBuffer = await response.arrayBuffer(); |
| const contentType = response.headers.get('content-type') || ''; |
| const format = getAudioResponseFormat(contentType); |
| return { |
| audio: new Uint8Array(arrayBuffer), |
| format, |
| }; |
| } |
|
|
| function buildVoxCPMTargetText(text: string, voicePrompt?: string): string { |
| const prompt = voicePrompt |
| ?.replace(/[\p{C}]+/gu, ' ') |
| .replace(/[()()]/gu, '') |
| .replace(/\s+/gu, ' ') |
| .trim(); |
| return prompt ? `(${prompt})${text}` : text; |
| } |
|
|
| function getAudioResponseFormat(contentType: string): string { |
| if (contentType.includes('audio/wav') || contentType.includes('audio/x-wav')) return 'wav'; |
| if (contentType.includes('audio/mpeg') || contentType.includes('audio/mp3')) return 'mp3'; |
| if (contentType.includes('audio/flac')) return 'flac'; |
| if (contentType.includes('audio/ogg')) return 'ogg'; |
| if (contentType.includes('audio/webm')) return 'webm'; |
| return 'mp3'; |
| } |
|
|
| function getVoxCPMAudioFormat(mimeType?: string, fileName?: string): string { |
| const lowerName = fileName?.toLowerCase() || ''; |
| if (mimeType?.includes('wav') || lowerName.endsWith('.wav')) return 'wav'; |
| if (mimeType?.includes('mpeg') || mimeType?.includes('mp3') || lowerName.endsWith('.mp3')) { |
| return 'mp3'; |
| } |
| if (mimeType?.includes('flac') || lowerName.endsWith('.flac')) return 'flac'; |
| if (mimeType?.includes('ogg') || lowerName.endsWith('.ogg')) return 'ogg'; |
| if (mimeType?.includes('webm') || lowerName.endsWith('.webm')) return 'webm'; |
| return 'wav'; |
| } |
|
|
| function getVLLMOmniSpeechUrl(baseUrl: string): string { |
| return baseUrl.endsWith('/v1') ? `${baseUrl}/audio/speech` : `${baseUrl}/v1/audio/speech`; |
| } |
|
|
| function getVLLMOmniModelId(config: TTSModelConfig): string { |
| const modelId = config.modelId?.trim(); |
| if (!modelId || modelId === 'VoxCPM2') return VOXCPM_VLLM_MODEL_ID; |
| return modelId; |
| } |
|
|
| function getBackendAuthHeaders(apiKey?: string): Record<string, string> { |
| return apiKey?.trim() ? { Authorization: `Bearer ${apiKey.trim()}` } : {}; |
| } |
|
|
| async function postVoxCPMVLLMOmni( |
| baseUrl: string, |
| params: { |
| targetText: string; |
| promptText?: string; |
| referenceAudioBase64?: string; |
| referenceAudioMimeType?: string; |
| referenceAudioName?: string; |
| }, |
| config: TTSModelConfig, |
| ): Promise<Response> { |
| const payload: Record<string, unknown> = { |
| model: getVLLMOmniModelId(config), |
| input: params.targetText, |
| |
| voice: 'default', |
| response_format: 'wav', |
| stream: false, |
| }; |
|
|
| if (params.referenceAudioBase64) { |
| const referenceAudio = getVoxCPMDataAudioUrl( |
| params.referenceAudioBase64, |
| params.referenceAudioMimeType, |
| params.referenceAudioName, |
| ); |
| payload.ref_audio = referenceAudio; |
| if (params.promptText?.trim()) { |
| payload.prompt_audio = referenceAudio; |
| payload.prompt_text = params.promptText.trim(); |
| } |
| } |
|
|
| return fetch(getVLLMOmniSpeechUrl(baseUrl), { |
| method: 'POST', |
| headers: { |
| 'Content-Type': 'application/json; charset=utf-8', |
| ...getBackendAuthHeaders(config.apiKey), |
| }, |
| body: JSON.stringify(payload), |
| }); |
| } |
|
|
| function getVoxCPMDataAudioUrl(base64: string, mimeType?: string, fileName?: string): string { |
| const format = getVoxCPMAudioFormat(mimeType, fileName); |
| const mediaType = |
| mimeType?.trim() || |
| (format === 'mp3' |
| ? 'audio/mpeg' |
| : format === 'flac' |
| ? 'audio/flac' |
| : format === 'ogg' |
| ? 'audio/ogg' |
| : format === 'webm' |
| ? 'audio/webm' |
| : 'audio/wav'); |
| return `data:${mediaType};base64,${base64}`; |
| } |
|
|
| function base64ToBlob(base64: string, mimeType?: string): Blob { |
| const binary = atob(base64); |
| const bytes = new Uint8Array(binary.length); |
| for (let index = 0; index < binary.length; index++) { |
| bytes[index] = binary.charCodeAt(index); |
| } |
| return new Blob([bytes], { type: mimeType || 'audio/wav' }); |
| } |
|
|
| async function postVoxCPMPythonAPI( |
| baseUrl: string, |
| params: { |
| targetText: string; |
| promptText?: string; |
| cfgValue: number; |
| inferenceTimesteps: number; |
| normalize: boolean; |
| denoise: boolean; |
| referenceAudioBase64?: string; |
| referenceAudioMimeType?: string; |
| referenceAudioName?: string; |
| }, |
| apiKey?: string, |
| ): Promise<Response> { |
| const formData = new FormData(); |
| formData.set('text', params.targetText); |
| formData.set('cfg_value', String(params.cfgValue)); |
| formData.set('inference_timesteps', String(params.inferenceTimesteps)); |
| formData.set('normalize', String(params.normalize)); |
| formData.set('denoise', String(params.denoise)); |
|
|
| if (params.referenceAudioBase64) { |
| const audioBlob = base64ToBlob(params.referenceAudioBase64, params.referenceAudioMimeType); |
| const audioName = params.referenceAudioName || 'reference.wav'; |
| formData.set('reference_audio', audioBlob, audioName); |
| if (params.promptText?.trim()) { |
| formData.set('prompt_audio', audioBlob, audioName); |
| formData.set('prompt_text', params.promptText.trim()); |
| } |
| } |
|
|
| return fetch(`${baseUrl}/tts/upload`, { |
| method: 'POST', |
| headers: getBackendAuthHeaders(apiKey), |
| body: formData, |
| }); |
| } |
|
|
| async function postVoxCPMNanoVLLM( |
| baseUrl: string, |
| params: { |
| targetText: string; |
| promptText?: string; |
| cfgValue: number; |
| referenceAudioBase64?: string; |
| referenceAudioMimeType?: string; |
| referenceAudioName?: string; |
| }, |
| apiKey?: string, |
| ): Promise<Response> { |
| const payload: Record<string, unknown> = { |
| target_text: params.targetText, |
| cfg_value: params.cfgValue, |
| }; |
|
|
| if (params.referenceAudioBase64) { |
| const format = getVoxCPMAudioFormat(params.referenceAudioMimeType, params.referenceAudioName); |
| payload.ref_audio_wav_base64 = params.referenceAudioBase64; |
| payload.ref_audio_wav_format = format; |
| if (params.promptText?.trim()) { |
| payload.prompt_wav_base64 = params.referenceAudioBase64; |
| payload.prompt_wav_format = format; |
| payload.prompt_text = params.promptText.trim(); |
| } |
| } |
|
|
| return fetch(`${baseUrl}/generate`, { |
| method: 'POST', |
| headers: { |
| 'Content-Type': 'application/json; charset=utf-8', |
| ...getBackendAuthHeaders(apiKey), |
| }, |
| body: JSON.stringify(payload), |
| }); |
| } |
|
|
| async function readTTSApiError(response: Response): Promise<string> { |
| const text = await response.text().catch(() => response.statusText); |
| if (!text) return response.statusText; |
| try { |
| const json = JSON.parse(text) as { detail?: unknown; error?: { message?: string } | string }; |
| if (typeof json.detail === 'string') return json.detail; |
| if (typeof json.error === 'string') return json.error; |
| if (json.error?.message) return json.error.message; |
| } catch { |
| |
| } |
| return text; |
| } |
|
|
| |
| |
| |
| async function generateAzureTTS( |
| config: TTSModelConfig, |
| text: string, |
| ): Promise<TTSGenerationResult> { |
| const baseUrl = config.baseUrl || TTS_PROVIDERS['azure-tts'].defaultBaseUrl; |
|
|
| |
| const rate = config.speed ? `${((config.speed - 1) * 100).toFixed(0)}%` : '0%'; |
| const ssml = ` |
| <speak version='1.0' xml:lang='zh-CN'> |
| <voice xml:lang='zh-CN' name='${config.voice}'> |
| <prosody rate='${rate}'>${escapeXml(text)}</prosody> |
| </voice> |
| </speak> |
| `.trim(); |
|
|
| const response = await fetch(`${baseUrl}/cognitiveservices/v1`, { |
| method: 'POST', |
| headers: { |
| 'Ocp-Apim-Subscription-Key': config.apiKey!, |
| 'Content-Type': 'application/ssml+xml; charset=utf-8', |
| 'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3', |
| }, |
| body: ssml, |
| }); |
|
|
| if (!response.ok) { |
| throw new Error(`Azure TTS API error: ${response.statusText}`); |
| } |
|
|
| const arrayBuffer = await response.arrayBuffer(); |
| return { |
| audio: new Uint8Array(arrayBuffer), |
| format: 'mp3', |
| }; |
| } |
|
|
| |
| |
| |
| async function generateGLMTTS(config: TTSModelConfig, text: string): Promise<TTSGenerationResult> { |
| const baseUrl = config.baseUrl || TTS_PROVIDERS['glm-tts'].defaultBaseUrl; |
|
|
| const response = await fetch(`${baseUrl}/audio/speech`, { |
| method: 'POST', |
| headers: { |
| Authorization: `Bearer ${config.apiKey}`, |
| 'Content-Type': 'application/json; charset=utf-8', |
| }, |
| body: JSON.stringify({ |
| model: config.modelId || 'glm-tts', |
| input: text, |
| voice: config.voice, |
| speed: config.speed || 1.0, |
| volume: 1.0, |
| response_format: 'wav', |
| }), |
| }); |
|
|
| if (!response.ok) { |
| const errorText = await response.text().catch(() => response.statusText); |
| let errorMessage = `GLM TTS API error: ${errorText}`; |
| try { |
| const errorJson = JSON.parse(errorText); |
| if (errorJson.error?.message) { |
| errorMessage = `GLM TTS API error: ${errorJson.error.message} (code: ${errorJson.error.code})`; |
| } |
| } catch { |
| |
| } |
| throw new Error(errorMessage); |
| } |
|
|
| const arrayBuffer = await response.arrayBuffer(); |
| return { |
| audio: new Uint8Array(arrayBuffer), |
| format: 'wav', |
| }; |
| } |
|
|
| |
| |
| |
| async function generateQwenTTS(config: TTSModelConfig, text: string): Promise<TTSGenerationResult> { |
| const baseUrl = config.baseUrl || TTS_PROVIDERS['qwen-tts'].defaultBaseUrl; |
|
|
| |
| |
| const rate = Math.round(((config.speed || 1.0) - 1.0) * 500); |
|
|
| const response = await fetch(`${baseUrl}/services/aigc/multimodal-generation/generation`, { |
| method: 'POST', |
| headers: { |
| Authorization: `Bearer ${config.apiKey}`, |
| 'Content-Type': 'application/json; charset=utf-8', |
| }, |
| body: JSON.stringify({ |
| model: config.modelId || 'qwen3-tts-flash', |
| input: { |
| text, |
| voice: config.voice, |
| language_type: 'Chinese', |
| }, |
| parameters: { |
| rate, |
| }, |
| }), |
| }); |
|
|
| if (!response.ok) { |
| const errorText = await response.text().catch(() => response.statusText); |
| throw new Error(`Qwen TTS API error: ${errorText}`); |
| } |
|
|
| const data = await response.json(); |
|
|
| |
| if (!data.output?.audio?.url) { |
| throw new Error(`Qwen TTS error: No audio URL in response. Response: ${JSON.stringify(data)}`); |
| } |
|
|
| |
| const audioUrl = data.output.audio.url; |
| const audioResponse = await fetch(audioUrl); |
|
|
| if (!audioResponse.ok) { |
| throw new Error(`Failed to download audio from URL: ${audioResponse.statusText}`); |
| } |
|
|
| const arrayBuffer = await audioResponse.arrayBuffer(); |
|
|
| return { |
| audio: new Uint8Array(arrayBuffer), |
| format: 'wav', |
| }; |
| } |
|
|
| |
| |
| |
| async function generateMiniMaxTTS( |
| config: TTSModelConfig, |
| text: string, |
| ): Promise<TTSGenerationResult> { |
| const baseUrl = (config.baseUrl || TTS_PROVIDERS['minimax-tts'].defaultBaseUrl || '').replace( |
| /\/$/, |
| '', |
| ); |
| const response = await fetch(`${baseUrl}/v1/t2a_v2`, { |
| method: 'POST', |
| headers: { |
| Authorization: `Bearer ${config.apiKey}`, |
| 'Content-Type': 'application/json; charset=utf-8', |
| }, |
| body: JSON.stringify({ |
| model: config.modelId || 'speech-2.8-hd', |
| text, |
| stream: false, |
| output_format: 'hex', |
| voice_setting: { |
| voice_id: config.voice, |
| speed: config.speed || 1.0, |
| vol: 1, |
| pitch: 0, |
| }, |
| audio_setting: { |
| sample_rate: 32000, |
| bitrate: 128000, |
| format: config.format || 'mp3', |
| channel: 1, |
| }, |
| language_boost: 'auto', |
| }), |
| }); |
|
|
| if (!response.ok) { |
| const errorText = await response.text().catch(() => response.statusText); |
| throw new Error(`MiniMax TTS API error: ${errorText}`); |
| } |
|
|
| const data = await response.json(); |
| const hexAudio = data?.data?.audio; |
| if (!hexAudio || typeof hexAudio !== 'string') { |
| throw new Error(`MiniMax TTS error: No audio returned. Response: ${JSON.stringify(data)}`); |
| } |
|
|
| const cleanedHex = hexAudio.trim(); |
| if (cleanedHex.length % 2 !== 0) { |
| throw new Error('MiniMax TTS error: invalid hex audio payload length'); |
| } |
|
|
| const audio = new Uint8Array( |
| cleanedHex.match(/.{1,2}/g)?.map((byte: string) => parseInt(byte, 16)) || [], |
| ); |
| return { |
| audio, |
| format: data?.extra_info?.audio_format || config.format || 'mp3', |
| }; |
| } |
|
|
| |
| |
| |
| async function generateElevenLabsTTS( |
| config: TTSModelConfig, |
| text: string, |
| ): Promise<TTSGenerationResult> { |
| const baseUrl = config.baseUrl || TTS_PROVIDERS['elevenlabs-tts'].defaultBaseUrl; |
| const requestedFormat = config.format || 'mp3'; |
| const clampedSpeed = Math.min(1.2, Math.max(0.7, config.speed || 1.0)); |
| const outputFormatMap: Record<string, string> = { |
| mp3: 'mp3_44100_128', |
| opus: 'opus_48000_96', |
| pcm: 'pcm_44100', |
| wav: 'wav_44100', |
| ulaw: 'ulaw_8000', |
| alaw: 'alaw_8000', |
| }; |
| const outputFormat = outputFormatMap[requestedFormat] || outputFormatMap.mp3; |
|
|
| const response = await fetch( |
| `${baseUrl}/text-to-speech/${encodeURIComponent(config.voice)}?output_format=${outputFormat}`, |
| { |
| method: 'POST', |
| headers: { |
| 'xi-api-key': config.apiKey!, |
| 'Content-Type': 'application/json; charset=utf-8', |
| }, |
| body: JSON.stringify({ |
| text, |
| model_id: config.modelId || 'eleven_multilingual_v2', |
| voice_settings: { |
| stability: 0.5, |
| similarity_boost: 0.75, |
| speed: clampedSpeed, |
| }, |
| }), |
| }, |
| ); |
|
|
| if (!response.ok) { |
| const errorText = await response.text().catch(() => response.statusText); |
| throw new Error(`ElevenLabs TTS API error: ${errorText || response.statusText}`); |
| } |
|
|
| const arrayBuffer = await response.arrayBuffer(); |
| return { |
| audio: new Uint8Array(arrayBuffer), |
| format: requestedFormat, |
| }; |
| } |
|
|
| |
| |
| |
| |
| export async function getCurrentTTSConfig(): Promise<TTSModelConfig> { |
| if (typeof window === 'undefined') { |
| throw new Error('getCurrentTTSConfig() can only be called in browser context'); |
| } |
|
|
| |
| const { useSettingsStore } = await import('@/lib/store/settings'); |
| const { ttsProviderId, ttsVoice, ttsSpeed, ttsProvidersConfig } = useSettingsStore.getState(); |
|
|
| const providerConfig = ttsProvidersConfig?.[ttsProviderId]; |
|
|
| return { |
| providerId: ttsProviderId, |
| modelId: |
| providerConfig?.modelId || |
| TTS_PROVIDERS[ttsProviderId as keyof typeof TTS_PROVIDERS]?.defaultModelId || |
| '', |
| apiKey: providerConfig?.apiKey, |
| baseUrl: providerConfig?.baseUrl || providerConfig?.customDefaultBaseUrl, |
| voice: ttsVoice, |
| speed: ttsSpeed, |
| }; |
| } |
|
|
| |
| export { getAllTTSProviders, getTTSProvider, getTTSVoices } from './constants'; |
|
|
| |
| |
| |
| async function generateDoubaoTTS( |
| config: TTSModelConfig, |
| text: string, |
| ): Promise<TTSGenerationResult> { |
| const colonIdx = (config.apiKey || '').indexOf(':'); |
| if (colonIdx <= 0) { |
| throw new Error( |
| 'Doubao TTS requires API key in format "appId:accessKey". Get both from the Volcengine console.', |
| ); |
| } |
| const appId = config.apiKey!.slice(0, colonIdx); |
| const accessKey = config.apiKey!.slice(colonIdx + 1); |
|
|
| const baseUrl = config.baseUrl || TTS_PROVIDERS['doubao-tts'].defaultBaseUrl; |
| const speechRate = Math.round(((config.speed || 1.0) - 1.0) * 100); |
|
|
| const response = await fetch(`${baseUrl}/unidirectional`, { |
| method: 'POST', |
| headers: { |
| 'Content-Type': 'application/json', |
| 'X-Api-App-Id': appId, |
| 'X-Api-Access-Key': accessKey, |
| 'X-Api-Resource-Id': 'seed-tts-2.0', |
| }, |
| body: JSON.stringify({ |
| user: { uid: 'multimind' }, |
| req_params: { |
| text, |
| speaker: config.voice, |
| audio_params: { format: 'mp3', sample_rate: 24000, speech_rate: speechRate }, |
| }, |
| }), |
| }); |
|
|
| if (!response.ok) { |
| const errorText = await response.text().catch(() => response.statusText); |
| throw new Error(`Doubao TTS API error (${response.status}): ${errorText}`); |
| } |
|
|
| const responseText = await response.text(); |
| const audioChunks: Uint8Array[] = []; |
|
|
| let depth = 0; |
| let start = -1; |
| for (let i = 0; i < responseText.length; i++) { |
| if (responseText[i] === '{') { |
| if (depth === 0) start = i; |
| depth++; |
| } else if (responseText[i] === '}') { |
| depth--; |
| if (depth === 0 && start >= 0) { |
| let chunk: { code: number; message?: string; data?: string }; |
| try { |
| chunk = JSON.parse(responseText.slice(start, i + 1)); |
| } catch { |
| start = -1; |
| continue; |
| } |
| start = -1; |
|
|
| if (chunk.code === 0 && chunk.data) { |
| audioChunks.push(new Uint8Array(Buffer.from(chunk.data, 'base64'))); |
| } else if (chunk.code === 20000000) { |
| break; |
| } else if (chunk.code && chunk.code !== 0) { |
| if (chunk.code === 45000000 || chunk.code === 45000292) { |
| throw new TTSRateLimitError( |
| 'doubao-tts', |
| chunk.message || 'concurrency quota exceeded', |
| ); |
| } |
| throw new Error(`Doubao TTS error: ${chunk.message || 'unknown'} (code: ${chunk.code})`); |
| } |
| } |
| } |
| } |
|
|
| if (audioChunks.length === 0) { |
| throw new Error('Doubao TTS: no audio data received'); |
| } |
|
|
| const totalLength = audioChunks.reduce((sum, c) => sum + c.length, 0); |
| const combined = new Uint8Array(totalLength); |
| let offset = 0; |
| for (const chunk of audioChunks) { |
| combined.set(chunk, offset); |
| offset += chunk.length; |
| } |
|
|
| return { audio: combined, format: 'mp3' }; |
| } |
|
|
| |
| |
| |
| function escapeXml(text: string): string { |
| return text |
| .replace(/&/g, '&') |
| .replace(/</g, '<') |
| .replace(/>/g, '>') |
| .replace(/"/g, '"') |
| .replace(/'/g, '''); |
| } |
|
|