OpenMAIC-React / src /lib /audio /tts-providers.ts
muthuk1's picture
Rebrand: OpenMAIC → MultiMind Classroom — rename in all source, DB name, cookie name, zip extension, prompts, docs, skills
ed07c96 verified
/**
* TTS (Text-to-Speech) Provider Implementation
*
* Factory pattern for routing TTS requests to appropriate provider implementations.
* Follows the same architecture as lib/ai/providers.ts for consistency.
*
* Currently Supported Providers:
* - OpenAI TTS: https://platform.openai.com/docs/guides/text-to-speech
* - Azure TTS: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/text-to-speech
* - GLM TTS: https://docs.bigmodel.cn/cn/guide/models/sound-and-video/glm-tts
* - Qwen TTS: https://bailian.console.aliyun.com/
* - MiniMax TTS: https://platform.minimaxi.com/docs/api-reference/speech-t2a-http
* - Doubao TTS: https://www.volcengine.com/docs/6561/1257543
* - ElevenLabs TTS: https://elevenlabs.io/docs/api-reference/text-to-speech/convert
* - Browser Native: Web Speech API (client-side only)
*
* HOW TO ADD A NEW PROVIDER:
*
* 1. Add provider ID to TTSProviderId in lib/audio/types.ts
* Example: | 'elevenlabs-tts'
*
* 2. Add provider configuration to lib/audio/constants.ts
* Example:
* 'elevenlabs-tts': {
* id: 'elevenlabs-tts',
* name: 'ElevenLabs',
* requiresApiKey: true,
* defaultBaseUrl: 'https://api.elevenlabs.io/v1',
* icon: '/logos/elevenlabs.svg',
* voices: [...],
* supportedFormats: ['mp3', 'pcm'],
* speedRange: { min: 0.5, max: 2.0, default: 1.0 }
* }
*
* 3. Implement provider function in this file
* Pattern: async function generateXxxTTS(config, text): Promise<TTSGenerationResult>
* - Validate config and build API request
* - Handle API authentication (apiKey, headers)
* - Convert provider-specific parameters (voice, speed, format)
* - Return { audio: Uint8Array, format: string }
*
* Example:
* async function generateElevenLabsTTS(
* config: TTSModelConfig,
* text: string
* ): Promise<TTSGenerationResult> {
* const baseUrl = config.baseUrl || TTS_PROVIDERS['elevenlabs-tts'].defaultBaseUrl;
*
* const response = await fetch(`${baseUrl}/text-to-speech/${config.voice}`, {
* method: 'POST',
* headers: {
* 'xi-api-key': config.apiKey!,
* 'Content-Type': 'application/json',
* },
* body: JSON.stringify({
* text,
* model_id: 'eleven_multilingual_v2',
* voice_settings: {
* stability: 0.5,
* similarity_boost: 0.75,
* }
* }),
* });
*
* if (!response.ok) {
* throw new Error(`ElevenLabs TTS API error: ${response.statusText}`);
* }
*
* const arrayBuffer = await response.arrayBuffer();
* return {
* audio: new Uint8Array(arrayBuffer),
* format: 'mp3',
* };
* }
*
* 4. Add case to generateTTS() switch statement
* case 'elevenlabs-tts':
* return await generateElevenLabsTTS(config, text);
*
* 5. Add i18n translations in lib/i18n.ts
* providerElevenLabsTTS: { zh: 'ElevenLabs TTS', en: 'ElevenLabs TTS' }
*
* Error Handling Patterns:
* - Always validate API key if requiresApiKey is true
* - Throw descriptive errors for API failures
* - Include response.statusText or error messages from API
* - For client-only providers (browser-native), throw error directing to client-side usage
*
* API Call Patterns:
* - Direct API: Use fetch with appropriate headers and body format (recommended for better encoding support)
* - SSML: For Azure-like providers requiring SSML markup
* - URL-based: For providers returning audio URL (download in second step)
*/
import type { TTSModelConfig } from './types';
import { isCustomTTSProvider } from './types';
import { TTS_PROVIDERS } from './constants';
import {
VOXCPM_VLLM_MODEL_ID,
VOXCPM_AUTO_VOICE_ID,
normalizeVoxCPMBackend,
type VoxCPMProviderOptions,
} from './voxcpm';
/**
* Result of TTS generation
*/
export interface TTSGenerationResult {
audio: Uint8Array;
format: string;
}
/**
* Thrown when a TTS provider returns a rate-limit / concurrency-quota error.
* Allows downstream consumers to distinguish rate-limit errors from other TTS failures.
*
* TODO: The API route currently catches all errors uniformly as GENERATION_FAILED.
* This class enables future retry/backoff logic without changing the throw sites.
*/
export class TTSRateLimitError extends Error {
constructor(
public readonly provider: string,
message: string,
) {
super(message);
this.name = 'TTSRateLimitError';
}
}
/**
* Generate speech using specified TTS provider
*/
export async function generateTTS(
config: TTSModelConfig,
text: string,
): Promise<TTSGenerationResult> {
const provider = TTS_PROVIDERS[config.providerId as keyof typeof TTS_PROVIDERS];
// Validate API key if required (only for built-in providers with known config)
if (provider?.requiresApiKey && !config.apiKey) {
throw new Error(`API key required for TTS provider: ${config.providerId}`);
}
switch (config.providerId) {
case 'openai-tts':
return await generateOpenAITTS(config, text);
case 'azure-tts':
return await generateAzureTTS(config, text);
case 'glm-tts':
return await generateGLMTTS(config, text);
case 'qwen-tts':
return await generateQwenTTS(config, text);
case 'voxcpm-tts':
return await generateVoxCPMTTS(config, text);
case 'minimax-tts':
return await generateMiniMaxTTS(config, text);
case 'doubao-tts':
return await generateDoubaoTTS(config, text);
case 'elevenlabs-tts':
return await generateElevenLabsTTS(config, text);
case 'browser-native-tts':
throw new Error(
'Browser Native TTS must be handled client-side using Web Speech API. This provider cannot be used on the server.',
);
default:
if (isCustomTTSProvider(config.providerId)) {
return await generateOpenAITTS(config, text);
}
throw new Error(`Unsupported TTS provider: ${config.providerId}`);
}
}
/**
* OpenAI TTS implementation (direct API call with explicit UTF-8 encoding)
*/
async function generateOpenAITTS(
config: TTSModelConfig,
text: string,
): Promise<TTSGenerationResult> {
const baseUrl = config.baseUrl || TTS_PROVIDERS['openai-tts'].defaultBaseUrl;
// Use gpt-4o-mini-tts for best quality and intelligent realtime applications
const response = await fetch(`${baseUrl}/audio/speech`, {
method: 'POST',
headers: {
Authorization: `Bearer ${config.apiKey}`,
'Content-Type': 'application/json; charset=utf-8',
},
body: JSON.stringify({
model: config.modelId || 'gpt-4o-mini-tts',
input: text,
voice: config.voice,
speed: config.speed || 1.0,
}),
});
if (!response.ok) {
const error = await response.json().catch(() => ({ error: response.statusText }));
throw new Error(`OpenAI TTS API error: ${error.error?.message || response.statusText}`);
}
const arrayBuffer = await response.arrayBuffer();
const contentType = response.headers.get('content-type') || '';
const format = getAudioResponseFormat(contentType);
return {
audio: new Uint8Array(arrayBuffer),
format,
};
}
/**
* VoxCPM2 TTS implementation.
*
* MultiMind Classroom keeps one internal VoxCPM request shape, then adapts it to the
* selected official backend protocol.
*/
async function generateVoxCPMTTS(
config: TTSModelConfig,
text: string,
): Promise<TTSGenerationResult> {
const baseUrl = (config.baseUrl || TTS_PROVIDERS['voxcpm-tts'].defaultBaseUrl || '').replace(
/\/$/,
'',
);
if (!baseUrl) {
throw new Error('VoxCPM base URL is required');
}
const options = (config.providerOptions || {}) as VoxCPMProviderOptions;
const backend = normalizeVoxCPMBackend(options.backend);
const voicePrompt =
options.voicePrompt ||
(config.voice && config.voice !== 'default' && config.voice !== VOXCPM_AUTO_VOICE_ID
? config.voice
: undefined);
if (config.voice === VOXCPM_AUTO_VOICE_ID && !voicePrompt) {
throw new Error('VoxCPM Auto Voice requires agent context');
}
const cfgValue = options.cfgValue ?? 2.0;
const inferenceTimesteps = options.inferenceTimesteps ?? 10;
const normalize = options.normalize ?? false;
const denoise = options.denoise ?? false;
const usePromptContinuation = Boolean(options.promptText?.trim() && options.referenceAudioBase64);
const request = {
targetText: usePromptContinuation ? text : buildVoxCPMTargetText(text, voicePrompt),
voicePrompt,
promptText: options.promptText,
cfgValue,
inferenceTimesteps,
normalize,
denoise,
referenceAudioBase64: options.referenceAudioBase64,
referenceAudioMimeType: options.referenceAudioMimeType,
referenceAudioName: options.referenceAudioName,
};
const response =
backend === 'nano-vllm'
? await postVoxCPMNanoVLLM(baseUrl, request, config.apiKey)
: backend === 'python-api'
? await postVoxCPMPythonAPI(baseUrl, request, config.apiKey)
: await postVoxCPMVLLMOmni(baseUrl, request, config);
if (!response.ok) {
throw new Error(`VoxCPM TTS API error: ${await readTTSApiError(response)}`);
}
const arrayBuffer = await response.arrayBuffer();
const contentType = response.headers.get('content-type') || '';
const format = getAudioResponseFormat(contentType);
return {
audio: new Uint8Array(arrayBuffer),
format,
};
}
function buildVoxCPMTargetText(text: string, voicePrompt?: string): string {
const prompt = voicePrompt
?.replace(/[\p{C}]+/gu, ' ')
.replace(/[()()]/gu, '')
.replace(/\s+/gu, ' ')
.trim();
return prompt ? `(${prompt})${text}` : text;
}
function getAudioResponseFormat(contentType: string): string {
if (contentType.includes('audio/wav') || contentType.includes('audio/x-wav')) return 'wav';
if (contentType.includes('audio/mpeg') || contentType.includes('audio/mp3')) return 'mp3';
if (contentType.includes('audio/flac')) return 'flac';
if (contentType.includes('audio/ogg')) return 'ogg';
if (contentType.includes('audio/webm')) return 'webm';
return 'mp3';
}
function getVoxCPMAudioFormat(mimeType?: string, fileName?: string): string {
const lowerName = fileName?.toLowerCase() || '';
if (mimeType?.includes('wav') || lowerName.endsWith('.wav')) return 'wav';
if (mimeType?.includes('mpeg') || mimeType?.includes('mp3') || lowerName.endsWith('.mp3')) {
return 'mp3';
}
if (mimeType?.includes('flac') || lowerName.endsWith('.flac')) return 'flac';
if (mimeType?.includes('ogg') || lowerName.endsWith('.ogg')) return 'ogg';
if (mimeType?.includes('webm') || lowerName.endsWith('.webm')) return 'webm';
return 'wav';
}
function getVLLMOmniSpeechUrl(baseUrl: string): string {
return baseUrl.endsWith('/v1') ? `${baseUrl}/audio/speech` : `${baseUrl}/v1/audio/speech`;
}
function getVLLMOmniModelId(config: TTSModelConfig): string {
const modelId = config.modelId?.trim();
if (!modelId || modelId === 'VoxCPM2') return VOXCPM_VLLM_MODEL_ID;
return modelId;
}
function getBackendAuthHeaders(apiKey?: string): Record<string, string> {
return apiKey?.trim() ? { Authorization: `Bearer ${apiKey.trim()}` } : {};
}
async function postVoxCPMVLLMOmni(
baseUrl: string,
params: {
targetText: string;
promptText?: string;
referenceAudioBase64?: string;
referenceAudioMimeType?: string;
referenceAudioName?: string;
},
config: TTSModelConfig,
): Promise<Response> {
const payload: Record<string, unknown> = {
model: getVLLMOmniModelId(config),
input: params.targetText,
// VoxCPM2's vLLM-Omni adapter currently ignores named voices; prompts/ref_audio carry voice identity.
voice: 'default',
response_format: 'wav',
stream: false,
};
if (params.referenceAudioBase64) {
const referenceAudio = getVoxCPMDataAudioUrl(
params.referenceAudioBase64,
params.referenceAudioMimeType,
params.referenceAudioName,
);
payload.ref_audio = referenceAudio;
if (params.promptText?.trim()) {
payload.prompt_audio = referenceAudio;
payload.prompt_text = params.promptText.trim();
}
}
return fetch(getVLLMOmniSpeechUrl(baseUrl), {
method: 'POST',
headers: {
'Content-Type': 'application/json; charset=utf-8',
...getBackendAuthHeaders(config.apiKey),
},
body: JSON.stringify(payload),
});
}
function getVoxCPMDataAudioUrl(base64: string, mimeType?: string, fileName?: string): string {
const format = getVoxCPMAudioFormat(mimeType, fileName);
const mediaType =
mimeType?.trim() ||
(format === 'mp3'
? 'audio/mpeg'
: format === 'flac'
? 'audio/flac'
: format === 'ogg'
? 'audio/ogg'
: format === 'webm'
? 'audio/webm'
: 'audio/wav');
return `data:${mediaType};base64,${base64}`;
}
function base64ToBlob(base64: string, mimeType?: string): Blob {
const binary = atob(base64);
const bytes = new Uint8Array(binary.length);
for (let index = 0; index < binary.length; index++) {
bytes[index] = binary.charCodeAt(index);
}
return new Blob([bytes], { type: mimeType || 'audio/wav' });
}
async function postVoxCPMPythonAPI(
baseUrl: string,
params: {
targetText: string;
promptText?: string;
cfgValue: number;
inferenceTimesteps: number;
normalize: boolean;
denoise: boolean;
referenceAudioBase64?: string;
referenceAudioMimeType?: string;
referenceAudioName?: string;
},
apiKey?: string,
): Promise<Response> {
const formData = new FormData();
formData.set('text', params.targetText);
formData.set('cfg_value', String(params.cfgValue));
formData.set('inference_timesteps', String(params.inferenceTimesteps));
formData.set('normalize', String(params.normalize));
formData.set('denoise', String(params.denoise));
if (params.referenceAudioBase64) {
const audioBlob = base64ToBlob(params.referenceAudioBase64, params.referenceAudioMimeType);
const audioName = params.referenceAudioName || 'reference.wav';
formData.set('reference_audio', audioBlob, audioName);
if (params.promptText?.trim()) {
formData.set('prompt_audio', audioBlob, audioName);
formData.set('prompt_text', params.promptText.trim());
}
}
return fetch(`${baseUrl}/tts/upload`, {
method: 'POST',
headers: getBackendAuthHeaders(apiKey),
body: formData,
});
}
async function postVoxCPMNanoVLLM(
baseUrl: string,
params: {
targetText: string;
promptText?: string;
cfgValue: number;
referenceAudioBase64?: string;
referenceAudioMimeType?: string;
referenceAudioName?: string;
},
apiKey?: string,
): Promise<Response> {
const payload: Record<string, unknown> = {
target_text: params.targetText,
cfg_value: params.cfgValue,
};
if (params.referenceAudioBase64) {
const format = getVoxCPMAudioFormat(params.referenceAudioMimeType, params.referenceAudioName);
payload.ref_audio_wav_base64 = params.referenceAudioBase64;
payload.ref_audio_wav_format = format;
if (params.promptText?.trim()) {
payload.prompt_wav_base64 = params.referenceAudioBase64;
payload.prompt_wav_format = format;
payload.prompt_text = params.promptText.trim();
}
}
return fetch(`${baseUrl}/generate`, {
method: 'POST',
headers: {
'Content-Type': 'application/json; charset=utf-8',
...getBackendAuthHeaders(apiKey),
},
body: JSON.stringify(payload),
});
}
async function readTTSApiError(response: Response): Promise<string> {
const text = await response.text().catch(() => response.statusText);
if (!text) return response.statusText;
try {
const json = JSON.parse(text) as { detail?: unknown; error?: { message?: string } | string };
if (typeof json.detail === 'string') return json.detail;
if (typeof json.error === 'string') return json.error;
if (json.error?.message) return json.error.message;
} catch {
// Fall through to raw text.
}
return text;
}
/**
* Azure TTS implementation (direct API call with SSML)
*/
async function generateAzureTTS(
config: TTSModelConfig,
text: string,
): Promise<TTSGenerationResult> {
const baseUrl = config.baseUrl || TTS_PROVIDERS['azure-tts'].defaultBaseUrl;
// Build SSML
const rate = config.speed ? `${((config.speed - 1) * 100).toFixed(0)}%` : '0%';
const ssml = `
<speak version='1.0' xml:lang='zh-CN'>
<voice xml:lang='zh-CN' name='${config.voice}'>
<prosody rate='${rate}'>${escapeXml(text)}</prosody>
</voice>
</speak>
`.trim();
const response = await fetch(`${baseUrl}/cognitiveservices/v1`, {
method: 'POST',
headers: {
'Ocp-Apim-Subscription-Key': config.apiKey!,
'Content-Type': 'application/ssml+xml; charset=utf-8',
'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3',
},
body: ssml,
});
if (!response.ok) {
throw new Error(`Azure TTS API error: ${response.statusText}`);
}
const arrayBuffer = await response.arrayBuffer();
return {
audio: new Uint8Array(arrayBuffer),
format: 'mp3',
};
}
/**
* GLM TTS implementation (GLM API)
*/
async function generateGLMTTS(config: TTSModelConfig, text: string): Promise<TTSGenerationResult> {
const baseUrl = config.baseUrl || TTS_PROVIDERS['glm-tts'].defaultBaseUrl;
const response = await fetch(`${baseUrl}/audio/speech`, {
method: 'POST',
headers: {
Authorization: `Bearer ${config.apiKey}`,
'Content-Type': 'application/json; charset=utf-8',
},
body: JSON.stringify({
model: config.modelId || 'glm-tts',
input: text,
voice: config.voice,
speed: config.speed || 1.0,
volume: 1.0,
response_format: 'wav',
}),
});
if (!response.ok) {
const errorText = await response.text().catch(() => response.statusText);
let errorMessage = `GLM TTS API error: ${errorText}`;
try {
const errorJson = JSON.parse(errorText);
if (errorJson.error?.message) {
errorMessage = `GLM TTS API error: ${errorJson.error.message} (code: ${errorJson.error.code})`;
}
} catch {
// If not JSON, use the text as is
}
throw new Error(errorMessage);
}
const arrayBuffer = await response.arrayBuffer();
return {
audio: new Uint8Array(arrayBuffer),
format: 'wav',
};
}
/**
* Qwen TTS implementation (DashScope API - Qwen3 TTS Flash)
*/
async function generateQwenTTS(config: TTSModelConfig, text: string): Promise<TTSGenerationResult> {
const baseUrl = config.baseUrl || TTS_PROVIDERS['qwen-tts'].defaultBaseUrl;
// Calculate speed: Qwen3 uses rate parameter from -500 to 500
// speed 1.0 = rate 0, speed 2.0 = rate 500, speed 0.5 = rate -250
const rate = Math.round(((config.speed || 1.0) - 1.0) * 500);
const response = await fetch(`${baseUrl}/services/aigc/multimodal-generation/generation`, {
method: 'POST',
headers: {
Authorization: `Bearer ${config.apiKey}`,
'Content-Type': 'application/json; charset=utf-8',
},
body: JSON.stringify({
model: config.modelId || 'qwen3-tts-flash',
input: {
text,
voice: config.voice,
language_type: 'Chinese', // Default to Chinese, can be made configurable
},
parameters: {
rate, // Speech rate from -500 to 500
},
}),
});
if (!response.ok) {
const errorText = await response.text().catch(() => response.statusText);
throw new Error(`Qwen TTS API error: ${errorText}`);
}
const data = await response.json();
// Check for audio URL in response
if (!data.output?.audio?.url) {
throw new Error(`Qwen TTS error: No audio URL in response. Response: ${JSON.stringify(data)}`);
}
// Download audio from URL
const audioUrl = data.output.audio.url;
const audioResponse = await fetch(audioUrl);
if (!audioResponse.ok) {
throw new Error(`Failed to download audio from URL: ${audioResponse.statusText}`);
}
const arrayBuffer = await audioResponse.arrayBuffer();
return {
audio: new Uint8Array(arrayBuffer),
format: 'wav', // Qwen3 TTS returns WAV format
};
}
/**
* MiniMax TTS implementation (synchronous HTTP API)
*/
async function generateMiniMaxTTS(
config: TTSModelConfig,
text: string,
): Promise<TTSGenerationResult> {
const baseUrl = (config.baseUrl || TTS_PROVIDERS['minimax-tts'].defaultBaseUrl || '').replace(
/\/$/,
'',
);
const response = await fetch(`${baseUrl}/v1/t2a_v2`, {
method: 'POST',
headers: {
Authorization: `Bearer ${config.apiKey}`,
'Content-Type': 'application/json; charset=utf-8',
},
body: JSON.stringify({
model: config.modelId || 'speech-2.8-hd',
text,
stream: false,
output_format: 'hex',
voice_setting: {
voice_id: config.voice,
speed: config.speed || 1.0,
vol: 1,
pitch: 0,
},
audio_setting: {
sample_rate: 32000,
bitrate: 128000,
format: config.format || 'mp3',
channel: 1,
},
language_boost: 'auto',
}),
});
if (!response.ok) {
const errorText = await response.text().catch(() => response.statusText);
throw new Error(`MiniMax TTS API error: ${errorText}`);
}
const data = await response.json();
const hexAudio = data?.data?.audio;
if (!hexAudio || typeof hexAudio !== 'string') {
throw new Error(`MiniMax TTS error: No audio returned. Response: ${JSON.stringify(data)}`);
}
const cleanedHex = hexAudio.trim();
if (cleanedHex.length % 2 !== 0) {
throw new Error('MiniMax TTS error: invalid hex audio payload length');
}
const audio = new Uint8Array(
cleanedHex.match(/.{1,2}/g)?.map((byte: string) => parseInt(byte, 16)) || [],
);
return {
audio,
format: data?.extra_info?.audio_format || config.format || 'mp3',
};
}
/**
* ElevenLabs TTS implementation (direct API call with voice-specific endpoint)
*/
async function generateElevenLabsTTS(
config: TTSModelConfig,
text: string,
): Promise<TTSGenerationResult> {
const baseUrl = config.baseUrl || TTS_PROVIDERS['elevenlabs-tts'].defaultBaseUrl;
const requestedFormat = config.format || 'mp3';
const clampedSpeed = Math.min(1.2, Math.max(0.7, config.speed || 1.0));
const outputFormatMap: Record<string, string> = {
mp3: 'mp3_44100_128',
opus: 'opus_48000_96',
pcm: 'pcm_44100',
wav: 'wav_44100',
ulaw: 'ulaw_8000',
alaw: 'alaw_8000',
};
const outputFormat = outputFormatMap[requestedFormat] || outputFormatMap.mp3;
const response = await fetch(
`${baseUrl}/text-to-speech/${encodeURIComponent(config.voice)}?output_format=${outputFormat}`,
{
method: 'POST',
headers: {
'xi-api-key': config.apiKey!,
'Content-Type': 'application/json; charset=utf-8',
},
body: JSON.stringify({
text,
model_id: config.modelId || 'eleven_multilingual_v2',
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
speed: clampedSpeed,
},
}),
},
);
if (!response.ok) {
const errorText = await response.text().catch(() => response.statusText);
throw new Error(`ElevenLabs TTS API error: ${errorText || response.statusText}`);
}
const arrayBuffer = await response.arrayBuffer();
return {
audio: new Uint8Array(arrayBuffer),
format: requestedFormat,
};
}
/**
* Get current TTS configuration from settings store
* Note: This function should only be called in browser context
*/
export async function getCurrentTTSConfig(): Promise<TTSModelConfig> {
if (typeof window === 'undefined') {
throw new Error('getCurrentTTSConfig() can only be called in browser context');
}
// Lazy import to avoid circular dependency
const { useSettingsStore } = await import('@/lib/store/settings');
const { ttsProviderId, ttsVoice, ttsSpeed, ttsProvidersConfig } = useSettingsStore.getState();
const providerConfig = ttsProvidersConfig?.[ttsProviderId];
return {
providerId: ttsProviderId,
modelId:
providerConfig?.modelId ||
TTS_PROVIDERS[ttsProviderId as keyof typeof TTS_PROVIDERS]?.defaultModelId ||
'',
apiKey: providerConfig?.apiKey,
baseUrl: providerConfig?.baseUrl || providerConfig?.customDefaultBaseUrl,
voice: ttsVoice,
speed: ttsSpeed,
};
}
// Re-export from constants for convenience
export { getAllTTSProviders, getTTSProvider, getTTSVoices } from './constants';
/**
* Doubao TTS 2.0 implementation (Volcengine Seed-TTS 2.0)
*/
async function generateDoubaoTTS(
config: TTSModelConfig,
text: string,
): Promise<TTSGenerationResult> {
const colonIdx = (config.apiKey || '').indexOf(':');
if (colonIdx <= 0) {
throw new Error(
'Doubao TTS requires API key in format "appId:accessKey". Get both from the Volcengine console.',
);
}
const appId = config.apiKey!.slice(0, colonIdx);
const accessKey = config.apiKey!.slice(colonIdx + 1);
const baseUrl = config.baseUrl || TTS_PROVIDERS['doubao-tts'].defaultBaseUrl;
const speechRate = Math.round(((config.speed || 1.0) - 1.0) * 100);
const response = await fetch(`${baseUrl}/unidirectional`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-Api-App-Id': appId,
'X-Api-Access-Key': accessKey,
'X-Api-Resource-Id': 'seed-tts-2.0',
},
body: JSON.stringify({
user: { uid: 'multimind' },
req_params: {
text,
speaker: config.voice,
audio_params: { format: 'mp3', sample_rate: 24000, speech_rate: speechRate },
},
}),
});
if (!response.ok) {
const errorText = await response.text().catch(() => response.statusText);
throw new Error(`Doubao TTS API error (${response.status}): ${errorText}`);
}
const responseText = await response.text();
const audioChunks: Uint8Array[] = [];
let depth = 0;
let start = -1;
for (let i = 0; i < responseText.length; i++) {
if (responseText[i] === '{') {
if (depth === 0) start = i;
depth++;
} else if (responseText[i] === '}') {
depth--;
if (depth === 0 && start >= 0) {
let chunk: { code: number; message?: string; data?: string };
try {
chunk = JSON.parse(responseText.slice(start, i + 1));
} catch {
start = -1;
continue;
}
start = -1;
if (chunk.code === 0 && chunk.data) {
audioChunks.push(new Uint8Array(Buffer.from(chunk.data, 'base64')));
} else if (chunk.code === 20000000) {
break;
} else if (chunk.code && chunk.code !== 0) {
if (chunk.code === 45000000 || chunk.code === 45000292) {
throw new TTSRateLimitError(
'doubao-tts',
chunk.message || 'concurrency quota exceeded',
);
}
throw new Error(`Doubao TTS error: ${chunk.message || 'unknown'} (code: ${chunk.code})`);
}
}
}
}
if (audioChunks.length === 0) {
throw new Error('Doubao TTS: no audio data received');
}
const totalLength = audioChunks.reduce((sum, c) => sum + c.length, 0);
const combined = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of audioChunks) {
combined.set(chunk, offset);
offset += chunk.length;
}
return { audio: combined, format: 'mp3' };
}
/**
* Escape XML special characters for SSML
*/
function escapeXml(text: string): string {
return text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&apos;');
}