OpenMAIC-React / src /lib /audio /tts-providers.ts

Rebrand: OpenMAIC → MultiMind Classroom — rename in all source, DB name, cookie name, zip extension, prompts, docs, skills

ed07c96 verified 10 days ago

raw

history blame contribute delete

27.7 kB

	/**
	* TTS (Text-to-Speech) Provider Implementation
	*
	* Factory pattern for routing TTS requests to appropriate provider implementations.
	* Follows the same architecture as lib/ai/providers.ts for consistency.
	*
	* Currently Supported Providers:
	* - OpenAI TTS: https://platform.openai.com/docs/guides/text-to-speech
	* - Azure TTS: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/text-to-speech
	* - GLM TTS: https://docs.bigmodel.cn/cn/guide/models/sound-and-video/glm-tts
	* - Qwen TTS: https://bailian.console.aliyun.com/
	* - MiniMax TTS: https://platform.minimaxi.com/docs/api-reference/speech-t2a-http
	* - Doubao TTS: https://www.volcengine.com/docs/6561/1257543
	* - ElevenLabs TTS: https://elevenlabs.io/docs/api-reference/text-to-speech/convert
	* - Browser Native: Web Speech API (client-side only)
	*
	* HOW TO ADD A NEW PROVIDER:
	*
	* 1. Add provider ID to TTSProviderId in lib/audio/types.ts
	* Example: \| 'elevenlabs-tts'
	*
	* 2. Add provider configuration to lib/audio/constants.ts
	* Example:
	* 'elevenlabs-tts': {
	* id: 'elevenlabs-tts',
	* name: 'ElevenLabs',
	* requiresApiKey: true,
	* defaultBaseUrl: 'https://api.elevenlabs.io/v1',
	* icon: '/logos/elevenlabs.svg',
	* voices: [...],
	* supportedFormats: ['mp3', 'pcm'],
	* speedRange: { min: 0.5, max: 2.0, default: 1.0 }
	* }
	*
	* 3. Implement provider function in this file
	* Pattern: async function generateXxxTTS(config, text): Promise<TTSGenerationResult>
	* - Validate config and build API request
	* - Handle API authentication (apiKey, headers)
	* - Convert provider-specific parameters (voice, speed, format)
	* - Return { audio: Uint8Array, format: string }
	*
	* Example:
	* async function generateElevenLabsTTS(
	* config: TTSModelConfig,
	* text: string
	* ): Promise<TTSGenerationResult> {
	* const baseUrl = config.baseUrl \|\| TTS_PROVIDERS['elevenlabs-tts'].defaultBaseUrl;
	*
	* const response = await fetch(`${baseUrl}/text-to-speech/${config.voice}`, {
	* method: 'POST',
	* headers: {
	* 'xi-api-key': config.apiKey!,
	* 'Content-Type': 'application/json',
	* },
	* body: JSON.stringify({
	* text,
	* model_id: 'eleven_multilingual_v2',
	* voice_settings: {
	* stability: 0.5,
	* similarity_boost: 0.75,
	* }
	* }),
	* });
	*
	* if (!response.ok) {
	* throw new Error(`ElevenLabs TTS API error: ${response.statusText}`);
	* }
	*
	* const arrayBuffer = await response.arrayBuffer();
	* return {
	* audio: new Uint8Array(arrayBuffer),
	* format: 'mp3',
	* };
	* }
	*
	* 4. Add case to generateTTS() switch statement
	* case 'elevenlabs-tts':
	* return await generateElevenLabsTTS(config, text);
	*
	* 5. Add i18n translations in lib/i18n.ts
	* providerElevenLabsTTS: { zh: 'ElevenLabs TTS', en: 'ElevenLabs TTS' }
	*
	* Error Handling Patterns:
	* - Always validate API key if requiresApiKey is true
	* - Throw descriptive errors for API failures
	* - Include response.statusText or error messages from API
	* - For client-only providers (browser-native), throw error directing to client-side usage
	*
	* API Call Patterns:
	* - Direct API: Use fetch with appropriate headers and body format (recommended for better encoding support)
	* - SSML: For Azure-like providers requiring SSML markup
	* - URL-based: For providers returning audio URL (download in second step)
	*/

	import type { TTSModelConfig } from './types';
	import { isCustomTTSProvider } from './types';
	import { TTS_PROVIDERS } from './constants';
	import {
	VOXCPM_VLLM_MODEL_ID,
	VOXCPM_AUTO_VOICE_ID,
	normalizeVoxCPMBackend,
	type VoxCPMProviderOptions,
	} from './voxcpm';

	/**
	* Result of TTS generation
	*/
	export interface TTSGenerationResult {
	audio: Uint8Array;
	format: string;
	}

	/**
	* Thrown when a TTS provider returns a rate-limit / concurrency-quota error.
	* Allows downstream consumers to distinguish rate-limit errors from other TTS failures.
	*
	* TODO: The API route currently catches all errors uniformly as GENERATION_FAILED.
	* This class enables future retry/backoff logic without changing the throw sites.
	*/
	export class TTSRateLimitError extends Error {
	constructor(
	public readonly provider: string,
	message: string,
	) {
	super(message);
	this.name = 'TTSRateLimitError';
	}
	}

	/**
	* Generate speech using specified TTS provider
	*/
	export async function generateTTS(
	config: TTSModelConfig,
	text: string,
	): Promise<TTSGenerationResult> {
	const provider = TTS_PROVIDERS[config.providerId as keyof typeof TTS_PROVIDERS];

	// Validate API key if required (only for built-in providers with known config)
	if (provider?.requiresApiKey && !config.apiKey) {
	throw new Error(`API key required for TTS provider: ${config.providerId}`);
	}

	switch (config.providerId) {
	case 'openai-tts':
	return await generateOpenAITTS(config, text);

	case 'azure-tts':
	return await generateAzureTTS(config, text);

	case 'glm-tts':
	return await generateGLMTTS(config, text);

	case 'qwen-tts':
	return await generateQwenTTS(config, text);

	case 'voxcpm-tts':
	return await generateVoxCPMTTS(config, text);

	case 'minimax-tts':
	return await generateMiniMaxTTS(config, text);
	case 'doubao-tts':
	return await generateDoubaoTTS(config, text);
	case 'elevenlabs-tts':
	return await generateElevenLabsTTS(config, text);

	case 'browser-native-tts':
	throw new Error(
	'Browser Native TTS must be handled client-side using Web Speech API. This provider cannot be used on the server.',
	);

	default:
	if (isCustomTTSProvider(config.providerId)) {
	return await generateOpenAITTS(config, text);
	}
	throw new Error(`Unsupported TTS provider: ${config.providerId}`);
	}
	}

	/**
	* OpenAI TTS implementation (direct API call with explicit UTF-8 encoding)
	*/
	async function generateOpenAITTS(
	config: TTSModelConfig,
	text: string,
	): Promise<TTSGenerationResult> {
	const baseUrl = config.baseUrl \|\| TTS_PROVIDERS['openai-tts'].defaultBaseUrl;

	// Use gpt-4o-mini-tts for best quality and intelligent realtime applications
	const response = await fetch(`${baseUrl}/audio/speech`, {
	method: 'POST',
	headers: {
	Authorization: `Bearer ${config.apiKey}`,
	'Content-Type': 'application/json; charset=utf-8',
	},
	body: JSON.stringify({
	model: config.modelId \|\| 'gpt-4o-mini-tts',
	input: text,
	voice: config.voice,
	speed: config.speed \|\| 1.0,
	}),
	});

	if (!response.ok) {
	const error = await response.json().catch(() => ({ error: response.statusText }));
	throw new Error(`OpenAI TTS API error: ${error.error?.message \|\| response.statusText}`);
	}

	const arrayBuffer = await response.arrayBuffer();
	const contentType = response.headers.get('content-type') \|\| '';
	const format = getAudioResponseFormat(contentType);
	return {
	audio: new Uint8Array(arrayBuffer),
	format,
	};
	}

	/**
	* VoxCPM2 TTS implementation.
	*
	* MultiMind Classroom keeps one internal VoxCPM request shape, then adapts it to the
	* selected official backend protocol.
	*/
	async function generateVoxCPMTTS(
	config: TTSModelConfig,
	text: string,
	): Promise<TTSGenerationResult> {
	const baseUrl = (config.baseUrl \|\| TTS_PROVIDERS['voxcpm-tts'].defaultBaseUrl \|\| '').replace(
	/\/$/,
	'',
	);
	if (!baseUrl) {
	throw new Error('VoxCPM base URL is required');
	}

	const options = (config.providerOptions \|\| {}) as VoxCPMProviderOptions;
	const backend = normalizeVoxCPMBackend(options.backend);
	const voicePrompt =
	options.voicePrompt \|\|
	(config.voice && config.voice !== 'default' && config.voice !== VOXCPM_AUTO_VOICE_ID
	? config.voice
	: undefined);
	if (config.voice === VOXCPM_AUTO_VOICE_ID && !voicePrompt) {
	throw new Error('VoxCPM Auto Voice requires agent context');
	}
	const cfgValue = options.cfgValue ?? 2.0;
	const inferenceTimesteps = options.inferenceTimesteps ?? 10;
	const normalize = options.normalize ?? false;
	const denoise = options.denoise ?? false;
	const usePromptContinuation = Boolean(options.promptText?.trim() && options.referenceAudioBase64);

	const request = {
	targetText: usePromptContinuation ? text : buildVoxCPMTargetText(text, voicePrompt),
	voicePrompt,
	promptText: options.promptText,
	cfgValue,
	inferenceTimesteps,
	normalize,
	denoise,
	referenceAudioBase64: options.referenceAudioBase64,
	referenceAudioMimeType: options.referenceAudioMimeType,
	referenceAudioName: options.referenceAudioName,
	};

	const response =
	backend === 'nano-vllm'
	? await postVoxCPMNanoVLLM(baseUrl, request, config.apiKey)
	: backend === 'python-api'
	? await postVoxCPMPythonAPI(baseUrl, request, config.apiKey)
	: await postVoxCPMVLLMOmni(baseUrl, request, config);

	if (!response.ok) {
	throw new Error(`VoxCPM TTS API error: ${await readTTSApiError(response)}`);
	}

	const arrayBuffer = await response.arrayBuffer();
	const contentType = response.headers.get('content-type') \|\| '';
	const format = getAudioResponseFormat(contentType);
	return {
	audio: new Uint8Array(arrayBuffer),
	format,
	};
	}

	function buildVoxCPMTargetText(text: string, voicePrompt?: string): string {
	const prompt = voicePrompt
	?.replace(/[\p{C}]+/gu, ' ')
	.replace(/[()（）]/gu, '')
	.replace(/\s+/gu, ' ')
	.trim();
	return prompt ? `(${prompt})${text}` : text;
	}

	function getAudioResponseFormat(contentType: string): string {
	if (contentType.includes('audio/wav') \|\| contentType.includes('audio/x-wav')) return 'wav';
	if (contentType.includes('audio/mpeg') \|\| contentType.includes('audio/mp3')) return 'mp3';
	if (contentType.includes('audio/flac')) return 'flac';
	if (contentType.includes('audio/ogg')) return 'ogg';
	if (contentType.includes('audio/webm')) return 'webm';
	return 'mp3';
	}

	function getVoxCPMAudioFormat(mimeType?: string, fileName?: string): string {
	const lowerName = fileName?.toLowerCase() \|\| '';
	if (mimeType?.includes('wav') \|\| lowerName.endsWith('.wav')) return 'wav';
	if (mimeType?.includes('mpeg') \|\| mimeType?.includes('mp3') \|\| lowerName.endsWith('.mp3')) {
	return 'mp3';
	}
	if (mimeType?.includes('flac') \|\| lowerName.endsWith('.flac')) return 'flac';
	if (mimeType?.includes('ogg') \|\| lowerName.endsWith('.ogg')) return 'ogg';
	if (mimeType?.includes('webm') \|\| lowerName.endsWith('.webm')) return 'webm';
	return 'wav';
	}

	function getVLLMOmniSpeechUrl(baseUrl: string): string {
	return baseUrl.endsWith('/v1') ? `${baseUrl}/audio/speech` : `${baseUrl}/v1/audio/speech`;
	}

	function getVLLMOmniModelId(config: TTSModelConfig): string {
	const modelId = config.modelId?.trim();
	if (!modelId \|\| modelId === 'VoxCPM2') return VOXCPM_VLLM_MODEL_ID;
	return modelId;
	}

	function getBackendAuthHeaders(apiKey?: string): Record<string, string> {
	return apiKey?.trim() ? { Authorization: `Bearer ${apiKey.trim()}` } : {};
	}

	async function postVoxCPMVLLMOmni(
	baseUrl: string,
	params: {
	targetText: string;
	promptText?: string;
	referenceAudioBase64?: string;
	referenceAudioMimeType?: string;
	referenceAudioName?: string;
	},
	config: TTSModelConfig,
	): Promise<Response> {
	const payload: Record<string, unknown> = {
	model: getVLLMOmniModelId(config),
	input: params.targetText,
	// VoxCPM2's vLLM-Omni adapter currently ignores named voices; prompts/ref_audio carry voice identity.
	voice: 'default',
	response_format: 'wav',
	stream: false,
	};

	if (params.referenceAudioBase64) {
	const referenceAudio = getVoxCPMDataAudioUrl(
	params.referenceAudioBase64,
	params.referenceAudioMimeType,
	params.referenceAudioName,
	);
	payload.ref_audio = referenceAudio;
	if (params.promptText?.trim()) {
	payload.prompt_audio = referenceAudio;
	payload.prompt_text = params.promptText.trim();
	}
	}

	return fetch(getVLLMOmniSpeechUrl(baseUrl), {
	method: 'POST',
	headers: {
	'Content-Type': 'application/json; charset=utf-8',
	...getBackendAuthHeaders(config.apiKey),
	},
	body: JSON.stringify(payload),
	});
	}

	function getVoxCPMDataAudioUrl(base64: string, mimeType?: string, fileName?: string): string {
	const format = getVoxCPMAudioFormat(mimeType, fileName);
	const mediaType =
	mimeType?.trim() \|\|
	(format === 'mp3'
	? 'audio/mpeg'
	: format === 'flac'
	? 'audio/flac'
	: format === 'ogg'
	? 'audio/ogg'
	: format === 'webm'
	? 'audio/webm'
	: 'audio/wav');
	return `data:${mediaType};base64,${base64}`;
	}

	function base64ToBlob(base64: string, mimeType?: string): Blob {
	const binary = atob(base64);
	const bytes = new Uint8Array(binary.length);
	for (let index = 0; index < binary.length; index++) {
	bytes[index] = binary.charCodeAt(index);
	}
	return new Blob([bytes], { type: mimeType \|\| 'audio/wav' });
	}

	async function postVoxCPMPythonAPI(
	baseUrl: string,
	params: {
	targetText: string;
	promptText?: string;
	cfgValue: number;
	inferenceTimesteps: number;
	normalize: boolean;
	denoise: boolean;
	referenceAudioBase64?: string;
	referenceAudioMimeType?: string;
	referenceAudioName?: string;
	},
	apiKey?: string,
	): Promise<Response> {
	const formData = new FormData();
	formData.set('text', params.targetText);
	formData.set('cfg_value', String(params.cfgValue));
	formData.set('inference_timesteps', String(params.inferenceTimesteps));
	formData.set('normalize', String(params.normalize));
	formData.set('denoise', String(params.denoise));

	if (params.referenceAudioBase64) {
	const audioBlob = base64ToBlob(params.referenceAudioBase64, params.referenceAudioMimeType);
	const audioName = params.referenceAudioName \|\| 'reference.wav';
	formData.set('reference_audio', audioBlob, audioName);
	if (params.promptText?.trim()) {
	formData.set('prompt_audio', audioBlob, audioName);
	formData.set('prompt_text', params.promptText.trim());
	}
	}

	return fetch(`${baseUrl}/tts/upload`, {
	method: 'POST',
	headers: getBackendAuthHeaders(apiKey),
	body: formData,
	});
	}

	async function postVoxCPMNanoVLLM(
	baseUrl: string,
	params: {
	targetText: string;
	promptText?: string;
	cfgValue: number;
	referenceAudioBase64?: string;
	referenceAudioMimeType?: string;
	referenceAudioName?: string;
	},
	apiKey?: string,
	): Promise<Response> {
	const payload: Record<string, unknown> = {
	target_text: params.targetText,
	cfg_value: params.cfgValue,
	};

	if (params.referenceAudioBase64) {
	const format = getVoxCPMAudioFormat(params.referenceAudioMimeType, params.referenceAudioName);
	payload.ref_audio_wav_base64 = params.referenceAudioBase64;
	payload.ref_audio_wav_format = format;
	if (params.promptText?.trim()) {
	payload.prompt_wav_base64 = params.referenceAudioBase64;
	payload.prompt_wav_format = format;
	payload.prompt_text = params.promptText.trim();
	}
	}

	return fetch(`${baseUrl}/generate`, {
	method: 'POST',
	headers: {
	'Content-Type': 'application/json; charset=utf-8',
	...getBackendAuthHeaders(apiKey),
	},
	body: JSON.stringify(payload),
	});
	}

	async function readTTSApiError(response: Response): Promise<string> {
	const text = await response.text().catch(() => response.statusText);
	if (!text) return response.statusText;
	try {
	const json = JSON.parse(text) as { detail?: unknown; error?: { message?: string } \| string };
	if (typeof json.detail === 'string') return json.detail;
	if (typeof json.error === 'string') return json.error;
	if (json.error?.message) return json.error.message;
	} catch {
	// Fall through to raw text.
	}
	return text;
	}

	/**
	* Azure TTS implementation (direct API call with SSML)
	*/
	async function generateAzureTTS(
	config: TTSModelConfig,
	text: string,
	): Promise<TTSGenerationResult> {
	const baseUrl = config.baseUrl \|\| TTS_PROVIDERS['azure-tts'].defaultBaseUrl;

	// Build SSML
	const rate = config.speed ? `${((config.speed - 1) * 100).toFixed(0)}%` : '0%';
	const ssml = `
	<speak version='1.0' xml:lang='zh-CN'>
	<voice xml:lang='zh-CN' name='${config.voice}'>
	<prosody rate='${rate}'>${escapeXml(text)}</prosody>
	</voice>
	</speak>
	`.trim();

	const response = await fetch(`${baseUrl}/cognitiveservices/v1`, {
	method: 'POST',
	headers: {
	'Ocp-Apim-Subscription-Key': config.apiKey!,
	'Content-Type': 'application/ssml+xml; charset=utf-8',
	'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3',
	},
	body: ssml,
	});

	if (!response.ok) {
	throw new Error(`Azure TTS API error: ${response.statusText}`);
	}

	const arrayBuffer = await response.arrayBuffer();
	return {
	audio: new Uint8Array(arrayBuffer),
	format: 'mp3',
	};
	}

	/**
	* GLM TTS implementation (GLM API)
	*/
	async function generateGLMTTS(config: TTSModelConfig, text: string): Promise<TTSGenerationResult> {
	const baseUrl = config.baseUrl \|\| TTS_PROVIDERS['glm-tts'].defaultBaseUrl;

	const response = await fetch(`${baseUrl}/audio/speech`, {
	method: 'POST',
	headers: {
	Authorization: `Bearer ${config.apiKey}`,
	'Content-Type': 'application/json; charset=utf-8',
	},
	body: JSON.stringify({
	model: config.modelId \|\| 'glm-tts',
	input: text,
	voice: config.voice,
	speed: config.speed \|\| 1.0,
	volume: 1.0,
	response_format: 'wav',
	}),
	});

	if (!response.ok) {
	const errorText = await response.text().catch(() => response.statusText);
	let errorMessage = `GLM TTS API error: ${errorText}`;
	try {
	const errorJson = JSON.parse(errorText);
	if (errorJson.error?.message) {
	errorMessage = `GLM TTS API error: ${errorJson.error.message} (code: ${errorJson.error.code})`;
	}
	} catch {
	// If not JSON, use the text as is
	}
	throw new Error(errorMessage);
	}

	const arrayBuffer = await response.arrayBuffer();
	return {
	audio: new Uint8Array(arrayBuffer),
	format: 'wav',
	};
	}

	/**
	* Qwen TTS implementation (DashScope API - Qwen3 TTS Flash)
	*/
	async function generateQwenTTS(config: TTSModelConfig, text: string): Promise<TTSGenerationResult> {
	const baseUrl = config.baseUrl \|\| TTS_PROVIDERS['qwen-tts'].defaultBaseUrl;

	// Calculate speed: Qwen3 uses rate parameter from -500 to 500
	// speed 1.0 = rate 0, speed 2.0 = rate 500, speed 0.5 = rate -250
	const rate = Math.round(((config.speed \|\| 1.0) - 1.0) * 500);

	const response = await fetch(`${baseUrl}/services/aigc/multimodal-generation/generation`, {
	method: 'POST',
	headers: {
	Authorization: `Bearer ${config.apiKey}`,
	'Content-Type': 'application/json; charset=utf-8',
	},
	body: JSON.stringify({
	model: config.modelId \|\| 'qwen3-tts-flash',
	input: {
	text,
	voice: config.voice,
	language_type: 'Chinese', // Default to Chinese, can be made configurable
	},
	parameters: {
	rate, // Speech rate from -500 to 500
	},
	}),
	});

	if (!response.ok) {
	const errorText = await response.text().catch(() => response.statusText);
	throw new Error(`Qwen TTS API error: ${errorText}`);
	}

	const data = await response.json();

	// Check for audio URL in response
	if (!data.output?.audio?.url) {
	throw new Error(`Qwen TTS error: No audio URL in response. Response: ${JSON.stringify(data)}`);
	}

	// Download audio from URL
	const audioUrl = data.output.audio.url;
	const audioResponse = await fetch(audioUrl);

	if (!audioResponse.ok) {
	throw new Error(`Failed to download audio from URL: ${audioResponse.statusText}`);
	}

	const arrayBuffer = await audioResponse.arrayBuffer();

	return {
	audio: new Uint8Array(arrayBuffer),
	format: 'wav', // Qwen3 TTS returns WAV format
	};
	}

	/**
	* MiniMax TTS implementation (synchronous HTTP API)
	*/
	async function generateMiniMaxTTS(
	config: TTSModelConfig,
	text: string,
	): Promise<TTSGenerationResult> {
	const baseUrl = (config.baseUrl \|\| TTS_PROVIDERS['minimax-tts'].defaultBaseUrl \|\| '').replace(
	/\/$/,
	'',
	);
	const response = await fetch(`${baseUrl}/v1/t2a_v2`, {
	method: 'POST',
	headers: {
	Authorization: `Bearer ${config.apiKey}`,
	'Content-Type': 'application/json; charset=utf-8',
	},
	body: JSON.stringify({
	model: config.modelId \|\| 'speech-2.8-hd',
	text,
	stream: false,
	output_format: 'hex',
	voice_setting: {
	voice_id: config.voice,
	speed: config.speed \|\| 1.0,
	vol: 1,
	pitch: 0,
	},
	audio_setting: {
	sample_rate: 32000,
	bitrate: 128000,
	format: config.format \|\| 'mp3',
	channel: 1,
	},
	language_boost: 'auto',
	}),
	});

	if (!response.ok) {
	const errorText = await response.text().catch(() => response.statusText);
	throw new Error(`MiniMax TTS API error: ${errorText}`);
	}

	const data = await response.json();
	const hexAudio = data?.data?.audio;
	if (!hexAudio \|\| typeof hexAudio !== 'string') {
	throw new Error(`MiniMax TTS error: No audio returned. Response: ${JSON.stringify(data)}`);
	}

	const cleanedHex = hexAudio.trim();
	if (cleanedHex.length % 2 !== 0) {
	throw new Error('MiniMax TTS error: invalid hex audio payload length');
	}

	const audio = new Uint8Array(
	cleanedHex.match(/.{1,2}/g)?.map((byte: string) => parseInt(byte, 16)) \|\| [],
	);
	return {
	audio,
	format: data?.extra_info?.audio_format \|\| config.format \|\| 'mp3',
	};
	}

	/**
	* ElevenLabs TTS implementation (direct API call with voice-specific endpoint)
	*/
	async function generateElevenLabsTTS(
	config: TTSModelConfig,
	text: string,
	): Promise<TTSGenerationResult> {
	const baseUrl = config.baseUrl \|\| TTS_PROVIDERS['elevenlabs-tts'].defaultBaseUrl;
	const requestedFormat = config.format \|\| 'mp3';
	const clampedSpeed = Math.min(1.2, Math.max(0.7, config.speed \|\| 1.0));
	const outputFormatMap: Record<string, string> = {
	mp3: 'mp3_44100_128',
	opus: 'opus_48000_96',
	pcm: 'pcm_44100',
	wav: 'wav_44100',
	ulaw: 'ulaw_8000',
	alaw: 'alaw_8000',
	};
	const outputFormat = outputFormatMap[requestedFormat] \|\| outputFormatMap.mp3;

	const response = await fetch(
	`${baseUrl}/text-to-speech/${encodeURIComponent(config.voice)}?output_format=${outputFormat}`,
	{
	method: 'POST',
	headers: {
	'xi-api-key': config.apiKey!,
	'Content-Type': 'application/json; charset=utf-8',
	},
	body: JSON.stringify({
	text,
	model_id: config.modelId \|\| 'eleven_multilingual_v2',
	voice_settings: {
	stability: 0.5,
	similarity_boost: 0.75,
	speed: clampedSpeed,
	},
	}),
	},
	);

	if (!response.ok) {
	const errorText = await response.text().catch(() => response.statusText);
	throw new Error(`ElevenLabs TTS API error: ${errorText \|\| response.statusText}`);
	}

	const arrayBuffer = await response.arrayBuffer();
	return {
	audio: new Uint8Array(arrayBuffer),
	format: requestedFormat,
	};
	}

	/**
	* Get current TTS configuration from settings store
	* Note: This function should only be called in browser context
	*/
	export async function getCurrentTTSConfig(): Promise<TTSModelConfig> {
	if (typeof window === 'undefined') {
	throw new Error('getCurrentTTSConfig() can only be called in browser context');
	}

	// Lazy import to avoid circular dependency
	const { useSettingsStore } = await import('@/lib/store/settings');
	const { ttsProviderId, ttsVoice, ttsSpeed, ttsProvidersConfig } = useSettingsStore.getState();

	const providerConfig = ttsProvidersConfig?.[ttsProviderId];

	return {
	providerId: ttsProviderId,
	modelId:
	providerConfig?.modelId \|\|
	TTS_PROVIDERS[ttsProviderId as keyof typeof TTS_PROVIDERS]?.defaultModelId \|\|
	'',
	apiKey: providerConfig?.apiKey,
	baseUrl: providerConfig?.baseUrl \|\| providerConfig?.customDefaultBaseUrl,
	voice: ttsVoice,
	speed: ttsSpeed,
	};
	}

	// Re-export from constants for convenience
	export { getAllTTSProviders, getTTSProvider, getTTSVoices } from './constants';

	/**
	* Doubao TTS 2.0 implementation (Volcengine Seed-TTS 2.0)
	*/
	async function generateDoubaoTTS(
	config: TTSModelConfig,
	text: string,
	): Promise<TTSGenerationResult> {
	const colonIdx = (config.apiKey \|\| '').indexOf(':');
	if (colonIdx <= 0) {
	throw new Error(
	'Doubao TTS requires API key in format "appId:accessKey". Get both from the Volcengine console.',
	);
	}
	const appId = config.apiKey!.slice(0, colonIdx);
	const accessKey = config.apiKey!.slice(colonIdx + 1);

	const baseUrl = config.baseUrl \|\| TTS_PROVIDERS['doubao-tts'].defaultBaseUrl;
	const speechRate = Math.round(((config.speed \|\| 1.0) - 1.0) * 100);

	const response = await fetch(`${baseUrl}/unidirectional`, {
	method: 'POST',
	headers: {
	'Content-Type': 'application/json',
	'X-Api-App-Id': appId,
	'X-Api-Access-Key': accessKey,
	'X-Api-Resource-Id': 'seed-tts-2.0',
	},
	body: JSON.stringify({
	user: { uid: 'multimind' },
	req_params: {
	text,
	speaker: config.voice,
	audio_params: { format: 'mp3', sample_rate: 24000, speech_rate: speechRate },
	},
	}),
	});

	if (!response.ok) {
	const errorText = await response.text().catch(() => response.statusText);
	throw new Error(`Doubao TTS API error (${response.status}): ${errorText}`);
	}

	const responseText = await response.text();
	const audioChunks: Uint8Array[] = [];

	let depth = 0;
	let start = -1;
	for (let i = 0; i < responseText.length; i++) {
	if (responseText[i] === '{') {
	if (depth === 0) start = i;
	depth++;
	} else if (responseText[i] === '}') {
	depth--;
	if (depth === 0 && start >= 0) {
	let chunk: { code: number; message?: string; data?: string };
	try {
	chunk = JSON.parse(responseText.slice(start, i + 1));
	} catch {
	start = -1;
	continue;
	}
	start = -1;

	if (chunk.code === 0 && chunk.data) {
	audioChunks.push(new Uint8Array(Buffer.from(chunk.data, 'base64')));
	} else if (chunk.code === 20000000) {
	break;
	} else if (chunk.code && chunk.code !== 0) {
	if (chunk.code === 45000000 \|\| chunk.code === 45000292) {
	throw new TTSRateLimitError(
	'doubao-tts',
	chunk.message \|\| 'concurrency quota exceeded',
	);
	}
	throw new Error(`Doubao TTS error: ${chunk.message \|\| 'unknown'} (code: ${chunk.code})`);
	}
	}
	}
	}

	if (audioChunks.length === 0) {
	throw new Error('Doubao TTS: no audio data received');
	}

	const totalLength = audioChunks.reduce((sum, c) => sum + c.length, 0);
	const combined = new Uint8Array(totalLength);
	let offset = 0;
	for (const chunk of audioChunks) {
	combined.set(chunk, offset);
	offset += chunk.length;
	}

	return { audio: combined, format: 'mp3' };
	}

	/**
	* Escape XML special characters for SSML
	*/
	function escapeXml(text: string): string {
	return text
	.replace(/&/g, '&')
	.replace(/</g, '<')
	.replace(/>/g, '>')
	.replace(/"/g, '"')
	.replace(/'/g, ''');
	}