| const axios = require('axios'); |
| const { logger } = require('@librechat/data-schemas'); |
| const { HttpsProxyAgent } = require('https-proxy-agent'); |
| const { genAzureEndpoint, logAxiosError } = require('@librechat/api'); |
| const { extractEnvVariable, TTSProviders } = require('librechat-data-provider'); |
| const { getRandomVoiceId, createChunkProcessor, splitTextIntoChunks } = require('./streamAudio'); |
| const { getAppConfig } = require('~/server/services/Config'); |
|
|
| |
| |
| |
| |
| class TTSService { |
| |
| |
| |
| constructor() { |
| this.providerStrategies = { |
| [TTSProviders.OPENAI]: this.openAIProvider.bind(this), |
| [TTSProviders.AZURE_OPENAI]: this.azureOpenAIProvider.bind(this), |
| [TTSProviders.ELEVENLABS]: this.elevenLabsProvider.bind(this), |
| [TTSProviders.LOCALAI]: this.localAIProvider.bind(this), |
| }; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| static async getInstance() { |
| return new TTSService(); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| getProvider(appConfig) { |
| const ttsSchema = appConfig?.speech?.tts; |
| if (!ttsSchema) { |
| throw new Error( |
| 'No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)?', |
| ); |
| } |
| const providers = Object.entries(ttsSchema).filter( |
| ([, value]) => Object.keys(value).length > 0, |
| ); |
|
|
| if (providers.length !== 1) { |
| throw new Error( |
| providers.length > 1 |
| ? 'Multiple providers are set. Please set only one provider.' |
| : 'No provider is set. Please set a provider.', |
| ); |
| } |
| return providers[0][0]; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| async getVoice(providerSchema, requestVoice) { |
| const voices = providerSchema.voices.filter((voice) => voice && voice.toUpperCase() !== 'ALL'); |
| let voice = requestVoice; |
| if (!voice || !voices.includes(voice) || (voice.toUpperCase() === 'ALL' && voices.length > 1)) { |
| voice = getRandomVoiceId(voices); |
| } |
| return voice; |
| } |
|
|
| |
| |
| |
| |
| removeUndefined(obj) { |
| Object.keys(obj).forEach((key) => { |
| if (obj[key] && typeof obj[key] === 'object') { |
| this.removeUndefined(obj[key]); |
| if (Object.keys(obj[key]).length === 0) { |
| delete obj[key]; |
| } |
| } else if (obj[key] === undefined) { |
| delete obj[key]; |
| } |
| }); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| openAIProvider(ttsSchema, input, voice) { |
| const url = ttsSchema?.url || 'https://api.openai.com/v1/audio/speech'; |
|
|
| if ( |
| ttsSchema?.voices && |
| ttsSchema.voices.length > 0 && |
| !ttsSchema.voices.includes(voice) && |
| !ttsSchema.voices.includes('ALL') |
| ) { |
| throw new Error(`Voice ${voice} is not available.`); |
| } |
|
|
| const data = { |
| input, |
| model: ttsSchema?.model, |
| voice: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined, |
| backend: ttsSchema?.backend, |
| }; |
|
|
| const headers = { |
| 'Content-Type': 'application/json', |
| Authorization: `Bearer ${extractEnvVariable(ttsSchema?.apiKey)}`, |
| }; |
|
|
| return [url, data, headers]; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| azureOpenAIProvider(ttsSchema, input, voice) { |
| const url = `${genAzureEndpoint({ |
| azureOpenAIApiInstanceName: extractEnvVariable(ttsSchema?.instanceName), |
| azureOpenAIApiDeploymentName: extractEnvVariable(ttsSchema?.deploymentName), |
| })}/audio/speech?api-version=${extractEnvVariable(ttsSchema?.apiVersion)}`; |
|
|
| if ( |
| ttsSchema?.voices && |
| ttsSchema.voices.length > 0 && |
| !ttsSchema.voices.includes(voice) && |
| !ttsSchema.voices.includes('ALL') |
| ) { |
| throw new Error(`Voice ${voice} is not available.`); |
| } |
|
|
| const data = { |
| model: extractEnvVariable(ttsSchema?.model), |
| input, |
| voice: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined, |
| }; |
|
|
| const headers = { |
| 'Content-Type': 'application/json', |
| 'api-key': ttsSchema.apiKey ? extractEnvVariable(ttsSchema.apiKey) : '', |
| }; |
|
|
| return [url, data, headers]; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| elevenLabsProvider(ttsSchema, input, voice, stream) { |
| let url = |
| ttsSchema?.url || |
| `https://api.elevenlabs.io/v1/text-to-speech/${voice}${stream ? '/stream' : ''}`; |
|
|
| if (!ttsSchema?.voices.includes(voice) && !ttsSchema?.voices.includes('ALL')) { |
| throw new Error(`Voice ${voice} is not available.`); |
| } |
|
|
| const data = { |
| model_id: ttsSchema?.model, |
| text: input, |
| voice_settings: { |
| similarity_boost: ttsSchema?.voice_settings?.similarity_boost, |
| stability: ttsSchema?.voice_settings?.stability, |
| style: ttsSchema?.voice_settings?.style, |
| use_speaker_boost: ttsSchema?.voice_settings?.use_speaker_boost, |
| }, |
| pronunciation_dictionary_locators: ttsSchema?.pronunciation_dictionary_locators, |
| }; |
|
|
| const headers = { |
| 'Content-Type': 'application/json', |
| 'xi-api-key': extractEnvVariable(ttsSchema?.apiKey), |
| Accept: 'audio/mpeg', |
| }; |
|
|
| return [url, data, headers]; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| localAIProvider(ttsSchema, input, voice) { |
| const url = ttsSchema?.url; |
|
|
| if ( |
| ttsSchema?.voices && |
| ttsSchema.voices.length > 0 && |
| !ttsSchema.voices.includes(voice) && |
| !ttsSchema.voices.includes('ALL') |
| ) { |
| throw new Error(`Voice ${voice} is not available.`); |
| } |
|
|
| const data = { |
| input, |
| model: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined, |
| backend: ttsSchema?.backend, |
| }; |
|
|
| const headers = { |
| 'Content-Type': 'application/json', |
| Authorization: `Bearer ${extractEnvVariable(ttsSchema?.apiKey)}`, |
| }; |
|
|
| if (extractEnvVariable(ttsSchema.apiKey) === '') { |
| delete headers.Authorization; |
| } |
|
|
| return [url, data, headers]; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| async ttsRequest(provider, ttsSchema, { input, voice, stream = true }) { |
| const strategy = this.providerStrategies[provider]; |
| if (!strategy) { |
| throw new Error('Invalid provider'); |
| } |
|
|
| const [url, data, headers] = strategy.call(this, ttsSchema, input, voice, stream); |
|
|
| [data, headers].forEach(this.removeUndefined.bind(this)); |
|
|
| const options = { headers, responseType: stream ? 'stream' : 'arraybuffer' }; |
|
|
| if (process.env.PROXY) { |
| options.httpsAgent = new HttpsProxyAgent(process.env.PROXY); |
| } |
|
|
| try { |
| return await axios.post(url, data, options); |
| } catch (error) { |
| logAxiosError({ message: `TTS request failed for provider ${provider}:`, error }); |
| throw error; |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| async processTextToSpeech(req, res) { |
| const { input, voice: requestVoice } = req.body; |
|
|
| if (!input) { |
| return res.status(400).send('Missing text in request body'); |
| } |
|
|
| const appConfig = |
| req.config ?? |
| (await getAppConfig({ |
| role: req.user?.role, |
| })); |
| try { |
| res.setHeader('Content-Type', 'audio/mpeg'); |
| const provider = this.getProvider(appConfig); |
| const ttsSchema = appConfig?.speech?.tts?.[provider]; |
| const voice = await this.getVoice(ttsSchema, requestVoice); |
|
|
| if (input.length < 4096) { |
| const response = await this.ttsRequest(provider, ttsSchema, { input, voice }); |
| response.data.pipe(res); |
| return; |
| } |
|
|
| const textChunks = splitTextIntoChunks(input, 1000); |
|
|
| for (const chunk of textChunks) { |
| try { |
| const response = await this.ttsRequest(provider, ttsSchema, { |
| voice, |
| input: chunk.text, |
| stream: true, |
| }); |
|
|
| logger.debug(`[textToSpeech] user: ${req?.user?.id} | writing audio stream`); |
| await new Promise((resolve) => { |
| response.data.pipe(res, { end: chunk.isFinished }); |
| response.data.on('end', resolve); |
| }); |
|
|
| if (chunk.isFinished) { |
| break; |
| } |
| } catch (innerError) { |
| logAxiosError({ |
| message: `[TTS] Error processing manual update for chunk: ${chunk?.text?.substring(0, 50)}...`, |
| error: innerError, |
| }); |
| if (!res.headersSent) { |
| return res.status(500).end(); |
| } |
| return; |
| } |
| } |
|
|
| if (!res.headersSent) { |
| res.end(); |
| } |
| } catch (error) { |
| logAxiosError({ message: '[TTS] Error creating the audio stream:', error }); |
| if (!res.headersSent) { |
| return res.status(500).send('An error occurred'); |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| async streamAudio(req, res) { |
| res.setHeader('Content-Type', 'audio/mpeg'); |
| const appConfig = |
| req.config ?? |
| (await getAppConfig({ |
| role: req.user?.role, |
| })); |
| const provider = this.getProvider(appConfig); |
| const ttsSchema = appConfig?.speech?.tts?.[provider]; |
| const voice = await this.getVoice(ttsSchema, req.body.voice); |
|
|
| let shouldContinue = true; |
|
|
| req.on('close', () => { |
| logger.warn('[streamAudio] Audio Stream Request closed by client'); |
| shouldContinue = false; |
| }); |
|
|
| const processChunks = createChunkProcessor(req.user.id, req.body.messageId); |
|
|
| try { |
| while (shouldContinue) { |
| const updates = await processChunks(); |
| if (typeof updates === 'string') { |
| logger.error(`Error processing audio stream updates: ${updates}`); |
| return res.status(500).end(); |
| } |
|
|
| if (updates.length === 0) { |
| await new Promise((resolve) => setTimeout(resolve, 1250)); |
| continue; |
| } |
|
|
| for (const update of updates) { |
| try { |
| const response = await this.ttsRequest(provider, ttsSchema, { |
| voice, |
| input: update.text, |
| stream: true, |
| }); |
|
|
| if (!shouldContinue) { |
| break; |
| } |
|
|
| logger.debug(`[streamAudio] user: ${req?.user?.id} | writing audio stream`); |
| await new Promise((resolve) => { |
| response.data.pipe(res, { end: update.isFinished }); |
| response.data.on('end', resolve); |
| }); |
|
|
| if (update.isFinished) { |
| shouldContinue = false; |
| break; |
| } |
| } catch (innerError) { |
| logAxiosError({ |
| message: `[TTS] Error processing audio stream update: ${update?.text?.substring(0, 50)}...`, |
| error: innerError, |
| }); |
| if (!res.headersSent) { |
| return res.status(500).end(); |
| } |
| return; |
| } |
| } |
|
|
| if (!shouldContinue) { |
| break; |
| } |
| } |
|
|
| if (!res.headersSent) { |
| res.end(); |
| } |
| } catch (error) { |
| logAxiosError({ message: '[TTS] Failed to fetch audio:', error }); |
| if (!res.headersSent) { |
| res.status(500).end(); |
| } |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| async function createTTSService() { |
| return TTSService.getInstance(); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| async function textToSpeech(req, res) { |
| const ttsService = await createTTSService(); |
| await ttsService.processTextToSpeech(req, res); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| async function streamAudio(req, res) { |
| const ttsService = await createTTSService(); |
| await ttsService.streamAudio(req, res); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| async function getProvider(appConfig) { |
| const ttsService = await createTTSService(); |
| return ttsService.getProvider(appConfig); |
| } |
|
|
| module.exports = { |
| textToSpeech, |
| streamAudio, |
| getProvider, |
| }; |
|
|