| const axios = require('axios'); |
| const fs = require('fs').promises; |
| const FormData = require('form-data'); |
| const { Readable } = require('stream'); |
| const { logger } = require('@librechat/data-schemas'); |
| const { HttpsProxyAgent } = require('https-proxy-agent'); |
| const { genAzureEndpoint, logAxiosError } = require('@librechat/api'); |
| const { extractEnvVariable, STTProviders } = require('librechat-data-provider'); |
| const { getAppConfig } = require('~/server/services/Config'); |
|
|
| |
| |
| |
| |
| const MIME_TO_EXTENSION_MAP = { |
| |
| 'audio/mp4': 'm4a', |
| 'audio/x-m4a': 'm4a', |
| |
| 'audio/ogg': 'ogg', |
| 'audio/vorbis': 'ogg', |
| 'application/ogg': 'ogg', |
| |
| 'audio/wav': 'wav', |
| 'audio/x-wav': 'wav', |
| 'audio/wave': 'wav', |
| |
| 'audio/mp3': 'mp3', |
| 'audio/mpeg': 'mp3', |
| 'audio/mpeg3': 'mp3', |
| |
| 'audio/webm': 'webm', |
| |
| 'audio/flac': 'flac', |
| 'audio/x-flac': 'flac', |
| }; |
|
|
| |
| |
| |
| |
| |
| function getValidatedLanguageCode(language) { |
| try { |
| if (!language) { |
| return null; |
| } |
|
|
| const normalizedLanguage = language.toLowerCase(); |
| const isValidLocaleCode = /^[a-z]{2}(-[a-z]{2})?$/.test(normalizedLanguage); |
|
|
| if (isValidLocaleCode) { |
| return normalizedLanguage.split('-')[0]; |
| } |
|
|
| logger.warn( |
| `[STT] Invalid language format "${language}". Expected ISO-639-1 locale code like "en-US" or "en". Skipping language parameter.`, |
| ); |
| return null; |
| } catch (error) { |
| logger.error(`[STT] Error validating language code "${language}":`, error); |
| return null; |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| function getFileExtensionFromMime(mimeType) { |
| |
| if (!mimeType) { |
| return 'webm'; |
| } |
|
|
| |
| const extension = MIME_TO_EXTENSION_MAP[mimeType]; |
| if (extension) { |
| return extension; |
| } |
|
|
| |
| const subtype = mimeType.split('/')[1]?.toLowerCase(); |
|
|
| |
| if (['mp3', 'mp4', 'ogg', 'wav', 'webm', 'm4a', 'flac'].includes(subtype)) { |
| return subtype === 'mp4' ? 'm4a' : subtype; |
| } |
|
|
| |
| if (subtype?.includes('mp4') || subtype?.includes('m4a')) { |
| return 'm4a'; |
| } |
| if (subtype?.includes('ogg')) { |
| return 'ogg'; |
| } |
| if (subtype?.includes('wav')) { |
| return 'wav'; |
| } |
| if (subtype?.includes('mp3') || subtype?.includes('mpeg')) { |
| return 'mp3'; |
| } |
| if (subtype?.includes('webm')) { |
| return 'webm'; |
| } |
|
|
| return 'webm'; |
| } |
|
|
| |
| |
| |
| |
| class STTService { |
| constructor() { |
| this.providerStrategies = { |
| [STTProviders.OPENAI]: this.openAIProvider, |
| [STTProviders.AZURE_OPENAI]: this.azureOpenAIProvider, |
| }; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| static async getInstance() { |
| return new STTService(); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| async getProviderSchema(req) { |
| const appConfig = |
| req.config ?? |
| (await getAppConfig({ |
| role: req?.user?.role, |
| })); |
| const sttSchema = appConfig?.speech?.stt; |
| if (!sttSchema) { |
| throw new Error( |
| 'No STT schema is set. Did you configure STT in the custom config (librechat.yaml)?', |
| ); |
| } |
|
|
| const providers = Object.entries(sttSchema).filter( |
| ([, value]) => Object.keys(value).length > 0, |
| ); |
|
|
| if (providers.length !== 1) { |
| throw new Error( |
| providers.length > 1 |
| ? 'Multiple providers are set. Please set only one provider.' |
| : 'No provider is set. Please set a provider.', |
| ); |
| } |
|
|
| const [provider, schema] = providers[0]; |
| return [provider, schema]; |
| } |
|
|
| |
| |
| |
| |
| |
| removeUndefined(obj) { |
| Object.keys(obj).forEach((key) => { |
| if (obj[key] && typeof obj[key] === 'object') { |
| this.removeUndefined(obj[key]); |
| if (Object.keys(obj[key]).length === 0) { |
| delete obj[key]; |
| } |
| } else if (obj[key] === undefined) { |
| delete obj[key]; |
| } |
| }); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| openAIProvider(sttSchema, audioReadStream, audioFile, language) { |
| const url = sttSchema?.url || 'https://api.openai.com/v1/audio/transcriptions'; |
| const apiKey = extractEnvVariable(sttSchema.apiKey) || ''; |
|
|
| const data = { |
| file: audioReadStream, |
| model: sttSchema.model, |
| }; |
|
|
| const validLanguage = getValidatedLanguageCode(language); |
| if (validLanguage) { |
| data.language = validLanguage; |
| } |
|
|
| const headers = { |
| 'Content-Type': 'multipart/form-data', |
| ...(apiKey && { Authorization: `Bearer ${apiKey}` }), |
| }; |
| [headers].forEach(this.removeUndefined); |
|
|
| return [url, data, headers]; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| azureOpenAIProvider(sttSchema, audioBuffer, audioFile, language) { |
| const url = `${genAzureEndpoint({ |
| azureOpenAIApiInstanceName: extractEnvVariable(sttSchema?.instanceName), |
| azureOpenAIApiDeploymentName: extractEnvVariable(sttSchema?.deploymentName), |
| })}/audio/transcriptions?api-version=${extractEnvVariable(sttSchema?.apiVersion)}`; |
|
|
| const apiKey = sttSchema.apiKey ? extractEnvVariable(sttSchema.apiKey) : ''; |
|
|
| if (audioBuffer.byteLength > 25 * 1024 * 1024) { |
| throw new Error('The audio file size exceeds the limit of 25MB'); |
| } |
|
|
| const acceptedFormats = ['flac', 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'ogg', 'wav', 'webm']; |
| const fileFormat = audioFile.mimetype.split('/')[1]; |
| if (!acceptedFormats.includes(fileFormat)) { |
| throw new Error(`The audio file format ${fileFormat} is not accepted`); |
| } |
|
|
| const formData = new FormData(); |
| formData.append('file', audioBuffer, { |
| filename: audioFile.originalname, |
| contentType: audioFile.mimetype, |
| }); |
|
|
| const validLanguage = getValidatedLanguageCode(language); |
| if (validLanguage) { |
| formData.append('language', validLanguage); |
| } |
|
|
| const headers = { |
| ...(apiKey && { 'api-key': apiKey }), |
| }; |
|
|
| [headers].forEach(this.removeUndefined); |
|
|
| return [url, formData, { ...headers, ...formData.getHeaders() }]; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| async sttRequest(provider, sttSchema, { audioBuffer, audioFile, language }) { |
| const strategy = this.providerStrategies[provider]; |
| if (!strategy) { |
| throw new Error('Invalid provider'); |
| } |
|
|
| const fileExtension = getFileExtensionFromMime(audioFile.mimetype); |
|
|
| const audioReadStream = Readable.from(audioBuffer); |
| audioReadStream.path = `audio.${fileExtension}`; |
|
|
| const [url, data, headers] = strategy.call( |
| this, |
| sttSchema, |
| audioReadStream, |
| audioFile, |
| language, |
| ); |
|
|
| const options = { headers }; |
|
|
| if (process.env.PROXY) { |
| options.httpsAgent = new HttpsProxyAgent(process.env.PROXY); |
| } |
|
|
| try { |
| const response = await axios.post(url, data, options); |
|
|
| if (response.status !== 200) { |
| throw new Error('Invalid response from the STT API'); |
| } |
|
|
| if (!response.data || !response.data.text) { |
| throw new Error('Missing data in response from the STT API'); |
| } |
|
|
| return response.data.text.trim(); |
| } catch (error) { |
| logAxiosError({ message: `STT request failed for provider ${provider}:`, error }); |
| throw error; |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| async processSpeechToText(req, res) { |
| if (!req.file) { |
| return res.status(400).json({ message: 'No audio file provided in the FormData' }); |
| } |
|
|
| const audioBuffer = await fs.readFile(req.file.path); |
| const audioFile = { |
| originalname: req.file.originalname, |
| mimetype: req.file.mimetype, |
| size: req.file.size, |
| }; |
|
|
| try { |
| const [provider, sttSchema] = await this.getProviderSchema(req); |
| const language = req.body?.language || ''; |
| const text = await this.sttRequest(provider, sttSchema, { audioBuffer, audioFile, language }); |
| res.json({ text }); |
| } catch (error) { |
| logAxiosError({ message: 'An error occurred while processing the audio:', error }); |
| res.sendStatus(500); |
| } finally { |
| try { |
| await fs.unlink(req.file.path); |
| logger.debug('[/speech/stt] Temp. audio upload file deleted'); |
| } catch { |
| logger.debug('[/speech/stt] Temp. audio upload file already deleted'); |
| } |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| async function createSTTService() { |
| return STTService.getInstance(); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| async function speechToText(req, res) { |
| const sttService = await createSTTService(); |
| await sttService.processSpeechToText(req, res); |
| } |
|
|
| module.exports = { STTService, speechToText }; |
|
|