/** * Single TTS Generation API * * Generates TTS audio for a single text string and returns base64-encoded audio. * Called by the client in parallel for each speech action after a scene is generated. * * POST /api/generate/tts */ import { NextRequest } from 'next/server'; import { generateTTS } from '@/lib/audio/tts-providers'; import { resolveTTSApiKey, resolveTTSBaseUrl } from '@/lib/server/provider-config'; import type { TTSProviderId } from '@/lib/audio/types'; import { createLogger } from '@/lib/logger'; import { apiError, apiSuccess } from '@/lib/server/api-response'; import { validateUrlForSSRF } from '@/lib/server/ssrf-guard'; import { VOXCPM_AUTO_VOICE_ID, VOXCPM_TTS_PROVIDER_ID } from '@/lib/audio/voxcpm'; const log = createLogger('TTS API'); export const maxDuration = 30; export async function POST(req: NextRequest) { let ttsProviderId: string | undefined; let ttsVoice: string | undefined; let audioId: string | undefined; try { const body = await req.json(); const { text, ttsModelId, ttsSpeed, ttsApiKey, ttsBaseUrl, ttsProviderOptions } = body as { text: string; audioId: string; ttsProviderId: TTSProviderId; ttsModelId?: string; ttsVoice: string; ttsSpeed?: number; ttsApiKey?: string; ttsBaseUrl?: string; ttsProviderOptions?: Record; }; ttsProviderId = body.ttsProviderId; ttsVoice = body.ttsVoice; audioId = body.audioId; // Validate required fields if (!text || !audioId || !ttsProviderId || !ttsVoice) { return apiError( 'MISSING_REQUIRED_FIELD', 400, 'Missing required fields: text, audioId, ttsProviderId, ttsVoice', ); } // Reject browser-native TTS — must be handled client-side if (ttsProviderId === 'browser-native-tts') { return apiError('INVALID_REQUEST', 400, 'browser-native-tts must be handled client-side'); } const voxcpmVoicePrompt = typeof ttsProviderOptions?.voicePrompt === 'string' ? ttsProviderOptions.voicePrompt : ''; if ( ttsProviderId === VOXCPM_TTS_PROVIDER_ID && ttsVoice === VOXCPM_AUTO_VOICE_ID && !voxcpmVoicePrompt.trim() ) { return apiError( 'VOXCPM_AUTO_VOICE_REQUIRES_CONTEXT', 400, 'VoxCPM Auto Voice requires agent context', ); } const clientBaseUrl = ttsBaseUrl || undefined; if (clientBaseUrl) { const ssrfError = await validateUrlForSSRF(clientBaseUrl); if (ssrfError) { return apiError('INVALID_URL', 403, ssrfError); } } const apiKey = clientBaseUrl ? ttsApiKey || '' : resolveTTSApiKey(ttsProviderId, ttsApiKey || undefined); const baseUrl = clientBaseUrl ? clientBaseUrl : resolveTTSBaseUrl(ttsProviderId, ttsBaseUrl || undefined); // Build TTS config const config = { providerId: ttsProviderId as TTSProviderId, modelId: ttsModelId, voice: ttsVoice, speed: ttsSpeed ?? 1.0, apiKey, baseUrl, providerOptions: ttsProviderOptions, }; log.info( `Generating TTS: provider=${ttsProviderId}, model=${ttsModelId || 'default'}, voice=${ttsVoice}, audioId=${audioId}, textLen=${text.length}`, ); // Generate audio const { audio, format } = await generateTTS(config, text); // Convert to base64 const base64 = Buffer.from(audio).toString('base64'); return apiSuccess({ audioId, base64, format }); } catch (error) { log.error( `TTS generation failed [provider=${ttsProviderId ?? 'unknown'}, voice=${ttsVoice ?? 'unknown'}, audioId=${audioId ?? 'unknown'}]:`, error, ); return apiError( 'GENERATION_FAILED', 500, error instanceof Error ? error.message : String(error), ); } }