/** * Shared TTS utilities used by both client-side and server-side generation. */ import type { TTSProviderId } from './types'; import type { Action, SpeechAction } from '@/lib/types/action'; import { createLogger } from '@/lib/logger'; const log = createLogger('TTS'); /** Provider-specific max text length limits. */ export const TTS_MAX_TEXT_LENGTH: Partial> = { 'glm-tts': 1024, }; /** * Split long text into chunks that respect sentence boundaries. * Tries splitting at sentence-ending punctuation first, then clause-level * punctuation, and finally hard-splits at maxLength as a last resort. */ export function splitLongSpeechText(text: string, maxLength: number): string[] { const normalized = text.trim(); if (!normalized || normalized.length <= maxLength) return [normalized]; const units = normalized .split(/(?<=[。!?!?;;::\n])/u) .map((part) => part.trim()) .filter(Boolean); const chunks: string[] = []; let current = ''; const pushChunk = (value: string) => { const trimmed = value.trim(); if (trimmed) chunks.push(trimmed); }; const appendUnit = (unit: string) => { if (!current) { current = unit; return; } if ((current + unit).length <= maxLength) { current += unit; return; } pushChunk(current); current = unit; }; const hardSplitUnit = (unit: string) => { const parts = unit.split(/(?<=[,,、])/u).filter(Boolean); if (parts.length > 1) { for (const part of parts) { if (part.length <= maxLength) appendUnit(part); else hardSplitUnit(part); } return; } let start = 0; while (start < unit.length) { appendUnit(unit.slice(start, start + maxLength)); start += maxLength; } }; for (const unit of units.length > 0 ? units : [normalized]) { if (unit.length <= maxLength) appendUnit(unit); else hardSplitUnit(unit); } pushChunk(current); return chunks; } /** * Split long speech actions into multiple shorter actions so each stays * within the TTS provider's text length limit. Each sub-action gets its * own independent audio file — no byte concatenation needed. */ export function splitLongSpeechActions(actions: Action[], providerId: TTSProviderId): Action[] { const maxLength = TTS_MAX_TEXT_LENGTH[providerId]; if (!maxLength) return actions; let didSplit = false; const nextActions: Action[] = actions.flatMap((action) => { if (action.type !== 'speech' || !action.text || action.text.length <= maxLength) return [action]; const chunks = splitLongSpeechText(action.text, maxLength); if (chunks.length <= 1) return [action]; didSplit = true; const { audioId: _audioId, ...baseAction } = action as SpeechAction; log.info( `Split speech for ${providerId}: action=${action.id}, len=${action.text.length}, chunks=${chunks.length}`, ); return chunks.map((chunk, i) => ({ ...baseAction, id: `${action.id}_tts_${i + 1}`, text: chunk, })); }); return didSplit ? nextActions : actions; }