File size: 3,120 Bytes
f56a29b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | /**
* Shared TTS utilities used by both client-side and server-side generation.
*/
import type { TTSProviderId } from './types';
import type { Action, SpeechAction } from '@/lib/types/action';
import { createLogger } from '@/lib/logger';
const log = createLogger('TTS');
/** Provider-specific max text length limits. */
export const TTS_MAX_TEXT_LENGTH: Partial<Record<TTSProviderId, number>> = {
'glm-tts': 1024,
};
/**
* Split long text into chunks that respect sentence boundaries.
* Tries splitting at sentence-ending punctuation first, then clause-level
* punctuation, and finally hard-splits at maxLength as a last resort.
*/
export function splitLongSpeechText(text: string, maxLength: number): string[] {
const normalized = text.trim();
if (!normalized || normalized.length <= maxLength) return [normalized];
const units = normalized
.split(/(?<=[。!?!?;;::\n])/u)
.map((part) => part.trim())
.filter(Boolean);
const chunks: string[] = [];
let current = '';
const pushChunk = (value: string) => {
const trimmed = value.trim();
if (trimmed) chunks.push(trimmed);
};
const appendUnit = (unit: string) => {
if (!current) {
current = unit;
return;
}
if ((current + unit).length <= maxLength) {
current += unit;
return;
}
pushChunk(current);
current = unit;
};
const hardSplitUnit = (unit: string) => {
const parts = unit.split(/(?<=[,,、])/u).filter(Boolean);
if (parts.length > 1) {
for (const part of parts) {
if (part.length <= maxLength) appendUnit(part);
else hardSplitUnit(part);
}
return;
}
let start = 0;
while (start < unit.length) {
appendUnit(unit.slice(start, start + maxLength));
start += maxLength;
}
};
for (const unit of units.length > 0 ? units : [normalized]) {
if (unit.length <= maxLength) appendUnit(unit);
else hardSplitUnit(unit);
}
pushChunk(current);
return chunks;
}
/**
* Split long speech actions into multiple shorter actions so each stays
* within the TTS provider's text length limit. Each sub-action gets its
* own independent audio file — no byte concatenation needed.
*/
export function splitLongSpeechActions(actions: Action[], providerId: TTSProviderId): Action[] {
const maxLength = TTS_MAX_TEXT_LENGTH[providerId];
if (!maxLength) return actions;
let didSplit = false;
const nextActions: Action[] = actions.flatMap((action) => {
if (action.type !== 'speech' || !action.text || action.text.length <= maxLength)
return [action];
const chunks = splitLongSpeechText(action.text, maxLength);
if (chunks.length <= 1) return [action];
didSplit = true;
const { audioId: _audioId, ...baseAction } = action as SpeechAction;
log.info(
`Split speech for ${providerId}: action=${action.id}, len=${action.text.length}, chunks=${chunks.length}`,
);
return chunks.map((chunk, i) => ({
...baseAction,
id: `${action.id}_tts_${i + 1}`,
text: chunk,
}));
});
return didSplit ? nextActions : actions;
}
|