|
|
|
|
| import { useCallback, useEffect, useRef } from 'react'; |
| import { useSettingsStore } from '@/lib/store/settings'; |
| import { useBrowserTTS } from '@/lib/hooks/use-browser-tts'; |
| import { |
| resolveAgentVoice, |
| getAvailableProvidersWithVoices, |
| type ResolvedVoice, |
| } from '@/lib/audio/voice-resolver'; |
| import { getVoxCPMProviderOptions, useVoxCPMVoiceProfiles } from '@/lib/audio/voxcpm-voices'; |
| import type { AgentConfig } from '@/lib/orchestration/registry/types'; |
| import type { TTSProviderId } from '@/lib/audio/types'; |
| import type { AudioIndicatorState } from '@/components/roundtable/audio-indicator'; |
| import { useI18n } from '@/lib/hooks/use-i18n'; |
|
|
| interface DiscussionTTSOptions { |
| enabled: boolean; |
| agents: AgentConfig[]; |
| onAudioStateChange?: (agentId: string | null, state: AudioIndicatorState) => void; |
| } |
|
|
| interface QueueItem { |
| messageId: string; |
| partId: string; |
| text: string; |
| agentId: string | null; |
| providerId: TTSProviderId; |
| modelId?: string; |
| voiceId: string; |
| } |
|
|
| export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: DiscussionTTSOptions) { |
| const { locale } = useI18n(); |
| const ttsProvidersConfig = useSettingsStore((s) => s.ttsProvidersConfig); |
| const ttsSpeed = useSettingsStore((s) => s.ttsSpeed); |
| const ttsMuted = useSettingsStore((s) => s.ttsMuted); |
| const ttsVolume = useSettingsStore((s) => s.ttsVolume); |
| const playbackSpeed = useSettingsStore((s) => s.playbackSpeed); |
| |
| const globalTtsProviderId = useSettingsStore((s) => s.ttsProviderId); |
| const globalTtsVoice = useSettingsStore((s) => s.ttsVoice); |
| const { profiles: voxcpmProfiles } = useVoxCPMVoiceProfiles(); |
|
|
| const queueRef = useRef<QueueItem[]>([]); |
| const isPlayingRef = useRef(false); |
| const pausedRef = useRef(false); |
| |
| const currentProviderRef = useRef<TTSProviderId | null>(null); |
| const segmentDoneCounterRef = useRef(0); |
| const abortControllerRef = useRef<AbortController | null>(null); |
| const audioRef = useRef<HTMLAudioElement | null>(null); |
| const onAudioStateChangeRef = useRef(onAudioStateChange); |
| onAudioStateChangeRef.current = onAudioStateChange; |
| const processQueueRef = useRef<() => void>(() => {}); |
|
|
| const { |
| speak: browserSpeak, |
| pause: browserPause, |
| resume: browserResume, |
| cancel: browserCancel, |
| } = useBrowserTTS({ |
| rate: ttsSpeed, |
| onEnd: () => { |
| isPlayingRef.current = false; |
| segmentDoneCounterRef.current++; |
| onAudioStateChangeRef.current?.(null, 'idle'); |
| |
| if (!pausedRef.current) { |
| processQueueRef.current(); |
| } |
| }, |
| }); |
| const browserCancelRef = useRef(browserCancel); |
| browserCancelRef.current = browserCancel; |
| const browserSpeakRef = useRef(browserSpeak); |
| browserSpeakRef.current = browserSpeak; |
| const browserPauseRef = useRef(browserPause); |
| browserPauseRef.current = browserPause; |
| const browserResumeRef = useRef(browserResume); |
| browserResumeRef.current = browserResume; |
|
|
| |
| const agentIndexMap = useRef<Map<string, number>>(new Map()); |
| useEffect(() => { |
| const map = new Map<string, number>(); |
| agents.forEach((agent, i) => map.set(agent.id, i)); |
| agentIndexMap.current = map; |
| }, [agents]); |
|
|
| const resolveVoiceForAgent = useCallback( |
| (agentId: string | null): ResolvedVoice => { |
| const providers = getAvailableProvidersWithVoices(ttsProvidersConfig, voxcpmProfiles); |
| if (!agentId) { |
| if (providers.length > 0) { |
| return { |
| providerId: providers[0].providerId, |
| voiceId: providers[0].voices[0]?.id ?? 'default', |
| }; |
| } |
| return { providerId: 'browser-native-tts', voiceId: 'default' }; |
| } |
| const agent = agents.find((a) => a.id === agentId); |
| if (!agent) { |
| if (providers.length > 0) { |
| return { |
| providerId: providers[0].providerId, |
| voiceId: providers[0].voices[0]?.id ?? 'default', |
| modelId: undefined, |
| }; |
| } |
| return { providerId: 'browser-native-tts', voiceId: 'default', modelId: undefined }; |
| } |
| |
| if (agent.role === 'teacher') { |
| return { |
| providerId: globalTtsProviderId, |
| voiceId: globalTtsVoice, |
| modelId: ttsProvidersConfig[globalTtsProviderId]?.modelId, |
| }; |
| } |
| const index = agentIndexMap.current.get(agentId) ?? 0; |
| return resolveAgentVoice(agent, index, providers); |
| }, |
| [agents, ttsProvidersConfig, voxcpmProfiles, globalTtsProviderId, globalTtsVoice], |
| ); |
|
|
| const processQueue = useCallback(async () => { |
| if (pausedRef.current) return; |
| if (isPlayingRef.current || queueRef.current.length === 0) return; |
| if (!enabled || ttsMuted) { |
| queueRef.current = []; |
| return; |
| } |
|
|
| isPlayingRef.current = true; |
| const item = queueRef.current.shift()!; |
|
|
| |
| if (item.providerId === 'browser-native-tts') { |
| currentProviderRef.current = item.providerId; |
| onAudioStateChangeRef.current?.(item.agentId, 'playing'); |
| browserSpeakRef.current(item.text, item.voiceId); |
| return; |
| } |
|
|
| |
| currentProviderRef.current = item.providerId; |
| onAudioStateChangeRef.current?.(item.agentId, 'generating'); |
| const controller = new AbortController(); |
| abortControllerRef.current = controller; |
|
|
| try { |
| const providerConfig = ttsProvidersConfig[item.providerId]; |
| const agent = item.agentId ? agents.find((a) => a.id === item.agentId) : undefined; |
| const providerOptions = |
| item.providerId === 'voxcpm-tts' |
| ? { |
| ...(providerConfig?.providerOptions || {}), |
| ...(await getVoxCPMProviderOptions(item.voiceId, { |
| agentName: agent?.name, |
| role: agent?.role, |
| persona: agent?.persona, |
| locale, |
| })), |
| } |
| : undefined; |
| const res = await fetch('/api/generate/tts', { |
| method: 'POST', |
| headers: { 'Content-Type': 'application/json' }, |
| body: JSON.stringify({ |
| text: item.text, |
| audioId: item.partId, |
| ttsProviderId: item.providerId, |
| ttsModelId: item.modelId || providerConfig?.modelId, |
| ttsVoice: item.voiceId, |
| ttsSpeed: ttsSpeed, |
| ttsApiKey: providerConfig?.apiKey, |
| ttsBaseUrl: |
| providerConfig?.serverBaseUrl || |
| providerConfig?.baseUrl || |
| providerConfig?.customDefaultBaseUrl, |
| ttsProviderOptions: providerOptions, |
| }), |
| signal: controller.signal, |
| }); |
|
|
| if (!res.ok) throw new Error(`TTS API error: ${res.status}`); |
|
|
| const data = await res.json(); |
| if (!data.base64) throw new Error('No audio in response'); |
|
|
| const audioUrl = `data:audio/${data.format || 'mp3'};base64,${data.base64}`; |
| const audio = new Audio(audioUrl); |
| audio.playbackRate = playbackSpeed; |
| audio.volume = ttsMuted ? 0 : ttsVolume; |
| audioRef.current = audio; |
| audio.addEventListener('ended', () => { |
| audioRef.current = null; |
| isPlayingRef.current = false; |
| segmentDoneCounterRef.current++; |
| onAudioStateChangeRef.current?.(item.agentId, 'idle'); |
| if (!pausedRef.current) { |
| queueMicrotask(() => processQueueRef.current()); |
| } |
| }); |
| audio.addEventListener('error', () => { |
| audioRef.current = null; |
| isPlayingRef.current = false; |
| segmentDoneCounterRef.current++; |
| onAudioStateChangeRef.current?.(item.agentId, 'idle'); |
| if (!pausedRef.current) { |
| queueMicrotask(() => processQueueRef.current()); |
| } |
| }); |
|
|
| |
| if (pausedRef.current) { |
| onAudioStateChangeRef.current?.(item.agentId, 'playing'); |
| audio.pause(); |
| return; |
| } |
|
|
| onAudioStateChangeRef.current?.(item.agentId, 'playing'); |
| await audio.play(); |
| } catch (err) { |
| if ((err as Error).name !== 'AbortError') { |
| console.error('[DiscussionTTS] TTS generation failed:', err); |
| } |
| audioRef.current = null; |
| isPlayingRef.current = false; |
| segmentDoneCounterRef.current++; |
| onAudioStateChangeRef.current?.(item.agentId, 'idle'); |
| if (!pausedRef.current) { |
| queueMicrotask(() => processQueueRef.current()); |
| } |
| } |
| }, [agents, enabled, locale, ttsMuted, ttsVolume, ttsProvidersConfig, ttsSpeed, playbackSpeed]); |
|
|
| processQueueRef.current = processQueue; |
|
|
| const handleSegmentSealed = useCallback( |
| (messageId: string, partId: string, fullText: string, agentId: string | null) => { |
| if (!enabled || ttsMuted || !fullText.trim()) return; |
|
|
| const { providerId, modelId, voiceId } = resolveVoiceForAgent(agentId); |
| queueRef.current.push({ |
| messageId, |
| partId, |
| text: fullText, |
| agentId, |
| providerId, |
| modelId, |
| voiceId, |
| }); |
|
|
| if (!isPlayingRef.current) { |
| processQueueRef.current(); |
| } else if (providerId !== 'browser-native-tts') { |
| onAudioStateChangeRef.current?.(agentId, 'generating'); |
| } |
| }, |
| [enabled, ttsMuted, resolveVoiceForAgent], |
| ); |
|
|
| const cleanup = useCallback(() => { |
| pausedRef.current = false; |
| currentProviderRef.current = null; |
| abortControllerRef.current?.abort(); |
| abortControllerRef.current = null; |
| if (audioRef.current) { |
| audioRef.current.pause(); |
| audioRef.current.src = ''; |
| audioRef.current = null; |
| } |
| browserCancelRef.current(); |
| queueRef.current = []; |
| isPlayingRef.current = false; |
| segmentDoneCounterRef.current = 0; |
| onAudioStateChangeRef.current?.(null, 'idle'); |
| }, []); |
|
|
| |
| const pause = useCallback(() => { |
| if (pausedRef.current) return; |
| pausedRef.current = true; |
| if (currentProviderRef.current === 'browser-native-tts') { |
| browserPauseRef.current(); |
| } else if (audioRef.current && !audioRef.current.paused) { |
| audioRef.current.pause(); |
| } |
| }, []); |
|
|
| |
| const resume = useCallback(() => { |
| if (!pausedRef.current) return; |
| pausedRef.current = false; |
| if (currentProviderRef.current === 'browser-native-tts') { |
| browserResumeRef.current(); |
| } else if (audioRef.current && audioRef.current.paused) { |
| audioRef.current.play(); |
| } else if (!isPlayingRef.current) { |
| |
| processQueueRef.current(); |
| } |
| }, []); |
|
|
| |
| useEffect(() => { |
| if (audioRef.current) { |
| audioRef.current.playbackRate = playbackSpeed; |
| } |
| }, [playbackSpeed]); |
|
|
| |
| useEffect(() => { |
| if (audioRef.current) { |
| audioRef.current.volume = ttsMuted ? 0 : ttsVolume; |
| } |
| }, [ttsVolume, ttsMuted]); |
|
|
| useEffect(() => cleanup, [cleanup]); |
|
|
| |
| |
| |
| |
| |
| const shouldHold = useCallback(() => { |
| return { |
| holding: isPlayingRef.current || queueRef.current.length > 0, |
| segmentDone: segmentDoneCounterRef.current, |
| }; |
| }, []); |
|
|
| return { |
| handleSegmentSealed, |
| cleanup, |
| pause, |
| resume, |
| shouldHold, |
| }; |
| } |
|
|