UpVoice / src /audio_generation.py
breddaz's picture
fix output folder permissions
8db2fc1
from dataclasses import dataclass
from typing import List, Tuple
import os
from openai import OpenAI
import tempfile
from pydub import AudioSegment
import base64
from pathlib import Path
from script_generation import PodScript
output_root = Path(tempfile.gettempdir()) / "upvoice_outputs"
output_root.mkdir(parents=True, exist_ok=True)
@dataclass
class AudioConfig:
output_path: str = str(output_root)
api_key: str = os.getenv("OPENAI_API_KEY", "")
model: str = "gpt-4o-mini-tts"
mocked: bool = False
class ScriptToAudio:
def __init__(self, config: AudioConfig):
self._config = config
self._llm_client = OpenAI(api_key=config.api_key)
async def _text_to_speech(self, script: PodScript, voice_1: str, voice_2: str) -> Tuple[List[str], str]:
"""Converts the text in the conversation JSON to speech using the specified voices.
Args:
conversation_json (Dict): The conversation JSON containing the text to be converted.
voice_1 (str): The voice for the first speaker.
voice_2 (str): The voice for the second speaker.
Returns:
Tuple[List[str], str]: A tuple containing a list of filenames of the generated audio files and the output directory.
Raises:
ValueError: If the conversation JSON is empty or if the voices are invalid.
RuntimeError: If the text-to-speech conversion fails.
"""
output_dir = Path(self._create_output_directory())
filenames = []
try:
for i, line in enumerate(script.conversation):
filename = output_dir / f"output_{i}.wav"
voice = voice_1 if i % 2 == 0 else voice_2
tmp_path= await self._generate_audio(line.text, voice)
os.rename(tmp_path, filename)
filenames.append(str(filename))
return filenames, str(output_dir)
except Exception as e:
raise RuntimeError(f"Failed to convert text to speech: {e}")
async def _generate_audio(self, text: str, voice: str) -> str:
"""Generates audio from the given text using the specified voice.
Args:
text (str): The input text to be converted to audio.
voice (str): The voice to be used for the audio generation.
Returns:
str: Astring containing the filename of the generated audio file.
Raises:
RuntimeError: If the audio generation fails.
"""
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
response = self._llm_client.audio.speech.create(
model=self._config.model,
voice=voice.lower(),
input=text,
response_format="wav"
)
tmp_path = tmp_file.name
with open(tmp_path, "wb") as f:
f.write(response.content)
""" with self._llm_client.audio.speech.with_streaming_response.create(
model=self._config.model,
voice=voice.lower(),
input=text,
) as response:
response.stream_to_file(tmp_path) """
return tmp_path
except Exception as e:
raise RuntimeError(f"Failed to generate audio: {e}")
def _create_output_directory(self) -> str:
"""Creates a unique output directory for the generated audio files.
Returns:
str: The name of the created output directory.
Raises:
RuntimeError: If the directory creation fails.
"""
random_bytes = os.urandom(8)
folder_name = base64.urlsafe_b64encode(random_bytes).decode("utf-8")
final_folder = Path(self._config.output_path) / folder_name
final_folder.mkdir(parents=True, exist_ok=True)
return str(final_folder)
def _combine_audio_files(self, filenames: List[str], output_file: str) -> None:
"""Combines multiple audio files into a single WAV file.
Args:
filenames (List[str]): A list of filenames of the audio files to be combined.
output_file (str): The name of the output WAV file.
Raises:
ValueError: If the input filenames list is empty.
RuntimeError: If the audio file combination fails.
"""
if not filenames:
raise ValueError("No input files provided")
try:
audio_segments = []
for filename in filenames:
audio_segment = AudioSegment.from_mp3(filename)
audio_segments.append(audio_segment)
combined = sum(audio_segments, AudioSegment.empty())
combined.export(output_file, format="wav")
for filename in filenames:
os.remove(filename)
except Exception as e:
raise RuntimeError(f"Failed to combine audio files: {e}")
async def run(self, script:PodScript ,voice_1: str, voice_2: str) -> str:
if self._config.mocked:
return './src/public/articles/mocked_java9_openai_tts.wav'
try:
audio_files, folder_name = await self._text_to_speech(
script, voice_1, voice_2
)
final_output = os.path.join(folder_name, "combined_output.wav")
self._combine_audio_files(audio_files, final_output)
return final_output
except Exception as e:
raise RuntimeError(f"Failed to convert article to podcast: {e}")