| import torch |
| import numpy as np |
| import gradio as gr |
| from transformers import pipeline |
| import logging |
| from scipy.io.wavfile import write |
| import uuid |
| import os |
| import warnings |
|
|
| |
| |
| |
| warnings.filterwarnings("ignore", category=FutureWarning) |
| logging.getLogger("transformers").setLevel(logging.ERROR) |
|
|
| |
| |
| |
| device = 0 if torch.cuda.is_available() else -1 |
|
|
| |
| |
| |
| model_dir = "./" |
|
|
| |
| |
| |
| tts_pipe = pipeline( |
| task="text-to-speech", |
| model=model_dir, |
| device=device |
| ) |
|
|
| |
| |
| |
| def tts_generate(text): |
| if not text.strip(): |
| return None |
|
|
| |
| output = tts_pipe(text) |
|
|
| if "audio" not in output: |
| raise ValueError("TTS pipeline did not return audio") |
|
|
| audio = np.array(output["audio"], dtype=np.float32) |
|
|
| |
| audio = np.nan_to_num(audio) |
| audio = np.clip(audio, -1.0, 1.0) |
|
|
| |
| sr = output.get("sampling_rate") or 22050 |
|
|
| |
| audio_int16 = (audio * 32767).astype(np.int16) |
|
|
| |
| os.makedirs("outputs", exist_ok=True) |
| out_path = f"outputs/{uuid.uuid4().hex}.wav" |
|
|
| |
| write(out_path, sr, audio_int16) |
|
|
| return out_path |
|
|
| |
| |
| |
| SAMPLES = [ |
| "Just end up crashing somewhere. <laugh> No, because remember last time?", |
| "Hmm… I don't know. <laugh> This feels like a bad idea. <gasp>", |
| "I'm so tired today <yawn> but I still have so much work to do.", |
| ] |
|
|
| |
| |
| |
| demo = gr.Interface( |
| fn=tts_generate, |
| inputs=gr.Textbox( |
| label="Enter text (use expressive tags like <laugh>, <sigh>)", |
| lines=5, |
| placeholder=SAMPLES[0], |
| ), |
| outputs=gr.Audio(type="filepath", label="Generated Audio"), |
| title="Fine-tuned Orpheus-3B Expressive TTS", |
| examples=[[s] for s in SAMPLES], |
| ) |
|
|
| |
| |
| |
| if __name__ == "__main__": |
| demo.launch(ssr_mode=False) |
|
|