boffire's picture
Update app.py
64e2649 verified
import gradio as gr
import torch
import torchaudio
import soundfile as sf
import os
import tempfile
import spaces
from datetime import datetime
from omnivoice import OmniVoice
# ─── Language selection ───
LANGUAGE_CHOICES = [
"Kabyle (default)",
"Standard Moroccan Tamazight",
"Tahaggart Tamahaq",
"Algerian Arabic"
]
LANG_CODE_MAP = {
"Kabyle (default)": "kab",
"Standard Moroccan Tamazight": "zgh",
"Tahaggart Tamahaq": "thv",
"Algerian Arabic": "arq",
}
# Default Kabyle text (kept as original)
DEFAULT_TEXT = """Awal n "Uṛdinatur" neqqar-as "Aselkim" s teqbaylit. Ma yella d "Linux" d Anagraw n Wammud."""
# Example sentences for each language (displayed when selected)
EXAMPLE_SENTENCES = {
"Kabyle (default)": DEFAULT_TEXT,
"Standard Moroccan Tamazight": "ⴰⵣⵓⵍ ⵎⴰⵙⵙⴰ ⵎⵎⵉ ⵏⵏⵓⵏ. ⵎⴰⵏⵉⴽ ⵜⵍⵍⵉⴷ? ⴰⴷ ⵏⵏⵓⵖ ⵏⵏⴰⵖ ⴰⵙⵙⴰ.",
"Tahaggart Tamahaq": "ⵎⴰⵙⵙⴰ ⵏⵏⵓⵏ, ⵎⴰⵏⵉⴽ ⵜⵏⵏⴰⵍⴰⵎ? ⴰⴷⴰⵖ ⵏⴰⵔⴰ ⵙ ⵓⵖⵔⵎ ⵏⵏⵖ.",
"Algerian Arabic": "شحال شْبَابْ ليوم. ليوما رانا حابين نروحو للبحر. تحب تجي معانا ولٌا لا؟"
}
# ─── Pre‑loaded cloned voices ───
PRELOADED_VOICES = {
"Upload my own": None,
"Muhya (pre‑loaded)": "assets/muhya.mp3",
}
# ─── Model ───
print("Loading model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype)
print(f"Model loaded ({device})")
MAX_WORDS = 50
def _count_words(text):
"""Count words in a string (splits on whitespace)."""
if not text:
return 0
return len(text.strip().split())
def _build_instruct(gender, age, pitch, style):
parts = []
if gender and gender != "Auto":
parts.append(gender.lower())
if age and age != "Auto":
parts.append(age.lower())
if pitch and pitch != "Auto":
parts.append(f"{pitch.lower()} pitch")
if style and style != "Auto":
parts.append(style.lower())
return ", ".join(parts) if parts else None
def _save_audio(audio_tensor, sample_rate=24000):
"""Save audio tensor to a temporary WAV file with robust shape handling."""
try:
if not isinstance(audio_tensor, torch.Tensor):
audio_tensor = torch.tensor(audio_tensor)
audio_tensor = audio_tensor.cpu()
# Normalize shape: ensure [channels, samples] or [samples]
while audio_tensor.dim() > 2:
audio_tensor = audio_tensor.squeeze(0)
if audio_tensor.dim() == 1:
# Mono: [samples] -> [samples, 1] for soundfile
audio_np = audio_tensor.unsqueeze(-1).numpy()
elif audio_tensor.dim() == 2:
# Could be [channels, samples] or [samples, channels]
# OmniVoice typically outputs [1, samples] or [channels, samples]
if audio_tensor.shape[0] <= 4 and audio_tensor.shape[1] > audio_tensor.shape[0]:
# Likely [channels, samples] -> transpose to [samples, channels]
audio_np = audio_tensor.T.numpy()
else:
# Likely [samples, channels] already
audio_np = audio_tensor.numpy()
else:
audio_np = audio_tensor.numpy()
# Ensure 2D for soundfile: [samples, channels]
if audio_np.ndim == 1:
audio_np = audio_np.reshape(-1, 1)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, audio_np, sample_rate)
return f.name
except Exception as e:
raise RuntimeError(f"Failed to save audio: {e}")
def update_example_text(lang_choice):
return EXAMPLE_SENTENCES.get(lang_choice, DEFAULT_TEXT)
# ─── Helper to force gender and switch to Voice Design mode ───
def set_male():
return [gr.update(value="Male"), gr.update(value="Voice Design")]
def set_female():
return [gr.update(value="Female"), gr.update(value="Voice Design")]
# ─── Voice Design / Auto ───
@spaces.GPU
def generate_design(text, mode, lang_choice, gender, age, pitch, style,
speed, duration, num_step, guidance_scale, denoise, postprocess):
if not text or not text.strip():
return None, "Please enter text."
word_count = _count_words(text)
if word_count > MAX_WORDS:
return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."
lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
kwargs["language"] = lang_code
if mode == "Voice Design":
instruct = _build_instruct(gender, age, pitch, style)
if instruct:
kwargs["instruct"] = instruct
if duration and duration > 0:
kwargs["duration"] = duration
else:
kwargs["speed"] = speed
if postprocess:
kwargs["postprocess_output"] = True
try:
audio = model.generate(text=text, **kwargs)
path = _save_audio(audio[0], 24000)
duration_sec = audio[0].shape[-1] / 24000 if hasattr(audio[0], 'shape') else 0
return path, f"Generation complete ({duration_sec:.1f}s)"
except Exception as e:
return None, f"Error: {e}"
# ─── Voice Clone ───
@spaces.GPU
def generate_clone(text, voice_choice, ref_audio, ref_text, lang_choice, speed, duration,
num_step, guidance_scale, denoise, postprocess):
if not text or not text.strip():
return None, "Please enter text."
word_count = _count_words(text)
if word_count > MAX_WORDS:
return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."
# Determine the actual reference audio path
preloaded_path = PRELOADED_VOICES.get(voice_choice)
if preloaded_path:
ref_audio = preloaded_path
elif ref_audio is None:
return None, "Please upload reference audio or select a pre‑loaded voice."
# Ensure ref_audio is a valid file path
if isinstance(ref_audio, tuple):
ref_audio = ref_audio[0] # Gradio sometimes returns (sample_rate, data) tuples
lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
kwargs["language"] = lang_code
if duration and duration > 0:
kwargs["duration"] = duration
else:
kwargs["speed"] = speed
if postprocess:
kwargs["postprocess_output"] = True
try:
audio = model.generate(
text=text,
ref_audio=ref_audio,
ref_text=ref_text if ref_text and ref_text.strip() else None,
**kwargs,
)
path = _save_audio(audio[0], 24000)
duration_sec = audio[0].shape[-1] / 24000 if hasattr(audio[0], 'shape') else 0
return path, f"Generation complete ({duration_sec:.1f}s)"
except Exception as e:
return None, f"Error: {e}"
def toggle_ref_audio(voice_choice):
"""Show/hide the manual upload field based on voice selection."""
return gr.update(visible=(voice_choice == "Upload my own"))
# ─── UI ───
CSS = """
.main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
.subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
footer { display: none !important; }
.word-counter { text-align: right; font-size: 0.85em; color: #666; margin-top: -0.5em; }
.word-counter.over-limit { color: #d32f2f; font-weight: bold; }
"""
with gr.Blocks(title="OmniVoice") as app:
gr.HTML("<h1 class='main-title'>OmniVoice</h1>")
gr.HTML("<p class='subtitle'>AI Voice Generator — Kabyle + Regional Languages</p>")
with gr.Tabs():
# ── Voice Design / Auto ──
with gr.Tab("Voice Design"):
with gr.Row():
with gr.Column(scale=1):
d_text = gr.Textbox(
label="Text to speak", lines=6,
placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
value=DEFAULT_TEXT
)
d_word_counter = gr.HTML(
value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
)
d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="Mode")
d_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
label="Language", info="Select the language of the input text")
# Update example text when language changes
d_lang.change(fn=update_example_text, inputs=d_lang, outputs=d_text)
# ── Always visible gender buttons ──
with gr.Row():
male_btn = gr.Button("Masculine Voice", variant="secondary")
female_btn = gr.Button("Feminine Voice", variant="secondary")
gr.Markdown("*These buttons switch to Voice Design mode and set the gender. Fine‑tune other attributes below.*")
# Voice design attributes (visible only when mode == "Voice Design")
with gr.Group(visible=False) as d_voice_opts:
with gr.Row():
d_gender = gr.Dropdown(["Auto", "Female", "Male"],
value="Auto", label="Gender")
d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"],
value="Auto", label="Age")
with gr.Row():
d_pitch = gr.Dropdown(
["Auto", "Very low", "Low", "Moderate", "High", "Very high"],
value="Auto", label="Pitch")
d_style = gr.Dropdown(["Auto", "Whisper"],
value="Auto", label="Style")
d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed")
with gr.Accordion("Advanced Settings", open=False):
d_duration = gr.Number(value=0, label="Duration (seconds)",
info="0 for auto. If set, Speed is ignored")
d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
d_denoise = gr.Checkbox(value=True, label="Denoise")
d_postprocess = gr.Checkbox(value=True, label="Postprocess (silence removal)")
d_btn = gr.Button("Generate Audio", variant="primary", size="lg")
with gr.Column(scale=1):
d_audio = gr.Audio(label="Generated Audio")
d_status = gr.Textbox(label="Status", interactive=False)
# Live word counter update
def update_word_counter(text):
count = _count_words(text)
css_class = "word-counter over-limit" if count > MAX_WORDS else "word-counter"
return f'<div class="{css_class}">{count} / {MAX_WORDS} words</div>'
d_text.change(fn=update_word_counter, inputs=d_text, outputs=d_word_counter)
# Button events
male_btn.click(fn=set_male, inputs=[], outputs=[d_gender, d_mode])
female_btn.click(fn=set_female, inputs=[], outputs=[d_gender, d_mode])
# Show/hide detailed voice options based on mode
d_mode.change(
fn=lambda m: gr.update(visible=m == "Voice Design"),
inputs=d_mode, outputs=d_voice_opts,
)
d_btn.click(
fn=generate_design,
inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style,
d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess],
outputs=[d_audio, d_status],
)
# ── Voice Clone ──
with gr.Tab("Voice Clone"):
with gr.Row():
with gr.Column(scale=1):
c_text = gr.Textbox(
label="Text to speak", lines=6,
placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
value=DEFAULT_TEXT
)
c_word_counter = gr.HTML(
value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
)
# Pre-loaded voice selector
c_voice_choice = gr.Dropdown(
choices=list(PRELOADED_VOICES.keys()),
value="Upload my own",
label="Voice Source",
info="Choose a pre‑loaded voice or upload your own"
)
# Manual upload (hidden when a pre-loaded voice is selected)
c_ref = gr.Audio(
label="Reference Audio (3–15 seconds)",
type="filepath",
visible=True
)
c_ref_text = gr.Textbox(label="Transcription (optional)", lines=2,
placeholder="Leave empty for auto-transcription")
c_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
label="Language", info="Select the language of the input text")
# Update example text when language changes
c_lang.change(fn=update_example_text, inputs=c_lang, outputs=c_text)
c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed")
with gr.Accordion("Advanced Settings", open=False):
c_duration = gr.Number(value=0, label="Duration (seconds)")
c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
c_denoise = gr.Checkbox(value=True, label="Denoise")
c_postprocess = gr.Checkbox(value=True, label="Postprocess (silence removal)")
c_btn = gr.Button("Generate Audio", variant="primary", size="lg")
with gr.Column(scale=1):
c_audio = gr.Audio(label="Generated Audio")
c_status = gr.Textbox(label="Status", interactive=False)
# Live word counter update
c_text.change(fn=update_word_counter, inputs=c_text, outputs=c_word_counter)
# Toggle upload field visibility
c_voice_choice.change(
fn=toggle_ref_audio,
inputs=c_voice_choice,
outputs=c_ref
)
c_btn.click(
fn=generate_clone,
inputs=[c_text, c_voice_choice, c_ref, c_ref_text, c_lang, c_speed,
c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
outputs=[c_audio, c_status],
)
if __name__ == "__main__":
app.launch(css=CSS)