import gradio as gr import torch import torchaudio import soundfile as sf import os import tempfile import spaces from datetime import datetime from omnivoice import OmniVoice # ─── Language selection ─── LANGUAGE_CHOICES = [ "Kabyle (default)", "Standard Moroccan Tamazight", "Tahaggart Tamahaq", "Algerian Arabic" ] LANG_CODE_MAP = { "Kabyle (default)": "kab", "Standard Moroccan Tamazight": "zgh", "Tahaggart Tamahaq": "thv", "Algerian Arabic": "arq", } # Default Kabyle text (kept as original) DEFAULT_TEXT = """Awal n "Uṛdinatur" neqqar-as "Aselkim" s teqbaylit. Ma yella d "Linux" d Anagraw n Wammud.""" # Example sentences for each language (displayed when selected) EXAMPLE_SENTENCES = { "Kabyle (default)": DEFAULT_TEXT, "Standard Moroccan Tamazight": "ⴰⵣⵓⵍ ⵎⴰⵙⵙⴰ ⵎⵎⵉ ⵏⵏⵓⵏ. ⵎⴰⵏⵉⴽ ⵜⵍⵍⵉⴷ? ⴰⴷ ⵏⵏⵓⵖ ⵏⵏⴰⵖ ⴰⵙⵙⴰ.", "Tahaggart Tamahaq": "ⵎⴰⵙⵙⴰ ⵏⵏⵓⵏ, ⵎⴰⵏⵉⴽ ⵜⵏⵏⴰⵍⴰⵎ? ⴰⴷⴰⵖ ⵏⴰⵔⴰ ⵙ ⵓⵖⵔⵎ ⵏⵏⵖ.", "Algerian Arabic": "شحال شْبَابْ ليوم. ليوما رانا حابين نروحو للبحر. تحب تجي معانا ولٌا لا؟" } # ─── Pre‑loaded cloned voices ─── PRELOADED_VOICES = { "Upload my own": None, "Muhya (pre‑loaded)": "assets/muhya.mp3", } # ─── Model ─── print("Loading model...") device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype) print(f"Model loaded ({device})") MAX_WORDS = 50 def _count_words(text): """Count words in a string (splits on whitespace).""" if not text: return 0 return len(text.strip().split()) def _build_instruct(gender, age, pitch, style): parts = [] if gender and gender != "Auto": parts.append(gender.lower()) if age and age != "Auto": parts.append(age.lower()) if pitch and pitch != "Auto": parts.append(f"{pitch.lower()} pitch") if style and style != "Auto": parts.append(style.lower()) return ", ".join(parts) if parts else None def _save_audio(audio_tensor, sample_rate=24000): """Save audio tensor to a temporary WAV file with robust shape handling.""" try: if not isinstance(audio_tensor, torch.Tensor): audio_tensor = torch.tensor(audio_tensor) audio_tensor = audio_tensor.cpu() # Normalize shape: ensure [channels, samples] or [samples] while audio_tensor.dim() > 2: audio_tensor = audio_tensor.squeeze(0) if audio_tensor.dim() == 1: # Mono: [samples] -> [samples, 1] for soundfile audio_np = audio_tensor.unsqueeze(-1).numpy() elif audio_tensor.dim() == 2: # Could be [channels, samples] or [samples, channels] # OmniVoice typically outputs [1, samples] or [channels, samples] if audio_tensor.shape[0] <= 4 and audio_tensor.shape[1] > audio_tensor.shape[0]: # Likely [channels, samples] -> transpose to [samples, channels] audio_np = audio_tensor.T.numpy() else: # Likely [samples, channels] already audio_np = audio_tensor.numpy() else: audio_np = audio_tensor.numpy() # Ensure 2D for soundfile: [samples, channels] if audio_np.ndim == 1: audio_np = audio_np.reshape(-1, 1) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sf.write(f.name, audio_np, sample_rate) return f.name except Exception as e: raise RuntimeError(f"Failed to save audio: {e}") def update_example_text(lang_choice): return EXAMPLE_SENTENCES.get(lang_choice, DEFAULT_TEXT) # ─── Helper to force gender and switch to Voice Design mode ─── def set_male(): return [gr.update(value="Male"), gr.update(value="Voice Design")] def set_female(): return [gr.update(value="Female"), gr.update(value="Voice Design")] # ─── Voice Design / Auto ─── @spaces.GPU def generate_design(text, mode, lang_choice, gender, age, pitch, style, speed, duration, num_step, guidance_scale, denoise, postprocess): if not text or not text.strip(): return None, "Please enter text." word_count = _count_words(text) if word_count > MAX_WORDS: return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input." lang_code = LANG_CODE_MAP.get(lang_choice, "kab") kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise) kwargs["language"] = lang_code if mode == "Voice Design": instruct = _build_instruct(gender, age, pitch, style) if instruct: kwargs["instruct"] = instruct if duration and duration > 0: kwargs["duration"] = duration else: kwargs["speed"] = speed if postprocess: kwargs["postprocess_output"] = True try: audio = model.generate(text=text, **kwargs) path = _save_audio(audio[0], 24000) duration_sec = audio[0].shape[-1] / 24000 if hasattr(audio[0], 'shape') else 0 return path, f"Generation complete ({duration_sec:.1f}s)" except Exception as e: return None, f"Error: {e}" # ─── Voice Clone ─── @spaces.GPU def generate_clone(text, voice_choice, ref_audio, ref_text, lang_choice, speed, duration, num_step, guidance_scale, denoise, postprocess): if not text or not text.strip(): return None, "Please enter text." word_count = _count_words(text) if word_count > MAX_WORDS: return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input." # Determine the actual reference audio path preloaded_path = PRELOADED_VOICES.get(voice_choice) if preloaded_path: ref_audio = preloaded_path elif ref_audio is None: return None, "Please upload reference audio or select a pre‑loaded voice." # Ensure ref_audio is a valid file path if isinstance(ref_audio, tuple): ref_audio = ref_audio[0] # Gradio sometimes returns (sample_rate, data) tuples lang_code = LANG_CODE_MAP.get(lang_choice, "kab") kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise) kwargs["language"] = lang_code if duration and duration > 0: kwargs["duration"] = duration else: kwargs["speed"] = speed if postprocess: kwargs["postprocess_output"] = True try: audio = model.generate( text=text, ref_audio=ref_audio, ref_text=ref_text if ref_text and ref_text.strip() else None, **kwargs, ) path = _save_audio(audio[0], 24000) duration_sec = audio[0].shape[-1] / 24000 if hasattr(audio[0], 'shape') else 0 return path, f"Generation complete ({duration_sec:.1f}s)" except Exception as e: return None, f"Error: {e}" def toggle_ref_audio(voice_choice): """Show/hide the manual upload field based on voice selection.""" return gr.update(visible=(voice_choice == "Upload my own")) # ─── UI ─── CSS = """ .main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; } .subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; } footer { display: none !important; } .word-counter { text-align: right; font-size: 0.85em; color: #666; margin-top: -0.5em; } .word-counter.over-limit { color: #d32f2f; font-weight: bold; } """ with gr.Blocks(title="OmniVoice") as app: gr.HTML("

OmniVoice

") gr.HTML("

AI Voice Generator — Kabyle + Regional Languages

") with gr.Tabs(): # ── Voice Design / Auto ── with gr.Tab("Voice Design"): with gr.Row(): with gr.Column(scale=1): d_text = gr.Textbox( label="Text to speak", lines=6, placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)", value=DEFAULT_TEXT ) d_word_counter = gr.HTML( value=f'
{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words
' ) d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="Mode") d_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)", label="Language", info="Select the language of the input text") # Update example text when language changes d_lang.change(fn=update_example_text, inputs=d_lang, outputs=d_text) # ── Always visible gender buttons ── with gr.Row(): male_btn = gr.Button("Masculine Voice", variant="secondary") female_btn = gr.Button("Feminine Voice", variant="secondary") gr.Markdown("*These buttons switch to Voice Design mode and set the gender. Fine‑tune other attributes below.*") # Voice design attributes (visible only when mode == "Voice Design") with gr.Group(visible=False) as d_voice_opts: with gr.Row(): d_gender = gr.Dropdown(["Auto", "Female", "Male"], value="Auto", label="Gender") d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"], value="Auto", label="Age") with gr.Row(): d_pitch = gr.Dropdown( ["Auto", "Very low", "Low", "Moderate", "High", "Very high"], value="Auto", label="Pitch") d_style = gr.Dropdown(["Auto", "Whisper"], value="Auto", label="Style") d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed") with gr.Accordion("Advanced Settings", open=False): d_duration = gr.Number(value=0, label="Duration (seconds)", info="0 for auto. If set, Speed is ignored") d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps") d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale") d_denoise = gr.Checkbox(value=True, label="Denoise") d_postprocess = gr.Checkbox(value=True, label="Postprocess (silence removal)") d_btn = gr.Button("Generate Audio", variant="primary", size="lg") with gr.Column(scale=1): d_audio = gr.Audio(label="Generated Audio") d_status = gr.Textbox(label="Status", interactive=False) # Live word counter update def update_word_counter(text): count = _count_words(text) css_class = "word-counter over-limit" if count > MAX_WORDS else "word-counter" return f'
{count} / {MAX_WORDS} words
' d_text.change(fn=update_word_counter, inputs=d_text, outputs=d_word_counter) # Button events male_btn.click(fn=set_male, inputs=[], outputs=[d_gender, d_mode]) female_btn.click(fn=set_female, inputs=[], outputs=[d_gender, d_mode]) # Show/hide detailed voice options based on mode d_mode.change( fn=lambda m: gr.update(visible=m == "Voice Design"), inputs=d_mode, outputs=d_voice_opts, ) d_btn.click( fn=generate_design, inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style, d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess], outputs=[d_audio, d_status], ) # ── Voice Clone ── with gr.Tab("Voice Clone"): with gr.Row(): with gr.Column(scale=1): c_text = gr.Textbox( label="Text to speak", lines=6, placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)", value=DEFAULT_TEXT ) c_word_counter = gr.HTML( value=f'
{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words
' ) # Pre-loaded voice selector c_voice_choice = gr.Dropdown( choices=list(PRELOADED_VOICES.keys()), value="Upload my own", label="Voice Source", info="Choose a pre‑loaded voice or upload your own" ) # Manual upload (hidden when a pre-loaded voice is selected) c_ref = gr.Audio( label="Reference Audio (3–15 seconds)", type="filepath", visible=True ) c_ref_text = gr.Textbox(label="Transcription (optional)", lines=2, placeholder="Leave empty for auto-transcription") c_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)", label="Language", info="Select the language of the input text") # Update example text when language changes c_lang.change(fn=update_example_text, inputs=c_lang, outputs=c_text) c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed") with gr.Accordion("Advanced Settings", open=False): c_duration = gr.Number(value=0, label="Duration (seconds)") c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps") c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale") c_denoise = gr.Checkbox(value=True, label="Denoise") c_postprocess = gr.Checkbox(value=True, label="Postprocess (silence removal)") c_btn = gr.Button("Generate Audio", variant="primary", size="lg") with gr.Column(scale=1): c_audio = gr.Audio(label="Generated Audio") c_status = gr.Textbox(label="Status", interactive=False) # Live word counter update c_text.change(fn=update_word_counter, inputs=c_text, outputs=c_word_counter) # Toggle upload field visibility c_voice_choice.change( fn=toggle_ref_audio, inputs=c_voice_choice, outputs=c_ref ) c_btn.click( fn=generate_clone, inputs=[c_text, c_voice_choice, c_ref, c_ref_text, c_lang, c_speed, c_duration, c_steps, c_cfg, c_denoise, c_postprocess], outputs=[c_audio, c_status], ) if __name__ == "__main__": app.launch(css=CSS)