Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import soundfile as sf | |
| import os | |
| import tempfile | |
| import spaces | |
| from datetime import datetime | |
| from omnivoice import OmniVoice | |
| # ─── Language selection ─── | |
| LANGUAGE_CHOICES = [ | |
| "Kabyle (default)", | |
| "Standard Moroccan Tamazight", | |
| "Tahaggart Tamahaq", | |
| "Algerian Arabic" | |
| ] | |
| LANG_CODE_MAP = { | |
| "Kabyle (default)": "kab", | |
| "Standard Moroccan Tamazight": "zgh", | |
| "Tahaggart Tamahaq": "thv", | |
| "Algerian Arabic": "arq", | |
| } | |
| # Default Kabyle text (kept as original) | |
| DEFAULT_TEXT = """Awal n "Uṛdinatur" neqqar-as "Aselkim" s teqbaylit. Ma yella d "Linux" d Anagraw n Wammud.""" | |
| # Example sentences for each language (displayed when selected) | |
| EXAMPLE_SENTENCES = { | |
| "Kabyle (default)": DEFAULT_TEXT, | |
| "Standard Moroccan Tamazight": "ⴰⵣⵓⵍ ⵎⴰⵙⵙⴰ ⵎⵎⵉ ⵏⵏⵓⵏ. ⵎⴰⵏⵉⴽ ⵜⵍⵍⵉⴷ? ⴰⴷ ⵏⵏⵓⵖ ⵏⵏⴰⵖ ⴰⵙⵙⴰ.", | |
| "Tahaggart Tamahaq": "ⵎⴰⵙⵙⴰ ⵏⵏⵓⵏ, ⵎⴰⵏⵉⴽ ⵜⵏⵏⴰⵍⴰⵎ? ⴰⴷⴰⵖ ⵏⴰⵔⴰ ⵙ ⵓⵖⵔⵎ ⵏⵏⵖ.", | |
| "Algerian Arabic": "شحال شْبَابْ ليوم. ليوما رانا حابين نروحو للبحر. تحب تجي معانا ولٌا لا؟" | |
| } | |
| # ─── Pre‑loaded cloned voices ─── | |
| PRELOADED_VOICES = { | |
| "Upload my own": None, | |
| "Muhya (pre‑loaded)": "assets/muhya.mp3", | |
| } | |
| # ─── Model ─── | |
| print("Loading model...") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| dtype = torch.float16 if device == "cuda" else torch.float32 | |
| model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype) | |
| print(f"Model loaded ({device})") | |
| MAX_WORDS = 50 | |
| def _count_words(text): | |
| """Count words in a string (splits on whitespace).""" | |
| if not text: | |
| return 0 | |
| return len(text.strip().split()) | |
| def _build_instruct(gender, age, pitch, style): | |
| parts = [] | |
| if gender and gender != "Auto": | |
| parts.append(gender.lower()) | |
| if age and age != "Auto": | |
| parts.append(age.lower()) | |
| if pitch and pitch != "Auto": | |
| parts.append(f"{pitch.lower()} pitch") | |
| if style and style != "Auto": | |
| parts.append(style.lower()) | |
| return ", ".join(parts) if parts else None | |
| def _save_audio(audio_tensor, sample_rate=24000): | |
| """Save audio tensor to a temporary WAV file with robust shape handling.""" | |
| try: | |
| if not isinstance(audio_tensor, torch.Tensor): | |
| audio_tensor = torch.tensor(audio_tensor) | |
| audio_tensor = audio_tensor.cpu() | |
| # Normalize shape: ensure [channels, samples] or [samples] | |
| while audio_tensor.dim() > 2: | |
| audio_tensor = audio_tensor.squeeze(0) | |
| if audio_tensor.dim() == 1: | |
| # Mono: [samples] -> [samples, 1] for soundfile | |
| audio_np = audio_tensor.unsqueeze(-1).numpy() | |
| elif audio_tensor.dim() == 2: | |
| # Could be [channels, samples] or [samples, channels] | |
| # OmniVoice typically outputs [1, samples] or [channels, samples] | |
| if audio_tensor.shape[0] <= 4 and audio_tensor.shape[1] > audio_tensor.shape[0]: | |
| # Likely [channels, samples] -> transpose to [samples, channels] | |
| audio_np = audio_tensor.T.numpy() | |
| else: | |
| # Likely [samples, channels] already | |
| audio_np = audio_tensor.numpy() | |
| else: | |
| audio_np = audio_tensor.numpy() | |
| # Ensure 2D for soundfile: [samples, channels] | |
| if audio_np.ndim == 1: | |
| audio_np = audio_np.reshape(-1, 1) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| sf.write(f.name, audio_np, sample_rate) | |
| return f.name | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to save audio: {e}") | |
| def update_example_text(lang_choice): | |
| return EXAMPLE_SENTENCES.get(lang_choice, DEFAULT_TEXT) | |
| # ─── Helper to force gender and switch to Voice Design mode ─── | |
| def set_male(): | |
| return [gr.update(value="Male"), gr.update(value="Voice Design")] | |
| def set_female(): | |
| return [gr.update(value="Female"), gr.update(value="Voice Design")] | |
| # ─── Voice Design / Auto ─── | |
| def generate_design(text, mode, lang_choice, gender, age, pitch, style, | |
| speed, duration, num_step, guidance_scale, denoise, postprocess): | |
| if not text or not text.strip(): | |
| return None, "Please enter text." | |
| word_count = _count_words(text) | |
| if word_count > MAX_WORDS: | |
| return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input." | |
| lang_code = LANG_CODE_MAP.get(lang_choice, "kab") | |
| kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise) | |
| kwargs["language"] = lang_code | |
| if mode == "Voice Design": | |
| instruct = _build_instruct(gender, age, pitch, style) | |
| if instruct: | |
| kwargs["instruct"] = instruct | |
| if duration and duration > 0: | |
| kwargs["duration"] = duration | |
| else: | |
| kwargs["speed"] = speed | |
| if postprocess: | |
| kwargs["postprocess_output"] = True | |
| try: | |
| audio = model.generate(text=text, **kwargs) | |
| path = _save_audio(audio[0], 24000) | |
| duration_sec = audio[0].shape[-1] / 24000 if hasattr(audio[0], 'shape') else 0 | |
| return path, f"Generation complete ({duration_sec:.1f}s)" | |
| except Exception as e: | |
| return None, f"Error: {e}" | |
| # ─── Voice Clone ─── | |
| def generate_clone(text, voice_choice, ref_audio, ref_text, lang_choice, speed, duration, | |
| num_step, guidance_scale, denoise, postprocess): | |
| if not text or not text.strip(): | |
| return None, "Please enter text." | |
| word_count = _count_words(text) | |
| if word_count > MAX_WORDS: | |
| return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input." | |
| # Determine the actual reference audio path | |
| preloaded_path = PRELOADED_VOICES.get(voice_choice) | |
| if preloaded_path: | |
| ref_audio = preloaded_path | |
| elif ref_audio is None: | |
| return None, "Please upload reference audio or select a pre‑loaded voice." | |
| # Ensure ref_audio is a valid file path | |
| if isinstance(ref_audio, tuple): | |
| ref_audio = ref_audio[0] # Gradio sometimes returns (sample_rate, data) tuples | |
| lang_code = LANG_CODE_MAP.get(lang_choice, "kab") | |
| kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise) | |
| kwargs["language"] = lang_code | |
| if duration and duration > 0: | |
| kwargs["duration"] = duration | |
| else: | |
| kwargs["speed"] = speed | |
| if postprocess: | |
| kwargs["postprocess_output"] = True | |
| try: | |
| audio = model.generate( | |
| text=text, | |
| ref_audio=ref_audio, | |
| ref_text=ref_text if ref_text and ref_text.strip() else None, | |
| **kwargs, | |
| ) | |
| path = _save_audio(audio[0], 24000) | |
| duration_sec = audio[0].shape[-1] / 24000 if hasattr(audio[0], 'shape') else 0 | |
| return path, f"Generation complete ({duration_sec:.1f}s)" | |
| except Exception as e: | |
| return None, f"Error: {e}" | |
| def toggle_ref_audio(voice_choice): | |
| """Show/hide the manual upload field based on voice selection.""" | |
| return gr.update(visible=(voice_choice == "Upload my own")) | |
| # ─── UI ─── | |
| CSS = """ | |
| .main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; } | |
| .subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; } | |
| footer { display: none !important; } | |
| .word-counter { text-align: right; font-size: 0.85em; color: #666; margin-top: -0.5em; } | |
| .word-counter.over-limit { color: #d32f2f; font-weight: bold; } | |
| """ | |
| with gr.Blocks(title="OmniVoice") as app: | |
| gr.HTML("<h1 class='main-title'>OmniVoice</h1>") | |
| gr.HTML("<p class='subtitle'>AI Voice Generator — Kabyle + Regional Languages</p>") | |
| with gr.Tabs(): | |
| # ── Voice Design / Auto ── | |
| with gr.Tab("Voice Design"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| d_text = gr.Textbox( | |
| label="Text to speak", lines=6, | |
| placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)", | |
| value=DEFAULT_TEXT | |
| ) | |
| d_word_counter = gr.HTML( | |
| value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>' | |
| ) | |
| d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="Mode") | |
| d_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)", | |
| label="Language", info="Select the language of the input text") | |
| # Update example text when language changes | |
| d_lang.change(fn=update_example_text, inputs=d_lang, outputs=d_text) | |
| # ── Always visible gender buttons ── | |
| with gr.Row(): | |
| male_btn = gr.Button("Masculine Voice", variant="secondary") | |
| female_btn = gr.Button("Feminine Voice", variant="secondary") | |
| gr.Markdown("*These buttons switch to Voice Design mode and set the gender. Fine‑tune other attributes below.*") | |
| # Voice design attributes (visible only when mode == "Voice Design") | |
| with gr.Group(visible=False) as d_voice_opts: | |
| with gr.Row(): | |
| d_gender = gr.Dropdown(["Auto", "Female", "Male"], | |
| value="Auto", label="Gender") | |
| d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"], | |
| value="Auto", label="Age") | |
| with gr.Row(): | |
| d_pitch = gr.Dropdown( | |
| ["Auto", "Very low", "Low", "Moderate", "High", "Very high"], | |
| value="Auto", label="Pitch") | |
| d_style = gr.Dropdown(["Auto", "Whisper"], | |
| value="Auto", label="Style") | |
| d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed") | |
| with gr.Accordion("Advanced Settings", open=False): | |
| d_duration = gr.Number(value=0, label="Duration (seconds)", | |
| info="0 for auto. If set, Speed is ignored") | |
| d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps") | |
| d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale") | |
| d_denoise = gr.Checkbox(value=True, label="Denoise") | |
| d_postprocess = gr.Checkbox(value=True, label="Postprocess (silence removal)") | |
| d_btn = gr.Button("Generate Audio", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| d_audio = gr.Audio(label="Generated Audio") | |
| d_status = gr.Textbox(label="Status", interactive=False) | |
| # Live word counter update | |
| def update_word_counter(text): | |
| count = _count_words(text) | |
| css_class = "word-counter over-limit" if count > MAX_WORDS else "word-counter" | |
| return f'<div class="{css_class}">{count} / {MAX_WORDS} words</div>' | |
| d_text.change(fn=update_word_counter, inputs=d_text, outputs=d_word_counter) | |
| # Button events | |
| male_btn.click(fn=set_male, inputs=[], outputs=[d_gender, d_mode]) | |
| female_btn.click(fn=set_female, inputs=[], outputs=[d_gender, d_mode]) | |
| # Show/hide detailed voice options based on mode | |
| d_mode.change( | |
| fn=lambda m: gr.update(visible=m == "Voice Design"), | |
| inputs=d_mode, outputs=d_voice_opts, | |
| ) | |
| d_btn.click( | |
| fn=generate_design, | |
| inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style, | |
| d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess], | |
| outputs=[d_audio, d_status], | |
| ) | |
| # ── Voice Clone ── | |
| with gr.Tab("Voice Clone"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| c_text = gr.Textbox( | |
| label="Text to speak", lines=6, | |
| placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)", | |
| value=DEFAULT_TEXT | |
| ) | |
| c_word_counter = gr.HTML( | |
| value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>' | |
| ) | |
| # Pre-loaded voice selector | |
| c_voice_choice = gr.Dropdown( | |
| choices=list(PRELOADED_VOICES.keys()), | |
| value="Upload my own", | |
| label="Voice Source", | |
| info="Choose a pre‑loaded voice or upload your own" | |
| ) | |
| # Manual upload (hidden when a pre-loaded voice is selected) | |
| c_ref = gr.Audio( | |
| label="Reference Audio (3–15 seconds)", | |
| type="filepath", | |
| visible=True | |
| ) | |
| c_ref_text = gr.Textbox(label="Transcription (optional)", lines=2, | |
| placeholder="Leave empty for auto-transcription") | |
| c_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)", | |
| label="Language", info="Select the language of the input text") | |
| # Update example text when language changes | |
| c_lang.change(fn=update_example_text, inputs=c_lang, outputs=c_text) | |
| c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed") | |
| with gr.Accordion("Advanced Settings", open=False): | |
| c_duration = gr.Number(value=0, label="Duration (seconds)") | |
| c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps") | |
| c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale") | |
| c_denoise = gr.Checkbox(value=True, label="Denoise") | |
| c_postprocess = gr.Checkbox(value=True, label="Postprocess (silence removal)") | |
| c_btn = gr.Button("Generate Audio", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| c_audio = gr.Audio(label="Generated Audio") | |
| c_status = gr.Textbox(label="Status", interactive=False) | |
| # Live word counter update | |
| c_text.change(fn=update_word_counter, inputs=c_text, outputs=c_word_counter) | |
| # Toggle upload field visibility | |
| c_voice_choice.change( | |
| fn=toggle_ref_audio, | |
| inputs=c_voice_choice, | |
| outputs=c_ref | |
| ) | |
| c_btn.click( | |
| fn=generate_clone, | |
| inputs=[c_text, c_voice_choice, c_ref, c_ref_text, c_lang, c_speed, | |
| c_duration, c_steps, c_cfg, c_denoise, c_postprocess], | |
| outputs=[c_audio, c_status], | |
| ) | |
| if __name__ == "__main__": | |
| app.launch(css=CSS) |