Spaces:

boffire
/

OmniVoice-kabyle

Running

App Files Files Community

OmniVoice-kabyle / app.py

boffire

Update app.py

64e2649 verified 20 days ago

raw

history blame contribute delete

15.9 kB

	import gradio as gr
	import torch
	import torchaudio
	import soundfile as sf
	import os
	import tempfile
	import spaces
	from datetime import datetime
	from omnivoice import OmniVoice

	# ─── Language selection ───
	LANGUAGE_CHOICES = [
	"Kabyle (default)",
	"Standard Moroccan Tamazight",
	"Tahaggart Tamahaq",
	"Algerian Arabic"
	]

	LANG_CODE_MAP = {
	"Kabyle (default)": "kab",
	"Standard Moroccan Tamazight": "zgh",
	"Tahaggart Tamahaq": "thv",
	"Algerian Arabic": "arq",
	}

	# Default Kabyle text (kept as original)
	DEFAULT_TEXT = """Awal n "Uṛdinatur" neqqar-as "Aselkim" s teqbaylit. Ma yella d "Linux" d Anagraw n Wammud."""

	# Example sentences for each language (displayed when selected)
	EXAMPLE_SENTENCES = {
	"Kabyle (default)": DEFAULT_TEXT,
	"Standard Moroccan Tamazight": "ⴰⵣⵓⵍ ⵎⴰⵙⵙⴰ ⵎⵎⵉ ⵏⵏⵓⵏ. ⵎⴰⵏⵉⴽ ⵜⵍⵍⵉⴷ? ⴰⴷ ⵏⵏⵓⵖ ⵏⵏⴰⵖ ⴰⵙⵙⴰ.",
	"Tahaggart Tamahaq": "ⵎⴰⵙⵙⴰ ⵏⵏⵓⵏ, ⵎⴰⵏⵉⴽ ⵜⵏⵏⴰⵍⴰⵎ? ⴰⴷⴰⵖ ⵏⴰⵔⴰ ⵙ ⵓⵖⵔⵎ ⵏⵏⵖ.",
	"Algerian Arabic": "شحال شْبَابْ ليوم. ليوما رانا حابين نروحو للبحر. تحب تجي معانا ولٌا لا؟"
	}

	# ─── Pre‑loaded cloned voices ───
	PRELOADED_VOICES = {
	"Upload my own": None,
	"Muhya (pre‑loaded)": "assets/muhya.mp3",
	}

	# ─── Model ───
	print("Loading model...")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	dtype = torch.float16 if device == "cuda" else torch.float32
	model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype)
	print(f"Model loaded ({device})")

	MAX_WORDS = 50

	def _count_words(text):
	"""Count words in a string (splits on whitespace)."""
	if not text:
	return 0
	return len(text.strip().split())

	def _build_instruct(gender, age, pitch, style):
	parts = []
	if gender and gender != "Auto":
	parts.append(gender.lower())
	if age and age != "Auto":
	parts.append(age.lower())
	if pitch and pitch != "Auto":
	parts.append(f"{pitch.lower()} pitch")
	if style and style != "Auto":
	parts.append(style.lower())
	return ", ".join(parts) if parts else None

	def _save_audio(audio_tensor, sample_rate=24000):
	"""Save audio tensor to a temporary WAV file with robust shape handling."""
	try:
	if not isinstance(audio_tensor, torch.Tensor):
	audio_tensor = torch.tensor(audio_tensor)
	audio_tensor = audio_tensor.cpu()

	# Normalize shape: ensure [channels, samples] or [samples]
	while audio_tensor.dim() > 2:
	audio_tensor = audio_tensor.squeeze(0)

	if audio_tensor.dim() == 1:
	# Mono: [samples] -> [samples, 1] for soundfile
	audio_np = audio_tensor.unsqueeze(-1).numpy()
	elif audio_tensor.dim() == 2:
	# Could be [channels, samples] or [samples, channels]
	# OmniVoice typically outputs [1, samples] or [channels, samples]
	if audio_tensor.shape[0] <= 4 and audio_tensor.shape[1] > audio_tensor.shape[0]:
	# Likely [channels, samples] -> transpose to [samples, channels]
	audio_np = audio_tensor.T.numpy()
	else:
	# Likely [samples, channels] already
	audio_np = audio_tensor.numpy()
	else:
	audio_np = audio_tensor.numpy()

	# Ensure 2D for soundfile: [samples, channels]
	if audio_np.ndim == 1:
	audio_np = audio_np.reshape(-1, 1)

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	sf.write(f.name, audio_np, sample_rate)
	return f.name
	except Exception as e:
	raise RuntimeError(f"Failed to save audio: {e}")

	def update_example_text(lang_choice):
	return EXAMPLE_SENTENCES.get(lang_choice, DEFAULT_TEXT)

	# ─── Helper to force gender and switch to Voice Design mode ───
	def set_male():
	return [gr.update(value="Male"), gr.update(value="Voice Design")]

	def set_female():
	return [gr.update(value="Female"), gr.update(value="Voice Design")]

	# ─── Voice Design / Auto ───
	@spaces.GPU
	def generate_design(text, mode, lang_choice, gender, age, pitch, style,
	speed, duration, num_step, guidance_scale, denoise, postprocess):
	if not text or not text.strip():
	return None, "Please enter text."

	word_count = _count_words(text)
	if word_count > MAX_WORDS:
	return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."

	lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
	kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
	kwargs["language"] = lang_code
	if mode == "Voice Design":
	instruct = _build_instruct(gender, age, pitch, style)
	if instruct:
	kwargs["instruct"] = instruct
	if duration and duration > 0:
	kwargs["duration"] = duration
	else:
	kwargs["speed"] = speed
	if postprocess:
	kwargs["postprocess_output"] = True
	try:
	audio = model.generate(text=text, **kwargs)
	path = _save_audio(audio[0], 24000)
	duration_sec = audio[0].shape[-1] / 24000 if hasattr(audio[0], 'shape') else 0
	return path, f"Generation complete ({duration_sec:.1f}s)"
	except Exception as e:
	return None, f"Error: {e}"

	# ─── Voice Clone ───
	@spaces.GPU
	def generate_clone(text, voice_choice, ref_audio, ref_text, lang_choice, speed, duration,
	num_step, guidance_scale, denoise, postprocess):
	if not text or not text.strip():
	return None, "Please enter text."

	word_count = _count_words(text)
	if word_count > MAX_WORDS:
	return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."

	# Determine the actual reference audio path
	preloaded_path = PRELOADED_VOICES.get(voice_choice)
	if preloaded_path:
	ref_audio = preloaded_path
	elif ref_audio is None:
	return None, "Please upload reference audio or select a pre‑loaded voice."

	# Ensure ref_audio is a valid file path
	if isinstance(ref_audio, tuple):
	ref_audio = ref_audio[0] # Gradio sometimes returns (sample_rate, data) tuples

	lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
	kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
	kwargs["language"] = lang_code
	if duration and duration > 0:
	kwargs["duration"] = duration
	else:
	kwargs["speed"] = speed
	if postprocess:
	kwargs["postprocess_output"] = True
	try:
	audio = model.generate(
	text=text,
	ref_audio=ref_audio,
	ref_text=ref_text if ref_text and ref_text.strip() else None,
	**kwargs,
	)
	path = _save_audio(audio[0], 24000)
	duration_sec = audio[0].shape[-1] / 24000 if hasattr(audio[0], 'shape') else 0
	return path, f"Generation complete ({duration_sec:.1f}s)"
	except Exception as e:
	return None, f"Error: {e}"

	def toggle_ref_audio(voice_choice):
	"""Show/hide the manual upload field based on voice selection."""
	return gr.update(visible=(voice_choice == "Upload my own"))

	# ─── UI ───
	CSS = """
	.main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
	.subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
	footer { display: none !important; }
	.word-counter { text-align: right; font-size: 0.85em; color: #666; margin-top: -0.5em; }
	.word-counter.over-limit { color: #d32f2f; font-weight: bold; }
	"""

	with gr.Blocks(title="OmniVoice") as app:
	gr.HTML("<h1 class='main-title'>OmniVoice</h1>")
	gr.HTML("<p class='subtitle'>AI Voice Generator — Kabyle + Regional Languages</p>")

	with gr.Tabs():
	# ── Voice Design / Auto ──
	with gr.Tab("Voice Design"):
	with gr.Row():
	with gr.Column(scale=1):
	d_text = gr.Textbox(
	label="Text to speak", lines=6,
	placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
	value=DEFAULT_TEXT
	)
	d_word_counter = gr.HTML(
	value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
	)
	d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="Mode")
	d_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
	label="Language", info="Select the language of the input text")

	# Update example text when language changes
	d_lang.change(fn=update_example_text, inputs=d_lang, outputs=d_text)

	# ── Always visible gender buttons ──
	with gr.Row():
	male_btn = gr.Button("Masculine Voice", variant="secondary")
	female_btn = gr.Button("Feminine Voice", variant="secondary")
	gr.Markdown("These buttons switch to Voice Design mode and set the gender. Fine‑tune other attributes below.")

	# Voice design attributes (visible only when mode == "Voice Design")
	with gr.Group(visible=False) as d_voice_opts:
	with gr.Row():
	d_gender = gr.Dropdown(["Auto", "Female", "Male"],
	value="Auto", label="Gender")
	d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"],
	value="Auto", label="Age")
	with gr.Row():
	d_pitch = gr.Dropdown(
	["Auto", "Very low", "Low", "Moderate", "High", "Very high"],
	value="Auto", label="Pitch")
	d_style = gr.Dropdown(["Auto", "Whisper"],
	value="Auto", label="Style")

	d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed")

	with gr.Accordion("Advanced Settings", open=False):
	d_duration = gr.Number(value=0, label="Duration (seconds)",
	info="0 for auto. If set, Speed is ignored")
	d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
	d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
	d_denoise = gr.Checkbox(value=True, label="Denoise")
	d_postprocess = gr.Checkbox(value=True, label="Postprocess (silence removal)")

	d_btn = gr.Button("Generate Audio", variant="primary", size="lg")

	with gr.Column(scale=1):
	d_audio = gr.Audio(label="Generated Audio")
	d_status = gr.Textbox(label="Status", interactive=False)

	# Live word counter update
	def update_word_counter(text):
	count = _count_words(text)
	css_class = "word-counter over-limit" if count > MAX_WORDS else "word-counter"
	return f'<div class="{css_class}">{count} / {MAX_WORDS} words</div>'

	d_text.change(fn=update_word_counter, inputs=d_text, outputs=d_word_counter)

	# Button events
	male_btn.click(fn=set_male, inputs=[], outputs=[d_gender, d_mode])
	female_btn.click(fn=set_female, inputs=[], outputs=[d_gender, d_mode])

	# Show/hide detailed voice options based on mode
	d_mode.change(
	fn=lambda m: gr.update(visible=m == "Voice Design"),
	inputs=d_mode, outputs=d_voice_opts,
	)
	d_btn.click(
	fn=generate_design,
	inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style,
	d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess],
	outputs=[d_audio, d_status],
	)

	# ── Voice Clone ──
	with gr.Tab("Voice Clone"):
	with gr.Row():
	with gr.Column(scale=1):
	c_text = gr.Textbox(
	label="Text to speak", lines=6,
	placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
	value=DEFAULT_TEXT
	)
	c_word_counter = gr.HTML(
	value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
	)

	# Pre-loaded voice selector
	c_voice_choice = gr.Dropdown(
	choices=list(PRELOADED_VOICES.keys()),
	value="Upload my own",
	label="Voice Source",
	info="Choose a pre‑loaded voice or upload your own"
	)

	# Manual upload (hidden when a pre-loaded voice is selected)
	c_ref = gr.Audio(
	label="Reference Audio (3–15 seconds)",
	type="filepath",
	visible=True
	)

	c_ref_text = gr.Textbox(label="Transcription (optional)", lines=2,
	placeholder="Leave empty for auto-transcription")
	c_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
	label="Language", info="Select the language of the input text")

	# Update example text when language changes
	c_lang.change(fn=update_example_text, inputs=c_lang, outputs=c_text)

	c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speed")

	with gr.Accordion("Advanced Settings", open=False):
	c_duration = gr.Number(value=0, label="Duration (seconds)")
	c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
	c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
	c_denoise = gr.Checkbox(value=True, label="Denoise")
	c_postprocess = gr.Checkbox(value=True, label="Postprocess (silence removal)")

	c_btn = gr.Button("Generate Audio", variant="primary", size="lg")

	with gr.Column(scale=1):
	c_audio = gr.Audio(label="Generated Audio")
	c_status = gr.Textbox(label="Status", interactive=False)

	# Live word counter update
	c_text.change(fn=update_word_counter, inputs=c_text, outputs=c_word_counter)

	# Toggle upload field visibility
	c_voice_choice.change(
	fn=toggle_ref_audio,
	inputs=c_voice_choice,
	outputs=c_ref
	)

	c_btn.click(
	fn=generate_clone,
	inputs=[c_text, c_voice_choice, c_ref, c_ref_text, c_lang, c_speed,
	c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
	outputs=[c_audio, c_status],
	)

	if __name__ == "__main__":
	app.launch(css=CSS)