Spaces:

Namanrai
/

Glowmation-TTS-API

Running

App Files Files Community

Namanrai commited on 8 days ago

Commit

48b44b5

verified ·

1 Parent(s): ac3cd0b

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -114

app.py CHANGED Viewed

@@ -2,176 +2,136 @@ import os
 import gc
 import torch
 import gradio as gr
-import numpy as np
 import tempfile
 gc.collect()
 if torch.cuda.is_available():
     torch.cuda.empty_cache()
-print("⏳ VibeVoice TTS - Loading Engine...")
-# ===== MODEL LOAD =====
-tts_model = None
-try:
-    from TTS.api import TTS
-    # XTTS v2 - Real voice cloning model
-    # CPU pe bhi kaam karta hai (slow but works)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    print(f"🖥️ Device: {device}")
-    tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
-    print("✅ XTTS v2 Engine Loaded!")
-except Exception as e:
-    print(f"❌ Model Load Failed: {e}")
-    tts_model = None
-# ===== MAIN FUNCTION =====
-def generate_voice(text, reference_audio, language):
-    """
-    text           : Jo bolwana hai
-    reference_audio: User ki apni awaaz ka sample (3-10 sec WAV/MP3)
-    language       : en / hi / ur etc.
-    """
-    # --- Basic Validation ---
-    if tts_model is None:
-        return None, "❌ Model load nahi hua. Server RAM/GPU issue hai. Space restart karo."
     if not text or text.strip() == "":
-        return None, "⚠️ Text khali hai! Kuch likho pehle."
     if reference_audio is None:
-        return None, "⚠️ Voice sample upload karo (3-10 second ka clean audio)."
-    if len(text.strip()) > 500:
-        return None, "⚠️ Text zyada lamba hai. 500 characters tak raho abhi."
     try:
-        # Output file
         output_path = tempfile.mktemp(suffix=".wav")
-        print(f"🎙️ Generating: '{text[:50]}...' | Lang: {language}")
-        tts_model.tts_to_file(
             text=text,
-            speaker_wav=reference_audio,   # Real voice cloning yahan hota hai
             language=language,
-            file_path=output_path,
         )
-        print("✅ Audio Generated!")
-        return output_path, "✅ Awaaz ban gayi! Neeche play karo ya download karo."
     except Exception as e:
         err = str(e)
-        print(f"❌ Generation Error: {err}")
-        # Common errors ko samajhne wali language mein batao
-        if "cuda out of memory" in err.lower():
-            return None, "❌ GPU RAM full ho gayi. Chhota text try karo ya CPU Space use karo."
-        elif "ffmpeg" in err.lower():
-            return None, "❌ Audio format issue. WAV ya MP3 file upload karo."
-        elif "sample rate" in err.lower():
-            return None, "❌ Audio quality low hai. 22050Hz ya upar ka audio use karo."
-        else:
-            return None, f"❌ Error aaya: {err}"
-# ===== GRADIO UI =====
-LANGUAGES = {
-    "English": "en",
-    "Hindi": "hi",
-    "Urdu": "ur",
-    "French": "fr",
-    "Spanish": "es",
-    "German": "de",
-    "Italian": "it",
-    "Portuguese": "pt",
-    "Polish": "pl",
-    "Turkish": "tr",
-    "Russian": "ru",
-    "Dutch": "nl",
-    "Czech": "cs",
-    "Arabic": "ar",
-    "Chinese": "zh-cn",
-    "Japanese": "ja",
-    "Korean": "ko",
-    "Hungarian": "hu",
-}
-with gr.Blocks(
-    title="🎙️ VibeVoice TTS",
-    theme=gr.themes.Soft(primary_hue="violet", secondary_hue="purple"),
-    css="""
-    .gradio-container { max-width: 800px !important; margin: auto; }
-    h1 { text-align: center; color: #7c3aed; }
-    .status-box textarea { font-size: 14px !important; }
-    """
-) as iface:
     gr.HTML("""
-    <h1>🎙️ VibeVoice TTS</h1>
     <p style='text-align:center; color:#6b7280;'>
-        Apni awaaz upload karo → Text likho → AI teri awaaz mein bolega
     </p>
     """)
     with gr.Row():
-        with gr.Column(scale=1):
             gr.Markdown("### 📝 Step 1 – Text")
             text_input = gr.Textbox(
-                label="Jo bolwana hai woh yahan likho",
-                placeholder="Hello! Yeh meri awaaz hai, AI ne clone ki hai.",
                 lines=4,
-                max_lines=8,
             )
             lang_dropdown = gr.Dropdown(
-                choices=list(LANGUAGES.keys()),
-                value="Hindi",
                 label="🌐 Language",
             )
-        with gr.Column(scale=1):
-            gr.Markdown("### 🎤 Step 2 – Apni Awaaz Upload Karo")
             audio_input = gr.Audio(
-                sources=["upload", "microphone"],
                 type="filepath",
-                label="Voice Sample (3–10 seconds, saaf awaaz mein)",
             )
-            gr.Markdown(
-                "<small>💡 Tips: Quiet room mein record karo. WAV ya MP3 dono chalega.</small>"
             )
-    submit_btn = gr.Button("🚀 Generate Voice", variant="primary", size="lg")
-    gr.Markdown("---")
     gr.Markdown("### 🔊 Result")
-    audio_output = gr.Audio(label="Generated Voice", type="filepath")
-    status_output = gr.Textbox(
-        label="Status",
-        interactive=False,
-        elem_classes=["status-box"],
-    )
-    # Button click → function call
     submit_btn.click(
-        fn=lambda text, audio, lang: generate_voice(text, audio, LANGUAGES[lang]),
-        inputs=[text_input, audio_input, lang_dropdown],
         outputs=[audio_output, status_output],
     )
     gr.Markdown("""
     ---
-    <p style='text-align:center; font-size:12px; color:#9ca3af;'>
-    Powered by <b>Coqui XTTS v2</b> · Real Voice Cloning · 17 Languages
-    </p>
     """)
 if __name__ == "__main__":
     iface.launch(server_name="0.0.0.0", server_port=7860)

 import gc
 import torch
 import gradio as gr
+import soundfile as sf
 import tempfile
 gc.collect()
 if torch.cuda.is_available():
     torch.cuda.empty_cache()
+print("⏳ Qwen3-TTS loading...")
+model = None
+def load_model():
+    global model
+    if model is not None:
+        return True
+    try:
+        from qwen_tts import Qwen3TTSModel
+        model = Qwen3TTSModel.from_pretrained(
+            "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
+            device_map="cuda" if torch.cuda.is_available() else "cpu",
+            dtype=torch.bfloat16,
+        )
+        print("✅ Qwen3-TTS Loaded!")
+        return True
+    except Exception as e:
+        print(f"❌ Load Error: {e}")
+        return False
+def generate_voice(text, reference_audio, ref_transcript, language):
     if not text or text.strip() == "":
+        return None, "⚠️ Text khali hai! Kuch likho."
     if reference_audio is None:
+        return None, "⚠️ Apni awaaz ka audio upload karo (3-10 sec)."
+    if not load_model():
+        return None, "❌ Model load nahi hua. GPU Space use kar raha hai? T4 GPU select karo."
     try:
         output_path = tempfile.mktemp(suffix=".wav")
+        # ref_transcript optional hai — agar nahi diya toh None pass karo
+        transcript = ref_transcript.strip() if ref_transcript and ref_transcript.strip() else None
+        wavs, sr = model.generate_voice_clone(
             text=text,
             language=language,
+            ref_audio=reference_audio,
+            ref_text=transcript,  # None hoga toh model khud guess karega
         )
+        sf.write(output_path, wavs[0], sr)
+        return output_path, "✅ Awaaz ban gayi! Neeche play/download karo."
+    except torch.cuda.OutOfMemoryError:
+        return None, "❌ GPU RAM full! Chhota text try karo (100 words tak)."
     except Exception as e:
         err = str(e)
+        print(f"❌ Error: {err}")
+        if "ffmpeg" in err.lower() or "audio" in err.lower():
+            return None, "❌ Audio format issue. WAV file upload karo."
+        return None, f"❌ Error: {err}"
+# ── UI ���─────────────────────────────────────────────────────────────────────
+LANGUAGES = [
+    "English", "Chinese", "Japanese", "Korean",
+    "German", "French", "Russian", "Portuguese",
+    "Spanish", "Italian"
+]
+with gr.Blocks(title="🎙️ VibeVoice – Qwen3 TTS") as iface:
     gr.HTML("""
+    <h1 style='text-align:center; color:#7c3aed;'>🎙️ VibeVoice – Qwen3 TTS</h1>
     <p style='text-align:center; color:#6b7280;'>
+        Apni awaaz upload karo → Text likho → AI teri awaaz mein bolega<br>
+        <small>Powered by Qwen3-TTS-0.6B · Real Voice Cloning</small>
     </p>
     """)
     with gr.Row():
+        with gr.Column():
             gr.Markdown("### 📝 Step 1 – Text")
             text_input = gr.Textbox(
+                label="Jo bolwana hai",
+                placeholder="Namaste! Yeh meri awaaz hai jo AI ne clone ki hai.",
                 lines=4,
             )
             lang_dropdown = gr.Dropdown(
+                choices=LANGUAGES,
+                value="English",
                 label="🌐 Language",
             )
+        with gr.Column():
+            gr.Markdown("### 🎤 Step 2 – Voice Sample")
             audio_input = gr.Audio(
+                source="upload",
                 type="filepath",
+                label="Apni awaaz upload karo (3–10 sec, saaf audio)",
             )
+            ref_text_input = gr.Textbox(
+                label="Reference Audio ka text (optional, lekin doge toh quality better hogi)",
+                placeholder="Jo tumne us audio mein bola tha...",
+                lines=2,
             )
+    submit_btn = gr.Button("🚀 Generate Voice", variant="primary")
     gr.Markdown("### 🔊 Result")
+    audio_output = gr.Audio(label="Generated Voice")
+    status_output = gr.Textbox(label="Status", interactive=False)
     submit_btn.click(
+        fn=generate_voice,
+        inputs=[text_input, audio_input, ref_text_input, lang_dropdown],
         outputs=[audio_output, status_output],
     )
     gr.Markdown("""
     ---
+    💡 **Tips:**
+    - GPU Space use karo (T4 free tier chalega)
+    - Reference audio: 5-10 second, quiet room, WAV format best hai
+    - Ref text doge toh cloning zyada accurate hogi
+    - Pehli baar thoda slow hoga (model download), baad mein fast
     """)
 if __name__ == "__main__":
     iface.launch(server_name="0.0.0.0", server_port=7860)