Namanrai commited on
Commit
48b44b5
Β·
verified Β·
1 Parent(s): ac3cd0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -114
app.py CHANGED
@@ -2,176 +2,136 @@ import os
2
  import gc
3
  import torch
4
  import gradio as gr
5
- import numpy as np
6
  import tempfile
7
 
8
  gc.collect()
9
  if torch.cuda.is_available():
10
  torch.cuda.empty_cache()
11
 
12
- print("⏳ VibeVoice TTS - Loading Engine...")
13
 
14
- # ===== MODEL LOAD =====
15
- tts_model = None
16
 
17
- try:
18
- from TTS.api import TTS
19
-
20
- # XTTS v2 - Real voice cloning model
21
- # CPU pe bhi kaam karta hai (slow but works)
22
- device = "cuda" if torch.cuda.is_available() else "cpu"
23
- print(f"πŸ–₯️ Device: {device}")
24
-
25
- tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
26
- print("βœ… XTTS v2 Engine Loaded!")
27
-
28
- except Exception as e:
29
- print(f"❌ Model Load Failed: {e}")
30
- tts_model = None
31
-
32
-
33
- # ===== MAIN FUNCTION =====
34
- def generate_voice(text, reference_audio, language):
35
- """
36
- text : Jo bolwana hai
37
- reference_audio: User ki apni awaaz ka sample (3-10 sec WAV/MP3)
38
- language : en / hi / ur etc.
39
- """
40
 
41
- # --- Basic Validation ---
42
- if tts_model is None:
43
- return None, "❌ Model load nahi hua. Server RAM/GPU issue hai. Space restart karo."
44
 
 
45
  if not text or text.strip() == "":
46
- return None, "⚠️ Text khali hai! Kuch likho pehle."
47
 
48
  if reference_audio is None:
49
- return None, "⚠️ Voice sample upload karo (3-10 second ka clean audio)."
50
 
51
- if len(text.strip()) > 500:
52
- return None, "⚠️ Text zyada lamba hai. 500 characters tak raho abhi."
53
 
54
  try:
55
- # Output file
56
  output_path = tempfile.mktemp(suffix=".wav")
57
 
58
- print(f"πŸŽ™οΈ Generating: '{text[:50]}...' | Lang: {language}")
 
59
 
60
- tts_model.tts_to_file(
61
  text=text,
62
- speaker_wav=reference_audio, # Real voice cloning yahan hota hai
63
  language=language,
64
- file_path=output_path,
 
65
  )
66
 
67
- print("βœ… Audio Generated!")
68
- return output_path, "βœ… Awaaz ban gayi! Neeche play karo ya download karo."
69
 
 
 
70
  except Exception as e:
71
  err = str(e)
72
- print(f"❌ Generation Error: {err}")
73
-
74
- # Common errors ko samajhne wali language mein batao
75
- if "cuda out of memory" in err.lower():
76
- return None, "❌ GPU RAM full ho gayi. Chhota text try karo ya CPU Space use karo."
77
- elif "ffmpeg" in err.lower():
78
- return None, "❌ Audio format issue. WAV ya MP3 file upload karo."
79
- elif "sample rate" in err.lower():
80
- return None, "❌ Audio quality low hai. 22050Hz ya upar ka audio use karo."
81
- else:
82
- return None, f"❌ Error aaya: {err}"
83
-
84
-
85
- # ===== GRADIO UI =====
86
- LANGUAGES = {
87
- "English": "en",
88
- "Hindi": "hi",
89
- "Urdu": "ur",
90
- "French": "fr",
91
- "Spanish": "es",
92
- "German": "de",
93
- "Italian": "it",
94
- "Portuguese": "pt",
95
- "Polish": "pl",
96
- "Turkish": "tr",
97
- "Russian": "ru",
98
- "Dutch": "nl",
99
- "Czech": "cs",
100
- "Arabic": "ar",
101
- "Chinese": "zh-cn",
102
- "Japanese": "ja",
103
- "Korean": "ko",
104
- "Hungarian": "hu",
105
- }
106
-
107
- with gr.Blocks(
108
- title="πŸŽ™οΈ VibeVoice TTS",
109
- theme=gr.themes.Soft(primary_hue="violet", secondary_hue="purple"),
110
- css="""
111
- .gradio-container { max-width: 800px !important; margin: auto; }
112
- h1 { text-align: center; color: #7c3aed; }
113
- .status-box textarea { font-size: 14px !important; }
114
- """
115
- ) as iface:
116
 
117
  gr.HTML("""
118
- <h1>πŸŽ™οΈ VibeVoice TTS</h1>
119
  <p style='text-align:center; color:#6b7280;'>
120
- Apni awaaz upload karo β†’ Text likho β†’ AI teri awaaz mein bolega
 
121
  </p>
122
  """)
123
 
124
  with gr.Row():
125
- with gr.Column(scale=1):
126
  gr.Markdown("### πŸ“ Step 1 – Text")
127
  text_input = gr.Textbox(
128
- label="Jo bolwana hai woh yahan likho",
129
- placeholder="Hello! Yeh meri awaaz hai, AI ne clone ki hai.",
130
  lines=4,
131
- max_lines=8,
132
  )
133
  lang_dropdown = gr.Dropdown(
134
- choices=list(LANGUAGES.keys()),
135
- value="Hindi",
136
  label="🌐 Language",
137
  )
138
 
139
- with gr.Column(scale=1):
140
- gr.Markdown("### 🎀 Step 2 – Apni Awaaz Upload Karo")
141
  audio_input = gr.Audio(
142
- sources=["upload", "microphone"],
143
  type="filepath",
144
- label="Voice Sample (3–10 seconds, saaf awaaz mein)",
145
  )
146
- gr.Markdown(
147
- "<small>πŸ’‘ Tips: Quiet room mein record karo. WAV ya MP3 dono chalega.</small>"
 
 
148
  )
149
 
150
- submit_btn = gr.Button("πŸš€ Generate Voice", variant="primary", size="lg")
151
 
152
- gr.Markdown("---")
153
  gr.Markdown("### πŸ”Š Result")
154
- audio_output = gr.Audio(label="Generated Voice", type="filepath")
155
- status_output = gr.Textbox(
156
- label="Status",
157
- interactive=False,
158
- elem_classes=["status-box"],
159
- )
160
 
161
- # Button click β†’ function call
162
  submit_btn.click(
163
- fn=lambda text, audio, lang: generate_voice(text, audio, LANGUAGES[lang]),
164
- inputs=[text_input, audio_input, lang_dropdown],
165
  outputs=[audio_output, status_output],
166
  )
167
 
168
  gr.Markdown("""
169
  ---
170
- <p style='text-align:center; font-size:12px; color:#9ca3af;'>
171
- Powered by <b>Coqui XTTS v2</b> Β· Real Voice Cloning Β· 17 Languages
172
- </p>
 
 
173
  """)
174
 
175
-
176
  if __name__ == "__main__":
177
  iface.launch(server_name="0.0.0.0", server_port=7860)
 
2
  import gc
3
  import torch
4
  import gradio as gr
5
+ import soundfile as sf
6
  import tempfile
7
 
8
  gc.collect()
9
  if torch.cuda.is_available():
10
  torch.cuda.empty_cache()
11
 
12
+ print("⏳ Qwen3-TTS loading...")
13
 
14
+ model = None
 
15
 
16
+ def load_model():
17
+ global model
18
+ if model is not None:
19
+ return True
20
+ try:
21
+ from qwen_tts import Qwen3TTSModel
22
+ model = Qwen3TTSModel.from_pretrained(
23
+ "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
24
+ device_map="cuda" if torch.cuda.is_available() else "cpu",
25
+ dtype=torch.bfloat16,
26
+ )
27
+ print("βœ… Qwen3-TTS Loaded!")
28
+ return True
29
+ except Exception as e:
30
+ print(f"❌ Load Error: {e}")
31
+ return False
 
 
 
 
 
 
 
32
 
 
 
 
33
 
34
+ def generate_voice(text, reference_audio, ref_transcript, language):
35
  if not text or text.strip() == "":
36
+ return None, "⚠️ Text khali hai! Kuch likho."
37
 
38
  if reference_audio is None:
39
+ return None, "⚠️ Apni awaaz ka audio upload karo (3-10 sec)."
40
 
41
+ if not load_model():
42
+ return None, "❌ Model load nahi hua. GPU Space use kar raha hai? T4 GPU select karo."
43
 
44
  try:
 
45
  output_path = tempfile.mktemp(suffix=".wav")
46
 
47
+ # ref_transcript optional hai β€” agar nahi diya toh None pass karo
48
+ transcript = ref_transcript.strip() if ref_transcript and ref_transcript.strip() else None
49
 
50
+ wavs, sr = model.generate_voice_clone(
51
  text=text,
 
52
  language=language,
53
+ ref_audio=reference_audio,
54
+ ref_text=transcript, # None hoga toh model khud guess karega
55
  )
56
 
57
+ sf.write(output_path, wavs[0], sr)
58
+ return output_path, "βœ… Awaaz ban gayi! Neeche play/download karo."
59
 
60
+ except torch.cuda.OutOfMemoryError:
61
+ return None, "❌ GPU RAM full! Chhota text try karo (100 words tak)."
62
  except Exception as e:
63
  err = str(e)
64
+ print(f"❌ Error: {err}")
65
+ if "ffmpeg" in err.lower() or "audio" in err.lower():
66
+ return None, "❌ Audio format issue. WAV file upload karo."
67
+ return None, f"❌ Error: {err}"
68
+
69
+
70
+ # ── UI ���─────────────────────────────────────────────────────────────────────
71
+
72
+ LANGUAGES = [
73
+ "English", "Chinese", "Japanese", "Korean",
74
+ "German", "French", "Russian", "Portuguese",
75
+ "Spanish", "Italian"
76
+ ]
77
+
78
+ with gr.Blocks(title="πŸŽ™οΈ VibeVoice – Qwen3 TTS") as iface:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  gr.HTML("""
81
+ <h1 style='text-align:center; color:#7c3aed;'>πŸŽ™οΈ VibeVoice – Qwen3 TTS</h1>
82
  <p style='text-align:center; color:#6b7280;'>
83
+ Apni awaaz upload karo β†’ Text likho β†’ AI teri awaaz mein bolega<br>
84
+ <small>Powered by Qwen3-TTS-0.6B Β· Real Voice Cloning</small>
85
  </p>
86
  """)
87
 
88
  with gr.Row():
89
+ with gr.Column():
90
  gr.Markdown("### πŸ“ Step 1 – Text")
91
  text_input = gr.Textbox(
92
+ label="Jo bolwana hai",
93
+ placeholder="Namaste! Yeh meri awaaz hai jo AI ne clone ki hai.",
94
  lines=4,
 
95
  )
96
  lang_dropdown = gr.Dropdown(
97
+ choices=LANGUAGES,
98
+ value="English",
99
  label="🌐 Language",
100
  )
101
 
102
+ with gr.Column():
103
+ gr.Markdown("### 🎀 Step 2 – Voice Sample")
104
  audio_input = gr.Audio(
105
+ source="upload",
106
  type="filepath",
107
+ label="Apni awaaz upload karo (3–10 sec, saaf audio)",
108
  )
109
+ ref_text_input = gr.Textbox(
110
+ label="Reference Audio ka text (optional, lekin doge toh quality better hogi)",
111
+ placeholder="Jo tumne us audio mein bola tha...",
112
+ lines=2,
113
  )
114
 
115
+ submit_btn = gr.Button("πŸš€ Generate Voice", variant="primary")
116
 
 
117
  gr.Markdown("### πŸ”Š Result")
118
+ audio_output = gr.Audio(label="Generated Voice")
119
+ status_output = gr.Textbox(label="Status", interactive=False)
 
 
 
 
120
 
 
121
  submit_btn.click(
122
+ fn=generate_voice,
123
+ inputs=[text_input, audio_input, ref_text_input, lang_dropdown],
124
  outputs=[audio_output, status_output],
125
  )
126
 
127
  gr.Markdown("""
128
  ---
129
+ πŸ’‘ **Tips:**
130
+ - GPU Space use karo (T4 free tier chalega)
131
+ - Reference audio: 5-10 second, quiet room, WAV format best hai
132
+ - Ref text doge toh cloning zyada accurate hogi
133
+ - Pehli baar thoda slow hoga (model download), baad mein fast
134
  """)
135
 
 
136
  if __name__ == "__main__":
137
  iface.launch(server_name="0.0.0.0", server_port=7860)