Namanrai commited on
Commit
b4935c5
Β·
verified Β·
1 Parent(s): 8f12b2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -35
app.py CHANGED
@@ -1,47 +1,177 @@
1
- import gradio as gr
2
- from transformers import pipeline
3
- import soundfile as sf
4
- import torch
5
- from datasets import load_dataset
6
  import gc
 
 
 
 
7
 
8
- # System ki purani memory saaf karna
9
  gc.collect()
 
 
10
 
11
- print("⏳ WARNING: Loading the BIGGEST Engine... Server crash hone ke chances hain!")
 
 
 
12
 
13
  try:
14
- # Heavy Text-to-Speech pipeline load kar rahe hain
15
- synthesizer = pipeline("text-to-speech", "microsoft/speecht5_tts")
16
-
17
- # High-quality speaker embedding
18
- embeddings_dataset = load_dataset("Matthijs/cmu_arctic_xvectors", split="validation")
19
- speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
20
- print("βœ… Engine load ho gaya! (Ye ek miracle hai)")
 
 
 
21
  except Exception as e:
22
- print(f"❌ Server Crashed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- def generate_api_voice(text):
25
- if not text:
26
- return None, "Error: Script likhna zaruri hai!"
27
-
28
  try:
29
- # Aawaz generate karne ka heavy process
30
- speech = synthesizer(text, forward_params={"speaker_embeddings": speaker_embedding})
31
-
32
- output_file = "output_voice.wav"
33
- sf.write(output_file, speech["audio"], samplerate=speech["sampling_rate"])
34
-
35
- return output_file, "βœ… API Status: Success"
 
 
 
 
 
 
 
 
36
  except Exception as e:
37
- return None, f"⚠️ Engine Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- # Ekdum clean UI (No extra clutter)
40
- iface = gr.Interface(
41
- fn=generate_api_voice,
42
- inputs=[gr.Textbox(label="Apni Script Yahan Likho")],
43
- outputs=[gr.Audio(label="VoiceForge Output"), gr.Textbox(label="Status")],
44
- title="πŸŽ™οΈ VoiceForge AI Studio - Heavy Engine Test"
45
- )
46
 
47
- iface.launch()
 
 
1
+ import os
 
 
 
 
2
  import gc
3
+ import torch
4
+ import gradio as gr
5
+ import numpy as np
6
+ import tempfile
7
 
 
8
  gc.collect()
9
+ if torch.cuda.is_available():
10
+ torch.cuda.empty_cache()
11
 
12
+ print("⏳ VibeVoice TTS - Loading Engine...")
13
+
14
+ # ===== MODEL LOAD =====
15
+ tts_model = None
16
 
17
  try:
18
+ from TTS.api import TTS
19
+
20
+ # XTTS v2 - Real voice cloning model
21
+ # CPU pe bhi kaam karta hai (slow but works)
22
+ device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ print(f"πŸ–₯️ Device: {device}")
24
+
25
+ tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
26
+ print("βœ… XTTS v2 Engine Loaded!")
27
+
28
  except Exception as e:
29
+ print(f"❌ Model Load Failed: {e}")
30
+ tts_model = None
31
+
32
+
33
+ # ===== MAIN FUNCTION =====
34
+ def generate_voice(text, reference_audio, language):
35
+ """
36
+ text : Jo bolwana hai
37
+ reference_audio: User ki apni awaaz ka sample (3-10 sec WAV/MP3)
38
+ language : en / hi / ur etc.
39
+ """
40
+
41
+ # --- Basic Validation ---
42
+ if tts_model is None:
43
+ return None, "❌ Model load nahi hua. Server RAM/GPU issue hai. Space restart karo."
44
+
45
+ if not text or text.strip() == "":
46
+ return None, "⚠️ Text khali hai! Kuch likho pehle."
47
+
48
+ if reference_audio is None:
49
+ return None, "⚠️ Voice sample upload karo (3-10 second ka clean audio)."
50
+
51
+ if len(text.strip()) > 500:
52
+ return None, "⚠️ Text zyada lamba hai. 500 characters tak raho abhi."
53
 
 
 
 
 
54
  try:
55
+ # Output file
56
+ output_path = tempfile.mktemp(suffix=".wav")
57
+
58
+ print(f"πŸŽ™οΈ Generating: '{text[:50]}...' | Lang: {language}")
59
+
60
+ tts_model.tts_to_file(
61
+ text=text,
62
+ speaker_wav=reference_audio, # Real voice cloning yahan hota hai
63
+ language=language,
64
+ file_path=output_path,
65
+ )
66
+
67
+ print("βœ… Audio Generated!")
68
+ return output_path, "βœ… Awaaz ban gayi! Neeche play karo ya download karo."
69
+
70
  except Exception as e:
71
+ err = str(e)
72
+ print(f"❌ Generation Error: {err}")
73
+
74
+ # Common errors ko samajhne wali language mein batao
75
+ if "cuda out of memory" in err.lower():
76
+ return None, "❌ GPU RAM full ho gayi. Chhota text try karo ya CPU Space use karo."
77
+ elif "ffmpeg" in err.lower():
78
+ return None, "❌ Audio format issue. WAV ya MP3 file upload karo."
79
+ elif "sample rate" in err.lower():
80
+ return None, "❌ Audio quality low hai. 22050Hz ya upar ka audio use karo."
81
+ else:
82
+ return None, f"❌ Error aaya: {err}"
83
+
84
+
85
+ # ===== GRADIO UI =====
86
+ LANGUAGES = {
87
+ "English": "en",
88
+ "Hindi": "hi",
89
+ "Urdu": "ur",
90
+ "French": "fr",
91
+ "Spanish": "es",
92
+ "German": "de",
93
+ "Italian": "it",
94
+ "Portuguese": "pt",
95
+ "Polish": "pl",
96
+ "Turkish": "tr",
97
+ "Russian": "ru",
98
+ "Dutch": "nl",
99
+ "Czech": "cs",
100
+ "Arabic": "ar",
101
+ "Chinese": "zh-cn",
102
+ "Japanese": "ja",
103
+ "Korean": "ko",
104
+ "Hungarian": "hu",
105
+ }
106
+
107
+ with gr.Blocks(
108
+ title="πŸŽ™οΈ VibeVoice TTS",
109
+ theme=gr.themes.Soft(primary_hue="violet", secondary_hue="purple"),
110
+ css="""
111
+ .gradio-container { max-width: 800px !important; margin: auto; }
112
+ h1 { text-align: center; color: #7c3aed; }
113
+ .status-box textarea { font-size: 14px !important; }
114
+ """
115
+ ) as iface:
116
+
117
+ gr.HTML("""
118
+ <h1>πŸŽ™οΈ VibeVoice TTS</h1>
119
+ <p style='text-align:center; color:#6b7280;'>
120
+ Apni awaaz upload karo β†’ Text likho β†’ AI teri awaaz mein bolega
121
+ </p>
122
+ """)
123
+
124
+ with gr.Row():
125
+ with gr.Column(scale=1):
126
+ gr.Markdown("### πŸ“ Step 1 – Text")
127
+ text_input = gr.Textbox(
128
+ label="Jo bolwana hai woh yahan likho",
129
+ placeholder="Hello! Yeh meri awaaz hai, AI ne clone ki hai.",
130
+ lines=4,
131
+ max_lines=8,
132
+ )
133
+ lang_dropdown = gr.Dropdown(
134
+ choices=list(LANGUAGES.keys()),
135
+ value="Hindi",
136
+ label="🌐 Language",
137
+ )
138
+
139
+ with gr.Column(scale=1):
140
+ gr.Markdown("### 🎀 Step 2 – Apni Awaaz Upload Karo")
141
+ audio_input = gr.Audio(
142
+ sources=["upload", "microphone"],
143
+ type="filepath",
144
+ label="Voice Sample (3–10 seconds, saaf awaaz mein)",
145
+ )
146
+ gr.Markdown(
147
+ "<small>πŸ’‘ Tips: Quiet room mein record karo. WAV ya MP3 dono chalega.</small>"
148
+ )
149
+
150
+ submit_btn = gr.Button("πŸš€ Generate Voice", variant="primary", size="lg")
151
+
152
+ gr.Markdown("---")
153
+ gr.Markdown("### πŸ”Š Result")
154
+ audio_output = gr.Audio(label="Generated Voice", type="filepath")
155
+ status_output = gr.Textbox(
156
+ label="Status",
157
+ interactive=False,
158
+ elem_classes=["status-box"],
159
+ )
160
+
161
+ # Button click β†’ function call
162
+ submit_btn.click(
163
+ fn=lambda text, audio, lang: generate_voice(text, audio, LANGUAGES[lang]),
164
+ inputs=[text_input, audio_input, lang_dropdown],
165
+ outputs=[audio_output, status_output],
166
+ )
167
+
168
+ gr.Markdown("""
169
+ ---
170
+ <p style='text-align:center; font-size:12px; color:#9ca3af;'>
171
+ Powered by <b>Coqui XTTS v2</b> Β· Real Voice Cloning Β· 17 Languages
172
+ </p>
173
+ """)
174
 
 
 
 
 
 
 
 
175
 
176
+ if __name__ == "__main__":
177
+ iface.launch(server_name="0.0.0.0", server_port=7860)