Spaces:
Runtime error
Runtime error
Commit ·
5f8297f
1
Parent(s): a81bf6e
deliverable api
Browse files- src/__pycache__/predict.cpython-310.pyc +0 -0
- src/__pycache__/rp_schema.cpython-310.pyc +0 -0
- src/__pycache__/se_extractor.cpython-310.pyc +0 -0
- src/predict.py +100 -29
- src/processed/4d651a78-ccbd-4f66-96b1-0e0ede048d77/raw/631a27e2-8466-463e-a6ca-a2afd468c5a3.wav +3 -0
- src/processed/69b28271-7198-4307-8501-e3969bbebef4/raw/631a27e2-8466-463e-a6ca-a2afd468c5a3.wav +3 -0
- src/rp_handler.py +30 -7
- src/rp_schema.py +16 -1
- src/se_extractor.py +28 -2
src/__pycache__/predict.cpython-310.pyc
CHANGED
|
Binary files a/src/__pycache__/predict.cpython-310.pyc and b/src/__pycache__/predict.cpython-310.pyc differ
|
|
|
src/__pycache__/rp_schema.cpython-310.pyc
CHANGED
|
Binary files a/src/__pycache__/rp_schema.cpython-310.pyc and b/src/__pycache__/rp_schema.cpython-310.pyc differ
|
|
|
src/__pycache__/se_extractor.cpython-310.pyc
CHANGED
|
Binary files a/src/__pycache__/se_extractor.cpython-310.pyc and b/src/__pycache__/se_extractor.cpython-310.pyc differ
|
|
|
src/predict.py
CHANGED
|
@@ -6,6 +6,11 @@ repository, with some modifications to make it work with the RP platform.
|
|
| 6 |
|
| 7 |
from concurrent.futures import ThreadPoolExecutor
|
| 8 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
from runpod.serverless.utils import rp_cuda
|
| 11 |
import boto3
|
|
@@ -52,6 +57,9 @@ from text_utils import TextCleaner
|
|
| 52 |
from pydantic import BaseModel, HttpUrl
|
| 53 |
from api import BaseSpeakerTTS, ToneColorConverter
|
| 54 |
|
|
|
|
|
|
|
|
|
|
| 55 |
class Predictor:
|
| 56 |
def __init__(self):
|
| 57 |
self.model = None
|
|
@@ -135,7 +143,70 @@ class Predictor:
|
|
| 135 |
self.tone_color_converter.load_ckpt(f'{self.ckpt_converter}/checkpoint.pth')
|
| 136 |
|
| 137 |
|
| 138 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
output_dir = 'processed'
|
| 140 |
gen_id = str(uuid.uuid4())
|
| 141 |
os.makedirs(output_dir,exist_ok=True)
|
|
@@ -159,29 +230,32 @@ class Predictor:
|
|
| 159 |
#voice_clone with styletts2
|
| 160 |
model,sampler = self.model,self.sampler
|
| 161 |
processed_seg_dir = os.path.join(segments_dir,s3_key.split('.')[0],'wavs')
|
| 162 |
-
result = self.process_audio_file(
|
| 163 |
final_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
|
|
|
|
| 164 |
sf.write(final_output,result,24000)
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
|
| 167 |
reference_speaker = local_file_path
|
| 168 |
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
|
| 169 |
src_path = os.path.join(results_dir,f"{gen_id}-tmp.wav")
|
| 170 |
-
openvoice_output = os.path.join(results_dir,f"{gen_id}-2.wav")
|
| 171 |
base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0)
|
| 172 |
|
| 173 |
source_se = torch.load(f'{self.ckpt_base}/en_default_se.pth').to(self.device)
|
| 174 |
tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
|
| 181 |
mp3_final_output_1 = str(final_output).replace('wav','mp3')
|
| 182 |
-
mp3_final_output_2 = str(
|
| 183 |
self.convert_wav_to_mp3(final_output,mp3_final_output_1)
|
| 184 |
-
self.convert_wav_to_mp3(
|
| 185 |
print(mp3_final_output_1)
|
| 186 |
print(mp3_final_output_2)
|
| 187 |
|
|
@@ -200,19 +274,19 @@ class Predictor:
|
|
| 200 |
base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0,use_emotions=True)
|
| 201 |
source_se = torch.load(f'{self.ckpt_base}/en_style_se.pth').to(self.device)
|
| 202 |
tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
|
| 207 |
-
mp3_final_output_1 = str(
|
| 208 |
-
self.convert_wav_to_mp3(
|
| 209 |
print(mp3_final_output_1)
|
| 210 |
self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-with-emotions.mp3")
|
| 211 |
shutil.rmtree(os.path.join(output_dir,gen_id))
|
| 212 |
return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-with-emotions.mp3"
|
| 213 |
}
|
| 214 |
|
| 215 |
-
if method_type == '
|
| 216 |
#voice clone with multi-lingugal
|
| 217 |
_,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
|
| 218 |
reference_speaker = local_file_path
|
|
@@ -236,9 +310,10 @@ class Predictor:
|
|
| 236 |
self.tone_color_converter.convert(audio_src_path=openai_multi_lang_path, src_se=source_se, tgt_se=target_se, output_path=multi_lang_with_voice_clone_path,message='')
|
| 237 |
|
| 238 |
mp3_final_output_1 = str(multi_lang_with_voice_clone_path).replace('wav','mp3')
|
| 239 |
-
convert_wav_to_mp3(multi_lang_with_voice_clone_path,mp3_final_output_1)
|
| 240 |
print(mp3_final_output_1)
|
| 241 |
-
upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-multi-lang.mp3")
|
|
|
|
| 242 |
return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-multi-lang.mp3"
|
| 243 |
}
|
| 244 |
|
|
@@ -249,7 +324,7 @@ class Predictor:
|
|
| 249 |
|
| 250 |
solver = solver.lower()
|
| 251 |
nfe = int(nfe)
|
| 252 |
-
lambd = 0.
|
| 253 |
|
| 254 |
dwav, sr = torchaudio.load(path)
|
| 255 |
dwav = dwav.mean(dim=0)
|
|
@@ -380,14 +455,9 @@ class Predictor:
|
|
| 380 |
|
| 381 |
return torch.cat([ref_s, ref_p], dim=1)
|
| 382 |
|
| 383 |
-
def process_audio_file(self,
|
| 384 |
-
print(
|
| 385 |
-
|
| 386 |
-
print(audio_segs)
|
| 387 |
-
if len(audio_segs) >= 1:
|
| 388 |
-
s_ref = self.compute_style(audio_segs[0], model)
|
| 389 |
-
else:
|
| 390 |
-
raise NotImplementedError('No audio segments found!')
|
| 391 |
sentences = split_and_recombine_text(passage)
|
| 392 |
wavs = []
|
| 393 |
s_prev = None
|
|
@@ -398,7 +468,7 @@ class Predictor:
|
|
| 398 |
s_prev,
|
| 399 |
s_ref,
|
| 400 |
alpha = 0,
|
| 401 |
-
beta = 0.
|
| 402 |
t = 0.7,
|
| 403 |
diffusion_steps=10, embedding_scale=1)
|
| 404 |
wavs.append(wav)
|
|
@@ -437,4 +507,5 @@ class Predictor:
|
|
| 437 |
return False
|
| 438 |
except Exception as e:
|
| 439 |
print(f"Error uploading file: {e}")
|
| 440 |
-
return False
|
|
|
|
|
|
| 6 |
|
| 7 |
from concurrent.futures import ThreadPoolExecutor
|
| 8 |
import numpy as np
|
| 9 |
+
import base64
|
| 10 |
+
from pydub.utils import mediainfo
|
| 11 |
+
import tempfile
|
| 12 |
+
|
| 13 |
+
|
| 14 |
|
| 15 |
from runpod.serverless.utils import rp_cuda
|
| 16 |
import boto3
|
|
|
|
| 57 |
from pydantic import BaseModel, HttpUrl
|
| 58 |
from api import BaseSpeakerTTS, ToneColorConverter
|
| 59 |
|
| 60 |
+
from pydub import AudioSegment
|
| 61 |
+
|
| 62 |
+
|
| 63 |
class Predictor:
|
| 64 |
def __init__(self):
|
| 65 |
self.model = None
|
|
|
|
| 143 |
self.tone_color_converter.load_ckpt(f'{self.ckpt_converter}/checkpoint.pth')
|
| 144 |
|
| 145 |
|
| 146 |
+
def createvoice(self,audio_base_64,cut_audio,process_audio):
|
| 147 |
+
file_bytes = base64.b64decode(audio_base_64)
|
| 148 |
+
file_buffer = io.BytesIO(file_bytes)
|
| 149 |
+
|
| 150 |
+
header = file_buffer.read(12)
|
| 151 |
+
print(header)
|
| 152 |
+
file_format = None
|
| 153 |
+
bucket_name = 'demovidelyuseruploads'
|
| 154 |
+
if b'WAVE' in header:
|
| 155 |
+
file_format = 'wav'
|
| 156 |
+
elif header.startswith((b'\xff\xfb', b'\xff\xf3', b'\xff\xe3', b'\xff\xfa')):
|
| 157 |
+
file_format = 'mp3'
|
| 158 |
+
else:
|
| 159 |
+
file_format = 'unknown'
|
| 160 |
+
if file_format == 'unknown':
|
| 161 |
+
return {'error':'unrecognized file format, encode audio file as base64 str'}
|
| 162 |
+
|
| 163 |
+
unique_filename = f"{uuid.uuid4()}"
|
| 164 |
+
|
| 165 |
+
local_filename = f"{unique_filename}.{file_format}"
|
| 166 |
+
with open(local_filename, 'wb') as file_out:
|
| 167 |
+
file_out.write(file_bytes)
|
| 168 |
+
|
| 169 |
+
wav_filename = local_filename
|
| 170 |
+
if file_format == "mp3":
|
| 171 |
+
wav_filename = f"{unique_filename}.wav"
|
| 172 |
+
subprocess.run(["ffmpeg", "-i", local_filename, wav_filename])
|
| 173 |
+
os.remove(local_filename)
|
| 174 |
+
print(wav_filename)
|
| 175 |
+
|
| 176 |
+
# if cut_audio > 0, means it was set
|
| 177 |
+
if cut_audio > 0:
|
| 178 |
+
#need to cut
|
| 179 |
+
se_extractor.extract_segments_to_cut_audio(cut_audio,wav_filename)
|
| 180 |
+
|
| 181 |
+
file_url = f"https://{bucket_name}.s3.amazonaws.com/{wav_filename}"
|
| 182 |
+
|
| 183 |
+
if process_audio:
|
| 184 |
+
(new_sr, wav1) = self._fn(wav_filename,"Midpoint",32,0.5)
|
| 185 |
+
print('Denoised')
|
| 186 |
+
buffer = io.BytesIO()
|
| 187 |
+
sf.write(buffer, wav1, new_sr, format='WAV')
|
| 188 |
+
print(new_sr)
|
| 189 |
+
buffer.seek(0)
|
| 190 |
+
else:
|
| 191 |
+
wav1, sr = librosa.load(wav_filename, sr=None)
|
| 192 |
+
buffer = io.BytesIO()
|
| 193 |
+
sf.write(buffer, wav1, sr, format='WAV')
|
| 194 |
+
buffer.seek(0)
|
| 195 |
+
|
| 196 |
+
print("uploading")
|
| 197 |
+
content_type = "audio/wav"
|
| 198 |
+
try:
|
| 199 |
+
self.s3_client.put_object(Bucket=bucket_name, Key=wav_filename, Body=buffer, ContentType=content_type)
|
| 200 |
+
print("uploaded")
|
| 201 |
+
except Exception as e:
|
| 202 |
+
print(f"Error uploading to S3: {e}")
|
| 203 |
+
return {"error": str(e)}
|
| 204 |
+
|
| 205 |
+
os.remove(wav_filename)
|
| 206 |
+
return {"url": file_url}
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def predict(self,s3_url,passage,process_audio,method_type='voice_clone'):
|
| 210 |
output_dir = 'processed'
|
| 211 |
gen_id = str(uuid.uuid4())
|
| 212 |
os.makedirs(output_dir,exist_ok=True)
|
|
|
|
| 230 |
#voice_clone with styletts2
|
| 231 |
model,sampler = self.model,self.sampler
|
| 232 |
processed_seg_dir = os.path.join(segments_dir,s3_key.split('.')[0],'wavs')
|
| 233 |
+
result = self.process_audio_file(local_file_path,passage,model,sampler)
|
| 234 |
final_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
|
| 235 |
+
|
| 236 |
sf.write(final_output,result,24000)
|
| 237 |
+
if process_audio:
|
| 238 |
+
(new_sr, wav1) = self._fn(final_output,"Midpoint",32,0.5)
|
| 239 |
+
sf.write(final_output,wav1,new_sr)
|
| 240 |
|
| 241 |
base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
|
| 242 |
reference_speaker = local_file_path
|
| 243 |
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
|
| 244 |
src_path = os.path.join(results_dir,f"{gen_id}-tmp.wav")
|
| 245 |
+
openvoice_output = os.path.join(results_dir,f"{gen_id}-voice-clone-2.wav")
|
| 246 |
base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0)
|
| 247 |
|
| 248 |
source_se = torch.load(f'{self.ckpt_base}/en_default_se.pth').to(self.device)
|
| 249 |
tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
|
| 250 |
+
if process_audio:
|
| 251 |
+
(new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
|
| 252 |
+
sf.write(openvoice_output,wav1,new_sr)
|
| 253 |
+
|
|
|
|
| 254 |
|
| 255 |
mp3_final_output_1 = str(final_output).replace('wav','mp3')
|
| 256 |
+
mp3_final_output_2 = str(openvoice_output).replace('wav','mp3')
|
| 257 |
self.convert_wav_to_mp3(final_output,mp3_final_output_1)
|
| 258 |
+
self.convert_wav_to_mp3(openvoice_output,mp3_final_output_2)
|
| 259 |
print(mp3_final_output_1)
|
| 260 |
print(mp3_final_output_2)
|
| 261 |
|
|
|
|
| 274 |
base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0,use_emotions=True)
|
| 275 |
source_se = torch.load(f'{self.ckpt_base}/en_style_se.pth').to(self.device)
|
| 276 |
tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
|
| 277 |
+
if process_audio:
|
| 278 |
+
(new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
|
| 279 |
+
sf.write(openvoice_output,wav1,new_sr)
|
| 280 |
|
| 281 |
+
mp3_final_output_1 = str(openvoice_output).replace('wav','mp3')
|
| 282 |
+
self.convert_wav_to_mp3(openvoice_output,mp3_final_output_1)
|
| 283 |
print(mp3_final_output_1)
|
| 284 |
self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-with-emotions.mp3")
|
| 285 |
shutil.rmtree(os.path.join(output_dir,gen_id))
|
| 286 |
return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-with-emotions.mp3"
|
| 287 |
}
|
| 288 |
|
| 289 |
+
if method_type == 'voice_clone_with_multi_lang':
|
| 290 |
#voice clone with multi-lingugal
|
| 291 |
_,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
|
| 292 |
reference_speaker = local_file_path
|
|
|
|
| 310 |
self.tone_color_converter.convert(audio_src_path=openai_multi_lang_path, src_se=source_se, tgt_se=target_se, output_path=multi_lang_with_voice_clone_path,message='')
|
| 311 |
|
| 312 |
mp3_final_output_1 = str(multi_lang_with_voice_clone_path).replace('wav','mp3')
|
| 313 |
+
self.convert_wav_to_mp3(multi_lang_with_voice_clone_path,mp3_final_output_1)
|
| 314 |
print(mp3_final_output_1)
|
| 315 |
+
self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-multi-lang.mp3")
|
| 316 |
+
shutil.rmtree(os.path.join(output_dir,gen_id))
|
| 317 |
return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-multi-lang.mp3"
|
| 318 |
}
|
| 319 |
|
|
|
|
| 324 |
|
| 325 |
solver = solver.lower()
|
| 326 |
nfe = int(nfe)
|
| 327 |
+
lambd = 0.1 # lets remove denoise
|
| 328 |
|
| 329 |
dwav, sr = torchaudio.load(path)
|
| 330 |
dwav = dwav.mean(dim=0)
|
|
|
|
| 455 |
|
| 456 |
return torch.cat([ref_s, ref_p], dim=1)
|
| 457 |
|
| 458 |
+
def process_audio_file(self,local_file_path,passage,model,sampler):
|
| 459 |
+
print(local_file_path)
|
| 460 |
+
s_ref = self.compute_style(local_file_path, model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
sentences = split_and_recombine_text(passage)
|
| 462 |
wavs = []
|
| 463 |
s_prev = None
|
|
|
|
| 468 |
s_prev,
|
| 469 |
s_ref,
|
| 470 |
alpha = 0,
|
| 471 |
+
beta = 0.3,
|
| 472 |
t = 0.7,
|
| 473 |
diffusion_steps=10, embedding_scale=1)
|
| 474 |
wavs.append(wav)
|
|
|
|
| 507 |
return False
|
| 508 |
except Exception as e:
|
| 509 |
print(f"Error uploading file: {e}")
|
| 510 |
+
return False
|
| 511 |
+
|
src/processed/4d651a78-ccbd-4f66-96b1-0e0ede048d77/raw/631a27e2-8466-463e-a6ca-a2afd468c5a3.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c25a5bee8b60933b09cc779d942fa5c219f437e455bf64b08c2623f1c833ccfe
|
| 3 |
+
size 322856
|
src/processed/69b28271-7198-4307-8501-e3969bbebef4/raw/631a27e2-8466-463e-a6ca-a2afd468c5a3.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c25a5bee8b60933b09cc779d942fa5c219f437e455bf64b08c2623f1c833ccfe
|
| 3 |
+
size 322856
|
src/rp_handler.py
CHANGED
|
@@ -21,13 +21,36 @@ MODEL.setup()
|
|
| 21 |
@rp_debugger.FunctionTimer
|
| 22 |
def run_voice_clone_job(job):
|
| 23 |
job_input = job['input']
|
| 24 |
-
method_type = job_input
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
runpod.serverless.start({"handler": run_voice_clone_job})
|
|
|
|
| 21 |
@rp_debugger.FunctionTimer
|
| 22 |
def run_voice_clone_job(job):
|
| 23 |
job_input = job['input']
|
| 24 |
+
method_type = job_input.get('method_type')
|
| 25 |
+
|
| 26 |
+
if method_type not in ["create_voice","voice_clone","voice_clone_with_emotions","voice_clone_with_multi_lang"]:
|
| 27 |
+
return {"error":"Please set method_type: available options, create_voice, voice_clone, voice_clone_with_emotions,voice_clone_with_multi_lang"}
|
| 28 |
+
|
| 29 |
+
if method_type == "create_voice":
|
| 30 |
+
audio_base64 = job_input.get('audio_base64')
|
| 31 |
+
if audio_base64 is None:
|
| 32 |
+
return {"error":"Needs audio file as base64"}
|
| 33 |
+
cut_audio = job_input.get('cut_audio')
|
| 34 |
+
process_audio = job_input.get('process_audio')
|
| 35 |
+
print(process_audio)
|
| 36 |
+
if process_audio is None:
|
| 37 |
+
process_audio = False
|
| 38 |
+
if cut_audio is None:
|
| 39 |
+
cut_audio = 0
|
| 40 |
+
|
| 41 |
+
processed_urls = MODEL.createvoice(audio_base64,cut_audio,process_audio)
|
| 42 |
+
return processed_urls
|
| 43 |
+
else:
|
| 44 |
+
s3_url = job_input.get('s3_url')
|
| 45 |
+
passage = job_input.get('passage')
|
| 46 |
+
process_audio = job_input.get('process_audio')
|
| 47 |
+
print(process_audio)
|
| 48 |
+
if process_audio is None:
|
| 49 |
+
process_audio = False
|
| 50 |
+
|
| 51 |
+
result = MODEL.predict(s3_url,passage,process_audio,method_type)
|
| 52 |
+
|
| 53 |
+
return result
|
| 54 |
|
| 55 |
|
| 56 |
runpod.serverless.start({"handler": run_voice_clone_job})
|
src/rp_schema.py
CHANGED
|
@@ -14,5 +14,20 @@ INPUT_VALIDATIONS = {
|
|
| 14 |
'required': False,
|
| 15 |
'default': 'None'
|
| 16 |
},
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
}
|
|
|
|
| 14 |
'required': False,
|
| 15 |
'default': 'None'
|
| 16 |
},
|
| 17 |
+
'audio_base64': {
|
| 18 |
+
'type': str,
|
| 19 |
+
'required': False,
|
| 20 |
+
'default': 'None'
|
| 21 |
+
},
|
| 22 |
+
'cut_audio': {
|
| 23 |
+
'type': int,
|
| 24 |
+
'required': False,
|
| 25 |
+
'default': 0
|
| 26 |
+
},
|
| 27 |
+
'process_audio': {
|
| 28 |
+
'type': bool,
|
| 29 |
+
'required': False,
|
| 30 |
+
'default': False
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
}
|
src/se_extractor.py
CHANGED
|
@@ -10,7 +10,7 @@ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
|
|
| 10 |
model_size = "medium"
|
| 11 |
# Run on GPU with FP16
|
| 12 |
model = None
|
| 13 |
-
def split_audio_whisper(audio_path, target_dir='processed'):
|
| 14 |
global model
|
| 15 |
if model is None:
|
| 16 |
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
|
@@ -62,13 +62,18 @@ def split_audio_whisper(audio_path, target_dir='processed'):
|
|
| 62 |
output_file = os.path.join(wavs_folder, fname)
|
| 63 |
audio_seg.export(output_file, format='wav')
|
| 64 |
|
|
|
|
|
|
|
|
|
|
| 65 |
if k < len(segments) - 1:
|
| 66 |
-
start_time = max(0, segments[k+1].start -
|
| 67 |
|
| 68 |
s_ind = s_ind + 1
|
| 69 |
return wavs_folder
|
| 70 |
|
| 71 |
|
|
|
|
|
|
|
| 72 |
def split_audio_vad(audio_path, target_dir, split_seconds=10.0):
|
| 73 |
SAMPLE_RATE = 16000
|
| 74 |
audio_vad = get_audio_tensor(audio_path)
|
|
@@ -155,3 +160,24 @@ def generate_voice_segments(audio_path, target_dir='processed', vad=True):
|
|
| 155 |
def load_model():
|
| 156 |
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
| 157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
model_size = "medium"
|
| 11 |
# Run on GPU with FP16
|
| 12 |
model = None
|
| 13 |
+
def split_audio_whisper(audio_path, target_dir='processed',needs_offset=True):
|
| 14 |
global model
|
| 15 |
if model is None:
|
| 16 |
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
|
|
|
| 62 |
output_file = os.path.join(wavs_folder, fname)
|
| 63 |
audio_seg.export(output_file, format='wav')
|
| 64 |
|
| 65 |
+
offset = 0.0
|
| 66 |
+
if needs_offset:
|
| 67 |
+
offset = 0.08
|
| 68 |
if k < len(segments) - 1:
|
| 69 |
+
start_time = max(0, segments[k+1].start - offset)
|
| 70 |
|
| 71 |
s_ind = s_ind + 1
|
| 72 |
return wavs_folder
|
| 73 |
|
| 74 |
|
| 75 |
+
|
| 76 |
+
|
| 77 |
def split_audio_vad(audio_path, target_dir, split_seconds=10.0):
|
| 78 |
SAMPLE_RATE = 16000
|
| 79 |
audio_vad = get_audio_tensor(audio_path)
|
|
|
|
| 160 |
def load_model():
|
| 161 |
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
| 162 |
|
| 163 |
+
|
| 164 |
+
def extract_segments_to_cut_audio(max_duration,audio_path,target_dir='processed'):
|
| 165 |
+
global model
|
| 166 |
+
if model is None:
|
| 167 |
+
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
| 168 |
+
audio = AudioSegment.from_file(audio_path)
|
| 169 |
+
max_len = len(audio)
|
| 170 |
+
|
| 171 |
+
segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
|
| 172 |
+
segments = list(segments)
|
| 173 |
+
start_time = 0.0
|
| 174 |
+
end_time = max_len
|
| 175 |
+
for segment in segments:
|
| 176 |
+
print(segment.end)
|
| 177 |
+
if segment.end > max_duration:
|
| 178 |
+
end_time = segment.end * 1000
|
| 179 |
+
break
|
| 180 |
+
max_duration_audio = audio[start_time:end_time]
|
| 181 |
+
max_duration_audio.export(audio_path,format='wav')
|
| 182 |
+
|
| 183 |
+
|