| import torch, uuid
|
| import os, sys, shutil
|
| from src.utils.preprocess import CropAndExtract
|
| from src.test_audio2coeff import Audio2Coeff
|
| from src.facerender.animate import AnimateFromCoeff
|
| from src.generate_batch import get_data
|
| from src.generate_facerender_batch import get_facerender_data
|
|
|
| from src.utils.init_path import init_path
|
|
|
| from pydub import AudioSegment
|
|
|
|
|
| def mp3_to_wav(mp3_filename,wav_filename,frame_rate):
|
| mp3_file = AudioSegment.from_file(file=mp3_filename)
|
| mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav")
|
|
|
|
|
| class SadTalker():
|
|
|
| def __init__(self, checkpoint_path='checkpoints', config_path='src/config', lazy_load=False):
|
|
|
| if torch.cuda.is_available() :
|
| device = "cuda"
|
| else:
|
| device = "cpu"
|
|
|
| self.device = device
|
|
|
| os.environ['TORCH_HOME']= checkpoint_path
|
|
|
| self.checkpoint_path = checkpoint_path
|
| self.config_path = config_path
|
|
|
|
|
| def test(self, source_image, driven_audio, preprocess='crop',
|
| still_mode=False, use_enhancer=False, batch_size=1, size=256,
|
| pose_style = 0, exp_scale=1.0,
|
| use_ref_video = False,
|
| ref_video = None,
|
| ref_info = None,
|
| use_idle_mode = False,
|
| length_of_audio = 0, use_blink=True,
|
| result_dir='./results/'):
|
|
|
| self.sadtalker_paths = init_path(self.checkpoint_path, self.config_path, size, False, preprocess)
|
| print(self.sadtalker_paths)
|
|
|
| self.audio_to_coeff = Audio2Coeff(self.sadtalker_paths, self.device)
|
| self.preprocess_model = CropAndExtract(self.sadtalker_paths, self.device)
|
| self.animate_from_coeff = AnimateFromCoeff(self.sadtalker_paths, self.device)
|
|
|
| time_tag = str(uuid.uuid4())
|
| save_dir = os.path.join(result_dir, time_tag)
|
| os.makedirs(save_dir, exist_ok=True)
|
|
|
| input_dir = os.path.join(save_dir, 'input')
|
| os.makedirs(input_dir, exist_ok=True)
|
|
|
| print(source_image)
|
| pic_path = os.path.join(input_dir, os.path.basename(source_image))
|
| shutil.move(source_image, input_dir)
|
|
|
| if driven_audio is not None and os.path.isfile(driven_audio):
|
| audio_path = os.path.join(input_dir, os.path.basename(driven_audio))
|
|
|
|
|
| if '.mp3' in audio_path:
|
| mp3_to_wav(driven_audio, audio_path.replace('.mp3', '.wav'), 16000)
|
| audio_path = audio_path.replace('.mp3', '.wav')
|
| else:
|
| shutil.move(driven_audio, input_dir)
|
|
|
| elif use_idle_mode:
|
| audio_path = os.path.join(input_dir, 'idlemode_'+str(length_of_audio)+'.wav')
|
| from pydub import AudioSegment
|
| one_sec_segment = AudioSegment.silent(duration=1000*length_of_audio)
|
| one_sec_segment.export(audio_path, format="wav")
|
| else:
|
| print(use_ref_video, ref_info)
|
| assert use_ref_video == True and ref_info == 'all'
|
|
|
| if use_ref_video and ref_info == 'all':
|
| ref_video_videoname = os.path.basename(ref_video)
|
| audio_path = os.path.join(save_dir, ref_video_videoname+'.wav')
|
| print('new audiopath:',audio_path)
|
|
|
| cmd = r"ffmpeg -y -hide_banner -loglevel error -i %s %s"%(ref_video, audio_path)
|
| os.system(cmd)
|
|
|
| os.makedirs(save_dir, exist_ok=True)
|
|
|
|
|
| first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
|
| os.makedirs(first_frame_dir, exist_ok=True)
|
| first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(pic_path, first_frame_dir, preprocess, True, size)
|
|
|
| if first_coeff_path is None:
|
| raise AttributeError("No face is detected")
|
|
|
| if use_ref_video:
|
| print('using ref video for genreation')
|
| ref_video_videoname = os.path.splitext(os.path.split(ref_video)[-1])[0]
|
| ref_video_frame_dir = os.path.join(save_dir, ref_video_videoname)
|
| os.makedirs(ref_video_frame_dir, exist_ok=True)
|
| print('3DMM Extraction for the reference video providing pose')
|
| ref_video_coeff_path, _, _ = self.preprocess_model.generate(ref_video, ref_video_frame_dir, preprocess, source_image_flag=False)
|
| else:
|
| ref_video_coeff_path = None
|
|
|
| if use_ref_video:
|
| if ref_info == 'pose':
|
| ref_pose_coeff_path = ref_video_coeff_path
|
| ref_eyeblink_coeff_path = None
|
| elif ref_info == 'blink':
|
| ref_pose_coeff_path = None
|
| ref_eyeblink_coeff_path = ref_video_coeff_path
|
| elif ref_info == 'pose+blink':
|
| ref_pose_coeff_path = ref_video_coeff_path
|
| ref_eyeblink_coeff_path = ref_video_coeff_path
|
| elif ref_info == 'all':
|
| ref_pose_coeff_path = None
|
| ref_eyeblink_coeff_path = None
|
| else:
|
| raise('error in refinfo')
|
| else:
|
| ref_pose_coeff_path = None
|
| ref_eyeblink_coeff_path = None
|
|
|
|
|
| if use_ref_video and ref_info == 'all':
|
| coeff_path = ref_video_coeff_path
|
| else:
|
| batch = get_data(first_coeff_path, audio_path, self.device, ref_eyeblink_coeff_path=ref_eyeblink_coeff_path, still=still_mode, idlemode=use_idle_mode, length_of_audio=length_of_audio, use_blink=use_blink)
|
| coeff_path = self.audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
|
|
|
|
|
| data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode, preprocess=preprocess, size=size, expression_scale = exp_scale)
|
| return_path = self.animate_from_coeff.generate(data, save_dir, pic_path, crop_info, enhancer='gfpgan' if use_enhancer else None, preprocess=preprocess, img_size=size)
|
| video_name = data['video_name']
|
| print(f'The generated video is named {video_name} in {save_dir}')
|
|
|
| del self.preprocess_model
|
| del self.audio_to_coeff
|
| del self.animate_from_coeff
|
|
|
| if torch.cuda.is_available():
|
| torch.cuda.empty_cache()
|
| torch.cuda.synchronize()
|
|
|
| import gc; gc.collect()
|
|
|
| return return_path
|
|
|
| |