| import os
|
| import subprocess
|
| import gradio as gr
|
| from retinaface import RetinaFace
|
| from PIL import Image
|
| import filetype
|
| from datetime import datetime
|
| import re
|
| import sys
|
| import torch
|
| import argparse
|
|
|
| import platform, os
|
|
|
| def open_folder():
|
| open_folder_path = os.path.abspath("outputs")
|
| if platform.system() == "Windows":
|
| os.startfile(open_folder_path)
|
| elif platform.system() == "Linux":
|
| os.system(f'xdg-open "{open_folder_path}"')
|
|
|
|
|
|
|
| python_executable = sys.executable
|
|
|
| def display_media(file):
|
|
|
| if file is None:
|
| return gr.update(visible=False), gr.update(visible=False)
|
| kind = filetype.guess(file.name)
|
|
|
| if kind is None:
|
| return gr.update(visible=False), gr.update(visible=False)
|
|
|
| if kind.mime.startswith('video'):
|
| return gr.update(value=file.name, visible=True), gr.update(visible=False)
|
| elif kind.mime.startswith('audio'):
|
| return gr.update(visible=False), gr.update(value=file.name, visible=True)
|
| else:
|
| return gr.update(visible=False), gr.update(visible=False)
|
|
|
|
|
| parser = argparse.ArgumentParser()
|
| parser.add_argument("--share", type=str, default=False, help="Set to True to share the app publicly.")
|
| args = parser.parse_args()
|
|
|
|
|
|
|
| def extract_audio(video_path, audio_path):
|
| command = [python_executable, "-m", "ffmpeg", "-i", video_path, "-vn", "-acodec", "libmp3lame", "-q:a", "2", audio_path]
|
| subprocess.call(command)
|
|
|
|
|
| def convert_audio_to_mp3(audio_path, mp3_path):
|
| command = ["ffmpeg", "-i", audio_path, "-acodec", "libmp3lame", "-q:a", "2", mp3_path]
|
| subprocess.call(command)
|
|
|
| def crop_and_save_image(image_path, auto_crop, crop_width, crop_height, crop_expansion):
|
| cropped_image = auto_crop_image(image_path, crop_expansion, crop_size=(crop_width, crop_height))
|
| if cropped_image is not None:
|
| cropped_folder = os.path.join("outputs", "cropped_images")
|
| os.makedirs(cropped_folder, exist_ok=True)
|
|
|
|
|
| base_name, extension = os.path.splitext(os.path.basename(image_path))
|
|
|
|
|
| counter = 1
|
|
|
|
|
| new_image_name = f"{base_name}_{counter:04d}{extension}"
|
| cropped_image_path = os.path.join(cropped_folder, new_image_name)
|
|
|
|
|
| while os.path.exists(cropped_image_path):
|
| counter += 1
|
| new_image_name = f"{base_name}_{counter:04d}{extension}"
|
| cropped_image_path = os.path.join(cropped_folder, new_image_name)
|
|
|
|
|
| cropped_image.save(cropped_image_path, format='PNG')
|
| return cropped_image_path
|
| return None
|
|
|
|
|
| def generate_kps_sequence_and_audio(video_path, kps_sequence_save_path, audio_save_path):
|
| command = [python_executable, "scripts/extract_kps_sequence_and_audio.py", "--video_path", video_path, "--kps_sequence_save_path", kps_sequence_save_path, "--audio_save_path", audio_save_path]
|
| subprocess.call(command)
|
|
|
| def auto_crop_image(image_path, expand_percent, crop_size=(512, 512)):
|
|
|
| if torch.cuda.is_available():
|
| device = 'cuda'
|
| print("Using GPU for RetinaFace detection.")
|
| else:
|
| device = 'cpu'
|
| print("Using CPU for RetinaFace detection.")
|
|
|
|
|
| img = Image.open(image_path)
|
|
|
|
|
| faces = RetinaFace.detect_faces(image_path)
|
|
|
| if not faces:
|
| print("No faces detected.")
|
| return None
|
|
|
|
|
|
|
| face = list(faces.values())[0]
|
| landmarks = face['landmarks']
|
|
|
|
|
| right_eye = landmarks['right_eye']
|
| left_eye = landmarks['left_eye']
|
| right_mouth = landmarks['mouth_right']
|
| left_mouth = landmarks['mouth_left']
|
|
|
|
|
| eye_distance = abs(right_eye[0] - left_eye[0])
|
|
|
|
|
| head_width = eye_distance * 4.5
|
| head_height = eye_distance * 6.5
|
|
|
|
|
| eye_center_x = (right_eye[0] + left_eye[0]) // 2
|
| eye_center_y = (right_eye[1] + left_eye[1]) // 2
|
|
|
|
|
| head_left = max(0, int(eye_center_x - head_width // 2))
|
| head_top = max(0, int(eye_center_y - head_height // 2))
|
| head_right = min(img.width, int(eye_center_x + head_width // 2))
|
| head_bottom = min(img.height, int(eye_center_y + head_height // 2))
|
|
|
|
|
| assumed_head_img = img.crop((head_left, head_top, head_right, head_bottom))
|
| assumed_head_img.save("assumed_head.png", format='PNG')
|
|
|
|
|
| expanded_w = int(head_width * (1 + expand_percent))
|
| expanded_h = int(head_height * (1 + expand_percent))
|
|
|
|
|
| center_x, center_y = head_left + head_width // 2, head_top + head_height // 2
|
| left = max(0, center_x - expanded_w // 2)
|
| right = min(img.width, center_x + expanded_w // 2)
|
| top = max(0, center_y - expanded_h // 2)
|
| bottom = min(img.height, center_y + expanded_h // 2)
|
|
|
|
|
| cropped_img = img.crop((left, top, right, bottom))
|
| cropped_img.save("expanded_face.png", format='PNG')
|
|
|
|
|
| cropped_width, cropped_height = cropped_img.size
|
| aspect_ratio = cropped_width / cropped_height
|
|
|
|
|
| target_width = crop_size[0]
|
| target_height = crop_size[1]
|
|
|
|
|
| if aspect_ratio > target_width / target_height:
|
|
|
| new_width = int(cropped_height * target_width / target_height)
|
| left_crop = (cropped_width - new_width) // 2
|
| right_crop = left_crop + new_width
|
| top_crop = 0
|
| bottom_crop = cropped_height
|
| else:
|
|
|
| new_height = int(cropped_width * target_height / target_width)
|
| top_crop = (cropped_height - new_height) // 2
|
| bottom_crop = top_crop + new_height
|
| left_crop = 0
|
| right_crop = cropped_width
|
|
|
|
|
| final_cropped_img = cropped_img.crop((left_crop, top_crop, right_crop, bottom_crop))
|
| final_cropped_img.save("final_cropped_img.png", format='PNG')
|
|
|
|
|
| resized_img = final_cropped_img.resize(crop_size, resample=Image.LANCZOS)
|
|
|
|
|
| resized_img.save(image_path, format='PNG')
|
| return resized_img
|
|
|
|
|
| def generate_output_video(reference_image_path, audio_path, kps_path, output_path, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop, crop_width, crop_height, crop_expansion,image_width,image_height, low_vram):
|
| print("auto cropping...")
|
| if auto_crop:
|
| auto_crop_image(reference_image_path,crop_expansion, crop_size=(crop_width, crop_height))
|
|
|
| print("starting inference...")
|
| command = [
|
| python_executable, "inference.py",
|
| "--reference_image_path", reference_image_path,
|
| "--audio_path", audio_path,
|
| "--kps_path", kps_path,
|
| "--output_path", output_path,
|
| "--retarget_strategy", retarget_strategy,
|
| "--num_inference_steps", str(num_inference_steps),
|
| "--reference_attention_weight", str(reference_attention_weight),
|
| "--audio_attention_weight", str(audio_attention_weight),
|
| "--image_width", str(image_width),
|
| "--image_height", str(image_height)
|
| ]
|
|
|
| if low_vram:
|
| command.append("--save_gpu_memory")
|
|
|
| with open("executed_command.txt", "w") as file:
|
| file.write(" ".join(command))
|
|
|
| subprocess.call(command)
|
| return output_path, reference_image_path
|
|
|
| def sanitize_folder_name(name):
|
|
|
| invalid_chars = r'[<>:"/\\|?*\x00-\x1F]'
|
|
|
| sanitized_name = re.sub(invalid_chars, '_', name)
|
| return sanitized_name
|
|
|
|
|
| def process_input(reference_image, target_input, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop, crop_width, crop_height, crop_expansion,image_width,image_height,low_vram):
|
|
|
| temp_process_dir = "temp_process"
|
| os.makedirs(temp_process_dir, exist_ok=True)
|
|
|
| input_file_name = os.path.splitext(os.path.basename(reference_image))[0]
|
| input_file_name=sanitize_folder_name(input_file_name)
|
| timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
| temp_dir = os.path.join(temp_process_dir, f"{input_file_name}_{timestamp}")
|
| os.makedirs(temp_dir, exist_ok=True)
|
|
|
| kind = filetype.guess(target_input)
|
| if not kind:
|
| raise ValueError("Cannot determine file type. Please provide a valid video or audio file.")
|
|
|
| mime_type = kind.mime
|
|
|
| if mime_type.startswith("video/"):
|
| audio_path = os.path.join(temp_dir, "target_audio.mp3")
|
| kps_path = os.path.join(temp_dir, "kps.pth")
|
| print("generating generate_kps_sequence_and_audio...")
|
| generate_kps_sequence_and_audio(target_input, kps_path, audio_path)
|
| elif mime_type.startswith("audio/"):
|
| audio_path = target_input
|
| if mime_type != "audio/mpeg":
|
| mp3_path = os.path.join(temp_dir, "target_audio_converted.mp3")
|
| convert_audio_to_mp3(target_input, mp3_path)
|
| audio_path = mp3_path
|
| kps_path = ""
|
| else:
|
| raise ValueError("Unsupported file type. Please provide a video or audio file.")
|
|
|
| output_dir = "outputs"
|
| os.makedirs(output_dir, exist_ok=True)
|
| output_file_name = f"{input_file_name}_result_"
|
| output_file_name=sanitize_folder_name(output_file_name)
|
| output_file_ext = ".mp4"
|
| output_file_count = 1
|
| while os.path.exists(os.path.join(output_dir, f"{output_file_name}{output_file_count:04d}{output_file_ext}")):
|
| output_file_count += 1
|
| output_path = os.path.join(output_dir, f"{output_file_name}{output_file_count:04d}{output_file_ext}")
|
|
|
|
|
| output_video_path, cropped_image_path = generate_output_video(reference_image, audio_path, kps_path, output_path, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop,crop_width,crop_height, crop_expansion,image_width,image_height,low_vram)
|
|
|
| return output_video_path, cropped_image_path
|
|
|
| def launch_interface():
|
| retarget_strategies = ["fix_face", "no_retarget", "offset_retarget", "naive_retarget"]
|
|
|
| with gr.Blocks() as demo:
|
| gr.Markdown("# Tencent AI Lab - V-Express Image to Animation V4 : https://www.patreon.com/posts/105251204")
|
| with gr.Row():
|
| with gr.Column():
|
| input_image = gr.Image(label="Reference Image", format="png", type="filepath", height=512)
|
| generate_button = gr.Button("Generate Talking Video")
|
| low_vram = gr.Checkbox(label="Low VRAM - Greatly reduces VRAM usage but takes longer", value=False,visible=False)
|
| crop_button = gr.Button("Crop Image")
|
| with gr.Row():
|
|
|
| with gr.Column(min_width=0):
|
| image_width = gr.Number(label="Target Video Width", value=512)
|
|
|
| with gr.Column(min_width=0):
|
| image_height = gr.Number(label="Target Video Height", value=512)
|
|
|
| with gr.Row():
|
| with gr.Column(min_width=0):
|
| retarget_strategy = gr.Dropdown(retarget_strategies, label="Retarget Strategy", value="fix_face")
|
| with gr.Column(min_width=0):
|
| inference_steps = gr.Slider(10, 90, step=1, label="Number of Inference Steps", value=30)
|
|
|
| with gr.Row():
|
| with gr.Column(min_width=0):
|
| reference_attention = gr.Slider(0.80, 1.1, step=0.01, label="Reference Attention Weight", value=0.95)
|
| with gr.Column(min_width=0):
|
| audio_attention = gr.Slider(1.0, 5.0, step=0.1, label="Audio Attention Weight", value=3.0)
|
|
|
| with gr.Row(visible=True) as crop_size_row:
|
| with gr.Column(min_width=0):
|
| auto_crop = gr.Checkbox(label="Auto Crop Image", value=True)
|
| with gr.Column(min_width=0):
|
| crop_expansion = gr.Slider(0.0, 1.0, step=0.01, label="Face Focus Expansion Percent", value=0.15)
|
| with gr.Row():
|
| with gr.Column(min_width=0):
|
| crop_width = gr.Number(label="Crop Width", value=512)
|
| with gr.Column(min_width=0):
|
| crop_height = gr.Number(label="Crop Height", value=512)
|
|
|
| with gr.Column():
|
| input_video = gr.File(
|
| label="Target Input (Image or Video)",
|
| type="filepath",
|
| file_count="single",
|
| file_types=[
|
| ".mp4", ".avi", ".mov", ".wmv", ".flv", ".mkv", ".webm",
|
| ".3gp", ".m4v", ".mpg", ".mpeg", ".m2v", ".m4v", ".mts",
|
| ".mp3", ".wav", ".aac", ".flac", ".m4a", ".wma", ".ogg"
|
| ],
|
| height=512 )
|
| video_output = gr.Video(visible=False)
|
| audio_output = gr.Audio(visible=False)
|
|
|
| input_video.change(display_media, inputs=input_video, outputs=[video_output, audio_output])
|
| btn_open_outputs = gr.Button("Open Outputs Folder")
|
| btn_open_outputs.click(fn=open_folder)
|
| gr.Markdown("""
|
|
|
| Retarget Strategies
|
|
|
| Only target audio : fix_face
|
|
|
| Input picture and target video (same person - best practice) select : no_retarget
|
|
|
| Input picture and target video (different person) select : offset_retarget or naive_retarget
|
|
|
| Please look examples in Tests folder to see which settings you like most. I feel like offset_retarget is best
|
|
|
| You can turn up reference_attention_weight to make the model maintain higher character consistency, and turn down audio_attention_weight to reduce mouth artifacts. E.g. setting both values to 1.0
|
| """)
|
|
|
|
|
|
|
| with gr.Column():
|
| output_video = gr.Video(label="Generated Video", height=512)
|
| output_image = gr.Image(label="Cropped Image")
|
|
|
|
|
| generate_button.click(
|
| fn=process_input,
|
| inputs=[
|
| input_image,
|
| input_video,
|
| retarget_strategy,
|
| inference_steps,
|
| reference_attention,
|
| audio_attention,
|
| auto_crop,
|
| crop_width,
|
| crop_height,
|
| crop_expansion,
|
| image_width,
|
| image_height,
|
| low_vram
|
| ],
|
| outputs=[output_video, output_image]
|
| )
|
|
|
| crop_button.click(
|
| fn=crop_and_save_image,
|
| inputs=[
|
| input_image,
|
| auto_crop,
|
| crop_width,
|
| crop_height,
|
| crop_expansion
|
| ],
|
| outputs=output_image
|
| )
|
|
|
| demo.queue()
|
| demo.launch(inbrowser=True,share=args.share)
|
|
|
|
|
| launch_interface() |