Spaces:

trymonolith
/

MuseTalk

Running

App Files Files Community

MuseTalk / app.py

trymonolith

Update app.py

64e548c verified 4 months ago

raw

history blame contribute delete

7.31 kB

	#!/usr/bin/env python3
	"""
	MuseTalk - Audio-Driven Video Generation Space
	Self-hosted Gradio interface for MuseTalk
	"""

	import gradio as gr
	import os
	import tempfile
	from pathlib import Path
	from inference import MuseTalkInference

	# Initialize inference engine
	inference_engine = None

	def initialize_engine():
	global inference_engine
	if inference_engine is None:
	inference_engine = MuseTalkInference()
	return inference_engine

	# Validation functions
	def validate_audio(audio_path):
	"""Validate audio file."""
	if not audio_path:
	return False, "Please upload an audio file"

	if not os.path.exists(audio_path):
	return False, "Audio file not found"

	# Check file size (max 100MB)
	file_size = os.path.getsize(audio_path) / (1024 * 1024)
	if file_size > 100:
	return False, f"Audio file too large ({file_size:.1f}MB, max 100MB)"

	return True, "Audio file valid"

	def validate_video(video_path):
	"""Validate video/image file."""
	if not video_path:
	return False, "Please upload a video or image file"

	if not os.path.exists(video_path):
	return False, "Video/image file not found"

	# Check file size (max 500MB)
	file_size = os.path.getsize(video_path) / (1024 * 1024)
	if file_size > 500:
	return False, f"Video/image file too large ({file_size:.1f}MB, max 500MB)"

	return True, "Video/image file valid"

	def generate_lipsync_video(audio_file, video_file, fps, quality):
	"""Generate lip-synced video using MuseTalk inference."""
	try:
	# Validate inputs
	audio_valid, audio_msg = validate_audio(audio_file)
	if not audio_valid:
	return None, f"Audio validation failed: {audio_msg}"

	video_valid, video_msg = validate_video(video_file)
	if not video_valid:
	return None, f"Video validation failed: {video_msg}"

	# Initialize inference engine
	engine = initialize_engine()

	# Create temporary output file
	output_dir = tempfile.gettempdir()
	output_path = os.path.join(output_dir, "musetalk_output.mp4")

	# Define progress callback
	def progress_callback(progress, status):
	print(f"[{progress}%] {status}")

	# Run inference
	result_path = engine.generate(
	audio_path=audio_file,
	video_path=video_file,
	output_path=output_path,
	fps=int(fps),
	progress_callback=progress_callback
	)

	return result_path, f"Successfully generated lip-synced video (Quality: {quality})"

	except Exception as e:
	error_msg = f"Error during generation: {str(e)}"
	print(error_msg)
	return None, error_msg

	# Create Gradio interface
	with gr.Blocks(title="MuseTalk - Audio-Driven Video Generation") as demo:
	gr.Markdown("# MuseTalk - Audio-Driven Video Generation")
	gr.Markdown("Generate realistic lip-synced videos from audio")

	# Main title and description
	gr.Markdown(
	"""
	## MuseTalk - AI Audio-Driven Video Generation

	MuseTalk generates realistic lip-synced videos from audio input.
	This is a self-hosted Space running on Hugging Face.
	"""
	)

	with gr.Row():
	gr.Markdown(
	"""
	### Features
	- Audio-driven video generation
	- Realistic lip-sync
	- Customizable video parameters
	"""
	)

	gr.Markdown("### Input Files")

	with gr.Row():
	with gr.Column():
	gr.Markdown("#### Audio")
	audio_input = gr.Audio(
	label="Upload Audio",
	type="filepath",
	format="wav"
	)

	with gr.Column():
	gr.Markdown("#### Video/Image")
	video_input = gr.File(
	label="Upload Video or Image",
	file_count="single",
	file_types=["video", "image"]
	)

	gr.Markdown("### Parameters")

	with gr.Row():
	fps_slider = gr.Slider(
	minimum=20,
	maximum=60,
	value=25,
	step=1,
	label="FPS (Frames Per Second)"
	)

	quality_radio = gr.Radio(
	choices=["Low", "Medium", "High"],
	value="Medium",
	label="Quality"
	)

	gr.Markdown("### Generation")

	generate_button = gr.Button("Generate Lip-Synced Video", variant="primary")

	output_video = gr.Video(
	label="Generated Video",
	format="mp4"
	)

	status_text = gr.Textbox(
	label="Status",
	interactive=False,
	lines=3
	)

	# Connect generate button to inference function
	generate_button.click(
	fn=generate_lipsync_video,
	inputs=[audio_input, video_input, fps_slider, quality_radio],
	outputs=[output_video, status_text]
	)

	# Accordion sections
	with gr.Accordion("About MuseTalk", open=False):
	gr.Markdown(
	"""
	### About MuseTalk

	MuseTalk is an AI model for audio-driven video generation that produces
	realistic lip-synced videos. The model operates in latent space using
	efficient single-step inpainting, enabling fast inference.

	Key Features:
	- Audio-driven lip-sync generation
	- Supports multiple languages (Chinese, English, Japanese, etc.)
	- Efficient inference on consumer hardware
	- High-quality 30fps+ output

	Model Architecture:
	- Uses whisper-tiny for audio feature extraction
	- DWPose for face detection and alignment
	- Latent space inpainting (not diffusion-based)
	- Supports 256x256 face region size
	"""
	)

	with gr.Accordion("Documentation & Setup", open=False):
	gr.Markdown(
	"""
	### How to Use

	1. Upload Audio: Select an audio file (WAV, MP3, M4A, OGG) up to 10 minutes
	2. Upload Video/Image: Select a reference video or image with a face
	3. Adjust Parameters:
	- FPS: Output video frame rate (20-60)
	- Quality: Output quality level (Low/Medium/High)
	4. Generate: Click "Generate Lip-Synced Video"
	5. Download: Your generated video will appear below

	### Supported Formats

	Audio: WAV, MP3, M4A, OGG (up to 10 minutes)
	Video: MP4, AVI, MOV, MKV (H264/H265 codec)
	Image: PNG, JPG, JPEG, BMP (with clear face visible)

	### Technical Details

	- Device: CPU-based inference with PyTorch
	- Memory: Optimized for 4GB+ VRAM devices
	- Speed: ~1-5 minutes depending on video length and quality
	- Output: MP4 format with H264 codec
	"""
	)

	if __name__ == "__main__":
	demo.launch(share=False, server_name="0.0.0.0", server_port=7860)