MuseTalk / app.py
trymonolith's picture
Update app.py
64e548c verified
#!/usr/bin/env python3
"""
MuseTalk - Audio-Driven Video Generation Space
Self-hosted Gradio interface for MuseTalk
"""
import gradio as gr
import os
import tempfile
from pathlib import Path
from inference import MuseTalkInference
# Initialize inference engine
inference_engine = None
def initialize_engine():
global inference_engine
if inference_engine is None:
inference_engine = MuseTalkInference()
return inference_engine
# Validation functions
def validate_audio(audio_path):
"""Validate audio file."""
if not audio_path:
return False, "Please upload an audio file"
if not os.path.exists(audio_path):
return False, "Audio file not found"
# Check file size (max 100MB)
file_size = os.path.getsize(audio_path) / (1024 * 1024)
if file_size > 100:
return False, f"Audio file too large ({file_size:.1f}MB, max 100MB)"
return True, "Audio file valid"
def validate_video(video_path):
"""Validate video/image file."""
if not video_path:
return False, "Please upload a video or image file"
if not os.path.exists(video_path):
return False, "Video/image file not found"
# Check file size (max 500MB)
file_size = os.path.getsize(video_path) / (1024 * 1024)
if file_size > 500:
return False, f"Video/image file too large ({file_size:.1f}MB, max 500MB)"
return True, "Video/image file valid"
def generate_lipsync_video(audio_file, video_file, fps, quality):
"""Generate lip-synced video using MuseTalk inference."""
try:
# Validate inputs
audio_valid, audio_msg = validate_audio(audio_file)
if not audio_valid:
return None, f"Audio validation failed: {audio_msg}"
video_valid, video_msg = validate_video(video_file)
if not video_valid:
return None, f"Video validation failed: {video_msg}"
# Initialize inference engine
engine = initialize_engine()
# Create temporary output file
output_dir = tempfile.gettempdir()
output_path = os.path.join(output_dir, "musetalk_output.mp4")
# Define progress callback
def progress_callback(progress, status):
print(f"[{progress}%] {status}")
# Run inference
result_path = engine.generate(
audio_path=audio_file,
video_path=video_file,
output_path=output_path,
fps=int(fps),
progress_callback=progress_callback
)
return result_path, f"Successfully generated lip-synced video (Quality: {quality})"
except Exception as e:
error_msg = f"Error during generation: {str(e)}"
print(error_msg)
return None, error_msg
# Create Gradio interface
with gr.Blocks(title="MuseTalk - Audio-Driven Video Generation") as demo:
gr.Markdown("# MuseTalk - Audio-Driven Video Generation")
gr.Markdown("Generate realistic lip-synced videos from audio")
# Main title and description
gr.Markdown(
"""
## MuseTalk - AI Audio-Driven Video Generation
MuseTalk generates realistic lip-synced videos from audio input.
This is a self-hosted Space running on Hugging Face.
"""
)
with gr.Row():
gr.Markdown(
"""
### Features
- Audio-driven video generation
- Realistic lip-sync
- Customizable video parameters
"""
)
gr.Markdown("### Input Files")
with gr.Row():
with gr.Column():
gr.Markdown("#### Audio")
audio_input = gr.Audio(
label="Upload Audio",
type="filepath",
format="wav"
)
with gr.Column():
gr.Markdown("#### Video/Image")
video_input = gr.File(
label="Upload Video or Image",
file_count="single",
file_types=["video", "image"]
)
gr.Markdown("### Parameters")
with gr.Row():
fps_slider = gr.Slider(
minimum=20,
maximum=60,
value=25,
step=1,
label="FPS (Frames Per Second)"
)
quality_radio = gr.Radio(
choices=["Low", "Medium", "High"],
value="Medium",
label="Quality"
)
gr.Markdown("### Generation")
generate_button = gr.Button("Generate Lip-Synced Video", variant="primary")
output_video = gr.Video(
label="Generated Video",
format="mp4"
)
status_text = gr.Textbox(
label="Status",
interactive=False,
lines=3
)
# Connect generate button to inference function
generate_button.click(
fn=generate_lipsync_video,
inputs=[audio_input, video_input, fps_slider, quality_radio],
outputs=[output_video, status_text]
)
# Accordion sections
with gr.Accordion("About MuseTalk", open=False):
gr.Markdown(
"""
### About MuseTalk
MuseTalk is an AI model for audio-driven video generation that produces
realistic lip-synced videos. The model operates in latent space using
efficient single-step inpainting, enabling fast inference.
**Key Features:**
- Audio-driven lip-sync generation
- Supports multiple languages (Chinese, English, Japanese, etc.)
- Efficient inference on consumer hardware
- High-quality 30fps+ output
**Model Architecture:**
- Uses whisper-tiny for audio feature extraction
- DWPose for face detection and alignment
- Latent space inpainting (not diffusion-based)
- Supports 256x256 face region size
"""
)
with gr.Accordion("Documentation & Setup", open=False):
gr.Markdown(
"""
### How to Use
1. **Upload Audio**: Select an audio file (WAV, MP3, M4A, OGG) up to 10 minutes
2. **Upload Video/Image**: Select a reference video or image with a face
3. **Adjust Parameters**:
- FPS: Output video frame rate (20-60)
- Quality: Output quality level (Low/Medium/High)
4. **Generate**: Click "Generate Lip-Synced Video"
5. **Download**: Your generated video will appear below
### Supported Formats
**Audio**: WAV, MP3, M4A, OGG (up to 10 minutes)
**Video**: MP4, AVI, MOV, MKV (H264/H265 codec)
**Image**: PNG, JPG, JPEG, BMP (with clear face visible)
### Technical Details
- **Device**: CPU-based inference with PyTorch
- **Memory**: Optimized for 4GB+ VRAM devices
- **Speed**: ~1-5 minutes depending on video length and quality
- **Output**: MP4 format with H264 codec
"""
)
if __name__ == "__main__":
demo.launch(share=False, server_name="0.0.0.0", server_port=7860)