Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,75 +5,224 @@ Self-hosted Gradio interface for MuseTalk
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
-
# MuseTalk - AI Audio-Driven Video Generation
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
#
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
def
|
| 23 |
-
"""
|
| 24 |
-
if
|
| 25 |
-
return "Please upload
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# Create Gradio interface
|
| 31 |
-
with gr.Blocks(title="MuseTalk") as demo:
|
| 32 |
gr.Markdown("# MuseTalk - Audio-Driven Video Generation")
|
| 33 |
gr.Markdown("Generate realistic lip-synced videos from audio")
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
with gr.Row():
|
| 36 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
gr.Markdown("## Input Files")
|
| 39 |
|
| 40 |
with gr.Row():
|
| 41 |
with gr.Column():
|
| 42 |
-
gr.Markdown("### Audio")
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
with gr.Column():
|
| 46 |
-
gr.Markdown("### Video/Image")
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
gr.Markdown("##
|
| 50 |
|
| 51 |
with gr.Row():
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
| 58 |
-
output_video = gr.Video(label="Result")
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
outputs=[status_box, output_video]
|
| 64 |
)
|
| 65 |
|
| 66 |
-
gr.
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
if __name__ == "__main__":
|
| 79 |
-
demo.launch()
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
+
import os
|
| 9 |
+
import tempfile
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from inference import MuseTalkInference
|
| 12 |
|
| 13 |
+
# Initialize inference engine
|
| 14 |
+
inference_engine = None
|
|
|
|
| 15 |
|
| 16 |
+
def initialize_engine():
|
| 17 |
+
global inference_engine
|
| 18 |
+
if inference_engine is None:
|
| 19 |
+
inference_engine = MuseTalkInference()
|
| 20 |
+
return inference_engine
|
| 21 |
|
| 22 |
+
# Validation functions
|
| 23 |
+
def validate_audio(audio_path):
|
| 24 |
+
"""Validate audio file."""
|
| 25 |
+
if not audio_path:
|
| 26 |
+
return False, "Please upload an audio file"
|
| 27 |
+
|
| 28 |
+
if not os.path.exists(audio_path):
|
| 29 |
+
return False, "Audio file not found"
|
| 30 |
+
|
| 31 |
+
# Check file size (max 100MB)
|
| 32 |
+
file_size = os.path.getsize(audio_path) / (1024 * 1024)
|
| 33 |
+
if file_size > 100:
|
| 34 |
+
return False, f"Audio file too large ({file_size:.1f}MB, max 100MB)"
|
| 35 |
+
|
| 36 |
+
return True, "Audio file valid"
|
| 37 |
|
| 38 |
+
def validate_video(video_path):
|
| 39 |
+
"""Validate video/image file."""
|
| 40 |
+
if not video_path:
|
| 41 |
+
return False, "Please upload a video or image file"
|
| 42 |
+
|
| 43 |
+
if not os.path.exists(video_path):
|
| 44 |
+
return False, "Video/image file not found"
|
| 45 |
+
|
| 46 |
+
# Check file size (max 500MB)
|
| 47 |
+
file_size = os.path.getsize(video_path) / (1024 * 1024)
|
| 48 |
+
if file_size > 500:
|
| 49 |
+
return False, f"Video/image file too large ({file_size:.1f}MB, max 500MB)"
|
| 50 |
|
| 51 |
+
return True, "Video/image file valid"
|
| 52 |
+
|
| 53 |
+
def generate_lipsync_video(audio_file, video_file, fps, quality):
|
| 54 |
+
"""Generate lip-synced video using MuseTalk inference."""
|
| 55 |
+
try:
|
| 56 |
+
# Validate inputs
|
| 57 |
+
audio_valid, audio_msg = validate_audio(audio_file)
|
| 58 |
+
if not audio_valid:
|
| 59 |
+
return None, f"Audio validation failed: {audio_msg}"
|
| 60 |
+
|
| 61 |
+
video_valid, video_msg = validate_video(video_file)
|
| 62 |
+
if not video_valid:
|
| 63 |
+
return None, f"Video validation failed: {video_msg}"
|
| 64 |
+
|
| 65 |
+
# Initialize inference engine
|
| 66 |
+
engine = initialize_engine()
|
| 67 |
+
|
| 68 |
+
# Create temporary output file
|
| 69 |
+
output_dir = tempfile.gettempdir()
|
| 70 |
+
output_path = os.path.join(output_dir, "musetalk_output.mp4")
|
| 71 |
+
|
| 72 |
+
# Define progress callback
|
| 73 |
+
def progress_callback(progress, status):
|
| 74 |
+
print(f"[{progress}%] {status}")
|
| 75 |
+
|
| 76 |
+
# Run inference
|
| 77 |
+
result_path = engine.generate(
|
| 78 |
+
audio_path=audio_file,
|
| 79 |
+
video_path=video_file,
|
| 80 |
+
output_path=output_path,
|
| 81 |
+
fps=int(fps),
|
| 82 |
+
progress_callback=progress_callback
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
return result_path, f"Successfully generated lip-synced video (Quality: {quality})"
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
error_msg = f"Error during generation: {str(e)}"
|
| 89 |
+
print(error_msg)
|
| 90 |
+
return None, error_msg
|
| 91 |
|
| 92 |
# Create Gradio interface
|
| 93 |
+
with gr.Blocks(title="MuseTalk - Audio-Driven Video Generation") as demo:
|
| 94 |
gr.Markdown("# MuseTalk - Audio-Driven Video Generation")
|
| 95 |
gr.Markdown("Generate realistic lip-synced videos from audio")
|
| 96 |
|
| 97 |
+
# Main title and description
|
| 98 |
+
gr.Markdown(
|
| 99 |
+
"""
|
| 100 |
+
## MuseTalk - AI Audio-Driven Video Generation
|
| 101 |
+
|
| 102 |
+
MuseTalk generates realistic lip-synced videos from audio input.
|
| 103 |
+
This is a self-hosted Space running on Hugging Face.
|
| 104 |
+
"""
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
with gr.Row():
|
| 108 |
+
gr.Markdown(
|
| 109 |
+
"""
|
| 110 |
+
### Features
|
| 111 |
+
- Audio-driven video generation
|
| 112 |
+
- Realistic lip-sync
|
| 113 |
+
- Customizable video parameters
|
| 114 |
+
"""
|
| 115 |
+
)
|
| 116 |
|
| 117 |
+
gr.Markdown("### Input Files")
|
| 118 |
|
| 119 |
with gr.Row():
|
| 120 |
with gr.Column():
|
| 121 |
+
gr.Markdown("#### Audio")
|
| 122 |
+
audio_input = gr.Audio(
|
| 123 |
+
label="Upload Audio",
|
| 124 |
+
type="filepath",
|
| 125 |
+
format="wav"
|
| 126 |
+
)
|
| 127 |
|
| 128 |
with gr.Column():
|
| 129 |
+
gr.Markdown("#### Video/Image")
|
| 130 |
+
video_input = gr.File(
|
| 131 |
+
label="Upload Video or Image",
|
| 132 |
+
file_count="single",
|
| 133 |
+
file_types=["video", "image"]
|
| 134 |
+
)
|
| 135 |
|
| 136 |
+
gr.Markdown("### Parameters")
|
| 137 |
|
| 138 |
with gr.Row():
|
| 139 |
+
fps_slider = gr.Slider(
|
| 140 |
+
minimum=20,
|
| 141 |
+
maximum=60,
|
| 142 |
+
value=25,
|
| 143 |
+
step=1,
|
| 144 |
+
label="FPS (Frames Per Second)"
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
quality_radio = gr.Radio(
|
| 148 |
+
choices=["Low", "Medium", "High"],
|
| 149 |
+
value="Medium",
|
| 150 |
+
label="Quality"
|
| 151 |
+
)
|
| 152 |
|
| 153 |
+
gr.Markdown("### Generation")
|
| 154 |
|
| 155 |
+
generate_button = gr.Button("Generate Lip-Synced Video", variant="primary")
|
|
|
|
| 156 |
|
| 157 |
+
output_video = gr.Video(
|
| 158 |
+
label="Generated Video",
|
| 159 |
+
format="mp4"
|
|
|
|
| 160 |
)
|
| 161 |
|
| 162 |
+
status_text = gr.Textbox(
|
| 163 |
+
label="Status",
|
| 164 |
+
interactive=False,
|
| 165 |
+
lines=3
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# Connect generate button to inference function
|
| 169 |
+
generate_button.click(
|
| 170 |
+
fn=generate_lipsync_video,
|
| 171 |
+
inputs=[audio_input, video_input, fps_slider, quality_radio],
|
| 172 |
+
outputs=[output_video, status_text]
|
| 173 |
+
)
|
| 174 |
|
| 175 |
+
# Accordion sections
|
| 176 |
+
with gr.Accordion("About MuseTalk", open=False):
|
| 177 |
+
gr.Markdown(
|
| 178 |
+
"""
|
| 179 |
+
### About MuseTalk
|
| 180 |
+
|
| 181 |
+
MuseTalk is an AI model for audio-driven video generation that produces
|
| 182 |
+
realistic lip-synced videos. The model operates in latent space using
|
| 183 |
+
efficient single-step inpainting, enabling fast inference.
|
| 184 |
+
|
| 185 |
+
**Key Features:**
|
| 186 |
+
- Audio-driven lip-sync generation
|
| 187 |
+
- Supports multiple languages (Chinese, English, Japanese, etc.)
|
| 188 |
+
- Efficient inference on consumer hardware
|
| 189 |
+
- High-quality 30fps+ output
|
| 190 |
+
|
| 191 |
+
**Model Architecture:**
|
| 192 |
+
- Uses whisper-tiny for audio feature extraction
|
| 193 |
+
- DWPose for face detection and alignment
|
| 194 |
+
- Latent space inpainting (not diffusion-based)
|
| 195 |
+
- Supports 256x256 face region size
|
| 196 |
+
"""
|
| 197 |
+
)
|
| 198 |
|
| 199 |
+
with gr.Accordion("Documentation & Setup", open=False):
|
| 200 |
+
gr.Markdown(
|
| 201 |
+
"""
|
| 202 |
+
### How to Use
|
| 203 |
+
|
| 204 |
+
1. **Upload Audio**: Select an audio file (WAV, MP3, M4A, OGG) up to 10 minutes
|
| 205 |
+
2. **Upload Video/Image**: Select a reference video or image with a face
|
| 206 |
+
3. **Adjust Parameters**:
|
| 207 |
+
- FPS: Output video frame rate (20-60)
|
| 208 |
+
- Quality: Output quality level (Low/Medium/High)
|
| 209 |
+
4. **Generate**: Click "Generate Lip-Synced Video"
|
| 210 |
+
5. **Download**: Your generated video will appear below
|
| 211 |
+
|
| 212 |
+
### Supported Formats
|
| 213 |
+
|
| 214 |
+
**Audio**: WAV, MP3, M4A, OGG (up to 10 minutes)
|
| 215 |
+
**Video**: MP4, AVI, MOV, MKV (H264/H265 codec)
|
| 216 |
+
**Image**: PNG, JPG, JPEG, BMP (with clear face visible)
|
| 217 |
+
|
| 218 |
+
### Technical Details
|
| 219 |
+
|
| 220 |
+
- **Device**: CPU-based inference with PyTorch
|
| 221 |
+
- **Memory**: Optimized for 4GB+ VRAM devices
|
| 222 |
+
- **Speed**: ~1-5 minutes depending on video length and quality
|
| 223 |
+
- **Output**: MP4 format with H264 codec
|
| 224 |
+
"""
|
| 225 |
+
)
|
| 226 |
|
| 227 |
if __name__ == "__main__":
|
| 228 |
+
demo.launch(share=False, server_name="0.0.0.0", server_port=7860)
|