Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| MuseTalk - Audio-Driven Video Generation Space | |
| Self-hosted Gradio interface for MuseTalk | |
| """ | |
| import gradio as gr | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| from inference import MuseTalkInference | |
| # Initialize inference engine | |
| inference_engine = None | |
| def initialize_engine(): | |
| global inference_engine | |
| if inference_engine is None: | |
| inference_engine = MuseTalkInference() | |
| return inference_engine | |
| # Validation functions | |
| def validate_audio(audio_path): | |
| """Validate audio file.""" | |
| if not audio_path: | |
| return False, "Please upload an audio file" | |
| if not os.path.exists(audio_path): | |
| return False, "Audio file not found" | |
| # Check file size (max 100MB) | |
| file_size = os.path.getsize(audio_path) / (1024 * 1024) | |
| if file_size > 100: | |
| return False, f"Audio file too large ({file_size:.1f}MB, max 100MB)" | |
| return True, "Audio file valid" | |
| def validate_video(video_path): | |
| """Validate video/image file.""" | |
| if not video_path: | |
| return False, "Please upload a video or image file" | |
| if not os.path.exists(video_path): | |
| return False, "Video/image file not found" | |
| # Check file size (max 500MB) | |
| file_size = os.path.getsize(video_path) / (1024 * 1024) | |
| if file_size > 500: | |
| return False, f"Video/image file too large ({file_size:.1f}MB, max 500MB)" | |
| return True, "Video/image file valid" | |
| def generate_lipsync_video(audio_file, video_file, fps, quality): | |
| """Generate lip-synced video using MuseTalk inference.""" | |
| try: | |
| # Validate inputs | |
| audio_valid, audio_msg = validate_audio(audio_file) | |
| if not audio_valid: | |
| return None, f"Audio validation failed: {audio_msg}" | |
| video_valid, video_msg = validate_video(video_file) | |
| if not video_valid: | |
| return None, f"Video validation failed: {video_msg}" | |
| # Initialize inference engine | |
| engine = initialize_engine() | |
| # Create temporary output file | |
| output_dir = tempfile.gettempdir() | |
| output_path = os.path.join(output_dir, "musetalk_output.mp4") | |
| # Define progress callback | |
| def progress_callback(progress, status): | |
| print(f"[{progress}%] {status}") | |
| # Run inference | |
| result_path = engine.generate( | |
| audio_path=audio_file, | |
| video_path=video_file, | |
| output_path=output_path, | |
| fps=int(fps), | |
| progress_callback=progress_callback | |
| ) | |
| return result_path, f"Successfully generated lip-synced video (Quality: {quality})" | |
| except Exception as e: | |
| error_msg = f"Error during generation: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg | |
| # Create Gradio interface | |
| with gr.Blocks(title="MuseTalk - Audio-Driven Video Generation") as demo: | |
| gr.Markdown("# MuseTalk - Audio-Driven Video Generation") | |
| gr.Markdown("Generate realistic lip-synced videos from audio") | |
| # Main title and description | |
| gr.Markdown( | |
| """ | |
| ## MuseTalk - AI Audio-Driven Video Generation | |
| MuseTalk generates realistic lip-synced videos from audio input. | |
| This is a self-hosted Space running on Hugging Face. | |
| """ | |
| ) | |
| with gr.Row(): | |
| gr.Markdown( | |
| """ | |
| ### Features | |
| - Audio-driven video generation | |
| - Realistic lip-sync | |
| - Customizable video parameters | |
| """ | |
| ) | |
| gr.Markdown("### Input Files") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("#### Audio") | |
| audio_input = gr.Audio( | |
| label="Upload Audio", | |
| type="filepath", | |
| format="wav" | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("#### Video/Image") | |
| video_input = gr.File( | |
| label="Upload Video or Image", | |
| file_count="single", | |
| file_types=["video", "image"] | |
| ) | |
| gr.Markdown("### Parameters") | |
| with gr.Row(): | |
| fps_slider = gr.Slider( | |
| minimum=20, | |
| maximum=60, | |
| value=25, | |
| step=1, | |
| label="FPS (Frames Per Second)" | |
| ) | |
| quality_radio = gr.Radio( | |
| choices=["Low", "Medium", "High"], | |
| value="Medium", | |
| label="Quality" | |
| ) | |
| gr.Markdown("### Generation") | |
| generate_button = gr.Button("Generate Lip-Synced Video", variant="primary") | |
| output_video = gr.Video( | |
| label="Generated Video", | |
| format="mp4" | |
| ) | |
| status_text = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| lines=3 | |
| ) | |
| # Connect generate button to inference function | |
| generate_button.click( | |
| fn=generate_lipsync_video, | |
| inputs=[audio_input, video_input, fps_slider, quality_radio], | |
| outputs=[output_video, status_text] | |
| ) | |
| # Accordion sections | |
| with gr.Accordion("About MuseTalk", open=False): | |
| gr.Markdown( | |
| """ | |
| ### About MuseTalk | |
| MuseTalk is an AI model for audio-driven video generation that produces | |
| realistic lip-synced videos. The model operates in latent space using | |
| efficient single-step inpainting, enabling fast inference. | |
| **Key Features:** | |
| - Audio-driven lip-sync generation | |
| - Supports multiple languages (Chinese, English, Japanese, etc.) | |
| - Efficient inference on consumer hardware | |
| - High-quality 30fps+ output | |
| **Model Architecture:** | |
| - Uses whisper-tiny for audio feature extraction | |
| - DWPose for face detection and alignment | |
| - Latent space inpainting (not diffusion-based) | |
| - Supports 256x256 face region size | |
| """ | |
| ) | |
| with gr.Accordion("Documentation & Setup", open=False): | |
| gr.Markdown( | |
| """ | |
| ### How to Use | |
| 1. **Upload Audio**: Select an audio file (WAV, MP3, M4A, OGG) up to 10 minutes | |
| 2. **Upload Video/Image**: Select a reference video or image with a face | |
| 3. **Adjust Parameters**: | |
| - FPS: Output video frame rate (20-60) | |
| - Quality: Output quality level (Low/Medium/High) | |
| 4. **Generate**: Click "Generate Lip-Synced Video" | |
| 5. **Download**: Your generated video will appear below | |
| ### Supported Formats | |
| **Audio**: WAV, MP3, M4A, OGG (up to 10 minutes) | |
| **Video**: MP4, AVI, MOV, MKV (H264/H265 codec) | |
| **Image**: PNG, JPG, JPEG, BMP (with clear face visible) | |
| ### Technical Details | |
| - **Device**: CPU-based inference with PyTorch | |
| - **Memory**: Optimized for 4GB+ VRAM devices | |
| - **Speed**: ~1-5 minutes depending on video length and quality | |
| - **Output**: MP4 format with H264 codec | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=False, server_name="0.0.0.0", server_port=7860) |