#!/usr/bin/env python3 """ MuseTalk - Audio-Driven Video Generation Space Self-hosted Gradio interface for MuseTalk """ import gradio as gr import os import tempfile from pathlib import Path from inference import MuseTalkInference # Initialize inference engine inference_engine = None def initialize_engine(): global inference_engine if inference_engine is None: inference_engine = MuseTalkInference() return inference_engine # Validation functions def validate_audio(audio_path): """Validate audio file.""" if not audio_path: return False, "Please upload an audio file" if not os.path.exists(audio_path): return False, "Audio file not found" # Check file size (max 100MB) file_size = os.path.getsize(audio_path) / (1024 * 1024) if file_size > 100: return False, f"Audio file too large ({file_size:.1f}MB, max 100MB)" return True, "Audio file valid" def validate_video(video_path): """Validate video/image file.""" if not video_path: return False, "Please upload a video or image file" if not os.path.exists(video_path): return False, "Video/image file not found" # Check file size (max 500MB) file_size = os.path.getsize(video_path) / (1024 * 1024) if file_size > 500: return False, f"Video/image file too large ({file_size:.1f}MB, max 500MB)" return True, "Video/image file valid" def generate_lipsync_video(audio_file, video_file, fps, quality): """Generate lip-synced video using MuseTalk inference.""" try: # Validate inputs audio_valid, audio_msg = validate_audio(audio_file) if not audio_valid: return None, f"Audio validation failed: {audio_msg}" video_valid, video_msg = validate_video(video_file) if not video_valid: return None, f"Video validation failed: {video_msg}" # Initialize inference engine engine = initialize_engine() # Create temporary output file output_dir = tempfile.gettempdir() output_path = os.path.join(output_dir, "musetalk_output.mp4") # Define progress callback def progress_callback(progress, status): print(f"[{progress}%] {status}") # Run inference result_path = engine.generate( audio_path=audio_file, video_path=video_file, output_path=output_path, fps=int(fps), progress_callback=progress_callback ) return result_path, f"Successfully generated lip-synced video (Quality: {quality})" except Exception as e: error_msg = f"Error during generation: {str(e)}" print(error_msg) return None, error_msg # Create Gradio interface with gr.Blocks(title="MuseTalk - Audio-Driven Video Generation") as demo: gr.Markdown("# MuseTalk - Audio-Driven Video Generation") gr.Markdown("Generate realistic lip-synced videos from audio") # Main title and description gr.Markdown( """ ## MuseTalk - AI Audio-Driven Video Generation MuseTalk generates realistic lip-synced videos from audio input. This is a self-hosted Space running on Hugging Face. """ ) with gr.Row(): gr.Markdown( """ ### Features - Audio-driven video generation - Realistic lip-sync - Customizable video parameters """ ) gr.Markdown("### Input Files") with gr.Row(): with gr.Column(): gr.Markdown("#### Audio") audio_input = gr.Audio( label="Upload Audio", type="filepath", format="wav" ) with gr.Column(): gr.Markdown("#### Video/Image") video_input = gr.File( label="Upload Video or Image", file_count="single", file_types=["video", "image"] ) gr.Markdown("### Parameters") with gr.Row(): fps_slider = gr.Slider( minimum=20, maximum=60, value=25, step=1, label="FPS (Frames Per Second)" ) quality_radio = gr.Radio( choices=["Low", "Medium", "High"], value="Medium", label="Quality" ) gr.Markdown("### Generation") generate_button = gr.Button("Generate Lip-Synced Video", variant="primary") output_video = gr.Video( label="Generated Video", format="mp4" ) status_text = gr.Textbox( label="Status", interactive=False, lines=3 ) # Connect generate button to inference function generate_button.click( fn=generate_lipsync_video, inputs=[audio_input, video_input, fps_slider, quality_radio], outputs=[output_video, status_text] ) # Accordion sections with gr.Accordion("About MuseTalk", open=False): gr.Markdown( """ ### About MuseTalk MuseTalk is an AI model for audio-driven video generation that produces realistic lip-synced videos. The model operates in latent space using efficient single-step inpainting, enabling fast inference. **Key Features:** - Audio-driven lip-sync generation - Supports multiple languages (Chinese, English, Japanese, etc.) - Efficient inference on consumer hardware - High-quality 30fps+ output **Model Architecture:** - Uses whisper-tiny for audio feature extraction - DWPose for face detection and alignment - Latent space inpainting (not diffusion-based) - Supports 256x256 face region size """ ) with gr.Accordion("Documentation & Setup", open=False): gr.Markdown( """ ### How to Use 1. **Upload Audio**: Select an audio file (WAV, MP3, M4A, OGG) up to 10 minutes 2. **Upload Video/Image**: Select a reference video or image with a face 3. **Adjust Parameters**: - FPS: Output video frame rate (20-60) - Quality: Output quality level (Low/Medium/High) 4. **Generate**: Click "Generate Lip-Synced Video" 5. **Download**: Your generated video will appear below ### Supported Formats **Audio**: WAV, MP3, M4A, OGG (up to 10 minutes) **Video**: MP4, AVI, MOV, MKV (H264/H265 codec) **Image**: PNG, JPG, JPEG, BMP (with clear face visible) ### Technical Details - **Device**: CPU-based inference with PyTorch - **Memory**: Optimized for 4GB+ VRAM devices - **Speed**: ~1-5 minutes depending on video length and quality - **Output**: MP4 format with H264 codec """ ) if __name__ == "__main__": demo.launch(share=False, server_name="0.0.0.0", server_port=7860)