Spaces:

trymonolith
/

MuseTalk

Running

App Files Files Community

trymonolith commited on Dec 4, 2025

Commit

64e548c

verified ·

1 Parent(s): 7f36f80

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -43

app.py CHANGED Viewed

@@ -5,75 +5,224 @@ Self-hosted Gradio interface for MuseTalk
 """
 import gradio as gr
-# Welcome message
-WELCOME_TEXT = """
-# MuseTalk - AI Audio-Driven Video Generation
-MuseTalk generates realistic lip-synced videos from audio input.
-This is a self-hosted Space running on Hugging Face.
-## Features
-- Audio-driven video generation
-- Realistic lip-sync
-- Customizable video parameters
-"""
-def generate_video(audio_file, video_file):
-    """Placeholder function for video generation"""
-    if audio_file is None or video_file is None:
-        return "Please upload both audio and video files", None
-    status = "MuseTalk generation would proceed here with proper installation"
-    return status, None
 # Create Gradio interface
-with gr.Blocks(title="MuseTalk") as demo:
     gr.Markdown("# MuseTalk - Audio-Driven Video Generation")
     gr.Markdown("Generate realistic lip-synced videos from audio")
     with gr.Row():
-        gr.Markdown(WELCOME_TEXT)
-    gr.Markdown("## Input Files")
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### Audio")
-            audio_file = gr.Audio(label="Upload Audio", type="filepath")
         with gr.Column():
-            gr.Markdown("### Video/Image")
-            video_file = gr.File(label="Upload Video or Image", file_types=["video", "image"])
-    gr.Markdown("## Settings")
     with gr.Row():
-        fps = gr.Slider(20, 60, value=30, label="FPS")
-        quality = gr.Radio(["Low", "Medium", "High"], value="Medium", label="Quality")
-    generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
-    status_box = gr.Textbox(label="Status", interactive=False, lines=2)
-    output_video = gr.Video(label="Result")
-    generate_btn.click(
-        fn=generate_video,
-        inputs=[audio_file, video_file],
-        outputs=[status_box, output_video]
     )
-    gr.Markdown("""
-    ## Setup Instructions
-    To fully enable MuseTalk, install from: https://github.com/TMElyralab/MuseTalk
-    Requirements:
-    - Python 3.8+
-    - PyTorch
-    - CUDA (for GPU)
-    - ffmpeg
-    """)
 if __name__ == "__main__":
-    demo.launch()

 """
 import gradio as gr
+import os
+import tempfile
+from pathlib import Path
+from inference import MuseTalkInference
+# Initialize inference engine
+inference_engine = None
+def initialize_engine():
+    global inference_engine
+    if inference_engine is None:
+        inference_engine = MuseTalkInference()
+    return inference_engine
+# Validation functions
+def validate_audio(audio_path):
+    """Validate audio file."""
+    if not audio_path:
+        return False, "Please upload an audio file"
+    if not os.path.exists(audio_path):
+        return False, "Audio file not found"
+    # Check file size (max 100MB)
+    file_size = os.path.getsize(audio_path) / (1024 * 1024)
+    if file_size > 100:
+        return False, f"Audio file too large ({file_size:.1f}MB, max 100MB)"
+    return True, "Audio file valid"
+def validate_video(video_path):
+    """Validate video/image file."""
+    if not video_path:
+        return False, "Please upload a video or image file"
+    if not os.path.exists(video_path):
+        return False, "Video/image file not found"
+    # Check file size (max 500MB)
+    file_size = os.path.getsize(video_path) / (1024 * 1024)
+    if file_size > 500:
+        return False, f"Video/image file too large ({file_size:.1f}MB, max 500MB)"
+    return True, "Video/image file valid"
+def generate_lipsync_video(audio_file, video_file, fps, quality):
+    """Generate lip-synced video using MuseTalk inference."""
+    try:
+        # Validate inputs
+        audio_valid, audio_msg = validate_audio(audio_file)
+        if not audio_valid:
+            return None, f"Audio validation failed: {audio_msg}"
+        video_valid, video_msg = validate_video(video_file)
+        if not video_valid:
+            return None, f"Video validation failed: {video_msg}"
+        # Initialize inference engine
+        engine = initialize_engine()
+        # Create temporary output file
+        output_dir = tempfile.gettempdir()
+        output_path = os.path.join(output_dir, "musetalk_output.mp4")
+        # Define progress callback
+        def progress_callback(progress, status):
+            print(f"[{progress}%] {status}")
+        # Run inference
+        result_path = engine.generate(
+            audio_path=audio_file,
+            video_path=video_file,
+            output_path=output_path,
+            fps=int(fps),
+            progress_callback=progress_callback
+        )
+        return result_path, f"Successfully generated lip-synced video (Quality: {quality})"
+    except Exception as e:
+        error_msg = f"Error during generation: {str(e)}"
+        print(error_msg)
+        return None, error_msg
 # Create Gradio interface
+with gr.Blocks(title="MuseTalk - Audio-Driven Video Generation") as demo:
     gr.Markdown("# MuseTalk - Audio-Driven Video Generation")
     gr.Markdown("Generate realistic lip-synced videos from audio")
+    # Main title and description
+    gr.Markdown(
+        """
+        ## MuseTalk - AI Audio-Driven Video Generation
+        MuseTalk generates realistic lip-synced videos from audio input.
+        This is a self-hosted Space running on Hugging Face.
+        """
+    )
     with gr.Row():
+        gr.Markdown(
+            """
+            ### Features
+            - Audio-driven video generation
+            - Realistic lip-sync
+            - Customizable video parameters
+            """
+        )
+    gr.Markdown("### Input Files")
     with gr.Row():
         with gr.Column():
+            gr.Markdown("#### Audio")
+            audio_input = gr.Audio(
+                label="Upload Audio",
+                type="filepath",
+                format="wav"
+            )
         with gr.Column():
+            gr.Markdown("#### Video/Image")
+            video_input = gr.File(
+                label="Upload Video or Image",
+                file_count="single",
+                file_types=["video", "image"]
+            )
+    gr.Markdown("### Parameters")
     with gr.Row():
+        fps_slider = gr.Slider(
+            minimum=20,
+            maximum=60,
+            value=25,
+            step=1,
+            label="FPS (Frames Per Second)"
+        )
+        quality_radio = gr.Radio(
+            choices=["Low", "Medium", "High"],
+            value="Medium",
+            label="Quality"
+        )
+    gr.Markdown("### Generation")
+    generate_button = gr.Button("Generate Lip-Synced Video", variant="primary")
+    output_video = gr.Video(
+        label="Generated Video",
+        format="mp4"
     )
+    status_text = gr.Textbox(
+        label="Status",
+        interactive=False,
+        lines=3
+    )
+    # Connect generate button to inference function
+    generate_button.click(
+        fn=generate_lipsync_video,
+        inputs=[audio_input, video_input, fps_slider, quality_radio],
+        outputs=[output_video, status_text]
+    )
+    # Accordion sections
+    with gr.Accordion("About MuseTalk", open=False):
+        gr.Markdown(
+            """
+            ### About MuseTalk
+            MuseTalk is an AI model for audio-driven video generation that produces
+            realistic lip-synced videos. The model operates in latent space using
+            efficient single-step inpainting, enabling fast inference.
+            **Key Features:**
+            - Audio-driven lip-sync generation
+            - Supports multiple languages (Chinese, English, Japanese, etc.)
+            - Efficient inference on consumer hardware
+            - High-quality 30fps+ output
+            **Model Architecture:**
+            - Uses whisper-tiny for audio feature extraction
+            - DWPose for face detection and alignment
+            - Latent space inpainting (not diffusion-based)
+            - Supports 256x256 face region size
+            """
+        )
+    with gr.Accordion("Documentation & Setup", open=False):
+        gr.Markdown(
+            """
+            ### How to Use
+            1. **Upload Audio**: Select an audio file (WAV, MP3, M4A, OGG) up to 10 minutes
+            2. **Upload Video/Image**: Select a reference video or image with a face
+            3. **Adjust Parameters**:
+               - FPS: Output video frame rate (20-60)
+               - Quality: Output quality level (Low/Medium/High)
+            4. **Generate**: Click "Generate Lip-Synced Video"
+            5. **Download**: Your generated video will appear below
+            ### Supported Formats
+            **Audio**: WAV, MP3, M4A, OGG (up to 10 minutes)
+            **Video**: MP4, AVI, MOV, MKV (H264/H265 codec)
+            **Image**: PNG, JPG, JPEG, BMP (with clear face visible)
+            ### Technical Details
+            - **Device**: CPU-based inference with PyTorch
+            - **Memory**: Optimized for 4GB+ VRAM devices
+            - **Speed**: ~1-5 minutes depending on video length and quality
+            - **Output**: MP4 format with H264 codec
+            """
+        )
 if __name__ == "__main__":
+    demo.launch(share=False, server_name="0.0.0.0", server_port=7860)