trymonolith commited on
Commit
64e548c
·
verified ·
1 Parent(s): 7f36f80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -43
app.py CHANGED
@@ -5,75 +5,224 @@ Self-hosted Gradio interface for MuseTalk
5
  """
6
 
7
  import gradio as gr
 
 
 
 
8
 
9
- # Welcome message
10
- WELCOME_TEXT = """
11
- # MuseTalk - AI Audio-Driven Video Generation
12
 
13
- MuseTalk generates realistic lip-synced videos from audio input.
14
- This is a self-hosted Space running on Hugging Face.
 
 
 
15
 
16
- ## Features
17
- - Audio-driven video generation
18
- - Realistic lip-sync
19
- - Customizable video parameters
20
- """
 
 
 
 
 
 
 
 
 
 
21
 
22
- def generate_video(audio_file, video_file):
23
- """Placeholder function for video generation"""
24
- if audio_file is None or video_file is None:
25
- return "Please upload both audio and video files", None
 
 
 
 
 
 
 
 
26
 
27
- status = "MuseTalk generation would proceed here with proper installation"
28
- return status, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Create Gradio interface
31
- with gr.Blocks(title="MuseTalk") as demo:
32
  gr.Markdown("# MuseTalk - Audio-Driven Video Generation")
33
  gr.Markdown("Generate realistic lip-synced videos from audio")
34
 
 
 
 
 
 
 
 
 
 
 
35
  with gr.Row():
36
- gr.Markdown(WELCOME_TEXT)
 
 
 
 
 
 
 
37
 
38
- gr.Markdown("## Input Files")
39
 
40
  with gr.Row():
41
  with gr.Column():
42
- gr.Markdown("### Audio")
43
- audio_file = gr.Audio(label="Upload Audio", type="filepath")
 
 
 
 
44
 
45
  with gr.Column():
46
- gr.Markdown("### Video/Image")
47
- video_file = gr.File(label="Upload Video or Image", file_types=["video", "image"])
 
 
 
 
48
 
49
- gr.Markdown("## Settings")
50
 
51
  with gr.Row():
52
- fps = gr.Slider(20, 60, value=30, label="FPS")
53
- quality = gr.Radio(["Low", "Medium", "High"], value="Medium", label="Quality")
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
56
 
57
- status_box = gr.Textbox(label="Status", interactive=False, lines=2)
58
- output_video = gr.Video(label="Result")
59
 
60
- generate_btn.click(
61
- fn=generate_video,
62
- inputs=[audio_file, video_file],
63
- outputs=[status_box, output_video]
64
  )
65
 
66
- gr.Markdown("""
67
- ## Setup Instructions
 
 
 
 
 
 
 
 
 
 
68
 
69
- To fully enable MuseTalk, install from: https://github.com/TMElyralab/MuseTalk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- Requirements:
72
- - Python 3.8+
73
- - PyTorch
74
- - CUDA (for GPU)
75
- - ffmpeg
76
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  if __name__ == "__main__":
79
- demo.launch()
 
5
  """
6
 
7
  import gradio as gr
8
+ import os
9
+ import tempfile
10
+ from pathlib import Path
11
+ from inference import MuseTalkInference
12
 
13
+ # Initialize inference engine
14
+ inference_engine = None
 
15
 
16
+ def initialize_engine():
17
+ global inference_engine
18
+ if inference_engine is None:
19
+ inference_engine = MuseTalkInference()
20
+ return inference_engine
21
 
22
+ # Validation functions
23
+ def validate_audio(audio_path):
24
+ """Validate audio file."""
25
+ if not audio_path:
26
+ return False, "Please upload an audio file"
27
+
28
+ if not os.path.exists(audio_path):
29
+ return False, "Audio file not found"
30
+
31
+ # Check file size (max 100MB)
32
+ file_size = os.path.getsize(audio_path) / (1024 * 1024)
33
+ if file_size > 100:
34
+ return False, f"Audio file too large ({file_size:.1f}MB, max 100MB)"
35
+
36
+ return True, "Audio file valid"
37
 
38
+ def validate_video(video_path):
39
+ """Validate video/image file."""
40
+ if not video_path:
41
+ return False, "Please upload a video or image file"
42
+
43
+ if not os.path.exists(video_path):
44
+ return False, "Video/image file not found"
45
+
46
+ # Check file size (max 500MB)
47
+ file_size = os.path.getsize(video_path) / (1024 * 1024)
48
+ if file_size > 500:
49
+ return False, f"Video/image file too large ({file_size:.1f}MB, max 500MB)"
50
 
51
+ return True, "Video/image file valid"
52
+
53
+ def generate_lipsync_video(audio_file, video_file, fps, quality):
54
+ """Generate lip-synced video using MuseTalk inference."""
55
+ try:
56
+ # Validate inputs
57
+ audio_valid, audio_msg = validate_audio(audio_file)
58
+ if not audio_valid:
59
+ return None, f"Audio validation failed: {audio_msg}"
60
+
61
+ video_valid, video_msg = validate_video(video_file)
62
+ if not video_valid:
63
+ return None, f"Video validation failed: {video_msg}"
64
+
65
+ # Initialize inference engine
66
+ engine = initialize_engine()
67
+
68
+ # Create temporary output file
69
+ output_dir = tempfile.gettempdir()
70
+ output_path = os.path.join(output_dir, "musetalk_output.mp4")
71
+
72
+ # Define progress callback
73
+ def progress_callback(progress, status):
74
+ print(f"[{progress}%] {status}")
75
+
76
+ # Run inference
77
+ result_path = engine.generate(
78
+ audio_path=audio_file,
79
+ video_path=video_file,
80
+ output_path=output_path,
81
+ fps=int(fps),
82
+ progress_callback=progress_callback
83
+ )
84
+
85
+ return result_path, f"Successfully generated lip-synced video (Quality: {quality})"
86
+
87
+ except Exception as e:
88
+ error_msg = f"Error during generation: {str(e)}"
89
+ print(error_msg)
90
+ return None, error_msg
91
 
92
  # Create Gradio interface
93
+ with gr.Blocks(title="MuseTalk - Audio-Driven Video Generation") as demo:
94
  gr.Markdown("# MuseTalk - Audio-Driven Video Generation")
95
  gr.Markdown("Generate realistic lip-synced videos from audio")
96
 
97
+ # Main title and description
98
+ gr.Markdown(
99
+ """
100
+ ## MuseTalk - AI Audio-Driven Video Generation
101
+
102
+ MuseTalk generates realistic lip-synced videos from audio input.
103
+ This is a self-hosted Space running on Hugging Face.
104
+ """
105
+ )
106
+
107
  with gr.Row():
108
+ gr.Markdown(
109
+ """
110
+ ### Features
111
+ - Audio-driven video generation
112
+ - Realistic lip-sync
113
+ - Customizable video parameters
114
+ """
115
+ )
116
 
117
+ gr.Markdown("### Input Files")
118
 
119
  with gr.Row():
120
  with gr.Column():
121
+ gr.Markdown("#### Audio")
122
+ audio_input = gr.Audio(
123
+ label="Upload Audio",
124
+ type="filepath",
125
+ format="wav"
126
+ )
127
 
128
  with gr.Column():
129
+ gr.Markdown("#### Video/Image")
130
+ video_input = gr.File(
131
+ label="Upload Video or Image",
132
+ file_count="single",
133
+ file_types=["video", "image"]
134
+ )
135
 
136
+ gr.Markdown("### Parameters")
137
 
138
  with gr.Row():
139
+ fps_slider = gr.Slider(
140
+ minimum=20,
141
+ maximum=60,
142
+ value=25,
143
+ step=1,
144
+ label="FPS (Frames Per Second)"
145
+ )
146
+
147
+ quality_radio = gr.Radio(
148
+ choices=["Low", "Medium", "High"],
149
+ value="Medium",
150
+ label="Quality"
151
+ )
152
 
153
+ gr.Markdown("### Generation")
154
 
155
+ generate_button = gr.Button("Generate Lip-Synced Video", variant="primary")
 
156
 
157
+ output_video = gr.Video(
158
+ label="Generated Video",
159
+ format="mp4"
 
160
  )
161
 
162
+ status_text = gr.Textbox(
163
+ label="Status",
164
+ interactive=False,
165
+ lines=3
166
+ )
167
+
168
+ # Connect generate button to inference function
169
+ generate_button.click(
170
+ fn=generate_lipsync_video,
171
+ inputs=[audio_input, video_input, fps_slider, quality_radio],
172
+ outputs=[output_video, status_text]
173
+ )
174
 
175
+ # Accordion sections
176
+ with gr.Accordion("About MuseTalk", open=False):
177
+ gr.Markdown(
178
+ """
179
+ ### About MuseTalk
180
+
181
+ MuseTalk is an AI model for audio-driven video generation that produces
182
+ realistic lip-synced videos. The model operates in latent space using
183
+ efficient single-step inpainting, enabling fast inference.
184
+
185
+ **Key Features:**
186
+ - Audio-driven lip-sync generation
187
+ - Supports multiple languages (Chinese, English, Japanese, etc.)
188
+ - Efficient inference on consumer hardware
189
+ - High-quality 30fps+ output
190
+
191
+ **Model Architecture:**
192
+ - Uses whisper-tiny for audio feature extraction
193
+ - DWPose for face detection and alignment
194
+ - Latent space inpainting (not diffusion-based)
195
+ - Supports 256x256 face region size
196
+ """
197
+ )
198
 
199
+ with gr.Accordion("Documentation & Setup", open=False):
200
+ gr.Markdown(
201
+ """
202
+ ### How to Use
203
+
204
+ 1. **Upload Audio**: Select an audio file (WAV, MP3, M4A, OGG) up to 10 minutes
205
+ 2. **Upload Video/Image**: Select a reference video or image with a face
206
+ 3. **Adjust Parameters**:
207
+ - FPS: Output video frame rate (20-60)
208
+ - Quality: Output quality level (Low/Medium/High)
209
+ 4. **Generate**: Click "Generate Lip-Synced Video"
210
+ 5. **Download**: Your generated video will appear below
211
+
212
+ ### Supported Formats
213
+
214
+ **Audio**: WAV, MP3, M4A, OGG (up to 10 minutes)
215
+ **Video**: MP4, AVI, MOV, MKV (H264/H265 codec)
216
+ **Image**: PNG, JPG, JPEG, BMP (with clear face visible)
217
+
218
+ ### Technical Details
219
+
220
+ - **Device**: CPU-based inference with PyTorch
221
+ - **Memory**: Optimized for 4GB+ VRAM devices
222
+ - **Speed**: ~1-5 minutes depending on video length and quality
223
+ - **Output**: MP4 format with H264 codec
224
+ """
225
+ )
226
 
227
  if __name__ == "__main__":
228
+ demo.launch(share=False, server_name="0.0.0.0", server_port=7860)