| import spaces |
| import gradio as gr |
| import torch |
| from diffusers import DiffusionPipeline |
| from diffusers.utils import load_image, export_to_video |
| import os |
| import random |
| import numpy as np |
| from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip |
| from PIL import Image, ImageOps |
|
|
| |
|
|
| |
| DISTILLED_SIGMA_VALUES = [ |
| 1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875 |
| ] |
|
|
| print("Loading LTX-2 Distilled Pipeline...") |
| pipe = DiffusionPipeline.from_pretrained( |
| "rootonchair/LTX-2-19b-distilled", |
| custom_pipeline="multimodalart/ltx2-audio-to-video", |
| torch_dtype=torch.bfloat16 |
| ) |
| pipe.to("cuda") |
|
|
| print("Loading and Fusing Camera Control LoRA...") |
| pipe.load_lora_weights("Lightricks/LTX-2-19b-LoRA-Camera-Control-Static", adapter_name="camera_control") |
| pipe.fuse_lora(lora_scale=0.8) |
| pipe.unload_lora_weights() |
|
|
| |
| def save_video_with_audio(video_frames, audio_path, fps=24): |
| """ |
| Combines the generated video frames with the original input audio. |
| """ |
| output_filename = f"output_{random.randint(0, 100000)}.mp4" |
| |
| |
| if isinstance(video_frames, list): |
| if video_frames and isinstance(video_frames[0], list): |
| frames_to_process = video_frames[0] |
| else: |
| frames_to_process = video_frames |
| np_frames = [np.array(img) for img in frames_to_process] |
| clip = ImageSequenceClip(np_frames, fps=fps) |
| |
| elif isinstance(video_frames, str): |
| clip = VideoFileClip(video_frames) |
| else: |
| temp_path = "temp_video_no_audio.mp4" |
| export_to_video(video_frames, temp_path, fps=fps) |
| clip = VideoFileClip(temp_path) |
|
|
| |
| audio_clip = AudioFileClip(audio_path) |
| |
| if audio_clip.duration > clip.duration: |
| audio_clip = audio_clip.subclipped(0, clip.duration) |
| |
| |
| final_clip = clip.with_audio(audio_clip) |
| |
| final_clip.write_videofile( |
| output_filename, |
| fps=fps, |
| codec="libx264", |
| audio_codec="aac", |
| logger="bar" |
| ) |
| |
| final_clip.close() |
| audio_clip.close() |
| if 'clip' in locals(): clip.close() |
| |
| return output_filename |
|
|
| def infer_aspect_ratio(image): |
| """ |
| Infers the closest supported aspect ratio based on the image dimensions. |
| Returns the aspect ratio key and the target resolution. |
| """ |
| |
| resolutions = { |
| "1:1": (512, 512), |
| "16:9": (768, 512), |
| "9:16": (512, 768) |
| } |
| |
| width, height = image.size |
| image_ratio = width / height |
| |
| |
| aspect_ratios = { |
| "1:1": 1.0, |
| "16:9": 16 / 9, |
| "9:16": 9 / 16 |
| } |
| |
| |
| closest_ratio = min(aspect_ratios.keys(), key=lambda k: abs(aspect_ratios[k] - image_ratio)) |
| |
| return closest_ratio, resolutions[closest_ratio] |
|
|
| def process_image_for_aspect_ratio(image): |
| """ |
| Automatically infers the best aspect ratio and crops/resizes the image to match. |
| Returns the processed image, dimensions, and the detected aspect ratio string. |
| """ |
| aspect_ratio_str, (target_width, target_height) = infer_aspect_ratio(image) |
| |
| |
| |
| processed_img = ImageOps.fit( |
| image, |
| (target_width, target_height), |
| method=Image.LANCZOS, |
| centering=(0.5, 0.5) |
| ) |
| |
| return processed_img, target_width, target_height, aspect_ratio_str |
|
|
| def get_audio_duration(audio_path): |
| """ |
| Gets the duration of an audio file and returns updated slider value. |
| Caps at the maximum allowed duration (12 seconds). |
| """ |
| if audio_path is None: |
| return gr.update() |
| |
| try: |
| audio_clip = AudioFileClip(audio_path) |
| duration = audio_clip.duration |
| audio_clip.close() |
| |
| |
| capped_duration = min(duration, 12.0) |
| rounded_duration = round(capped_duration * 2) / 2 |
| |
| return gr.update(value=rounded_duration) |
| except Exception as e: |
| print(f"Error getting audio duration: {e}") |
| return gr.update() |
|
|
| |
| @spaces.GPU(duration=85, size='xlarge') |
| def generate( |
| image_path, |
| audio_path, |
| prompt, |
| negative_prompt, |
| video_duration, |
| seed, |
| progress=gr.Progress(track_tqdm=True) |
| ): |
| if not image_path or not audio_path: |
| raise gr.Error("Please provide both an image and an audio file.") |
|
|
| |
| if seed == -1: |
| seed = random.randint(0, 1000000) |
| generator = torch.Generator(device="cuda").manual_seed(seed) |
| |
| |
| original_image = load_image(image_path) |
| image, width, height, detected_ratio = process_image_for_aspect_ratio(original_image) |
|
|
| print(f"Generating with seed: {seed}, Resolution: {width}x{height} ({detected_ratio}), Duration: {video_duration}s") |
|
|
| |
| fps = 24.0 |
| |
| total_frames = int(video_duration * fps) |
| |
| |
| |
| |
| base_block = round(total_frames / 8) * 8 |
| num_frames = base_block + 1 |
| |
| |
| if num_frames < 9: num_frames = 9 |
| |
| print(f"Calculated frames: {num_frames}") |
|
|
| |
| video_output, _ = pipe( |
| image=image, |
| audio=audio_path, |
| prompt=prompt, |
| negative_prompt=negative_prompt, |
| width=width, |
| height=height, |
| num_frames=num_frames, |
| frame_rate=fps, |
| num_inference_steps=8, |
| sigmas=DISTILLED_SIGMA_VALUES, |
| guidance_scale=1.0, |
| generator=generator, |
| return_dict=False, |
| ) |
|
|
| |
| output_video_path = save_video_with_audio(video_output, audio_path, fps=fps) |
| |
| return output_video_path, seed |
|
|
| |
|
|
| css = """ |
| #col-container { max-width: 800px; margin: 0 auto; } |
| """ |
|
|
| with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: |
| with gr.Column(elem_id="col-container"): |
| gr.Markdown("# ⚡ LTX-2 Distilled Audio-to-Video") |
| gr.Markdown("Generate lip-synced or audio-reactive video from a single image using the distilled 8-step LTX-2 model.") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| input_image = gr.Image(label="Input Image", type="filepath", height=300) |
| input_audio = gr.Audio(label="Input Audio", type="filepath") |
| |
| with gr.Column(): |
| result_video = gr.Video(label="Generated Video") |
| |
| prompt = gr.Textbox( |
| label="Prompt", |
| value="A person speaking, lips moving in sync with the words, talking head", |
| lines=2 |
| ) |
| |
| with gr.Row(): |
| video_duration = gr.Slider( |
| label="Video Duration (Seconds)", |
| minimum=1.0, |
| maximum=12.0, |
| step=0.5, |
| value=4.0, |
| ) |
|
|
| with gr.Accordion("Advanced Settings", open=False): |
| negative_prompt = gr.Textbox( |
| label="Negative Prompt", |
| value="low quality, worst quality, deformed, distorted", |
| placeholder="Usually ignored by distilled models with guidance 1.0" |
| ) |
| seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0) |
|
|
| run_btn = gr.Button("Generate Video", variant="primary") |
| |
| |
| used_seed = gr.Number(label="Used Seed", visible=False) |
|
|
| |
| |
| |
| input_audio.change( |
| fn=get_audio_duration, |
| inputs=[input_audio], |
| outputs=[video_duration] |
| ) |
| |
| run_btn.click( |
| fn=generate, |
| inputs=[ |
| input_image, |
| input_audio, |
| prompt, |
| negative_prompt, |
| video_duration, |
| seed |
| ], |
| outputs=[result_video, used_seed] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.queue().launch() |