import spaces
import gradio as gr
import torch
from diffusers import DiffusionPipeline
from diffusers.utils import load_image, export_to_video
import os
import random
import numpy as np
from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
from PIL import Image, ImageOps

# --- 1. Model Setup & Configuration ---

# Define the specific distilled sigmas (from LTX-2 documentation)
DISTILLED_SIGMA_VALUES = [
    1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875
]

print("Loading LTX-2 Distilled Pipeline...")
pipe = DiffusionPipeline.from_pretrained(
    "rootonchair/LTX-2-19b-distilled",
    custom_pipeline="multimodalart/ltx2-audio-to-video",
    torch_dtype=torch.bfloat16
)
pipe.to("cuda")

print("Loading and Fusing Camera Control LoRA...")
pipe.load_lora_weights("Lightricks/LTX-2-19b-LoRA-Camera-Control-Static", adapter_name="camera_control")
pipe.fuse_lora(lora_scale=0.8)
pipe.unload_lora_weights()

# --- 2. Helper Functions ---
def save_video_with_audio(video_frames, audio_path, fps=24):
    """
    Combines the generated video frames with the original input audio.
    """
    output_filename = f"output_{random.randint(0, 100000)}.mp4"
    
    # 1. Handle Diffusers Output Formats
    if isinstance(video_frames, list):
        if video_frames and isinstance(video_frames[0], list):
            frames_to_process = video_frames[0]
        else:
            frames_to_process = video_frames
        np_frames = [np.array(img) for img in frames_to_process]
        clip = ImageSequenceClip(np_frames, fps=fps)
        
    elif isinstance(video_frames, str):
        clip = VideoFileClip(video_frames)
    else:
        temp_path = "temp_video_no_audio.mp4"
        export_to_video(video_frames, temp_path, fps=fps)
        clip = VideoFileClip(temp_path)

    # 2. Load and Process Audio
    audio_clip = AudioFileClip(audio_path)
    
    if audio_clip.duration > clip.duration:
        audio_clip = audio_clip.subclipped(0, clip.duration)
    
    # 3. Combine and Save
    final_clip = clip.with_audio(audio_clip)
    
    final_clip.write_videofile(
        output_filename, 
        fps=fps, 
        codec="libx264", 
        audio_codec="aac",
        logger="bar"
    )
    
    final_clip.close()
    audio_clip.close()
    if 'clip' in locals(): clip.close()
    
    return output_filename

def infer_aspect_ratio(image):
    """
    Infers the closest supported aspect ratio based on the image dimensions.
    Returns the aspect ratio key and the target resolution.
    """
    # Define resolutions (W, H)
    resolutions = {
        "1:1": (512, 512),
        "16:9": (768, 512),
        "9:16": (512, 768)
    }
    
    width, height = image.size
    image_ratio = width / height
    
    # Calculate the actual ratios
    aspect_ratios = {
        "1:1": 1.0,
        "16:9": 16 / 9,  # ~1.778
        "9:16": 9 / 16   # ~0.5625
    }
    
    # Find the closest aspect ratio
    closest_ratio = min(aspect_ratios.keys(), key=lambda k: abs(aspect_ratios[k] - image_ratio))
    
    return closest_ratio, resolutions[closest_ratio]

def process_image_for_aspect_ratio(image):
    """
    Automatically infers the best aspect ratio and crops/resizes the image to match.
    Returns the processed image, dimensions, and the detected aspect ratio string.
    """
    aspect_ratio_str, (target_width, target_height) = infer_aspect_ratio(image)
    
    # Use ImageOps.fit to center crop and resize automatically
    # This preserves aspect ratio of the content while filling the target dimensions
    processed_img = ImageOps.fit(
        image, 
        (target_width, target_height), 
        method=Image.LANCZOS, 
        centering=(0.5, 0.5)
    )
    
    return processed_img, target_width, target_height, aspect_ratio_str

def get_audio_duration(audio_path):
    """
    Gets the duration of an audio file and returns updated slider value.
    Caps at the maximum allowed duration (12 seconds).
    """
    if audio_path is None:
        return gr.update()
    
    try:
        audio_clip = AudioFileClip(audio_path)
        duration = audio_clip.duration
        audio_clip.close()
        
        # Cap at maximum of 12 seconds, round to nearest 0.5
        capped_duration = min(duration, 12.0)
        rounded_duration = round(capped_duration * 2) / 2  # Round to nearest 0.5
        
        return gr.update(value=rounded_duration)
    except Exception as e:
        print(f"Error getting audio duration: {e}")
        return gr.update()

# --- 3. Inference Function ---
@spaces.GPU(duration=85, size='xlarge')
def generate(
    image_path, 
    audio_path, 
    prompt, 
    negative_prompt,
    video_duration,
    seed,
    progress=gr.Progress(track_tqdm=True)
):
    if not image_path or not audio_path:
        raise gr.Error("Please provide both an image and an audio file.")

    # Set reproducibility
    if seed == -1:
        seed = random.randint(0, 1000000)
    generator = torch.Generator(device="cuda").manual_seed(seed)
    
    # 1. Load and Preprocess Image (auto-detect aspect ratio)
    original_image = load_image(image_path)
    image, width, height, detected_ratio = process_image_for_aspect_ratio(original_image)

    print(f"Generating with seed: {seed}, Resolution: {width}x{height} ({detected_ratio}), Duration: {video_duration}s")

    # 2. Calculate Frames
    fps = 24.0
    # LTX-2 constraint: (num_frames - 1) % 8 == 0
    total_frames = int(video_duration * fps)
    
    # Round to nearest valid block of 8, plus 1
    # Example: 4 seconds * 24 = 96 frames. 
    # 96 is divisible by 8. So we take 96 + 1 = 97 frames.
    base_block = round(total_frames / 8) * 8
    num_frames = base_block + 1
    
    # Ensure sane minimum
    if num_frames < 9: num_frames = 9
    
    print(f"Calculated frames: {num_frames}")

    # 3. Run Inference
    video_output, _ = pipe(
        image=image,
        audio=audio_path,
        prompt=prompt,
        negative_prompt=negative_prompt, 
        width=width,
        height=height,
        num_frames=num_frames,
        frame_rate=fps,
        num_inference_steps=8, # Distilled uses 8 steps
        sigmas=DISTILLED_SIGMA_VALUES, 
        guidance_scale=1.0, 
        generator=generator,
        return_dict=False,
    )

    # 4. Post-process: Add audio
    output_video_path = save_video_with_audio(video_output, audio_path, fps=fps)
    
    return output_video_path, seed

# --- 4. Gradio Interface Definition ---

css = """
#col-container { max-width: 800px; margin: 0 auto; }
"""

with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
    with gr.Column(elem_id="col-container"):
        gr.Markdown("# ⚡ LTX-2 Distilled Audio-to-Video")
        gr.Markdown("Generate lip-synced or audio-reactive video from a single image using the distilled 8-step LTX-2 model.")

        with gr.Row():
            with gr.Column():
                input_image = gr.Image(label="Input Image", type="filepath", height=300)
                input_audio = gr.Audio(label="Input Audio", type="filepath")
            
            with gr.Column():
                result_video = gr.Video(label="Generated Video")
        
        prompt = gr.Textbox(
            label="Prompt", 
            value="A person speaking, lips moving in sync with the words, talking head",
            lines=2
        )
        
        with gr.Row():
            video_duration = gr.Slider(
                label="Video Duration (Seconds)",
                minimum=1.0,
                maximum=12.0,
                step=0.5,
                value=4.0,
            )

        with gr.Accordion("Advanced Settings", open=False):
            negative_prompt = gr.Textbox(
                label="Negative Prompt", 
                value="low quality, worst quality, deformed, distorted",
                placeholder="Usually ignored by distilled models with guidance 1.0"
            )
            seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)

        run_btn = gr.Button("Generate Video", variant="primary")
        
        # Output info
        used_seed = gr.Number(label="Used Seed", visible=False)

    # Event Logic
    
    # Auto-update video duration when audio is uploaded
    input_audio.change(
        fn=get_audio_duration,
        inputs=[input_audio],
        outputs=[video_duration]
    )
    
    run_btn.click(
        fn=generate,
        inputs=[
            input_image, 
            input_audio, 
            prompt, 
            negative_prompt, 
            video_duration,
            seed
        ],
        outputs=[result_video, used_seed]
    )

if __name__ == "__main__":
    demo.queue().launch()