import os
from typing import Optional, Tuple

import gradio as gr
from PIL import Image

import torch
from diffusers import StableVideoDiffusionPipeline

# -------------------------
# Load SVD model once
# -------------------------

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# You can cache this in HF Spaces; first run will download weights
SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt"

print(f"Loading SVD pipeline '{SVD_MODEL_ID}' on {DEVICE}...")
pipe = StableVideoDiffusionPipeline.from_pretrained(
    SVD_MODEL_ID,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
)
pipe = pipe.to(DEVICE)
pipe.enable_model_cpu_offload() if DEVICE == "cuda" else None

# -------------------------
# Core processing
# -------------------------

def process_image(image: Optional[Image.Image], command: str) -> Tuple[Optional[str], str]:
    """
    Generate a short video from a single image using Stable Video Diffusion.

    Returns:
        video_path: path to generated mp4 video
        logs: textual info
    """
    if image is None:
        return None, "No image uploaded."

    command = (command or "").strip()
    if not command:
        command = "a short smooth camera motion"  # default prompt

    # Preprocess image: SVD expects specific size (e.g., 576x1024 or similar)
    # We will just resize while keeping aspect ratio, then center-crop.
    target_h, target_w = 576, 1024

    img = image.convert("RGB")
    img = img.resize((target_w, target_h), Image.BICUBIC)

    # Inference
    with torch.no_grad():
        video_frames = pipe(
            img,
            num_frames=25,          # length of video
            decode_chunk_size=8,
            fps=7,                  # frames per second
            generator=torch.Generator(device=DEVICE).manual_seed(42),
        ).frames  # List[List[PIL.Image]]: batch x frames

    frames = video_frames[0]

    # Save to video file using ffmpeg-python
    import ffmpeg
    import numpy as np
    import tempfile

    # Convert PIL frames to numpy uint8 arrays
    np_frames = [np.array(f) for f in frames]

    # Build ffmpeg pipeline
    height, width, _ = np_frames[0].shape
    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    video_out = tmp.name

    process = (
        ffmpeg
        .input(
            "pipe:",
            format="rawvideo",
            pix_fmt="rgb24",
            s=f"{width}x{height}",
            r=7,
        )
        .output(
            video_out,
            vcodec="libx264",
            pix_fmt="yuv420p",
            r=7,
            loglevel="error",
        )
        .overwrite_output()
        .run_async(pipe_stdin=True)
    )

    for frame in np_frames:
        process.stdin.write(
            frame.astype("uint8").tobytes()
        )

    process.stdin.close()
    process.wait()

    logs = f"Generated video with {len(frames)} frames at 7 fps using command: {command}"
    return video_out, logs

# -------------------------
# Gradio UI
# -------------------------

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # DeepSite – SVD Image → Video

        Upload an image and enter a command (prompt).
        The app generates a short video using Stable Video Diffusion.
        """
    )

    with gr.Row():
        with gr.Column():
            input_image = gr.Image(
                label="Upload image",
                type="pil",
            )
            input_command = gr.Textbox(
                label="Command / Prompt",
                placeholder="e.g. 'slow zoom-out', 'pan to the right', etc.",
                lines=2,
            )
            run_btn = gr.Button("Generate Video")

        with gr.Column():
            output_video = gr.Video(label="Output video")
            output_text = gr.Textbox(label="Logs / Info", lines=6, interactive=False)

    run_btn.click(
        fn=process_image,
        inputs=[input_image, input_command],
        outputs=[output_video, output_text],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))