File size: 4,120 Bytes
1a27df3
1278826
1a27df3
cc06cda
1a27df3
 
1278826
 
1a27df3
1278826
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a27df3
1278826
 
 
 
 
1a27df3
 
 
 
 
 
1278826
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a27df3
1278826
 
 
 
 
 
 
 
 
 
1a27df3
1278826
 
 
1a27df3
cc06cda
1278826
 
 
 
 
cc06cda
1278826
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc06cda
 
b35c850
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
from typing import Optional, Tuple

import gradio as gr
from PIL import Image

import torch
from diffusers import StableVideoDiffusionPipeline

# -------------------------
# Load SVD model once
# -------------------------

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# You can cache this in HF Spaces; first run will download weights
SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt"

print(f"Loading SVD pipeline '{SVD_MODEL_ID}' on {DEVICE}...")
pipe = StableVideoDiffusionPipeline.from_pretrained(
    SVD_MODEL_ID,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
)
pipe = pipe.to(DEVICE)
pipe.enable_model_cpu_offload() if DEVICE == "cuda" else None

# -------------------------
# Core processing
# -------------------------

def process_image(image: Optional[Image.Image], command: str) -> Tuple[Optional[str], str]:
    """
    Generate a short video from a single image using Stable Video Diffusion.

    Returns:
        video_path: path to generated mp4 video
        logs: textual info
    """
    if image is None:
        return None, "No image uploaded."

    command = (command or "").strip()
    if not command:
        command = "a short smooth camera motion"  # default prompt

    # Preprocess image: SVD expects specific size (e.g., 576x1024 or similar)
    # We will just resize while keeping aspect ratio, then center-crop.
    target_h, target_w = 576, 1024

    img = image.convert("RGB")
    img = img.resize((target_w, target_h), Image.BICUBIC)

    # Inference
    with torch.no_grad():
        video_frames = pipe(
            img,
            num_frames=25,          # length of video
            decode_chunk_size=8,
            fps=7,                  # frames per second
            generator=torch.Generator(device=DEVICE).manual_seed(42),
        ).frames  # List[List[PIL.Image]]: batch x frames

    frames = video_frames[0]

    # Save to video file using ffmpeg-python
    import ffmpeg
    import numpy as np
    import tempfile

    # Convert PIL frames to numpy uint8 arrays
    np_frames = [np.array(f) for f in frames]

    # Build ffmpeg pipeline
    height, width, _ = np_frames[0].shape
    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    video_out = tmp.name

    process = (
        ffmpeg
        .input(
            "pipe:",
            format="rawvideo",
            pix_fmt="rgb24",
            s=f"{width}x{height}",
            r=7,
        )
        .output(
            video_out,
            vcodec="libx264",
            pix_fmt="yuv420p",
            r=7,
            loglevel="error",
        )
        .overwrite_output()
        .run_async(pipe_stdin=True)
    )

    for frame in np_frames:
        process.stdin.write(
            frame.astype("uint8").tobytes()
        )

    process.stdin.close()
    process.wait()

    logs = f"Generated video with {len(frames)} frames at 7 fps using command: {command}"
    return video_out, logs

# -------------------------
# Gradio UI
# -------------------------

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # DeepSite – SVD Image → Video

        Upload an image and enter a command (prompt).
        The app generates a short video using Stable Video Diffusion.
        """
    )

    with gr.Row():
        with gr.Column():
            input_image = gr.Image(
                label="Upload image",
                type="pil",
            )
            input_command = gr.Textbox(
                label="Command / Prompt",
                placeholder="e.g. 'slow zoom-out', 'pan to the right', etc.",
                lines=2,
            )
            run_btn = gr.Button("Generate Video")

        with gr.Column():
            output_video = gr.Video(label="Output video")
            output_text = gr.Textbox(label="Logs / Info", lines=6, interactive=False)

    run_btn.click(
        fn=process_image,
        inputs=[input_image, input_command],
        outputs=[output_video, output_text],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))