Boka73's picture
Update app.py
1278826 verified
import os
from typing import Optional, Tuple
import gradio as gr
from PIL import Image
import torch
from diffusers import StableVideoDiffusionPipeline
# -------------------------
# Load SVD model once
# -------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# You can cache this in HF Spaces; first run will download weights
SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt"
print(f"Loading SVD pipeline '{SVD_MODEL_ID}' on {DEVICE}...")
pipe = StableVideoDiffusionPipeline.from_pretrained(
SVD_MODEL_ID,
torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
)
pipe = pipe.to(DEVICE)
pipe.enable_model_cpu_offload() if DEVICE == "cuda" else None
# -------------------------
# Core processing
# -------------------------
def process_image(image: Optional[Image.Image], command: str) -> Tuple[Optional[str], str]:
"""
Generate a short video from a single image using Stable Video Diffusion.
Returns:
video_path: path to generated mp4 video
logs: textual info
"""
if image is None:
return None, "No image uploaded."
command = (command or "").strip()
if not command:
command = "a short smooth camera motion" # default prompt
# Preprocess image: SVD expects specific size (e.g., 576x1024 or similar)
# We will just resize while keeping aspect ratio, then center-crop.
target_h, target_w = 576, 1024
img = image.convert("RGB")
img = img.resize((target_w, target_h), Image.BICUBIC)
# Inference
with torch.no_grad():
video_frames = pipe(
img,
num_frames=25, # length of video
decode_chunk_size=8,
fps=7, # frames per second
generator=torch.Generator(device=DEVICE).manual_seed(42),
).frames # List[List[PIL.Image]]: batch x frames
frames = video_frames[0]
# Save to video file using ffmpeg-python
import ffmpeg
import numpy as np
import tempfile
# Convert PIL frames to numpy uint8 arrays
np_frames = [np.array(f) for f in frames]
# Build ffmpeg pipeline
height, width, _ = np_frames[0].shape
tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
video_out = tmp.name
process = (
ffmpeg
.input(
"pipe:",
format="rawvideo",
pix_fmt="rgb24",
s=f"{width}x{height}",
r=7,
)
.output(
video_out,
vcodec="libx264",
pix_fmt="yuv420p",
r=7,
loglevel="error",
)
.overwrite_output()
.run_async(pipe_stdin=True)
)
for frame in np_frames:
process.stdin.write(
frame.astype("uint8").tobytes()
)
process.stdin.close()
process.wait()
logs = f"Generated video with {len(frames)} frames at 7 fps using command: {command}"
return video_out, logs
# -------------------------
# Gradio UI
# -------------------------
with gr.Blocks() as demo:
gr.Markdown(
"""
# DeepSite – SVD Image → Video
Upload an image and enter a command (prompt).
The app generates a short video using Stable Video Diffusion.
"""
)
with gr.Row():
with gr.Column():
input_image = gr.Image(
label="Upload image",
type="pil",
)
input_command = gr.Textbox(
label="Command / Prompt",
placeholder="e.g. 'slow zoom-out', 'pan to the right', etc.",
lines=2,
)
run_btn = gr.Button("Generate Video")
with gr.Column():
output_video = gr.Video(label="Output video")
output_text = gr.Textbox(label="Logs / Info", lines=6, interactive=False)
run_btn.click(
fn=process_image,
inputs=[input_image, input_command],
outputs=[output_video, output_text],
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))