import os from typing import Optional, Tuple import gradio as gr from PIL import Image import torch from diffusers import StableVideoDiffusionPipeline # ------------------------- # Load SVD model once # ------------------------- DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # You can cache this in HF Spaces; first run will download weights SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt" print(f"Loading SVD pipeline '{SVD_MODEL_ID}' on {DEVICE}...") pipe = StableVideoDiffusionPipeline.from_pretrained( SVD_MODEL_ID, torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, ) pipe = pipe.to(DEVICE) pipe.enable_model_cpu_offload() if DEVICE == "cuda" else None # ------------------------- # Core processing # ------------------------- def process_image(image: Optional[Image.Image], command: str) -> Tuple[Optional[str], str]: """ Generate a short video from a single image using Stable Video Diffusion. Returns: video_path: path to generated mp4 video logs: textual info """ if image is None: return None, "No image uploaded." command = (command or "").strip() if not command: command = "a short smooth camera motion" # default prompt # Preprocess image: SVD expects specific size (e.g., 576x1024 or similar) # We will just resize while keeping aspect ratio, then center-crop. target_h, target_w = 576, 1024 img = image.convert("RGB") img = img.resize((target_w, target_h), Image.BICUBIC) # Inference with torch.no_grad(): video_frames = pipe( img, num_frames=25, # length of video decode_chunk_size=8, fps=7, # frames per second generator=torch.Generator(device=DEVICE).manual_seed(42), ).frames # List[List[PIL.Image]]: batch x frames frames = video_frames[0] # Save to video file using ffmpeg-python import ffmpeg import numpy as np import tempfile # Convert PIL frames to numpy uint8 arrays np_frames = [np.array(f) for f in frames] # Build ffmpeg pipeline height, width, _ = np_frames[0].shape tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) video_out = tmp.name process = ( ffmpeg .input( "pipe:", format="rawvideo", pix_fmt="rgb24", s=f"{width}x{height}", r=7, ) .output( video_out, vcodec="libx264", pix_fmt="yuv420p", r=7, loglevel="error", ) .overwrite_output() .run_async(pipe_stdin=True) ) for frame in np_frames: process.stdin.write( frame.astype("uint8").tobytes() ) process.stdin.close() process.wait() logs = f"Generated video with {len(frames)} frames at 7 fps using command: {command}" return video_out, logs # ------------------------- # Gradio UI # ------------------------- with gr.Blocks() as demo: gr.Markdown( """ # DeepSite – SVD Image → Video Upload an image and enter a command (prompt). The app generates a short video using Stable Video Diffusion. """ ) with gr.Row(): with gr.Column(): input_image = gr.Image( label="Upload image", type="pil", ) input_command = gr.Textbox( label="Command / Prompt", placeholder="e.g. 'slow zoom-out', 'pan to the right', etc.", lines=2, ) run_btn = gr.Button("Generate Video") with gr.Column(): output_video = gr.Video(label="Output video") output_text = gr.Textbox(label="Logs / Info", lines=6, interactive=False) run_btn.click( fn=process_image, inputs=[input_image, input_command], outputs=[output_video, output_text], ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))