Spaces:
Running
Running
| import os | |
| from typing import Optional, Tuple | |
| import gradio as gr | |
| from PIL import Image | |
| import torch | |
| from diffusers import StableVideoDiffusionPipeline | |
| # ------------------------- | |
| # Load SVD model once | |
| # ------------------------- | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # You can cache this in HF Spaces; first run will download weights | |
| SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt" | |
| print(f"Loading SVD pipeline '{SVD_MODEL_ID}' on {DEVICE}...") | |
| pipe = StableVideoDiffusionPipeline.from_pretrained( | |
| SVD_MODEL_ID, | |
| torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, | |
| ) | |
| pipe = pipe.to(DEVICE) | |
| pipe.enable_model_cpu_offload() if DEVICE == "cuda" else None | |
| # ------------------------- | |
| # Core processing | |
| # ------------------------- | |
| def process_image(image: Optional[Image.Image], command: str) -> Tuple[Optional[str], str]: | |
| """ | |
| Generate a short video from a single image using Stable Video Diffusion. | |
| Returns: | |
| video_path: path to generated mp4 video | |
| logs: textual info | |
| """ | |
| if image is None: | |
| return None, "No image uploaded." | |
| command = (command or "").strip() | |
| if not command: | |
| command = "a short smooth camera motion" # default prompt | |
| # Preprocess image: SVD expects specific size (e.g., 576x1024 or similar) | |
| # We will just resize while keeping aspect ratio, then center-crop. | |
| target_h, target_w = 576, 1024 | |
| img = image.convert("RGB") | |
| img = img.resize((target_w, target_h), Image.BICUBIC) | |
| # Inference | |
| with torch.no_grad(): | |
| video_frames = pipe( | |
| img, | |
| num_frames=25, # length of video | |
| decode_chunk_size=8, | |
| fps=7, # frames per second | |
| generator=torch.Generator(device=DEVICE).manual_seed(42), | |
| ).frames # List[List[PIL.Image]]: batch x frames | |
| frames = video_frames[0] | |
| # Save to video file using ffmpeg-python | |
| import ffmpeg | |
| import numpy as np | |
| import tempfile | |
| # Convert PIL frames to numpy uint8 arrays | |
| np_frames = [np.array(f) for f in frames] | |
| # Build ffmpeg pipeline | |
| height, width, _ = np_frames[0].shape | |
| tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) | |
| video_out = tmp.name | |
| process = ( | |
| ffmpeg | |
| .input( | |
| "pipe:", | |
| format="rawvideo", | |
| pix_fmt="rgb24", | |
| s=f"{width}x{height}", | |
| r=7, | |
| ) | |
| .output( | |
| video_out, | |
| vcodec="libx264", | |
| pix_fmt="yuv420p", | |
| r=7, | |
| loglevel="error", | |
| ) | |
| .overwrite_output() | |
| .run_async(pipe_stdin=True) | |
| ) | |
| for frame in np_frames: | |
| process.stdin.write( | |
| frame.astype("uint8").tobytes() | |
| ) | |
| process.stdin.close() | |
| process.wait() | |
| logs = f"Generated video with {len(frames)} frames at 7 fps using command: {command}" | |
| return video_out, logs | |
| # ------------------------- | |
| # Gradio UI | |
| # ------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # DeepSite – SVD Image → Video | |
| Upload an image and enter a command (prompt). | |
| The app generates a short video using Stable Video Diffusion. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_image = gr.Image( | |
| label="Upload image", | |
| type="pil", | |
| ) | |
| input_command = gr.Textbox( | |
| label="Command / Prompt", | |
| placeholder="e.g. 'slow zoom-out', 'pan to the right', etc.", | |
| lines=2, | |
| ) | |
| run_btn = gr.Button("Generate Video") | |
| with gr.Column(): | |
| output_video = gr.Video(label="Output video") | |
| output_text = gr.Textbox(label="Logs / Info", lines=6, interactive=False) | |
| run_btn.click( | |
| fn=process_image, | |
| inputs=[input_image, input_command], | |
| outputs=[output_video, output_text], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860))) |