Spaces:
Running
Running
File size: 4,120 Bytes
1a27df3 1278826 1a27df3 cc06cda 1a27df3 1278826 1a27df3 1278826 1a27df3 1278826 1a27df3 1278826 1a27df3 1278826 1a27df3 1278826 1a27df3 cc06cda 1278826 cc06cda 1278826 cc06cda b35c850 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | import os
from typing import Optional, Tuple
import gradio as gr
from PIL import Image
import torch
from diffusers import StableVideoDiffusionPipeline
# -------------------------
# Load SVD model once
# -------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# You can cache this in HF Spaces; first run will download weights
SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt"
print(f"Loading SVD pipeline '{SVD_MODEL_ID}' on {DEVICE}...")
pipe = StableVideoDiffusionPipeline.from_pretrained(
SVD_MODEL_ID,
torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
)
pipe = pipe.to(DEVICE)
pipe.enable_model_cpu_offload() if DEVICE == "cuda" else None
# -------------------------
# Core processing
# -------------------------
def process_image(image: Optional[Image.Image], command: str) -> Tuple[Optional[str], str]:
"""
Generate a short video from a single image using Stable Video Diffusion.
Returns:
video_path: path to generated mp4 video
logs: textual info
"""
if image is None:
return None, "No image uploaded."
command = (command or "").strip()
if not command:
command = "a short smooth camera motion" # default prompt
# Preprocess image: SVD expects specific size (e.g., 576x1024 or similar)
# We will just resize while keeping aspect ratio, then center-crop.
target_h, target_w = 576, 1024
img = image.convert("RGB")
img = img.resize((target_w, target_h), Image.BICUBIC)
# Inference
with torch.no_grad():
video_frames = pipe(
img,
num_frames=25, # length of video
decode_chunk_size=8,
fps=7, # frames per second
generator=torch.Generator(device=DEVICE).manual_seed(42),
).frames # List[List[PIL.Image]]: batch x frames
frames = video_frames[0]
# Save to video file using ffmpeg-python
import ffmpeg
import numpy as np
import tempfile
# Convert PIL frames to numpy uint8 arrays
np_frames = [np.array(f) for f in frames]
# Build ffmpeg pipeline
height, width, _ = np_frames[0].shape
tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
video_out = tmp.name
process = (
ffmpeg
.input(
"pipe:",
format="rawvideo",
pix_fmt="rgb24",
s=f"{width}x{height}",
r=7,
)
.output(
video_out,
vcodec="libx264",
pix_fmt="yuv420p",
r=7,
loglevel="error",
)
.overwrite_output()
.run_async(pipe_stdin=True)
)
for frame in np_frames:
process.stdin.write(
frame.astype("uint8").tobytes()
)
process.stdin.close()
process.wait()
logs = f"Generated video with {len(frames)} frames at 7 fps using command: {command}"
return video_out, logs
# -------------------------
# Gradio UI
# -------------------------
with gr.Blocks() as demo:
gr.Markdown(
"""
# DeepSite – SVD Image → Video
Upload an image and enter a command (prompt).
The app generates a short video using Stable Video Diffusion.
"""
)
with gr.Row():
with gr.Column():
input_image = gr.Image(
label="Upload image",
type="pil",
)
input_command = gr.Textbox(
label="Command / Prompt",
placeholder="e.g. 'slow zoom-out', 'pan to the right', etc.",
lines=2,
)
run_btn = gr.Button("Generate Video")
with gr.Column():
output_video = gr.Video(label="Output video")
output_text = gr.Textbox(label="Logs / Info", lines=6, interactive=False)
run_btn.click(
fn=process_image,
inputs=[input_image, input_command],
outputs=[output_video, output_text],
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860))) |