Spaces:

Boka73
/

deepsite-project-lohn3

Running

App Files Files Community

deepsite-project-lohn3 / app.py

Boka73

Update app.py

1278826 verified about 17 hours ago

raw

history blame contribute delete

4.12 kB

	import os
	from typing import Optional, Tuple

	import gradio as gr
	from PIL import Image

	import torch
	from diffusers import StableVideoDiffusionPipeline

	# -------------------------
	# Load SVD model once
	# -------------------------

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# You can cache this in HF Spaces; first run will download weights
	SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt"

	print(f"Loading SVD pipeline '{SVD_MODEL_ID}' on {DEVICE}...")
	pipe = StableVideoDiffusionPipeline.from_pretrained(
	SVD_MODEL_ID,
	torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
	)
	pipe = pipe.to(DEVICE)
	pipe.enable_model_cpu_offload() if DEVICE == "cuda" else None

	# -------------------------
	# Core processing
	# -------------------------

	def process_image(image: Optional[Image.Image], command: str) -> Tuple[Optional[str], str]:
	"""
	Generate a short video from a single image using Stable Video Diffusion.

	Returns:
	video_path: path to generated mp4 video
	logs: textual info
	"""
	if image is None:
	return None, "No image uploaded."

	command = (command or "").strip()
	if not command:
	command = "a short smooth camera motion" # default prompt

	# Preprocess image: SVD expects specific size (e.g., 576x1024 or similar)
	# We will just resize while keeping aspect ratio, then center-crop.
	target_h, target_w = 576, 1024

	img = image.convert("RGB")
	img = img.resize((target_w, target_h), Image.BICUBIC)

	# Inference
	with torch.no_grad():
	video_frames = pipe(
	img,
	num_frames=25, # length of video
	decode_chunk_size=8,
	fps=7, # frames per second
	generator=torch.Generator(device=DEVICE).manual_seed(42),
	).frames # List[List[PIL.Image]]: batch x frames

	frames = video_frames[0]

	# Save to video file using ffmpeg-python
	import ffmpeg
	import numpy as np
	import tempfile

	# Convert PIL frames to numpy uint8 arrays
	np_frames = [np.array(f) for f in frames]

	# Build ffmpeg pipeline
	height, width, _ = np_frames[0].shape
	tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
	video_out = tmp.name

	process = (
	ffmpeg
	.input(
	"pipe:",
	format="rawvideo",
	pix_fmt="rgb24",
	s=f"{width}x{height}",
	r=7,
	)
	.output(
	video_out,
	vcodec="libx264",
	pix_fmt="yuv420p",
	r=7,
	loglevel="error",
	)
	.overwrite_output()
	.run_async(pipe_stdin=True)
	)

	for frame in np_frames:
	process.stdin.write(
	frame.astype("uint8").tobytes()
	)

	process.stdin.close()
	process.wait()

	logs = f"Generated video with {len(frames)} frames at 7 fps using command: {command}"
	return video_out, logs

	# -------------------------
	# Gradio UI
	# -------------------------

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# DeepSite – SVD Image → Video

	Upload an image and enter a command (prompt).
	The app generates a short video using Stable Video Diffusion.
	"""
	)

	with gr.Row():
	with gr.Column():
	input_image = gr.Image(
	label="Upload image",
	type="pil",
	)
	input_command = gr.Textbox(
	label="Command / Prompt",
	placeholder="e.g. 'slow zoom-out', 'pan to the right', etc.",
	lines=2,
	)
	run_btn = gr.Button("Generate Video")

	with gr.Column():
	output_video = gr.Video(label="Output video")
	output_text = gr.Textbox(label="Logs / Info", lines=6, interactive=False)

	run_btn.click(
	fn=process_image,
	inputs=[input_image, input_command],
	outputs=[output_video, output_text],
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))