VOID-api

Running on Zero

App Files Files Community

VOID-api / app.py

Devteamdl

Reduce GPU duration to 120s for compat with non-PRO ZeroGPU quotas

8505af9 verified 2 days ago

raw

history blame contribute delete

12.7 kB

	"""
	VOID – Video Object and Interaction Deletion
	Gradio demo for Hugging Face Spaces (ZeroGPU)
	"""

	import os
	import sys
	import tempfile

	import numpy as np
	import torch
	import torch.nn.functional as F
	import imageio
	import mediapy as media
	import spaces
	import gradio as gr
	from huggingface_hub import hf_hub_download, snapshot_download
	from safetensors.torch import load_file
	from diffusers import DDIMScheduler
	from PIL import Image

	# ── project imports ────────────────────────────────────────────────────────────
	sys.path.insert(0, os.path.dirname(__file__))

	from videox_fun.models import (
	AutoencoderKLCogVideoX,
	CogVideoXTransformer3DModel,
	T5EncoderModel,
	T5Tokenizer,
	)
	from videox_fun.pipeline import CogVideoXFunInpaintPipeline
	from videox_fun.utils.fp8_optimization import convert_weight_dtype_wrapper
	from videox_fun.utils.utils import temporal_padding

	# ── constants ──────────────────────────────────────────────────────────────────
	# Set these env vars in your HF Space settings, or hardcode once weights are public.
	BASE_MODEL_ID = os.environ.get("BASE_MODEL_ID", "alibaba-pai/CogVideoX-Fun-V1.5-5b-InP")
	VOID_MODEL_ID = os.environ.get("VOID_MODEL_ID", "your-hf-username/VOID")
	VOID_CKPT_FILE = "void_pass1.safetensors"

	SAMPLE_SIZE = (384, 672) # H × W
	MAX_VID_LEN = 197
	TEMPORAL_WIN = 72
	FPS = 12
	WEIGHT_DTYPE = torch.bfloat16
	NEG_PROMPT = (
	"The video is not of a high quality, it has a low resolution. "
	"Watermark present in each frame. The background is solid. "
	"Strange body and strange trajectory. Distortion."
	)

	# ── model loading (once at startup, lives in CPU RAM between GPU requests) ─────
	print("Loading VOID pipeline …")

	HF_TOKEN = os.environ.get("HF_TOKEN")

	# Download base model to local cache (custom from_pretrained needs a local path)
	base_model_path = snapshot_download(repo_id=BASE_MODEL_ID, token=HF_TOKEN)

	transformer = CogVideoXTransformer3DModel.from_pretrained(
	base_model_path,
	subfolder="transformer",
	low_cpu_mem_usage=True,
	torch_dtype=torch.float8_e4m3fn, # qfloat8 to save VRAM
	use_vae_mask=True,
	stack_mask=False,
	).to(WEIGHT_DTYPE)

	# Load VOID Pass-1 checkpoint
	ckpt_path = hf_hub_download(repo_id=VOID_MODEL_ID, filename=VOID_CKPT_FILE, token=HF_TOKEN)
	state_dict = load_file(ckpt_path)
	state_dict = state_dict.get("state_dict", state_dict)

	# Adapt patch_embed channels if they differ (mask-conditioning channels added)
	param_name = "patch_embed.proj.weight"
	if state_dict[param_name].size(1) != transformer.state_dict()[param_name].size(1):
	feat_dim = 16 * 8 # latent_channels * feat_scale
	new_weight = transformer.state_dict()[param_name].clone()
	new_weight[:, :feat_dim] = state_dict[param_name][:, :feat_dim]
	new_weight[:, -feat_dim:] = state_dict[param_name][:, -feat_dim:]
	state_dict[param_name] = new_weight

	transformer.load_state_dict(state_dict, strict=False)

	vae = AutoencoderKLCogVideoX.from_pretrained(
	base_model_path, subfolder="vae"
	).to(WEIGHT_DTYPE)
	tokenizer = T5Tokenizer.from_pretrained(base_model_path, subfolder="tokenizer")
	text_encoder = T5EncoderModel.from_pretrained(
	base_model_path, subfolder="text_encoder", torch_dtype=WEIGHT_DTYPE
	)
	scheduler = DDIMScheduler.from_pretrained(base_model_path, subfolder="scheduler")

	pipeline = CogVideoXFunInpaintPipeline(
	vae=vae,
	tokenizer=tokenizer,
	text_encoder=text_encoder,
	transformer=transformer,
	scheduler=scheduler,
	)
	convert_weight_dtype_wrapper(transformer, WEIGHT_DTYPE)
	pipeline.enable_model_cpu_offload()

	print("VOID pipeline ready.")


	# ── helpers ────────────────────────────────────────────────────────────────────
	def load_video_tensor(path: str) -> torch.Tensor:
	"""Return (1, C, T, H, W) float32 in [0, 1] resized to SAMPLE_SIZE."""
	frames = media.read_video(path)
	t = torch.from_numpy(np.array(frames))[:MAX_VID_LEN] # (T, H, W, C)
	t = t.permute(3, 0, 1, 2).float() / 255.0 # (C, T, H, W)
	t = F.interpolate(t, SAMPLE_SIZE, mode="area").unsqueeze(0)
	return t


	def load_quadmask_tensor(path: str) -> torch.Tensor:
	"""
	Return (1, 1, T, H, W) float32 in [0, 1].

	Quadmask pixel values:
	0 → primary object (to erase)
	63 → overlap / interaction zone
	127 → affected region (shadows, reflections …)
	255 → background (keep)

	After quantisation the mask is inverted so 255 = "erase", 0 = "keep",
	matching the pipeline's internal convention.
	"""
	frames = media.read_video(path)[:MAX_VID_LEN]
	if frames.ndim == 4:
	frames = frames[..., 0] # take first channel, grayscale
	m = torch.from_numpy(np.array(frames)).unsqueeze(0).float() # (1, T, H, W)
	m = F.interpolate(m, SAMPLE_SIZE, mode="area").unsqueeze(0) # (1, 1, T, H, W)

	# Quantise to four canonical values
	m = torch.where(m <= 31, torch.zeros_like(m), m)
	m = torch.where((m > 31) & (m <= 95), torch.full_like(m, 63), m)
	m = torch.where((m > 95) & (m <= 191), torch.full_like(m, 127), m)
	m = torch.where(m > 191, torch.full_like(m, 255), m)

	m = 255.0 - m # invert
	return m / 255.0


	def tensor_to_mp4(video: torch.Tensor) -> str:
	"""Save (1, C, T, H, W) in [0, 1] to a temp mp4 and return the path."""
	frames = video[0].permute(1, 2, 3, 0).cpu().float().numpy() # (T, H, W, C)
	frames = (frames * 255).clip(0, 255).astype(np.uint8)
	tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
	imageio.mimsave(tmp.name, frames, fps=FPS)
	return tmp.name


	# ── inference ──────────────────────────────────────────────────────────────────
	@spaces.GPU(duration=120)
	def run_inpaint(
	input_video_path: str,
	mask_video_path: str,
	prompt: str,
	num_steps: int,
	guidance_scale: float,
	seed: int,
	) -> str:
	if not input_video_path or not mask_video_path:
	raise gr.Error("Please upload both an input video and a quadmask video.")
	if not prompt.strip():
	raise gr.Error("Please enter a prompt describing the scene after removal.")

	generator = torch.Generator(device="cuda").manual_seed(int(seed))

	input_video = load_video_tensor(input_video_path)
	input_mask = load_quadmask_tensor(mask_video_path)

	input_video = temporal_padding(input_video, min_length=TEMPORAL_WIN, max_length=MAX_VID_LEN)
	input_mask = temporal_padding(input_mask, min_length=TEMPORAL_WIN, max_length=MAX_VID_LEN)

	with torch.no_grad():
	result = pipeline(
	prompt=prompt,
	negative_prompt=NEG_PROMPT,
	height=SAMPLE_SIZE[0],
	width=SAMPLE_SIZE[1],
	num_frames=TEMPORAL_WIN,
	video=input_video,
	mask_video=input_mask,
	generator=generator,
	guidance_scale=guidance_scale,
	num_inference_steps=num_steps,
	strength=1.0,
	use_trimask=True,
	use_vae_mask=True,
	stack_mask=False,
	zero_out_mask_region=False,
	).videos

	return tensor_to_mp4(result)


	# ── Gradio UI ──────────────────────────────────────────────────────────────────
	QUADMASK_EXPLAINER = """
	### Quadmask format

	The quadmask is a grayscale video where each pixel value encodes what role that region plays:

	\| Pixel value \| Meaning \|
	\|-------------\|---------\|
	\| 0 (black) \| Primary object to remove \|
	\| 63 (dark grey) \| Overlap of primary object / affected zone \|
	\| 127 (mid grey) \| Affected region — shadows, reflections, new and old trajectories \|
	\| 255 (white) \| Background — keep as-is \|

	Use the VLM-Mask-Reasoner pipeline included in the repo to generate quadmasks automatically.
	"""

	SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "sample")
	EXAMPLES = [

	[
	os.path.join(SAMPLE_DIR, "BigBen", "input_video.mp4"),
	os.path.join(SAMPLE_DIR, "BigBen", "quadmask_0.mp4"),
	"A video of London's skyline reflecting in the Thames",
	30, 1.0, 42,
	],

	[
	os.path.join(SAMPLE_DIR, "trampoline", "input_video.mp4"),
	os.path.join(SAMPLE_DIR, "trampoline", "quadmask_0.mp4"),
	"A video of an empty trampoline.",
	30, 1.0, 42,
	],

	[
	os.path.join(SAMPLE_DIR, "spinner", "input_video.mp4"),
	os.path.join(SAMPLE_DIR, "spinner", "quadmask_0.mp4"),
	"A video of two spinningtops spinning.",
	30, 1.0, 42,
	],

	[
	os.path.join(SAMPLE_DIR, "ducky-float", "input_video.mp4"),
	os.path.join(SAMPLE_DIR, "ducky-float", "quadmask_0.mp4"),
	"A video of a rubber ducky.",
	30, 1.0, 42,
	],

	[
	os.path.join(SAMPLE_DIR, "lime", "input_video.mp4"),
	os.path.join(SAMPLE_DIR, "lime", "quadmask_0.mp4"),
	"A lime falls on the table.",
	30, 1.0, 42,
	],
	[
	os.path.join(SAMPLE_DIR, "moving_ball", "input_video.mp4"),
	os.path.join(SAMPLE_DIR, "moving_ball", "quadmask_0.mp4"),
	"A ball rolls off the table.",
	30, 1.0, 42,
	],
	[
	os.path.join(SAMPLE_DIR, "pillow", "input_video.mp4"),
	os.path.join(SAMPLE_DIR, "pillow", "quadmask_0.mp4"),
	"Two pillows placed on the table.",
	30, 1.0, 42,
	],
	[
	os.path.join(SAMPLE_DIR, "bowling", "input_video.mp4"),
	os.path.join(SAMPLE_DIR, "bowling", "quadmask_0.mp4"),
	"Bowling pins standing on the grass.",
	30, 1.0, 42,
	],
	[
	os.path.join(SAMPLE_DIR, "crush-can", "input_video.mp4"),
	os.path.join(SAMPLE_DIR, "crush-can", "quadmask_0.mp4"),
	"A soda can on the table.",
	30, 1.0, 42,
	],
	[
	os.path.join(SAMPLE_DIR, "toast-shmello", "input_video.mp4"),
	os.path.join(SAMPLE_DIR, "toast-shmello", "quadmask_0.mp4"),
	"A marshmallow dessert.",
	30, 1.0, 42,
	],
	]

	with gr.Blocks(title="VOID – Video Object & Interaction Deletion") as demo:
	gr.Markdown(
	"""
	# VOID – Video Object and Interaction Deletion

	[🌐 Project Page](https://void-model.github.io/) \| [💻 GitHub](https://github.com/Netflix/void-model)

	Upload a video and its quadmask, enter a prompt describing the scene after removal,
	and VOID will erase the object along with its physical interactions.

	> Built on CogVideoX-Fun-V1.5-5B fine-tuned for interaction-aware video inpainting.
	"""
	)

	with gr.Row():
	with gr.Column():
	input_video = gr.Video(label="Input video", sources=["upload"])
	mask_video = gr.Video(label="Quadmask video", sources=["upload"])
	prompt = gr.Textbox(
	label="Prompt — describe the scene after removal",
	placeholder="e.g. A wooden table with nothing on it.",
	lines=2,
	)
	with gr.Accordion("Advanced settings", open=False):
	num_steps = gr.Slider(10, 50, value=30, step=1, label="Inference steps")
	guidance_scale = gr.Slider(1.0, 10.0, value=1.0, step=0.5, label="Guidance scale")
	seed = gr.Number(value=42, label="Seed", precision=0)
	run_btn = gr.Button("Run VOID", variant="primary")

	with gr.Column():
	output_video = gr.Video(label="Inpainted output", interactive=False)

	gr.Markdown(QUADMASK_EXPLAINER)

	gr.Examples(
	examples=EXAMPLES,
	inputs=[input_video, mask_video, prompt, num_steps, guidance_scale, seed],
	outputs=[output_video],
	cache_examples=False,
	label="Sample sequences — click to load inputs",
	)

	run_btn.click(
	fn=run_inpaint,
	inputs=[input_video, mask_video, prompt, num_steps, guidance_scale, seed],
	outputs=[output_video],
	)

	if __name__ == "__main__":
	demo.launch()