""" VOID – Video Object and Interaction Deletion Gradio demo for Hugging Face Spaces (ZeroGPU) """ import os import sys import tempfile import numpy as np import torch import torch.nn.functional as F import imageio import mediapy as media import spaces import gradio as gr from huggingface_hub import hf_hub_download, snapshot_download from safetensors.torch import load_file from diffusers import DDIMScheduler from PIL import Image # ── project imports ──────────────────────────────────────────────────────────── sys.path.insert(0, os.path.dirname(__file__)) from videox_fun.models import ( AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, T5EncoderModel, T5Tokenizer, ) from videox_fun.pipeline import CogVideoXFunInpaintPipeline from videox_fun.utils.fp8_optimization import convert_weight_dtype_wrapper from videox_fun.utils.utils import temporal_padding # ── constants ────────────────────────────────────────────────────────────────── # Set these env vars in your HF Space settings, or hardcode once weights are public. BASE_MODEL_ID = os.environ.get("BASE_MODEL_ID", "alibaba-pai/CogVideoX-Fun-V1.5-5b-InP") VOID_MODEL_ID = os.environ.get("VOID_MODEL_ID", "your-hf-username/VOID") VOID_CKPT_FILE = "void_pass1.safetensors" SAMPLE_SIZE = (384, 672) # H × W MAX_VID_LEN = 197 TEMPORAL_WIN = 72 FPS = 12 WEIGHT_DTYPE = torch.bfloat16 NEG_PROMPT = ( "The video is not of a high quality, it has a low resolution. " "Watermark present in each frame. The background is solid. " "Strange body and strange trajectory. Distortion." ) # ── model loading (once at startup, lives in CPU RAM between GPU requests) ───── print("Loading VOID pipeline …") HF_TOKEN = os.environ.get("HF_TOKEN") # Download base model to local cache (custom from_pretrained needs a local path) base_model_path = snapshot_download(repo_id=BASE_MODEL_ID, token=HF_TOKEN) transformer = CogVideoXTransformer3DModel.from_pretrained( base_model_path, subfolder="transformer", low_cpu_mem_usage=True, torch_dtype=torch.float8_e4m3fn, # qfloat8 to save VRAM use_vae_mask=True, stack_mask=False, ).to(WEIGHT_DTYPE) # Load VOID Pass-1 checkpoint ckpt_path = hf_hub_download(repo_id=VOID_MODEL_ID, filename=VOID_CKPT_FILE, token=HF_TOKEN) state_dict = load_file(ckpt_path) state_dict = state_dict.get("state_dict", state_dict) # Adapt patch_embed channels if they differ (mask-conditioning channels added) param_name = "patch_embed.proj.weight" if state_dict[param_name].size(1) != transformer.state_dict()[param_name].size(1): feat_dim = 16 * 8 # latent_channels * feat_scale new_weight = transformer.state_dict()[param_name].clone() new_weight[:, :feat_dim] = state_dict[param_name][:, :feat_dim] new_weight[:, -feat_dim:] = state_dict[param_name][:, -feat_dim:] state_dict[param_name] = new_weight transformer.load_state_dict(state_dict, strict=False) vae = AutoencoderKLCogVideoX.from_pretrained( base_model_path, subfolder="vae" ).to(WEIGHT_DTYPE) tokenizer = T5Tokenizer.from_pretrained(base_model_path, subfolder="tokenizer") text_encoder = T5EncoderModel.from_pretrained( base_model_path, subfolder="text_encoder", torch_dtype=WEIGHT_DTYPE ) scheduler = DDIMScheduler.from_pretrained(base_model_path, subfolder="scheduler") pipeline = CogVideoXFunInpaintPipeline( vae=vae, tokenizer=tokenizer, text_encoder=text_encoder, transformer=transformer, scheduler=scheduler, ) convert_weight_dtype_wrapper(transformer, WEIGHT_DTYPE) pipeline.enable_model_cpu_offload() print("VOID pipeline ready.") # ── helpers ──────────────────────────────────────────────────────────────────── def load_video_tensor(path: str) -> torch.Tensor: """Return (1, C, T, H, W) float32 in [0, 1] resized to SAMPLE_SIZE.""" frames = media.read_video(path) t = torch.from_numpy(np.array(frames))[:MAX_VID_LEN] # (T, H, W, C) t = t.permute(3, 0, 1, 2).float() / 255.0 # (C, T, H, W) t = F.interpolate(t, SAMPLE_SIZE, mode="area").unsqueeze(0) return t def load_quadmask_tensor(path: str) -> torch.Tensor: """ Return (1, 1, T, H, W) float32 in [0, 1]. Quadmask pixel values: 0 → primary object (to erase) 63 → overlap / interaction zone 127 → affected region (shadows, reflections …) 255 → background (keep) After quantisation the mask is inverted so 255 = "erase", 0 = "keep", matching the pipeline's internal convention. """ frames = media.read_video(path)[:MAX_VID_LEN] if frames.ndim == 4: frames = frames[..., 0] # take first channel, grayscale m = torch.from_numpy(np.array(frames)).unsqueeze(0).float() # (1, T, H, W) m = F.interpolate(m, SAMPLE_SIZE, mode="area").unsqueeze(0) # (1, 1, T, H, W) # Quantise to four canonical values m = torch.where(m <= 31, torch.zeros_like(m), m) m = torch.where((m > 31) & (m <= 95), torch.full_like(m, 63), m) m = torch.where((m > 95) & (m <= 191), torch.full_like(m, 127), m) m = torch.where(m > 191, torch.full_like(m, 255), m) m = 255.0 - m # invert return m / 255.0 def tensor_to_mp4(video: torch.Tensor) -> str: """Save (1, C, T, H, W) in [0, 1] to a temp mp4 and return the path.""" frames = video[0].permute(1, 2, 3, 0).cpu().float().numpy() # (T, H, W, C) frames = (frames * 255).clip(0, 255).astype(np.uint8) tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) imageio.mimsave(tmp.name, frames, fps=FPS) return tmp.name # ── inference ────────────────────────────────────────────────────────────────── @spaces.GPU(duration=120) def run_inpaint( input_video_path: str, mask_video_path: str, prompt: str, num_steps: int, guidance_scale: float, seed: int, ) -> str: if not input_video_path or not mask_video_path: raise gr.Error("Please upload both an input video and a quadmask video.") if not prompt.strip(): raise gr.Error("Please enter a prompt describing the scene after removal.") generator = torch.Generator(device="cuda").manual_seed(int(seed)) input_video = load_video_tensor(input_video_path) input_mask = load_quadmask_tensor(mask_video_path) input_video = temporal_padding(input_video, min_length=TEMPORAL_WIN, max_length=MAX_VID_LEN) input_mask = temporal_padding(input_mask, min_length=TEMPORAL_WIN, max_length=MAX_VID_LEN) with torch.no_grad(): result = pipeline( prompt=prompt, negative_prompt=NEG_PROMPT, height=SAMPLE_SIZE[0], width=SAMPLE_SIZE[1], num_frames=TEMPORAL_WIN, video=input_video, mask_video=input_mask, generator=generator, guidance_scale=guidance_scale, num_inference_steps=num_steps, strength=1.0, use_trimask=True, use_vae_mask=True, stack_mask=False, zero_out_mask_region=False, ).videos return tensor_to_mp4(result) # ── Gradio UI ────────────────────────────────────────────────────────────────── QUADMASK_EXPLAINER = """ ### Quadmask format The quadmask is a **grayscale video** where each pixel value encodes what role that region plays: | Pixel value | Meaning | |-------------|---------| | **0** (black) | Primary object to remove | | **63** (dark grey) | Overlap of primary object / affected zone | | **127** (mid grey) | Affected region — shadows, reflections, new and old trajectories | | **255** (white) | Background — keep as-is | Use the **VLM-Mask-Reasoner** pipeline included in the repo to generate quadmasks automatically. """ SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "sample") EXAMPLES = [ [ os.path.join(SAMPLE_DIR, "BigBen", "input_video.mp4"), os.path.join(SAMPLE_DIR, "BigBen", "quadmask_0.mp4"), "A video of London's skyline reflecting in the Thames", 30, 1.0, 42, ], [ os.path.join(SAMPLE_DIR, "trampoline", "input_video.mp4"), os.path.join(SAMPLE_DIR, "trampoline", "quadmask_0.mp4"), "A video of an empty trampoline.", 30, 1.0, 42, ], [ os.path.join(SAMPLE_DIR, "spinner", "input_video.mp4"), os.path.join(SAMPLE_DIR, "spinner", "quadmask_0.mp4"), "A video of two spinningtops spinning.", 30, 1.0, 42, ], [ os.path.join(SAMPLE_DIR, "ducky-float", "input_video.mp4"), os.path.join(SAMPLE_DIR, "ducky-float", "quadmask_0.mp4"), "A video of a rubber ducky.", 30, 1.0, 42, ], [ os.path.join(SAMPLE_DIR, "lime", "input_video.mp4"), os.path.join(SAMPLE_DIR, "lime", "quadmask_0.mp4"), "A lime falls on the table.", 30, 1.0, 42, ], [ os.path.join(SAMPLE_DIR, "moving_ball", "input_video.mp4"), os.path.join(SAMPLE_DIR, "moving_ball", "quadmask_0.mp4"), "A ball rolls off the table.", 30, 1.0, 42, ], [ os.path.join(SAMPLE_DIR, "pillow", "input_video.mp4"), os.path.join(SAMPLE_DIR, "pillow", "quadmask_0.mp4"), "Two pillows placed on the table.", 30, 1.0, 42, ], [ os.path.join(SAMPLE_DIR, "bowling", "input_video.mp4"), os.path.join(SAMPLE_DIR, "bowling", "quadmask_0.mp4"), "Bowling pins standing on the grass.", 30, 1.0, 42, ], [ os.path.join(SAMPLE_DIR, "crush-can", "input_video.mp4"), os.path.join(SAMPLE_DIR, "crush-can", "quadmask_0.mp4"), "A soda can on the table.", 30, 1.0, 42, ], [ os.path.join(SAMPLE_DIR, "toast-shmello", "input_video.mp4"), os.path.join(SAMPLE_DIR, "toast-shmello", "quadmask_0.mp4"), "A marshmallow dessert.", 30, 1.0, 42, ], ] with gr.Blocks(title="VOID – Video Object & Interaction Deletion") as demo: gr.Markdown( """ # VOID – Video Object and Interaction Deletion [🌐 Project Page](https://void-model.github.io/) | [💻 GitHub](https://github.com/Netflix/void-model) Upload a video and its **quadmask**, enter a prompt describing the scene *after* removal, and VOID will erase the object along with its physical interactions. > Built on **CogVideoX-Fun-V1.5-5B** fine-tuned for interaction-aware video inpainting. """ ) with gr.Row(): with gr.Column(): input_video = gr.Video(label="Input video", sources=["upload"]) mask_video = gr.Video(label="Quadmask video", sources=["upload"]) prompt = gr.Textbox( label="Prompt — describe the scene after removal", placeholder="e.g. A wooden table with nothing on it.", lines=2, ) with gr.Accordion("Advanced settings", open=False): num_steps = gr.Slider(10, 50, value=30, step=1, label="Inference steps") guidance_scale = gr.Slider(1.0, 10.0, value=1.0, step=0.5, label="Guidance scale") seed = gr.Number(value=42, label="Seed", precision=0) run_btn = gr.Button("Run VOID", variant="primary") with gr.Column(): output_video = gr.Video(label="Inpainted output", interactive=False) gr.Markdown(QUADMASK_EXPLAINER) gr.Examples( examples=EXAMPLES, inputs=[input_video, mask_video, prompt, num_steps, guidance_scale, seed], outputs=[output_video], cache_examples=False, label="Sample sequences — click to load inputs", ) run_btn.click( fn=run_inpaint, inputs=[input_video, mask_video, prompt, num_steps, guidance_scale, seed], outputs=[output_video], ) if __name__ == "__main__": demo.launch()