| """ |
| VOID β Video Object and Interaction Deletion |
| Gradio demo for Hugging Face Spaces (ZeroGPU) |
| """ |
|
|
| import os |
| import sys |
| import tempfile |
|
|
| import numpy as np |
| import torch |
| import torch.nn.functional as F |
| import imageio |
| import mediapy as media |
| import spaces |
| import gradio as gr |
| from huggingface_hub import hf_hub_download, snapshot_download |
| from safetensors.torch import load_file |
| from diffusers import DDIMScheduler |
| from PIL import Image |
|
|
| |
| sys.path.insert(0, os.path.dirname(__file__)) |
|
|
| from videox_fun.models import ( |
| AutoencoderKLCogVideoX, |
| CogVideoXTransformer3DModel, |
| T5EncoderModel, |
| T5Tokenizer, |
| ) |
| from videox_fun.pipeline import CogVideoXFunInpaintPipeline |
| from videox_fun.utils.fp8_optimization import convert_weight_dtype_wrapper |
| from videox_fun.utils.utils import temporal_padding |
|
|
| |
| |
| BASE_MODEL_ID = os.environ.get("BASE_MODEL_ID", "alibaba-pai/CogVideoX-Fun-V1.5-5b-InP") |
| VOID_MODEL_ID = os.environ.get("VOID_MODEL_ID", "your-hf-username/VOID") |
| VOID_CKPT_FILE = "void_pass1.safetensors" |
|
|
| SAMPLE_SIZE = (384, 672) |
| MAX_VID_LEN = 197 |
| TEMPORAL_WIN = 72 |
| FPS = 12 |
| WEIGHT_DTYPE = torch.bfloat16 |
| NEG_PROMPT = ( |
| "The video is not of a high quality, it has a low resolution. " |
| "Watermark present in each frame. The background is solid. " |
| "Strange body and strange trajectory. Distortion." |
| ) |
|
|
| |
| print("Loading VOID pipeline β¦") |
|
|
| HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
| |
| base_model_path = snapshot_download(repo_id=BASE_MODEL_ID, token=HF_TOKEN) |
|
|
| transformer = CogVideoXTransformer3DModel.from_pretrained( |
| base_model_path, |
| subfolder="transformer", |
| low_cpu_mem_usage=True, |
| torch_dtype=torch.float8_e4m3fn, |
| use_vae_mask=True, |
| stack_mask=False, |
| ).to(WEIGHT_DTYPE) |
|
|
| |
| ckpt_path = hf_hub_download(repo_id=VOID_MODEL_ID, filename=VOID_CKPT_FILE, token=HF_TOKEN) |
| state_dict = load_file(ckpt_path) |
| state_dict = state_dict.get("state_dict", state_dict) |
|
|
| |
| param_name = "patch_embed.proj.weight" |
| if state_dict[param_name].size(1) != transformer.state_dict()[param_name].size(1): |
| feat_dim = 16 * 8 |
| new_weight = transformer.state_dict()[param_name].clone() |
| new_weight[:, :feat_dim] = state_dict[param_name][:, :feat_dim] |
| new_weight[:, -feat_dim:] = state_dict[param_name][:, -feat_dim:] |
| state_dict[param_name] = new_weight |
|
|
| transformer.load_state_dict(state_dict, strict=False) |
|
|
| vae = AutoencoderKLCogVideoX.from_pretrained( |
| base_model_path, subfolder="vae" |
| ).to(WEIGHT_DTYPE) |
| tokenizer = T5Tokenizer.from_pretrained(base_model_path, subfolder="tokenizer") |
| text_encoder = T5EncoderModel.from_pretrained( |
| base_model_path, subfolder="text_encoder", torch_dtype=WEIGHT_DTYPE |
| ) |
| scheduler = DDIMScheduler.from_pretrained(base_model_path, subfolder="scheduler") |
|
|
| pipeline = CogVideoXFunInpaintPipeline( |
| vae=vae, |
| tokenizer=tokenizer, |
| text_encoder=text_encoder, |
| transformer=transformer, |
| scheduler=scheduler, |
| ) |
| convert_weight_dtype_wrapper(transformer, WEIGHT_DTYPE) |
| pipeline.enable_model_cpu_offload() |
|
|
| print("VOID pipeline ready.") |
|
|
|
|
| |
| def load_video_tensor(path: str) -> torch.Tensor: |
| """Return (1, C, T, H, W) float32 in [0, 1] resized to SAMPLE_SIZE.""" |
| frames = media.read_video(path) |
| t = torch.from_numpy(np.array(frames))[:MAX_VID_LEN] |
| t = t.permute(3, 0, 1, 2).float() / 255.0 |
| t = F.interpolate(t, SAMPLE_SIZE, mode="area").unsqueeze(0) |
| return t |
|
|
|
|
| def load_quadmask_tensor(path: str) -> torch.Tensor: |
| """ |
| Return (1, 1, T, H, W) float32 in [0, 1]. |
| |
| Quadmask pixel values: |
| 0 β primary object (to erase) |
| 63 β overlap / interaction zone |
| 127 β affected region (shadows, reflections β¦) |
| 255 β background (keep) |
| |
| After quantisation the mask is inverted so 255 = "erase", 0 = "keep", |
| matching the pipeline's internal convention. |
| """ |
| frames = media.read_video(path)[:MAX_VID_LEN] |
| if frames.ndim == 4: |
| frames = frames[..., 0] |
| m = torch.from_numpy(np.array(frames)).unsqueeze(0).float() |
| m = F.interpolate(m, SAMPLE_SIZE, mode="area").unsqueeze(0) |
|
|
| |
| m = torch.where(m <= 31, torch.zeros_like(m), m) |
| m = torch.where((m > 31) & (m <= 95), torch.full_like(m, 63), m) |
| m = torch.where((m > 95) & (m <= 191), torch.full_like(m, 127), m) |
| m = torch.where(m > 191, torch.full_like(m, 255), m) |
|
|
| m = 255.0 - m |
| return m / 255.0 |
|
|
|
|
| def tensor_to_mp4(video: torch.Tensor) -> str: |
| """Save (1, C, T, H, W) in [0, 1] to a temp mp4 and return the path.""" |
| frames = video[0].permute(1, 2, 3, 0).cpu().float().numpy() |
| frames = (frames * 255).clip(0, 255).astype(np.uint8) |
| tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) |
| imageio.mimsave(tmp.name, frames, fps=FPS) |
| return tmp.name |
|
|
|
|
| |
| @spaces.GPU(duration=120) |
| def run_inpaint( |
| input_video_path: str, |
| mask_video_path: str, |
| prompt: str, |
| num_steps: int, |
| guidance_scale: float, |
| seed: int, |
| ) -> str: |
| if not input_video_path or not mask_video_path: |
| raise gr.Error("Please upload both an input video and a quadmask video.") |
| if not prompt.strip(): |
| raise gr.Error("Please enter a prompt describing the scene after removal.") |
|
|
| generator = torch.Generator(device="cuda").manual_seed(int(seed)) |
|
|
| input_video = load_video_tensor(input_video_path) |
| input_mask = load_quadmask_tensor(mask_video_path) |
|
|
| input_video = temporal_padding(input_video, min_length=TEMPORAL_WIN, max_length=MAX_VID_LEN) |
| input_mask = temporal_padding(input_mask, min_length=TEMPORAL_WIN, max_length=MAX_VID_LEN) |
|
|
| with torch.no_grad(): |
| result = pipeline( |
| prompt=prompt, |
| negative_prompt=NEG_PROMPT, |
| height=SAMPLE_SIZE[0], |
| width=SAMPLE_SIZE[1], |
| num_frames=TEMPORAL_WIN, |
| video=input_video, |
| mask_video=input_mask, |
| generator=generator, |
| guidance_scale=guidance_scale, |
| num_inference_steps=num_steps, |
| strength=1.0, |
| use_trimask=True, |
| use_vae_mask=True, |
| stack_mask=False, |
| zero_out_mask_region=False, |
| ).videos |
|
|
| return tensor_to_mp4(result) |
|
|
|
|
| |
| QUADMASK_EXPLAINER = """ |
| ### Quadmask format |
| |
| The quadmask is a **grayscale video** where each pixel value encodes what role that region plays: |
| |
| | Pixel value | Meaning | |
| |-------------|---------| |
| | **0** (black) | Primary object to remove | |
| | **63** (dark grey) | Overlap of primary object / affected zone | |
| | **127** (mid grey) | Affected region β shadows, reflections, new and old trajectories | |
| | **255** (white) | Background β keep as-is | |
| |
| Use the **VLM-Mask-Reasoner** pipeline included in the repo to generate quadmasks automatically. |
| """ |
|
|
| SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "sample") |
| EXAMPLES = [ |
|
|
| [ |
| os.path.join(SAMPLE_DIR, "BigBen", "input_video.mp4"), |
| os.path.join(SAMPLE_DIR, "BigBen", "quadmask_0.mp4"), |
| "A video of London's skyline reflecting in the Thames", |
| 30, 1.0, 42, |
| ], |
|
|
| [ |
| os.path.join(SAMPLE_DIR, "trampoline", "input_video.mp4"), |
| os.path.join(SAMPLE_DIR, "trampoline", "quadmask_0.mp4"), |
| "A video of an empty trampoline.", |
| 30, 1.0, 42, |
| ], |
|
|
| [ |
| os.path.join(SAMPLE_DIR, "spinner", "input_video.mp4"), |
| os.path.join(SAMPLE_DIR, "spinner", "quadmask_0.mp4"), |
| "A video of two spinningtops spinning.", |
| 30, 1.0, 42, |
| ], |
|
|
| [ |
| os.path.join(SAMPLE_DIR, "ducky-float", "input_video.mp4"), |
| os.path.join(SAMPLE_DIR, "ducky-float", "quadmask_0.mp4"), |
| "A video of a rubber ducky.", |
| 30, 1.0, 42, |
| ], |
| |
| [ |
| os.path.join(SAMPLE_DIR, "lime", "input_video.mp4"), |
| os.path.join(SAMPLE_DIR, "lime", "quadmask_0.mp4"), |
| "A lime falls on the table.", |
| 30, 1.0, 42, |
| ], |
| [ |
| os.path.join(SAMPLE_DIR, "moving_ball", "input_video.mp4"), |
| os.path.join(SAMPLE_DIR, "moving_ball", "quadmask_0.mp4"), |
| "A ball rolls off the table.", |
| 30, 1.0, 42, |
| ], |
| [ |
| os.path.join(SAMPLE_DIR, "pillow", "input_video.mp4"), |
| os.path.join(SAMPLE_DIR, "pillow", "quadmask_0.mp4"), |
| "Two pillows placed on the table.", |
| 30, 1.0, 42, |
| ], |
| [ |
| os.path.join(SAMPLE_DIR, "bowling", "input_video.mp4"), |
| os.path.join(SAMPLE_DIR, "bowling", "quadmask_0.mp4"), |
| "Bowling pins standing on the grass.", |
| 30, 1.0, 42, |
| ], |
| [ |
| os.path.join(SAMPLE_DIR, "crush-can", "input_video.mp4"), |
| os.path.join(SAMPLE_DIR, "crush-can", "quadmask_0.mp4"), |
| "A soda can on the table.", |
| 30, 1.0, 42, |
| ], |
| [ |
| os.path.join(SAMPLE_DIR, "toast-shmello", "input_video.mp4"), |
| os.path.join(SAMPLE_DIR, "toast-shmello", "quadmask_0.mp4"), |
| "A marshmallow dessert.", |
| 30, 1.0, 42, |
| ], |
| ] |
|
|
| with gr.Blocks(title="VOID β Video Object & Interaction Deletion") as demo: |
| gr.Markdown( |
| """ |
| # VOID β Video Object and Interaction Deletion |
| |
| [π Project Page](https://void-model.github.io/) | [π» GitHub](https://github.com/Netflix/void-model) |
| |
| Upload a video and its **quadmask**, enter a prompt describing the scene *after* removal, |
| and VOID will erase the object along with its physical interactions. |
| |
| > Built on **CogVideoX-Fun-V1.5-5B** fine-tuned for interaction-aware video inpainting. |
| """ |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| input_video = gr.Video(label="Input video", sources=["upload"]) |
| mask_video = gr.Video(label="Quadmask video", sources=["upload"]) |
| prompt = gr.Textbox( |
| label="Prompt β describe the scene after removal", |
| placeholder="e.g. A wooden table with nothing on it.", |
| lines=2, |
| ) |
| with gr.Accordion("Advanced settings", open=False): |
| num_steps = gr.Slider(10, 50, value=30, step=1, label="Inference steps") |
| guidance_scale = gr.Slider(1.0, 10.0, value=1.0, step=0.5, label="Guidance scale") |
| seed = gr.Number(value=42, label="Seed", precision=0) |
| run_btn = gr.Button("Run VOID", variant="primary") |
|
|
| with gr.Column(): |
| output_video = gr.Video(label="Inpainted output", interactive=False) |
|
|
| gr.Markdown(QUADMASK_EXPLAINER) |
|
|
| gr.Examples( |
| examples=EXAMPLES, |
| inputs=[input_video, mask_video, prompt, num_steps, guidance_scale, seed], |
| outputs=[output_video], |
| cache_examples=False, |
| label="Sample sequences β click to load inputs", |
| ) |
|
|
| run_btn.click( |
| fn=run_inpaint, |
| inputs=[input_video, mask_video, prompt, num_steps, guidance_scale, seed], |
| outputs=[output_video], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|