Duplicate from TencentARC/VideoPainter

Browse files

Co-authored-by: Yuxuan BIAN <BianYx@users.noreply.huggingface.co>

Files changed (10) hide show

.gitattributes +37 -0
License.txt +99 -0
README.md +489 -0
VideoPainter/checkpoints/branch/config.json +32 -0
VideoPainter/checkpoints/branch/diffusion_pytorch_model.safetensors +3 -0
VideoPainterID/checkpoints/pytorch_lora_weights.safetensors +3 -0
assets/method.jpg +3 -0
assets/teaser.jpg +3 -0
config.json +32 -0
i3d_rgb_imagenet.pt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/method.jpg filter=lfs diff=lfs merge=lfs -text
+assets/teaser.jpg filter=lfs diff=lfs merge=lfs -text

License.txt ADDED Viewed

	@@ -0,0 +1,99 @@

+This project, "VideoPainter", is fine-tuned with the assistance of "CogVideo 5B", which is subject to the The CogVideoX License. Details of the The CogVideoX License can be found in this file.
+In addition, usage of any components originally developed or modified by us, is also subject to the following requirement:
+Copyright (C) 2025 THL A29 Limited, a Tencent company.  All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this Software and associated documentation files, to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sublicense copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+- You agree to use the VideoPainter only for academic, research and education purposes, and refrain from using it for any commercial or production purposes under any circumstances.
+- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+For avoidance of doubts, "Software" means the VideoPainter model inference code, training code, parameters and weights made available under this license.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+Open Source Model Licensed under the The CogVideoX License:
+--------------------------------------------------------------------
+1. CogVideo 5B
+Copyright 2024 CogVideo Model Team @ Zhipu AI
+Terms of the The CogVideoX License:
+--------------------------------------------------------------------
+The CogVideoX License
+1. Definitions
+“Licensor” means the CogVideoX Model Team that distributes its Software.
+“Software” means the CogVideoX model parameters made available under this license.
+2. License Grant
+Under the terms and conditions of this license, the licensor hereby grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license. The intellectual property rights of the generated content belong to the user to the extent permitted by applicable local laws.
+This license allows you to freely use all open-source models in this repository for academic research. Users who wish to use the models for commercial purposes must register and obtain a basic commercial license in https://open.bigmodel.cn/mla/form .
+Users who have registered and obtained the basic commercial license can use the models for commercial activities for free, but must comply with all terms and conditions of this license. Additionally, the number of service users (visits) for your commercial activities must not exceed 1 million visits per month.
+If the number of service users (visits) for your commercial activities exceeds 1 million visits per month, you need to contact our business team to obtain more commercial licenses.
+The above copyright statement and this license statement should be included in all copies or significant portions of this software.
+3. Restriction
+You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any military, or illegal purposes.
+You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
+4. Disclaimer
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+5. Limitation of Liability
+EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+6. Dispute Resolution
+This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
+Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at license@zhipuai.cn.
+1. 定义
+“许可方”是指分发其软件的 CogVideoX 模型团队。
+“软件”是指根据本许可提供的 CogVideoX 模型参数。
+2. 许可授予
+根据本许可的条款和条件，许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可。生成内容的知识产权所属，可根据适用当地法律的规定，在法律允许的范围内由用户享有生成内容的知识产权或其他权利。
+本许可允许您免费使用本仓库中的所有开源模型进行学术研究。对于希望将模型用于商业目的的用户，需在 https://open.bigmodel.cn/mla/form 完成登记并获得基础商用授权。
+经过登记并获得基础商用授权的用户可以免费使用本模型进行商业活动，但必须遵守本许可的所有条款和条件。
+在本许可证下，您的商业活动的服务用户数量（访问量）不得超过100万人次访问 / 每月。如果超过，您需要与我们的商业团队联系以获得更多的商业许可。
+上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。
+3.限制
+您不得出于任何军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。
+您不得利用本软件从事任何危害国家安全和国家统一、危害社会公共利益、侵犯人身权益的行为。
+4.免责声明
+本软件“按原样”提供，不提供任何明示或暗示的保证，包括但不限于对适销性、特定用途的适用性和非侵权性的保证。
+在任何情况下，作者或版权持有人均不对任何索赔、损害或其他责任负责，无论是在合同诉讼、侵权行为还是其他方面，由软件或软件的使用或其他交易引起、由软件引起或与之相关 软件。
+5. 责任限制
+除适用法律禁止的范围外，在任何情况下且根据任何法律理论，无论是基于侵权行为、疏忽、合同、责任或其他原因，任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、 或间接损害，或任何其他商业损失，即使许可人已被告知此类损害的可能性。
+6.争议解决
+本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。
+请注意，许可证可能会更新到更全面的版本。 有关许可和版权的任何问题，请通过 license@zhipuai.cn 与我们联系。

README.md ADDED Viewed

	@@ -0,0 +1,489 @@

+---
+language:
+- en
+base_model:
+- THUDM/CogVideoX-5b
+- THUDM/CogVideoX-5b-I2V
+- THUDM/CogVideoX1.5-5B
+- THUDM/CogVideoX1.5-5B-I2V
+tags:
+- video
+- video inpainting
+- video editing
+---
+# VideoPainter
+This repository contains the implementation of the paper "VideoPainter: Any-length Video Inpainting and Editing with Plug-and-Play Context Control"
+Keywords: Video Inpainting, Video Editing, Video Generation
+> [Yuxuan Bian](https://yxbian23.github.io/)<sup>12</sup>, [Zhaoyang Zhang](https://zzyfd.github.io/#/)<sup>1‡</sup>, [Xuan Ju](https://juxuan27.github.io/)<sup>2</sup>, [Mingdeng Cao](https://openreview.net/profile?id=~Mingdeng_Cao1)<sup>3</sup>, [Liangbin Xie](https://liangbinxie.github.io/)<sup>4</sup>, [Ying Shan](https://www.linkedin.com/in/YingShanProfile/)<sup>1</sup>, [Qiang Xu](https://cure-lab.github.io/)<sup>2✉</sup><br>
+> <sup>1</sup>ARC Lab, Tencent PCG <sup>2</sup>The Chinese University of Hong Kong <sup>3</sup>The University of Tokyo <sup>4</sup>University of Macau <sup>‡</sup>Project Lead <sup>✉</sup>Corresponding Author
+<p align="center">
+<a href='https://yxbian23.github.io/project/video-painter'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp;
+<a href="https://arxiv.org/abs/2503.05639"><img src="https://img.shields.io/badge/arXiv-2503.05639-b31b1b.svg"></a> &nbsp;
+<a href="https://github.com/TencentARC/VideoPainter"><img src="https://img.shields.io/badge/GitHub-Code-black?logo=github"></a> &nbsp;
+<a href="https://youtu.be/HYzNfsD3A0s"><img src="https://img.shields.io/badge/YouTube-Video-red?logo=youtube"></a> &nbsp;
+<a href='https://huggingface.co/datasets/TencentARC/VPData'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-blue'></a> &nbsp;
+<a href='https://huggingface.co/datasets/TencentARC/VPBench'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Benchmark-blue'></a> &nbsp;
+<a href="https://huggingface.co/TencentARC/VideoPainter"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue"></a>
+</p>
+**Your star means a lot for us to develop this project!** ⭐⭐⭐
+**VPData and VPBench have been fully uploaded (contain 390K mask sequences and video captions). Welcome to use our biggest video segmentation dataset VPData with video captions!** 🔥🔥🔥
+**📖 Table of Contents**
+- [VideoPainter](#videopainter)
+  - [🔥 Update Log](#-update-log)
+  - [📌 TODO](#todo)
+  - [🛠️ Method Overview](#️-method-overview)
+  - [🚀 Getting Started](#-getting-started)
+    - [Environment Requirement 🌍](#environment-requirement-)
+    - [Data Download ⬇️](#data-download-️)
+  - [🏃🏼 Running Scripts](#-running-scripts)
+    - [Training 🤯](#training-)
+    - [Inference 📜](#inference-)
+    - [Evaluation 📏](#evaluation-)
+  - [🤝🏼 Cite Us](#-cite-us)
+  - [💖 Acknowledgement](#-acknowledgement)
+## 🔥 Update Log
+- [2025/3/09] 📢 📢  [VideoPainter](https://huggingface.co/TencentARC/VideoPainter) are released, an efficient, any-length video inpainting & editing framework with plug-and-play context control.
+- [2025/3/09] 📢 📢  [VPData](https://huggingface.co/datasets/TencentARC/VPData) and [VPBench](https://huggingface.co/datasets/TencentARC/VPBench) are released, the largest video inpainting dataset with precise segmentation masks and dense video captions (>390K clips).
+- [2025/3/25] 📢 📢  The 390K+ high-quality video segmentation masks of [VPData](https://huggingface.co/datasets/TencentARC/VPData) have been fully released.
+- [2025/3/25] 📢 📢  The raw videos of videovo subset have been uploaded to [VPData](https://huggingface.co/datasets/TencentARC/VPData), to solve the raw video link expiration issue.
+## TODO
+- [x] Release trainig and inference code
+- [x] Release evaluation code
+- [x] Release [VideoPainter checkpoints](https://huggingface.co/TencentARC/VideoPainter) (based on CogVideoX-5B)
+- [x] Release [VPData and VPBench](https://huggingface.co/collections/TencentARC/videopainter-67cc49c6146a48a2ba93d159) for large-scale training and evaluation.
+- [x] Release gradio demo
+- [ ] Data preprocessing code
+## 🛠️ Method Overview
+We propose a novel dual-stream paradigm VideoPainter that incorporates an efficient context encoder (comprising only 6\% of the backbone parameters) to process masked videos and inject backbone-aware background contextual cues to any pre-trained video DiT, producing semantically consistent content in a plug-and-play manner. This architectural separation significantly reduces the model's learning complexity while enabling nuanced integration of crucial background context. We also introduce a novel target region ID resampling technique that enables any-length video inpainting, greatly enhancing our practical applicability. Additionally, we establish a scalable dataset pipeline leveraging current vision understanding models, contributing VPData and VPBench to facilitate segmentation-based inpainting training and assessment, the largest video inpainting dataset and benchmark to date with over 390K diverse clips. Using inpainting as a pipeline basis, we also explore downstream applications including video editing and video editing pair data generation, demonstrating competitive performance and significant practical potential.
+![](assets/teaser.jpg)
+## 🚀 Getting Started
+<details>
+<summary><b>Environment Requirement 🌍</b></summary>
+Clone the repo:
+```
+git clone https://github.com/TencentARC/VideoPainter.git
+```
+We recommend you first use `conda` to create virtual environment, and install needed libraries. For example:
+```
+conda create -n videopainter python=3.10 -y
+conda activate videopainter
+pip install -r requirements.txt
+```
+Then, you can install diffusers (implemented in this repo) with:
+```
+cd ./diffusers
+pip install -e .
+```
+After that, you can install required ffmpeg thourgh:
+```
+conda install -c conda-forge ffmpeg -y
+```
+Optional, you can install sam2 for gradio demo thourgh:
+```
+cd ./app
+pip install -e .
+```
+</details>
+<details>
+<summary><b>VPBench and VPData Download ⬇️</b></summary>
+You can download the VPBench [here](https://huggingface.co/datasets/TencentARC/VPBench), and the VPData [here](https://huggingface.co/datasets/TencentARC/VPData) (as well as the Davis we re-processed), which are used for training and testing the BrushNet. By downloading the data, you are agreeing to the terms and conditions of the license. The data structure should be like:
+```
+|-- data
+    |-- davis
+        |-- JPEGImages_432_240
+        |-- test_masks
+        |-- davis_caption
+        |-- test.json
+        |-- train.json
+    |-- videovo/raw_video
+        |-- 000005000
+            |-- 000005000000.0.mp4
+            |-- 000005000001.0.mp4
+            |-- ...
+        |-- 000005001
+        |-- ...
+    |-- pexels/pexels/raw_video
+        |-- 000000000
+            |-- 000000000000_852038.mp4
+            |-- 000000000001_852057.mp4
+            |-- ...
+        |-- 000000001
+        |-- ...
+    |-- video_inpainting
+        |-- videovo
+            |-- 000005000000/all_masks.npz
+            |-- 000005000001/all_masks.npz
+            |-- ...
+        |-- pexels
+            |-- ...
+    |-- pexels_videovo_train_dataset.csv
+    |-- pexels_videovo_val_dataset.csv
+    |-- pexels_videovo_test_dataset.csv
+    |-- our_video_inpaint.csv
+    |-- our_video_inpaint_long.csv
+    |-- our_video_edit.csv
+    |-- our_video_edit_long.csv
+    |-- pexels.csv
+    |-- videovo.csv
+```
+You can download the VPBench, and put the benchmark to the `data` folder by:
+```
+git lfs install
+git clone https://huggingface.co/datasets/TencentARC/VPBench
+mv VPBench data
+cd data
+unzip pexels.zip
+unzip videovo.zip
+unzip davis.zip
+unzip video_inpainting.zip
+```
+You can download the VPData (only mask and text annotations due to the space limit), and put the dataset to the `data` folder by:
+```
+git lfs install
+git clone https://huggingface.co/datasets/TencentARC/VPData
+mv VPBench data
+# 1. unzip the masks in VPData
+python data_utils/unzip_folder.py --source_dir ./data/videovo_masks --target_dir ./data/video_inpainting/videovo
+python data_utils/unzip_folder.py --source_dir ./data/pexels_masks --target_dir ./data/video_inpainting/pexels
+# 2. unzip the raw videos in Videovo subset in VPData
+python data_utils/unzip_folder.py --source_dir ./data/videovo_raw_videos --target_dir ./data/videovo/raw_video
+```
+Noted: *Due to the space limit, you need to run the following script to download the raw videos of the Pexels subset in VPData. The format should be consistent with VPData/VPBench above (After download the VPData/VPBench, the script will automatically place the raw videos of VPData into the corresponding dataset directories that have been created by VPBench).*
+```
+cd data_utils
+python VPData_download.py
+```
+</details>
+<details>
+<summary><b>Checkpoints</b></summary>
+Checkpoints of VideoPainter can be downloaded from [here](https://huggingface.co/TencentARC/VideoPainter). The ckpt folder contains
+- VideoPainter pretrained checkpoints for CogVideoX-5b-I2V
+- VideoPainter IP Adapter pretrained checkpoints for CogVideoX-5b-I2V
+- pretrinaed CogVideoX-5b-I2V checkpoint from [HuggingFace](https://huggingface.co/THUDM/CogVideoX-5b-I2V).
+You can download the checkpoints, and put the checkpoints to the `ckpt` folder by:
+```
+git lfs install
+git clone https://huggingface.co/TencentARC/VideoPainter
+mv VideoPainter ckpt
+```
+You also need to download the base model [CogVideoX-5B-I2V](https://huggingface.co/THUDM/CogVideoX-5b-I2V) by:
+```
+git lfs install
+cd ckpt
+git clone https://huggingface.co/THUDM/CogVideoX-5b-I2V
+```
+[Optional]You need to download [FLUX.1-Fill-dev](https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev/) for first frame inpainting:
+```
+git lfs install
+cd ckpt
+git clone https://huggingface.co/black-forest-labs/FLUX.1-Fill-dev
+mv ckpt/FLUX.1-Fill-dev ckpt/flux_inp
+```
+[Optional]You need to download [SAM2](https://huggingface.co/facebook/sam2-hiera-large) for video segmentation in gradio demo:
+```
+git lfs install
+cd ckpt
+wget https://huggingface.co/facebook/sam2-hiera-large/resolve/main/sam2_hiera_large.pt
+```
+You can also choose the segmentation checkpoints of other sizes to balance efficiency and performance, such as [SAM2-Tiny](https://huggingface.co/facebook/sam2-hiera-tiny).
+The ckpt structure should be like:
+```
+|-- ckpt
+    |-- VideoPainter/checkpoints
+        |-- branch
+            |-- config.json
+            |-- diffusion_pytorch_model.safetensors
+    |-- VideoPainterID/checkpoints
+        |-- pytorch_lora_weights.safetensors
+    |-- CogVideoX-5b-I2V
+        |-- scheduler
+        |-- transformer
+        |-- vae
+        |-- ...
+    |-- flux_inp
+        |-- scheduler
+        |-- transformer
+        |-- vae
+        |-- ...
+    |-- sam2_hiera_large.pt
+```
+</details>
+## 🏃🏼 Running Scripts
+<details>
+<summary><b>Training 🤯</b></summary>
+You can train the VideoPainter using the script:
+```
+# cd train
+# bash VideoPainter.sh
+export MODEL_PATH="../ckpt/CogVideoX-5b-I2V"
+export CACHE_PATH="~/.cache"
+export DATASET_PATH="../data/videovo/raw_video"
+export PROJECT_NAME="pexels_videovo-inpainting"
+export RUNS_NAME="VideoPainter"
+export OUTPUT_PATH="./${PROJECT_NAME}/${RUNS_NAME}"
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=false
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+accelerate launch --config_file accelerate_config_machine_single_ds.yaml  --machine_rank 0 \
+  train_cogvideox_inpainting_i2v_video.py \
+  --pretrained_model_name_or_path $MODEL_PATH \
+  --cache_dir $CACHE_PATH \
+  --meta_file_path ../data/pexels_videovo_train_dataset.csv \
+  --val_meta_file_path ../data/pexels_videovo_val_dataset.csv \
+  --instance_data_root $DATASET_PATH \
+  --dataloader_num_workers 1 \
+  --num_validation_videos 1 \
+  --validation_epochs 1 \
+  --seed 42 \
+  --mixed_precision bf16 \
+  --output_dir $OUTPUT_PATH \
+  --height 480 \
+  --width 720 \
+  --fps 8 \
+  --max_num_frames 49 \
+  --video_reshape_mode "resize" \
+  --skip_frames_start 0 \
+  --skip_frames_end 0 \
+  --max_text_seq_length 226 \
+  --branch_layer_num 2 \
+  --train_batch_size 1 \
+  --num_train_epochs 10 \
+  --checkpointing_steps 1024 \
+  --validating_steps 256 \
+  --gradient_accumulation_steps 1 \
+  --learning_rate 1e-5 \
+  --lr_scheduler cosine_with_restarts \
+  --lr_warmup_steps 1000 \
+  --lr_num_cycles 1 \
+  --enable_slicing \
+  --enable_tiling \
+  --noised_image_dropout 0.05 \
+  --gradient_checkpointing \
+  --optimizer AdamW \
+  --adam_beta1 0.9 \
+  --adam_beta2 0.95 \
+  --max_grad_norm 1.0 \
+  --allow_tf32 \
+  --report_to wandb \
+  --tracker_name $PROJECT_NAME \
+  --runs_name $RUNS_NAME \
+  --inpainting_loss_weight 1.0 \
+  --mix_train_ratio 0 \
+  --first_frame_gt \
+  --mask_add \
+  --mask_transform_prob 0.3 \
+  --p_brush 0.4 \
+  --p_rect 0.1 \
+  --p_ellipse 0.1 \
+  --p_circle 0.1 \
+  --p_random_brush 0.3
+# cd train
+# bash VideoPainterID.sh
+export MODEL_PATH="../ckpt/CogVideoX-5b-I2V"
+export BRANCH_MODEL_PATH="../ckpt/VideoPainter/checkpoints/branch"
+export CACHE_PATH="~/.cache"
+export DATASET_PATH="../data/videovo/raw_video"
+export PROJECT_NAME="pexels_videovo-inpainting"
+export RUNS_NAME="VideoPainterID"
+export OUTPUT_PATH="./${PROJECT_NAME}/${RUNS_NAME}"
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=false
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+accelerate launch --config_file accelerate_config_machine_single_ds_wo_cpu.yaml --machine_rank 0 \
+  train_cogvideox_inpainting_i2v_video_resample.py \
+  --pretrained_model_name_or_path $MODEL_PATH \
+  --cogvideox_branch_name_or_path $BRANCH_MODEL_PATH \
+  --cache_dir $CACHE_PATH \
+  --meta_file_path ../data/pexels_videovo_train_dataset.csv \
+  --val_meta_file_path ../data/pexels_videovo_val_dataset.csv \
+  --instance_data_root $DATASET_PATH \
+  --dataloader_num_workers 1 \
+  --num_validation_videos 1 \
+  --validation_epochs 1 \
+  --seed 42 \
+  --rank 256 \
+  --lora_alpha 128 \
+  --mixed_precision bf16 \
+  --output_dir $OUTPUT_PATH \
+  --height 480 \
+  --width 720 \
+  --fps 8 \
+  --max_num_frames 49 \
+  --video_reshape_mode "resize" \
+  --skip_frames_start 0 \
+  --skip_frames_end 0 \
+  --max_text_seq_length 226 \
+  --branch_layer_num 2 \
+  --train_batch_size 1 \
+  --num_train_epochs 10 \
+  --checkpointing_steps 256 \
+  --validating_steps 128 \
+  --gradient_accumulation_steps 1 \
+  --learning_rate 5e-5 \
+  --lr_scheduler cosine_with_restarts \
+  --lr_warmup_steps 200 \
+  --lr_num_cycles 1 \
+  --enable_slicing \
+  --enable_tiling \
+  --noised_image_dropout 0.05 \
+  --gradient_checkpointing \
+  --optimizer AdamW \
+  --adam_beta1 0.9 \
+  --adam_beta2 0.95 \
+  --max_grad_norm 1.0 \
+  --allow_tf32 \
+  --report_to wandb \
+  --tracker_name $PROJECT_NAME \
+  --runs_name $RUNS_NAME \
+  --inpainting_loss_weight 1.0 \
+  --mix_train_ratio 0 \
+  --first_frame_gt \
+  --mask_add \
+  --mask_transform_prob 0.3 \
+  --p_brush 0.4 \
+  --p_rect 0.1 \
+  --p_ellipse 0.1 \
+  --p_circle 0.1 \
+  --p_random_brush 0.3 \
+  --id_pool_resample_learnable
+```
+</details>
+<details>
+<summary><b>Inference 📜</b></summary>
+You can inference for the video inpainting or editing with the script:
+```
+cd infer
+# video inpainting
+bash inpaint.sh
+# video inpainting with ID resampling
+bash inpaint_id_resample.sh
+# video editing
+bash edit.sh
+```
+Our VideoPainter can also function as a video editing pair data generator, you can inference with the script:
+```
+bash edit_bench.sh
+```
+Since VideoPainter is trained on public Internet videos, it primarily performs well on general scenarios. For high-quality industrial applications (e.g., product exhibitions, virtual try-on), we recommend training the model on your domain-specific data. We welcome and appreciate any contributions of trained models from the community!
+</details>
+<details>
+<summary><b>Gradio Demo 🖌️</b></summary>
+You can also inference through gradio demo:
+```
+# cd app
+CUDA_VISIBLE_DEVICES=0 python app.py \
+    --model_path ../ckpt/CogVideoX-5b-I2V \
+    --inpainting_branch ../ckpt/VideoPainter/checkpoints/branch \
+    --id_adapter ../ckpt/VideoPainterID/checkpoints \
+    --img_inpainting_model ../ckpt/flux_inp
+```
+</details>
+<details>
+<summary><b>Evaluation 📏</b></summary>
+You can evaluate using the script:
+```
+cd evaluate
+# video inpainting
+bash eval_inpainting.sh
+# video inpainting with ID resampling
+bash eval_inpainting_id_resample.sh
+# video editing
+bash eval_edit.sh
+# video editing with ID resampling
+bash eval_editing_id_resample.sh
+```
+</details>
+## 🤝🏼 Cite Us
+```
+@article{bian2025videopainter,
+  title={VideoPainter: Any-length Video Inpainting and Editing with Plug-and-Play Context Control},
+  author={Bian, Yuxuan and Zhang, Zhaoyang and Ju, Xuan and Cao, Mingdeng and Xie, Liangbin and Shan, Ying and Xu, Qiang},
+  journal={arXiv preprint arXiv:2503.05639},
+  year={2025}
+}
+```
+## 💖 Acknowledgement
+<span id="acknowledgement"></span>
+Our code is modified based on [diffusers](https://github.com/huggingface/diffusers) and [CogVideoX](https://github.com/THUDM/CogVideo), thanks to all the contributors!

VideoPainter/checkpoints/branch/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_class_name": "CogvideoXBranchModel",
+  "_diffusers_version": "0.31.0.dev0",
+  "_name_or_path": "/group/40005/yuxuanbian/hf_models/CogVideoX-5b-I2V",
+  "activation_fn": "gelu-approximate",
+  "attention_bias": true,
+  "attention_head_dim": 64,
+  "dropout": 0.0,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "id_pool_resample_learnable": false,
+  "in_channels": 32,
+  "max_text_seq_length": 226,
+  "norm_elementwise_affine": true,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 48,
+  "num_layers": 2,
+  "out_channels": 16,
+  "patch_size": 2,
+  "sample_frames": 49,
+  "sample_height": 60,
+  "sample_width": 90,
+  "spatial_interpolation_scale": 1.875,
+  "temporal_compression_ratio": 4,
+  "temporal_interpolation_scale": 1.0,
+  "text_embed_dim": 4096,
+  "time_embed_dim": 512,
+  "timestep_activation_fn": "silu",
+  "use_learned_positional_embeddings": true,
+  "use_rotary_positional_embeddings": true,
+  "wo_text": false
+}

VideoPainter/checkpoints/branch/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d01728cb0cb605b591f41cbea033db22d5ae72d0b37565957feae71b089be8e
+size 712360464

VideoPainterID/checkpoints/pytorch_lora_weights.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8651aac99c672b7e9ca497e62449d223640432c7dd26a85e6697c0efd87dad1f
+size 528527480

assets/method.jpg ADDED Viewed

Git LFS Details

SHA256: 3f52eb3838b2447353603a76be5881389848f471f23a5e4f4d8d346be86bfbbc
Pointer size: 131 Bytes
Size of remote file: 995 kB

assets/teaser.jpg ADDED Viewed

Git LFS Details

SHA256: 9374cf8c7765411bc1b7dd00b3ec3fe8dbdcd50fe3f14eadfbe6b3c33029707d
Pointer size: 132 Bytes
Size of remote file: 2.72 MB

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_class_name": "VideoPainter",
+  "_diffusers_version": "0.31.0.dev0",
+  "_name_or_path": "hf_models/CogVideoX-5b-I2V",
+  "activation_fn": "gelu-approximate",
+  "attention_bias": true,
+  "attention_head_dim": 64,
+  "dropout": 0.0,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "id_pool_resample_learnable": false,
+  "in_channels": 32,
+  "max_text_seq_length": 226,
+  "norm_elementwise_affine": true,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 48,
+  "num_layers": 2,
+  "out_channels": 16,
+  "patch_size": 2,
+  "sample_frames": 49,
+  "sample_height": 60,
+  "sample_width": 90,
+  "spatial_interpolation_scale": 1.875,
+  "temporal_compression_ratio": 4,
+  "temporal_interpolation_scale": 1.0,
+  "text_embed_dim": 4096,
+  "time_embed_dim": 512,
+  "timestep_activation_fn": "silu",
+  "use_learned_positional_embeddings": true,
+  "use_rotary_positional_embeddings": true,
+  "wo_text": false
+}

i3d_rgb_imagenet.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2609088c2e8c868187c9921c50bc225329a9057ed75e76120e0b4a397a2c7538
+size 50883138